Apache Mesos
metrics.hpp
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 
17 #ifndef __MASTER_METRICS_HPP__
18 #define __MASTER_METRICS_HPP__
19 
20 #include <string>
21 #include <vector>
22 
24 
29 
30 #include <stout/hashmap.hpp>
31 
32 #include "mesos/mesos.hpp"
33 #include "mesos/type_utils.hpp"
34 
35 namespace mesos {
36 namespace internal {
37 namespace master {
38 
39 class Master;
40 
41 struct Metrics
42 {
43  explicit Metrics(const Master& master);
44 
45  ~Metrics();
46 
49 
55 
60 
62 
63  // Contains counters 'prefix/pending', 'prefix/recovering', etc.
64  struct OperationStates {
65  OperationStates(const std::string& prefix);
67 
68  void update(const OperationState& state, int delta);
69 
71 
80  };
81 
82  // Operation states are tracked in two granularities: master-wide and
83  // per operation type. Additionally, for every framework the types of
84  // operations are tracked but not their states.
85  //
86  // NOTE: These metrics are missing the implicit operation statuses that
87  // are generated on operation reconciliation. For example, when a framework
88  // queries the state of an unknown operation on an unreachable agent,
89  // the master will generate an `OPERATION_UNREACHABLE` update that is not
90  // counted by these metrics.
93 
96  const OperationState& state);
97 
100  const OperationState& state);
101 
104  const OperationState& oldState,
105  const OperationState& newState);
106 
108 
109  // Task state metrics.
123 
126 
127  // NOTE: We only track metrics sources and reasons for terminal states.
129 
130  // Message counters.
132 
133  // HTTP cache hits.
134  // TODO(bevers): Collect these per endpoint once per-endpoint
135  // metrics get merged.
137 
138  // Metrics specific to frameworks of a common principal.
139  // These metrics have names prefixed by "frameworks/<principal>/".
140  struct Frameworks
141  {
142  // Counters for messages from all frameworks of this principal.
143  // Note: We only count messages from active scheduler
144  // *instances* while they are *registered*. i.e., messages
145  // prior to the completion of (re)registration
146  // (AuthenticateMessage and (Re)RegisterFrameworkMessage) and
147  // messages from an inactive scheduler instance (after the
148  // framework has failed over) are not counted.
149 
150  // Framework messages received (before processing).
152 
153  // Framework messages processed.
154  // NOTE: This doesn't include dropped messages. Processing of
155  // a message may be throttled by a RateLimiter if one is
156  // configured for this principal. Also due to Master's
157  // asynchronous nature, this doesn't necessarily mean the work
158  // requested by this message has finished.
160 
161  explicit Frameworks(const std::string& principal)
162  : messages_received("frameworks/" + principal + "/messages_received"),
163  messages_processed("frameworks/" + principal + "/messages_processed")
164  {
165  process::metrics::add(messages_received);
166  process::metrics::add(messages_processed);
167  }
168 
170  {
171  process::metrics::remove(messages_received);
172  process::metrics::remove(messages_processed);
173  }
174  };
175 
176  // Per-framework-principal metrics keyed by the framework
177  // principal.
179 
180  // Messages from schedulers.
196 
197  // Messages from executors.
199 
200  // Messages from slaves.
208 
209  // Messages from both schedulers and slaves.
211 
216 
219 
222 
225 
228 
229  // Recovery counters.
231 
232  // Process metrics.
236 
237  // Successful registry operations.
244 
245  // Slave observer metrics.
246  //
247  // TODO(neilc): The `slave_shutdowns_xxx` metrics are deprecated and
248  // will always be zero. Remove in Mesos 2.0.
252 
256 
257  // Non-revocable resources.
258  std::vector<process::metrics::PullGauge> resources_total;
259  std::vector<process::metrics::PullGauge> resources_used;
260  std::vector<process::metrics::PullGauge> resources_percent;
261 
262  // Revocable resources.
263  std::vector<process::metrics::PullGauge> resources_revocable_total;
264  std::vector<process::metrics::PullGauge> resources_revocable_used;
265  std::vector<process::metrics::PullGauge> resources_revocable_percent;
266 
268 
270  const TaskState& state,
271  const TaskStatus::Source& source,
272  const TaskStatus::Reason& reason);
273 };
274 
275 
277 {
279  const FrameworkInfo& _frameworkInfo,
280  bool publishPerFrameworkMetrics);
281 
282  ~FrameworkMetrics();
283 
284  void incrementCall(const mesos::scheduler::Call::Type& callType);
285 
286  void incrementEvent(const mesos::scheduler::Event& event);
287 
288  // Overloads to convert unversioned messages into events.
289  void incrementEvent(const FrameworkErrorMessage& message);
290  void incrementEvent(const ExitedExecutorMessage& message);
291  void incrementEvent(const LostSlaveMessage& message);
292  void incrementEvent(const InverseOffersMessage& message);
293  void incrementEvent(const ExecutorToFrameworkMessage& message);
294  void incrementEvent(const ResourceOffersMessage& message);
295  void incrementEvent(const RescindResourceOfferMessage& message);
296  void incrementEvent(const RescindInverseOfferMessage& message);
297  void incrementEvent(const FrameworkRegisteredMessage& message);
298  void incrementEvent(const FrameworkReregisteredMessage& message);
299  void incrementEvent(const StatusUpdateMessage& message);
300  void incrementEvent(const UpdateOperationStatusMessage& message);
301 
302  void incrementTaskState(const TaskState& state);
303  void decrementActiveTaskState(const TaskState& state);
304 
305  void incrementOperation(const Offer::Operation& operation);
306 
307  template <typename T> void addMetric(const T& metric);
308  template <typename T> void removeMetric(const T& metric);
309 
310  const std::string metricPrefix;
311 
313 
315 
318 
321 
326 
328 
330 
333 };
334 
335 
336 std::string getFrameworkMetricPrefix(const FrameworkInfo& frameworkInfo);
337 
338 } // namespace master {
339 } // namespace internal {
340 } // namespace mesos {
341 
342 #endif // __MASTER_METRICS_HPP__
process::metrics::Counter messages_executor_to_framework
Definition: metrics.hpp:198
process::metrics::Counter invalid_operation_status_updates
Definition: metrics.hpp:224
process::metrics::Counter messages_reregister_slave
Definition: metrics.hpp:202
~Frameworks()
Definition: metrics.hpp:169
process::metrics::PullGauge outstanding_offers
Definition: metrics.hpp:61
process::metrics::PullGauge frameworks_inactive
Definition: metrics.hpp:59
process::metrics::Counter messages_reregister_framework
Definition: metrics.hpp:182
process::metrics::Counter recovery_slave_removals
Definition: metrics.hpp:230
Definition: master.hpp:27
process::metrics::PullGauge tasks_killing
Definition: metrics.hpp:114
Future< Nothing > remove(const Metric &metric)
Definition: metrics.hpp:109
process::metrics::Counter tasks_finished
Definition: metrics.hpp:115
hashmap< TaskStatus::Source, Reasons > SourcesReasons
Definition: metrics.hpp:125
process::metrics::PushGauge pending
Definition: metrics.hpp:72
process::metrics::Counter messages_reconcile_tasks
Definition: metrics.hpp:193
process::metrics::PullGauge slaves_connected
Definition: metrics.hpp:50
std::vector< process::metrics::PullGauge > resources_total
Definition: metrics.hpp:258
process::metrics::PushGauge unreachable
Definition: metrics.hpp:74
hashmap< TaskState, process::metrics::PushGauge > active_task_states
Definition: metrics.hpp:329
process::metrics::Counter messages_deactivate_framework
Definition: metrics.hpp:184
OperationStates operation_states
Definition: metrics.hpp:91
process::metrics::Counter slave_unreachable_completed
Definition: metrics.hpp:254
process::metrics::Counter valid_status_updates
Definition: metrics.hpp:217
void decrementOperationState(Offer::Operation::Type type, const OperationState &state)
process::metrics::Counter total
Definition: metrics.hpp:70
process::metrics::Counter finished
Definition: metrics.hpp:75
process::metrics::PullGauge event_queue_dispatches
Definition: metrics.hpp:234
constexpr const char * prefix
Definition: os.hpp:96
process::metrics::Counter messages_update_slave
Definition: metrics.hpp:207
mesos::v1::scheduler::Call Call
Definition: mesos.hpp:2882
process::metrics::Counter calls
Definition: metrics.hpp:316
process::metrics::PullGauge tasks_unreachable
Definition: metrics.hpp:113
process::metrics::PullGauge tasks_staging
Definition: metrics.hpp:110
void incrementOperationState(Offer::Operation::Type type, const OperationState &state)
const std::string metricPrefix
Definition: metrics.hpp:310
process::metrics::Counter slave_shutdowns_scheduled
Definition: metrics.hpp:249
void incrementTasksStates(const TaskState &state, const TaskStatus::Source &source, const TaskStatus::Reason &reason)
process::metrics::Counter messages_kill_task
Definition: metrics.hpp:185
process::metrics::Counter tasks_failed
Definition: metrics.hpp:116
hashmap< Offer::Operation::Type, OperationStates > operation_type_states
Definition: metrics.hpp:92
process::metrics::Counter error
Definition: metrics.hpp:77
void incrementInvalidSchedulerCalls(const mesos::scheduler::Call &call)
process::metrics::Counter valid_status_update_acknowledgements
Definition: metrics.hpp:220
process::metrics::Counter messages_processed
Definition: metrics.hpp:159
Operation
Definition: cgroups.hpp:444
process::metrics::Counter tasks_lost
Definition: metrics.hpp:118
process::metrics::Counter slave_reregistrations
Definition: metrics.hpp:239
Future< Nothing > add(const T &metric)
Definition: metrics.hpp:95
hashmap< std::string, process::Owned< Frameworks > > frameworks
Definition: metrics.hpp:178
process::metrics::PushGauge subscribed
Definition: metrics.hpp:314
process::metrics::Counter http_cache_hits
Definition: metrics.hpp:136
process::metrics::Counter events
Definition: metrics.hpp:319
void transitionOperationState(Offer::Operation::Type type, const OperationState &oldState, const OperationState &newState)
Definition: counter.hpp:26
std::vector< process::metrics::PullGauge > resources_revocable_total
Definition: metrics.hpp:263
mesos::v1::scheduler::Event Event
Definition: mesos.hpp:2883
Definition: hashmap.hpp:38
process::metrics::Counter messages_status_update_acknowledgement
Definition: metrics.hpp:186
process::metrics::Counter valid_framework_to_executor_messages
Definition: metrics.hpp:212
process::metrics::PullGauge slaves_disconnected
Definition: metrics.hpp:51
process::metrics::PushGauge recovering
Definition: metrics.hpp:73
process::metrics::Counter messages_authenticate
Definition: metrics.hpp:210
process::metrics::Counter invalid_executor_to_framework_messages
Definition: metrics.hpp:215
void update(const OperationState &state, int delta)
process::metrics::PullGauge frameworks_active
Definition: metrics.hpp:58
std::vector< process::metrics::PullGauge > resources_revocable_used
Definition: metrics.hpp:264
process::metrics::Counter slave_unreachable_canceled
Definition: metrics.hpp:255
process::metrics::PullGauge event_queue_http_requests
Definition: metrics.hpp:235
process::metrics::Counter tasks_error
Definition: metrics.hpp:119
process::metrics::Counter valid_executor_to_framework_messages
Definition: metrics.hpp:214
Definition: pull_gauge.hpp:46
process::metrics::PullGauge tasks_running
Definition: metrics.hpp:112
std::vector< process::metrics::PullGauge > resources_percent
Definition: metrics.hpp:260
process::metrics::Counter tasks_gone
Definition: metrics.hpp:121
process::metrics::Counter messages_launch_tasks
Definition: metrics.hpp:188
process::metrics::Counter slave_unreachable_scheduled
Definition: metrics.hpp:253
process::metrics::Counter dropped
Definition: metrics.hpp:78
process::metrics::Counter dropped_messages
Definition: metrics.hpp:131
process::metrics::Counter invalid_framework_to_executor_messages
Definition: metrics.hpp:213
Definition: agent.hpp:25
process::metrics::Counter messages_suppress_offers
Definition: metrics.hpp:191
process::metrics::PullGauge uptime_secs
Definition: metrics.hpp:47
process::metrics::PullGauge frameworks_disconnected
Definition: metrics.hpp:57
process::metrics::Counter slave_shutdowns_canceled
Definition: metrics.hpp:251
process::metrics::Counter offers_rescinded
Definition: metrics.hpp:325
process::metrics::PullGauge slaves_inactive
Definition: metrics.hpp:53
Metrics(const Master &master)
process::metrics::Counter messages_status_update
Definition: metrics.hpp:204
process::metrics::Counter messages_revive_offers
Definition: metrics.hpp:190
process::metrics::Counter slave_removals
Definition: metrics.hpp:240
process::metrics::Counter tasks_killed
Definition: metrics.hpp:117
process::metrics::Counter messages_register_framework
Definition: metrics.hpp:181
process::metrics::PushGauge operator_event_stream_subscribers
Definition: metrics.hpp:107
process::metrics::Counter operations
Definition: metrics.hpp:331
process::metrics::Counter slave_shutdowns_completed
Definition: metrics.hpp:250
process::metrics::Counter messages_register_slave
Definition: metrics.hpp:201
process::metrics::PullGauge slaves_active
Definition: metrics.hpp:52
process::metrics::Counter slave_removals_reason_unregistered
Definition: metrics.hpp:242
process::metrics::Counter messages_operation_status_update_acknowledgement
Definition: metrics.hpp:195
hashmap< mesos::scheduler::Call::Type, process::metrics::Counter > call_types
Definition: metrics.hpp:317
Definition: attributes.hpp:24
process::metrics::Counter messages_exited_executor
Definition: metrics.hpp:206
process::metrics::Counter messages_resource_request
Definition: metrics.hpp:187
process::metrics::Counter gone_by_operator
Definition: metrics.hpp:79
process::metrics::Counter messages_framework_to_executor
Definition: metrics.hpp:194
hashmap< TaskState, SourcesReasons > tasks_states
Definition: metrics.hpp:128
bool publishPerFrameworkMetrics
Definition: metrics.hpp:312
process::metrics::Counter slave_removals_reason_registered
Definition: metrics.hpp:243
Type
Definition: capabilities.hpp:79
process::metrics::Counter invalid_operation_status_update_acknowledgements
Definition: metrics.hpp:227
std::string getFrameworkMetricPrefix(const FrameworkInfo &frameworkInfo)
std::vector< process::metrics::PullGauge > resources_used
Definition: metrics.hpp:259
process::metrics::Counter messages_unregister_slave
Definition: metrics.hpp:203
hashmap< mesos::scheduler::Event::Type, process::metrics::Counter > event_types
Definition: metrics.hpp:320
process::metrics::Counter tasks_gone_by_operator
Definition: metrics.hpp:122
process::metrics::Counter messages_decline_offers
Definition: metrics.hpp:189
Try< uint32_t > type(const std::string &path)
process::metrics::Counter valid_operation_status_updates
Definition: metrics.hpp:223
hashmap< TaskState, process::metrics::Counter > terminal_task_states
Definition: metrics.hpp:327
process::metrics::Counter messages_operation_status_update
Definition: metrics.hpp:205
process::metrics::Counter messages_reconcile_operations
Definition: metrics.hpp:192
Definition: metrics.hpp:41
Frameworks(const std::string &principal)
Definition: metrics.hpp:161
process::metrics::Counter messages_unregister_framework
Definition: metrics.hpp:183
process::metrics::PullGauge tasks_starting
Definition: metrics.hpp:111
process::metrics::Counter failed
Definition: metrics.hpp:76
process::metrics::Counter messages_received
Definition: metrics.hpp:151
process::metrics::Counter invalid_status_updates
Definition: metrics.hpp:218
process::metrics::Counter invalid_status_update_acknowledgements
Definition: metrics.hpp:221
process::metrics::PullGauge frameworks_connected
Definition: metrics.hpp:56
hashmap< Offer::Operation::Type, process::metrics::Counter > operation_types
Definition: metrics.hpp:332
process::metrics::Counter offers_declined
Definition: metrics.hpp:324
Definition: master.hpp:355
Definition: push_gauge.hpp:41
std::vector< process::metrics::PullGauge > resources_revocable_percent
Definition: metrics.hpp:265
process::metrics::PullGauge event_queue_messages
Definition: metrics.hpp:233
process::metrics::Counter slave_removals_reason_unhealthy
Definition: metrics.hpp:241
hashmap< TaskStatus::Reason, process::metrics::Counter > Reasons
Definition: metrics.hpp:124
process::metrics::PullGauge slaves_unreachable
Definition: metrics.hpp:54
process::metrics::Counter valid_operation_status_update_acknowledgements
Definition: metrics.hpp:226
process::metrics::Counter slave_registrations
Definition: metrics.hpp:238
process::metrics::Counter tasks_dropped
Definition: metrics.hpp:120
process::metrics::Counter offers_sent
Definition: metrics.hpp:322
process::metrics::PullGauge elected
Definition: metrics.hpp:48
process::metrics::Counter offers_accepted
Definition: metrics.hpp:323