Apache Mesos
state.hpp
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 
17 #ifndef __SLAVE_STATE_HPP__
18 #define __SLAVE_STATE_HPP__
19 
20 #ifndef __WINDOWS__
21 #include <unistd.h>
22 #endif // __WINDOWS__
23 
24 #include <vector>
25 
26 #include <mesos/resources.hpp>
27 #include <mesos/type_utils.hpp>
28 
29 #include <process/pid.hpp>
30 
31 #include <stout/hashmap.hpp>
32 #include <stout/hashset.hpp>
33 #include <stout/path.hpp>
34 #include <stout/protobuf.hpp>
35 #include <stout/try.hpp>
36 #include <stout/utils.hpp>
37 #include <stout/uuid.hpp>
38 
39 #include <stout/os/mkdir.hpp>
40 #include <stout/os/mktemp.hpp>
41 #include <stout/os/rename.hpp>
42 #include <stout/os/rm.hpp>
43 #include <stout/os/write.hpp>
44 
46 
47 #include "messages/messages.hpp"
48 
49 namespace mesos {
50 namespace internal {
51 namespace slave {
52 namespace state {
53 
54 // Forward declarations.
55 struct State;
56 struct SlaveState;
57 struct ResourcesState;
58 struct FrameworkState;
59 struct ExecutorState;
60 struct RunState;
61 struct TaskState;
62 
63 
64 // This function performs recovery from the state stored at 'rootDir'.
65 // If the 'strict' flag is set, any errors encountered while
66 // recovering a state are considered fatal and hence the recovery is
67 // short-circuited and returns an error. There might be orphaned
68 // executors that need to be manually cleaned up. If the 'strict' flag
69 // is not set, any errors encountered are considered non-fatal and the
70 // recovery continues by recovering as much of the state as possible,
71 // while increasing the 'errors' count. Note that 'errors' on a struct
72 // includes the 'errors' encountered recursively. In other words,
73 // 'State.errors' is the sum total of all recovery errors.
74 Try<State> recover(const std::string& rootDir, bool strict);
75 
76 
77 // Reads the protobuf message(s) from the given path.
78 // `T` may be either a single protobuf message or a sequence of messages
79 // if `T` is a specialization of `google::protobuf::RepeatedPtrField`.
80 template <typename T>
81 Result<T> read(const std::string& path)
82 {
83  Result<T> result = ::protobuf::read<T>(path);
84  if (result.isSome()) {
85  upgradeResources(&result.get());
86  }
87 
88  return result;
89 }
90 
91 
92 // While we return a `Result<string>` here in order to keep the return
93 // type of `state::read` consistent, the `None` case does not arise here.
94 // That is, an empty file will result in an empty string, rather than
95 // the `Result` ending up in a `None` state.
96 template <>
97 inline Result<std::string> read<std::string>(const std::string& path)
98 {
99  return os::read(path);
100 }
101 
102 
103 template <>
104 inline Result<Resources> read<Resources>(const std::string& path)
105 {
107  read<google::protobuf::RepeatedPtrField<Resource>>(path);
108 
109  if (resources.isError()) {
110  return Error(resources.error());
111  }
112 
113  if (resources.isNone()) {
114  return None();
115  }
116 
117  return std::move(resources.get());
118 }
119 
120 
121 namespace internal {
122 
124  const std::string& path,
125  const std::string& message)
126 {
127  return ::os::write(path, message);
128 }
129 
130 
131 template <
132  typename T,
133  typename std::enable_if<
134  std::is_convertible<T*, google::protobuf::Message*>::value,
135  int>::type = 0>
136 inline Try<Nothing> checkpoint(const std::string& path, T message)
137 {
138  // If the `Try` from `downgradeResources` returns an `Error`, we currently
139  // continue to checkpoint the resources in a partially downgraded state.
140  // This implies that an agent with refined reservations cannot be downgraded
141  // to versions before reservation refinement support, which was introduced
142  // in 1.4.0.
143  //
144  // TODO(mpark): Do something smarter with the result once
145  // something like an agent recovery capability is introduced.
146  downgradeResources(&message);
147  return ::protobuf::write(path, message);
148 }
149 
150 
152  const std::string& path,
153  google::protobuf::RepeatedPtrField<Resource> resources)
154 {
155  // If the `Try` from `downgradeResources` returns an `Error`, we currently
156  // continue to checkpoint the resources in a partially downgraded state.
157  // This implies that an agent with refined reservations cannot be downgraded
158  // to versions before reservation refinement support, which was introduced
159  // in 1.4.0.
160  //
161  // TODO(mpark): Do something smarter with the result once
162  // something like an agent recovery capability is introduced.
163  downgradeResources(&resources);
164  return ::protobuf::write(path, resources);
165 }
166 
167 
169  const std::string& path,
170  const Resources& resources)
171 {
172  const google::protobuf::RepeatedPtrField<Resource>& messages = resources;
173  return checkpoint(path, messages);
174 }
175 
176 } // namespace internal {
177 
178 
179 // Thin wrapper to checkpoint data to disk and perform the necessary
180 // error checking. It checkpoints an instance of T at the given path.
181 // We can checkpoint anything as long as T is supported by
182 // internal::checkpoint. Currently the list of supported Ts are:
183 // - std::string
184 // - google::protobuf::Message
185 // - google::protobuf::RepeatedPtrField<T>
186 // - mesos::Resources
187 //
188 // NOTE: We provide atomic (all-or-nothing) semantics here by always
189 // writing to a temporary file first then using os::rename to atomically
190 // move it to the desired path.
191 template <typename T>
192 Try<Nothing> checkpoint(const std::string& path, const T& t)
193 {
194  // Create the base directory.
195  std::string base = Path(path).dirname();
196 
197  Try<Nothing> mkdir = os::mkdir(base);
198  if (mkdir.isError()) {
199  return Error("Failed to create directory '" + base + "': " + mkdir.error());
200  }
201 
202  // NOTE: We create the temporary file at 'base/XXXXXX' to make sure
203  // rename below does not cross devices (MESOS-2319).
204  //
205  // TODO(jieyu): It's possible that the temporary file becomes
206  // dangling if slave crashes or restarts while checkpointing.
207  // Consider adding a way to garbage collect them.
208  Try<std::string> temp = os::mktemp(path::join(base, "XXXXXX"));
209  if (temp.isError()) {
210  return Error("Failed to create temporary file: " + temp.error());
211  }
212 
213  // Now checkpoint the instance of T to the temporary file.
215  if (checkpoint.isError()) {
216  // Try removing the temporary file on error.
217  os::rm(temp.get());
218 
219  return Error("Failed to write temporary file '" + temp.get() +
220  "': " + checkpoint.error());
221  }
222 
223  // Rename the temporary file to the path.
224  Try<Nothing> rename = os::rename(temp.get(), path);
225  if (rename.isError()) {
226  // Try removing the temporary file on error.
227  os::rm(temp.get());
228 
229  return Error("Failed to rename '" + temp.get() + "' to '" +
230  path + "': " + rename.error());
231  }
232 
233  return Nothing();
234 }
235 
236 
237 // NOTE: The *State structs (e.g., TaskState, RunState, etc) are
238 // defined in reverse dependency order because many of them have
239 // Option<*State> dependencies which means we need them declared in
240 // their entirety in order to compile because things like
241 // Option<*State> need to know the final size of the types.
242 
243 struct TaskState
244 {
245  TaskState() : errors(0) {}
246 
247  static Try<TaskState> recover(
248  const std::string& rootDir,
249  const SlaveID& slaveId,
250  const FrameworkID& frameworkId,
251  const ExecutorID& executorId,
252  const ContainerID& containerId,
253  const TaskID& taskId,
254  bool strict);
255 
256  TaskID id;
258  std::vector<StatusUpdate> updates;
260  unsigned int errors;
261 };
262 
263 
264 struct RunState
265 {
266  RunState() : completed(false), errors(0) {}
267 
268  static Try<RunState> recover(
269  const std::string& rootDir,
270  const SlaveID& slaveId,
271  const FrameworkID& frameworkId,
272  const ExecutorID& executorId,
273  const ContainerID& containerId,
274  bool strict);
275 
280 
281  // This represents if the executor is connected via HTTP. It can be None()
282  // when the connection type is unknown.
284 
285  // Executor terminated and all its updates acknowledged.
286  bool completed;
287 
288  unsigned int errors;
289 };
290 
291 
293 {
294  ExecutorState() : errors(0) {}
295 
297  const std::string& rootDir,
298  const SlaveID& slaveId,
299  const FrameworkID& frameworkId,
300  const ExecutorID& executorId,
301  bool strict);
302 
303  ExecutorID id;
307  unsigned int errors;
308 };
309 
310 
312 {
313  FrameworkState() : errors(0) {}
314 
316  const std::string& rootDir,
317  const SlaveID& slaveId,
318  const FrameworkID& frameworkId,
319  bool strict);
320 
321  FrameworkID id;
323 
324  // Note that HTTP frameworks (supported in 0.24.0) do not have a
325  // PID, in which case 'pid' is Some(UPID()) rather than None().
327 
329  unsigned int errors;
330 };
331 
332 
334 {
335  ResourcesState() : errors(0) {}
336 
338  const std::string& rootDir,
339  bool strict);
340 
343  unsigned int errors;
344 };
345 
346 
348 {
349  SlaveState() : errors(0) {}
350 
351  static Try<SlaveState> recover(
352  const std::string& rootDir,
353  const SlaveID& slaveId,
354  bool strict);
355 
356  SlaveID id;
359  unsigned int errors;
360 };
361 
362 
363 // The top level state. The members are child nodes in the tree. Each
364 // child node (recursively) recovers the checkpointed state.
365 struct State
366 {
367  State() : errors(0) {}
368 
371  bool rebooted = false;
372 
373  // TODO(jieyu): Consider using a vector of Option<Error> here so
374  // that we can print all the errors. This also applies to all the
375  // State structs above.
376  unsigned int errors;
377 };
378 
379 } // namespace state {
380 } // namespace slave {
381 } // namespace internal {
382 } // namespace mesos {
383 
384 #endif // __SLAVE_STATE_HPP__
FrameworkState()
Definition: state.hpp:313
Definition: path.hpp:26
unsigned int errors
Definition: state.hpp:288
Try< Nothing > downgradeResources(google::protobuf::RepeatedPtrField< Resource > *resources)
bool isNone() const
Definition: result.hpp:112
hashmap< FrameworkID, FrameworkState > frameworks
Definition: state.hpp:358
Definition: nothing.hpp:16
Definition: errorbase.hpp:36
Try< Nothing > rm(const std::string &path)
Definition: rm.hpp:26
Option< ResourcesState > resources
Definition: state.hpp:369
Option< process::UPID > libprocessPid
Definition: state.hpp:279
Definition: check.hpp:33
static Result< T > error(const std::string &message)
Definition: result.hpp:53
unsigned int errors
Definition: state.hpp:343
Option< FrameworkInfo > info
Definition: state.hpp:322
Option< Resources > target
Definition: state.hpp:342
hashmap< TaskID, TaskState > tasks
Definition: state.hpp:277
Try< std::string > mktemp(const std::string &path=path::join(os::temp(),"XXXXXX"))
Definition: mktemp.hpp:36
Result< Resources > read< Resources >(const std::string &path)
Definition: state.hpp:104
Option< ContainerID > id
Definition: state.hpp:276
unsigned int errors
Definition: state.hpp:260
Definition: resources.hpp:81
hashset< id::UUID > acks
Definition: state.hpp:259
event_base * base
std::vector< StatusUpdate > updates
Definition: state.hpp:258
std::string join(const std::string &path1, const std::string &path2, const char _separator=os::PATH_SEPARATOR)
Definition: path.hpp:56
Option< ContainerID > latest
Definition: state.hpp:305
Definition: check.hpp:30
Option< bool > http
Definition: state.hpp:283
void upgradeResources(google::protobuf::RepeatedPtrField< Resource > *resources)
Definition: hashmap.hpp:38
ExecutorID id
Definition: state.hpp:303
Resources resources
Definition: state.hpp:341
Represents a POSIX or Windows file system path and offers common path manipulations.
Definition: path.hpp:145
Try< Nothing > mkdir(const std::string &directory, bool recursive=true)
Definition: mkdir.hpp:31
RunState()
Definition: state.hpp:266
Definition: spec.hpp:26
hashmap< ContainerID, RunState > runs
Definition: state.hpp:306
ExecutorState()
Definition: state.hpp:294
Option< pid_t > forkedPid
Definition: state.hpp:278
Option< ExecutorInfo > info
Definition: state.hpp:304
const T & get() const
Definition: result.hpp:115
unsigned int errors
Definition: state.hpp:376
State()
Definition: state.hpp:367
static Try error(const E &e)
Definition: try.hpp:42
Try< State > recover(const std::string &rootDir, bool strict)
unsigned int errors
Definition: state.hpp:329
Option< Task > info
Definition: state.hpp:257
std::string dirname() const
Extracts the component up to, but not including, the final &#39;/&#39;.
Definition: path.hpp:241
FrameworkID id
Definition: state.hpp:321
Result< T > read(const std::string &path)
Definition: state.hpp:81
Result< std::string > read(int_fd fd, size_t size)
Definition: read.hpp:55
Try< Nothing > checkpoint(const std::string &path, const std::string &message)
Definition: state.hpp:123
Try< Nothing > rename(const std::string &from, const std::string &to)
Definition: rename.hpp:27
Definition: none.hpp:27
Definition: attributes.hpp:24
bool isError() const
Definition: try.hpp:71
ResourcesState()
Definition: state.hpp:335
TaskState()
Definition: state.hpp:245
SlaveID id
Definition: state.hpp:356
Protocol< WriteRequest, WriteResponse > write
unsigned int errors
Definition: state.hpp:359
Try< uint32_t > type(const std::string &path)
std::string temp()
Definition: temp.hpp:27
bool isSome() const
Definition: result.hpp:111
bool isError() const
Definition: result.hpp:113
Option< SlaveState > slave
Definition: state.hpp:370
unsigned int errors
Definition: state.hpp:307
SlaveState()
Definition: state.hpp:349
bool completed
Definition: state.hpp:286
Option< process::UPID > pid
Definition: state.hpp:326
Option< SlaveInfo > info
Definition: state.hpp:357
TaskID id
Definition: state.hpp:256
hashmap< ExecutorID, ExecutorState > executors
Definition: state.hpp:328
Definition: state.hpp:365