Apache Mesos
port_mapping.hpp
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 
17 #ifndef __PORT_MAPPING_ISOLATOR_HPP__
18 #define __PORT_MAPPING_ISOLATOR_HPP__
19 
20 #include <stdint.h>
21 
22 #include <sys/types.h>
23 
24 #include <set>
25 #include <string>
26 #include <vector>
27 
28 #include <process/id.hpp>
29 #include <process/owned.hpp>
30 #include <process/subprocess.hpp>
31 
34 
35 #include <stout/bytes.hpp>
36 #include <stout/hashmap.hpp>
37 #include <stout/hashset.hpp>
38 #include <stout/ip.hpp>
39 #include <stout/interval.hpp>
40 #include <stout/mac.hpp>
41 #include <stout/none.hpp>
42 #include <stout/option.hpp>
43 #include <stout/subcommand.hpp>
44 
46 
47 #include "slave/flags.hpp"
48 
50 
51 namespace mesos {
52 namespace internal {
53 namespace slave {
54 
55 // The prefix this isolator uses for the virtual ethernet devices.
56 // NOTE: This constant is exposed for testing.
57 inline std::string PORT_MAPPING_VETH_PREFIX() { return "mesos"; }
58 
59 
60 // The root directory where we bind mount all the namespace handles.
61 // We choose the directory '/var/run/netns' so that we can use
62 // iproute2 suite (e.g., ip netns show/exec) to inspect or enter the
63 // network namespace. This is very useful for debugging purposes.
64 // NOTE: This constant is exposed for testing.
65 inline std::string PORT_MAPPING_BIND_MOUNT_ROOT() { return "/var/run/netns"; }
66 
67 // The root directory where we keep all the namespace handle
68 // symlinks. This is introduced in 0.23.0.
69 // NOTE: This constant is exposed for testing.
71 {
72  return "/var/run/mesos/netns";
73 }
74 
75 
76 // These names are used to identify the traffic control statistics
77 // output for each of the Linux Traffic Control Qdiscs we report.
78 constexpr char NET_ISOLATOR_BW_LIMIT[] = "bw_limit";
79 constexpr char NET_ISOLATOR_BLOAT_REDUCTION[] = "bloat_reduction";
80 
81 
82 // Responsible for allocating ephemeral ports for the port mapping
83 // network isolator. This class is exposed mainly for unit testing.
85 {
86 public:
88  const IntervalSet<uint16_t>& total,
89  size_t _portsPerContainer)
90  : free(total),
91  portsPerContainer_(_portsPerContainer) {}
92 
93  // Returns the number of ephemeral ports for each container.
94  size_t portsPerContainer() const { return portsPerContainer_; }
95 
96  // Allocate an ephemeral port range for a container. The allocator
97  // will automatically find one port range with the given container
98  // size. Returns error if the allocation cannot be fulfilled (e.g.,
99  // exhausting available ephemeral ports).
101 
102  // Mark the specified ephemeral port range as allocated.
103  void allocate(const Interval<uint16_t>& ports);
104 
105  // Deallocate the specified ephemeral port range.
106  void deallocate(const Interval<uint16_t>& ports);
107 
108  // Return true if the specified ephemeral port range is managed by
109  // the allocator, regardless it has been allocated to use or not.
110  bool isManaged(const Interval<uint16_t>& ports)
111  {
112  return (free + used).contains(ports);
113  }
114 
115 private:
116  // Given an integer x, return the smallest integer t such that t >=
117  // x and t % m == 0.
118  static uint32_t nextMultipleOf(uint32_t x, uint32_t m);
119 
122 
123  // The number of ephemeral ports for each container.
124  size_t portsPerContainer_;
125 };
126 
127 
128 // For the specified ports, generate a set of port ranges each of
129 // which can be used by a single IP filter. In other words, each port
130 // range needs to satisfy the following two conditions: 1) the size of
131 // the range is 2^n (n=0,1,2...); 2) the begin of the range is size
132 // aligned (i.e., begin % size == 0). This function is exposed mainly
133 // for unit testing.
134 std::vector<routing::filter::ip::PortRange> getPortRanges(
135  const IntervalSet<uint16_t>& ports);
136 
137 
138 // Provides network isolation using port mapping. Each container is
139 // assigned a fixed set of ports (including ephemeral ports). The
140 // isolator will set up filters on the host such that network traffic
141 // to the host will be properly redirected to the corresponding
142 // container depending on the destination ports. The network traffic
143 // from containers will also be properly relayed to the host. This
144 // isolator is useful when the operator wants to reuse the host IP for
145 // all containers running on the host (e.g., there are insufficient
146 // IPs).
148 {
149 public:
151 
153 
155  const std::vector<mesos::slave::ContainerState>& states,
156  const hashset<ContainerID>& orphans) override;
157 
159  const ContainerID& containerId,
160  const mesos::slave::ContainerConfig& containerConfig) override;
161 
163  const ContainerID& containerId,
164  pid_t pid) override;
165 
167  const ContainerID& containerId) override;
168 
170  const ContainerID& containerId,
171  const Resources& resourceRequests,
172  const google::protobuf::Map<
173  std::string, Value::Scalar>& resourceLimits = {}) override;
174 
176  const ContainerID& containerId) override;
177 
179  const ContainerID& containerId) override;
180 
181 private:
182  struct Info
183  {
184  Info(const IntervalSet<uint16_t>& _nonEphemeralPorts,
185  const Interval<uint16_t>& _ephemeralPorts,
186  const Option<pid_t>& _pid = None())
187  : nonEphemeralPorts(_nonEphemeralPorts),
188  ephemeralPorts(_ephemeralPorts),
189  pid(_pid) {}
190 
191  // Non-ephemeral ports used by the container. It's possible that a
192  // container does not use any non-ephemeral ports. In that case,
193  // 'nonEphemeralPorts' will be empty. This variable could change
194  // upon 'update'.
195  IntervalSet<uint16_t> nonEphemeralPorts;
196 
197  // Each container has one and only one range of ephemeral ports.
198  // It cannot have more than one ranges of ephemeral ports because
199  // we need to setup the ip_local_port_range (which only accepts a
200  // single interval) inside the container to restrict the ephemeral
201  // ports used by the container.
202  const Interval<uint16_t> ephemeralPorts;
203 
204  Option<pid_t> pid;
205  Option<uint16_t> flowId;
206  };
207 
208  // Define the metrics used by the port mapping network isolator.
209  struct Metrics
210  {
211  Metrics();
212  ~Metrics();
213 
214  process::metrics::Counter adding_eth0_ip_filters_errors;
215  process::metrics::Counter adding_eth0_ip_filters_already_exist;
216  process::metrics::Counter adding_eth0_egress_filters_errors;
217  process::metrics::Counter adding_eth0_egress_filters_already_exist;
218  process::metrics::Counter adding_lo_ip_filters_errors;
219  process::metrics::Counter adding_lo_ip_filters_already_exist;
220  process::metrics::Counter adding_veth_ip_filters_errors;
221  process::metrics::Counter adding_veth_ip_filters_already_exist;
222  process::metrics::Counter adding_veth_icmp_filters_errors;
223  process::metrics::Counter adding_veth_icmp_filters_already_exist;
224  process::metrics::Counter adding_veth_arp_filters_errors;
225  process::metrics::Counter adding_veth_arp_filters_already_exist;
226  process::metrics::Counter adding_eth0_icmp_filters_errors;
227  process::metrics::Counter adding_eth0_icmp_filters_already_exist;
228  process::metrics::Counter adding_eth0_arp_filters_errors;
229  process::metrics::Counter adding_eth0_arp_filters_already_exist;
230  process::metrics::Counter removing_eth0_ip_filters_errors;
231  process::metrics::Counter removing_eth0_ip_filters_do_not_exist;
232  process::metrics::Counter removing_eth0_egress_filters_errors;
233  process::metrics::Counter removing_eth0_egress_filters_do_not_exist;
234  process::metrics::Counter removing_lo_ip_filters_errors;
235  process::metrics::Counter removing_lo_ip_filters_do_not_exist;
236  process::metrics::Counter removing_veth_ip_filters_errors;
237  process::metrics::Counter removing_veth_ip_filters_do_not_exist;
238  process::metrics::Counter removing_eth0_icmp_filters_errors;
239  process::metrics::Counter removing_eth0_icmp_filters_do_not_exist;
240  process::metrics::Counter removing_eth0_arp_filters_errors;
241  process::metrics::Counter removing_eth0_arp_filters_do_not_exist;
242  process::metrics::Counter updating_eth0_icmp_filters_errors;
243  process::metrics::Counter updating_eth0_icmp_filters_already_exist;
244  process::metrics::Counter updating_eth0_icmp_filters_do_not_exist;
245  process::metrics::Counter updating_eth0_arp_filters_errors;
246  process::metrics::Counter updating_eth0_arp_filters_already_exist;
247  process::metrics::Counter updating_eth0_arp_filters_do_not_exist;
248  process::metrics::Counter updating_container_ip_filters_errors;
249  } metrics;
250 
252  const Flags& _flags,
253  const std::string& _bindMountRoot,
254  const std::string& _eth0,
255  const std::string& _lo,
256  const net::MAC& _hostMAC,
257  const net::IP::Network& _hostIPNetwork,
258  const size_t _hostEth0MTU,
259  const net::IP& _hostDefaultGateway,
260  const routing::Handle& _hostTxFqCodelHandle,
261  const hashmap<std::string, std::string>& _hostNetworkConfigurations,
262  const Option<Bytes>& _egressRateLimitPerContainer,
263  const IntervalSet<uint16_t>& _managedNonEphemeralPorts,
264  const process::Owned<EphemeralPortsAllocator>& _ephemeralPortsAllocator,
265  const std::set<uint16_t>& _flowIDs)
266  : ProcessBase(process::ID::generate("mesos-port-mapping-isolator")),
267  flags(_flags),
268  bindMountRoot(_bindMountRoot),
269  eth0(_eth0),
270  lo(_lo),
271  hostMAC(_hostMAC),
272  hostIPNetwork(_hostIPNetwork),
273  hostEth0MTU(_hostEth0MTU),
274  hostDefaultGateway(_hostDefaultGateway),
275  hostTxFqCodelHandle(_hostTxFqCodelHandle),
276  hostNetworkConfigurations(_hostNetworkConfigurations),
277  egressRateLimitPerContainer(_egressRateLimitPerContainer),
278  managedNonEphemeralPorts(_managedNonEphemeralPorts),
279  ephemeralPortsAllocator(_ephemeralPortsAllocator),
280  freeFlowIds(_flowIDs) {}
281 
282  // Continuations.
283  Try<Nothing> _cleanup(Info* info, const Option<ContainerID>& containerId);
284  Try<Info*> _recover(pid_t pid);
285 
286  void _update(
287  const ContainerID& containerId,
289 
291  const ResourceStatistics& result,
292  const process::Subprocess& s);
293 
295  ResourceStatistics result,
296  const process::Future<std::string>& out);
297 
298  // Helper functions.
299  Try<Nothing> addHostIPFilters(
300  const routing::filter::ip::PortRange& range,
301  const Option<uint16_t>& flowId,
302  const std::string& veth);
303 
304  Try<Nothing> removeHostIPFilters(
305  const routing::filter::ip::PortRange& range,
306  const std::string& veth,
307  bool removeFiltersOnVeth = true);
308 
309  // Return the scripts that will be executed in the child context.
310  std::string scripts(Info* info);
311 
312  uint16_t getNextFlowId();
313 
314  const Flags flags;
315  const std::string bindMountRoot;
316 
317  const std::string eth0;
318  const std::string lo;
319  const net::MAC hostMAC;
320  const net::IP::Network hostIPNetwork;
321  const size_t hostEth0MTU;
322  const net::IP hostDefaultGateway;
323  const routing::Handle hostTxFqCodelHandle;
324 
325  // Describe the host network configurations. It is a map between
326  // configure proc files (e.g., /proc/sys/net/core/somaxconn) and
327  // values of the configure proc files.
328  const hashmap<std::string, std::string> hostNetworkConfigurations;
329 
330  // The optional throughput limit to containers' egress traffic.
331  const Option<Bytes> egressRateLimitPerContainer;
332 
333  // All the non-ephemeral ports managed by the slave, as passed in
334  // via flags.resources.
335  const IntervalSet<uint16_t> managedNonEphemeralPorts;
336 
337  process::Owned<EphemeralPortsAllocator> ephemeralPortsAllocator;
338 
339  // Store a set of unused flow ID's on this slave.
340  std::set<uint16_t> freeFlowIds;
341 
343 
344  // Recovered containers from a previous run that weren't managed by
345  // the network isolator.
346  hashset<ContainerID> unmanaged;
347 };
348 
349 
350 // Defines the subcommand for 'update' that needs to be executed by a
351 // subprocess to update the filters inside a container.
353 {
354 public:
355  static const char* NAME;
356 
357  struct Flags : public virtual flags::FlagsBase
358  {
359  Flags();
360 
366  };
367 
369 
371 
372 protected:
373  int execute() override;
374  flags::FlagsBase* getFlags() override { return &flags; }
375 };
376 
377 
378 // Defines the subcommand for 'statistics' that needs to be executed
379 // by a subprocess to retrieve newtork statistics from inside a
380 // container.
382 {
383 public:
384  static const char* NAME;
385 
386  struct Flags : public virtual flags::FlagsBase
387  {
388  Flags();
389 
395  };
396 
398 
400 
401 protected:
402  int execute() override;
403  flags::FlagsBase* getFlags() override { return &flags; }
404 };
405 
406 } // namespace slave {
407 } // namespace internal {
408 } // namespace mesos {
409 
410 #endif // __PORT_MAPPING_ISOLATOR_HPP__
Protocol< RecoverRequest, RecoverResponse > recover
Try< Nothing > isolate(const std::string &hierarchy, const std::string &cgroup, pid_t pid)
std::string generate(const std::string &prefix="")
Returns &#39;prefix(N)&#39; where N represents the number of instances where the same prefix (wrt...
Option< std::string > lo_name
Definition: port_mapping.hpp:362
EphemeralPortsAllocator(const IntervalSet< uint16_t > &total, size_t _portsPerContainer)
Definition: port_mapping.hpp:87
void execute(const std::string &script)
Try< bool > update(const std::string &link, const Handle &parent, uint16_t protocol, const action::Mirror &mirror)
Option< std::string > eth0_name
Definition: port_mapping.hpp:361
std::vector< routing::filter::ip::PortRange > getPortRanges(const IntervalSet< uint16_t > &ports)
constexpr char NET_ISOLATOR_BLOAT_REDUCTION[]
Definition: port_mapping.hpp:79
Definition: check.hpp:33
bool enable_snmp_statistics
Definition: port_mapping.hpp:394
Flags flags
Definition: port_mapping.hpp:370
PortMappingUpdate()
Definition: port_mapping.hpp:368
process::Future< bool > cleanup(const std::string &hierarchy)
Result< ProcessStatus > status(pid_t pid)
Definition: proc.hpp:166
Definition: handle.hpp:38
Definition: resources.hpp:83
constexpr char NET_ISOLATOR_BW_LIMIT[]
Definition: port_mapping.hpp:78
bool enable_socket_statistics_details
Definition: port_mapping.hpp:393
PortMappingStatistics()
Definition: port_mapping.hpp:397
Definition: flags.hpp:39
std::string PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()
Definition: port_mapping.hpp:70
Definition: counter.hpp:26
Definition: port_mapping.hpp:352
Flags flags
Definition: port_mapping.hpp:399
Definition: ip.hpp:73
Represents a fork() exec()ed subprocess.
Definition: subprocess.hpp:43
DWORD pid_t
Definition: windows.hpp:181
Try< ResourceStatistics > usage(pid_t pid, bool mem=true, bool cpus=true)
flags::FlagsBase * getFlags() override
Definition: port_mapping.hpp:374
Definition: subcommand.hpp:41
Option< JSON::Object > ports_to_add
Definition: port_mapping.hpp:364
Definition: port_mapping.hpp:381
Try< std::vector< Info > > infos(int familiy, int states)
size_t portsPerContainer() const
Definition: port_mapping.hpp:94
Option< pid_t > pid
Definition: port_mapping.hpp:363
static const char * NAME
Definition: port_mapping.hpp:355
Option< pid_t > pid
Definition: port_mapping.hpp:391
Definition: flags.hpp:44
Definition: agent.hpp:25
Definition: port_mapping.hpp:357
flags::FlagsBase * getFlags() override
Definition: port_mapping.hpp:403
Definition: ip.hpp:49
static const char * NAME
Definition: port_mapping.hpp:384
Definition: ip.hpp:203
Definition: mac.hpp:74
bool enable_socket_statistics_summary
Definition: port_mapping.hpp:392
#define flags
Definition: decoder.hpp:18
Definition: none.hpp:27
Definition: attributes.hpp:24
Option< std::string > eth0_name
Definition: port_mapping.hpp:390
void deallocate(const Interval< uint16_t > &ports)
std::string PORT_MAPPING_BIND_MOUNT_ROOT()
Definition: port_mapping.hpp:65
std::string PORT_MAPPING_VETH_PREFIX()
Definition: port_mapping.hpp:57
Try< std::string > prepare(const std::string &baseHierarchy, const std::string &subsystem, const std::string &cgroup)
bool isManaged(const Interval< uint16_t > &ports)
Definition: port_mapping.hpp:110
JSON::Object Metrics()
Try< Nothing > create(const std::string &hierarchy, const std::string &cgroup, bool recursive=false)
Definition: owned.hpp:36
Try< Interval< uint16_t > > allocate()
Definition: parse.hpp:33
PID< MetricsProcess > metrics
~PortMappingIsolatorProcess() override
Definition: port_mapping.hpp:152
Option< JSON::Object > ports_to_remove
Definition: port_mapping.hpp:365