Apache Mesos
port_mapping.hpp
Go to the documentation of this file.
1 // Licensed to the Apache Software Foundation (ASF) under one
2 // or more contributor license agreements. See the NOTICE file
3 // distributed with this work for additional information
4 // regarding copyright ownership. The ASF licenses this file
5 // to you under the Apache License, Version 2.0 (the
6 // "License"); you may not use this file except in compliance
7 // with the License. You may obtain a copy of the License at
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 
17 #ifndef __PORT_MAPPING_ISOLATOR_HPP__
18 #define __PORT_MAPPING_ISOLATOR_HPP__
19 
20 #include <stdint.h>
21 
22 #include <sys/types.h>
23 
24 #include <set>
25 #include <string>
26 #include <vector>
27 
28 #include <process/id.hpp>
29 #include <process/owned.hpp>
30 #include <process/subprocess.hpp>
31 
34 
35 #include <stout/bytes.hpp>
36 #include <stout/hashmap.hpp>
37 #include <stout/hashset.hpp>
38 #include <stout/ip.hpp>
39 #include <stout/interval.hpp>
40 #include <stout/mac.hpp>
41 #include <stout/none.hpp>
42 #include <stout/option.hpp>
43 #include <stout/subcommand.hpp>
44 
46 
47 #include "slave/flags.hpp"
48 
50 
51 namespace mesos {
52 namespace internal {
53 namespace slave {
54 
55 // The prefix this isolator uses for the virtual ethernet devices.
56 // NOTE: This constant is exposed for testing.
57 inline std::string PORT_MAPPING_VETH_PREFIX() { return "mesos"; }
58 
59 
60 // The root directory where we bind mount all the namespace handles.
61 // We choose the directory '/var/run/netns' so that we can use
62 // iproute2 suite (e.g., ip netns show/exec) to inspect or enter the
63 // network namespace. This is very useful for debugging purposes.
64 // NOTE: This constant is exposed for testing.
65 inline std::string PORT_MAPPING_BIND_MOUNT_ROOT() { return "/var/run/netns"; }
66 
67 // The root directory where we keep all the namespace handle
68 // symlinks. This is introduced in 0.23.0.
69 // NOTE: This constant is exposed for testing.
71 {
72  return "/var/run/mesos/netns";
73 }
74 
75 
76 // These names are used to identify the traffic control statistics
77 // output for each of the Linux Traffic Control Qdiscs we report.
78 constexpr char NET_ISOLATOR_BW_LIMIT[] = "bw_limit";
79 constexpr char NET_ISOLATOR_BLOAT_REDUCTION[] = "bloat_reduction";
80 
81 
82 // Responsible for allocating ephemeral ports for the port mapping
83 // network isolator. This class is exposed mainly for unit testing.
85 {
86 public:
88  const IntervalSet<uint16_t>& total,
89  size_t _portsPerContainer)
90  : free(total),
91  portsPerContainer_(_portsPerContainer) {}
92 
93  // Returns the number of ephemeral ports for each container.
94  size_t portsPerContainer() const { return portsPerContainer_; }
95 
96  // Allocate an ephemeral port range for a container. The allocator
97  // will automatically find one port range with the given container
98  // size. Returns error if the allocation cannot be fulfilled (e.g.,
99  // exhausting available ephemeral ports).
101 
102  // Mark the specified ephemeral port range as allocated.
103  void allocate(const Interval<uint16_t>& ports);
104 
105  // Deallocate the specified ephemeral port range.
106  void deallocate(const Interval<uint16_t>& ports);
107 
108  // Return true if the specified ephemeral port range is managed by
109  // the allocator, regardless it has been allocated to use or not.
110  bool isManaged(const Interval<uint16_t>& ports)
111  {
112  return (free + used).contains(ports);
113  }
114 
115 private:
116  // Given an integer x, return the smallest integer t such that t >=
117  // x and t % m == 0.
118  static uint32_t nextMultipleOf(uint32_t x, uint32_t m);
119 
122 
123  // The number of ephemeral ports for each container.
124  size_t portsPerContainer_;
125 };
126 
127 
128 // For the specified ports, generate a set of port ranges each of
129 // which can be used by a single IP filter. In other words, each port
130 // range needs to satisfy the following two conditions: 1) the size of
131 // the range is 2^n (n=0,1,2...); 2) the begin of the range is size
132 // aligned (i.e., begin % size == 0). This function is exposed mainly
133 // for unit testing.
134 std::vector<routing::filter::ip::PortRange> getPortRanges(
135  const IntervalSet<uint16_t>& ports);
136 
137 
138 // Provides network isolation using port mapping. Each container is
139 // assigned a fixed set of ports (including ephemeral ports). The
140 // isolator will set up filters on the host such that network traffic
141 // to the host will be properly redirected to the corresponding
142 // container depending on the destination ports. The network traffic
143 // from containers will also be properly relayed to the host. This
144 // isolator is useful when the operator wants to reuse the host IP for
145 // all containers running on the host (e.g., there are insufficient
146 // IPs).
148 {
149 public:
151 
153 
155  const std::vector<mesos::slave::ContainerState>& states,
156  const hashset<ContainerID>& orphans) override;
157 
159  const ContainerID& containerId,
160  const mesos::slave::ContainerConfig& containerConfig) override;
161 
163  const ContainerID& containerId,
164  pid_t pid) override;
165 
167  const ContainerID& containerId) override;
168 
170  const ContainerID& containerId,
171  const Resources& resources) override;
172 
174  const ContainerID& containerId) override;
175 
177  const ContainerID& containerId) override;
178 
179 private:
180  struct Info
181  {
182  Info(const IntervalSet<uint16_t>& _nonEphemeralPorts,
183  const Interval<uint16_t>& _ephemeralPorts,
184  const Option<pid_t>& _pid = None())
185  : nonEphemeralPorts(_nonEphemeralPorts),
186  ephemeralPorts(_ephemeralPorts),
187  pid(_pid) {}
188 
189  // Non-ephemeral ports used by the container. It's possible that a
190  // container does not use any non-ephemeral ports. In that case,
191  // 'nonEphemeralPorts' will be empty. This variable could change
192  // upon 'update'.
193  IntervalSet<uint16_t> nonEphemeralPorts;
194 
195  // Each container has one and only one range of ephemeral ports.
196  // It cannot have more than one ranges of ephemeral ports because
197  // we need to setup the ip_local_port_range (which only accepts a
198  // single interval) inside the container to restrict the ephemeral
199  // ports used by the container.
200  const Interval<uint16_t> ephemeralPorts;
201 
202  Option<pid_t> pid;
203  Option<uint16_t> flowId;
204  };
205 
206  // Define the metrics used by the port mapping network isolator.
207  struct Metrics
208  {
209  Metrics();
210  ~Metrics();
211 
212  process::metrics::Counter adding_eth0_ip_filters_errors;
213  process::metrics::Counter adding_eth0_ip_filters_already_exist;
214  process::metrics::Counter adding_eth0_egress_filters_errors;
215  process::metrics::Counter adding_eth0_egress_filters_already_exist;
216  process::metrics::Counter adding_lo_ip_filters_errors;
217  process::metrics::Counter adding_lo_ip_filters_already_exist;
218  process::metrics::Counter adding_veth_ip_filters_errors;
219  process::metrics::Counter adding_veth_ip_filters_already_exist;
220  process::metrics::Counter adding_veth_icmp_filters_errors;
221  process::metrics::Counter adding_veth_icmp_filters_already_exist;
222  process::metrics::Counter adding_veth_arp_filters_errors;
223  process::metrics::Counter adding_veth_arp_filters_already_exist;
224  process::metrics::Counter adding_eth0_icmp_filters_errors;
225  process::metrics::Counter adding_eth0_icmp_filters_already_exist;
226  process::metrics::Counter adding_eth0_arp_filters_errors;
227  process::metrics::Counter adding_eth0_arp_filters_already_exist;
228  process::metrics::Counter removing_eth0_ip_filters_errors;
229  process::metrics::Counter removing_eth0_ip_filters_do_not_exist;
230  process::metrics::Counter removing_eth0_egress_filters_errors;
231  process::metrics::Counter removing_eth0_egress_filters_do_not_exist;
232  process::metrics::Counter removing_lo_ip_filters_errors;
233  process::metrics::Counter removing_lo_ip_filters_do_not_exist;
234  process::metrics::Counter removing_veth_ip_filters_errors;
235  process::metrics::Counter removing_veth_ip_filters_do_not_exist;
236  process::metrics::Counter removing_eth0_icmp_filters_errors;
237  process::metrics::Counter removing_eth0_icmp_filters_do_not_exist;
238  process::metrics::Counter removing_eth0_arp_filters_errors;
239  process::metrics::Counter removing_eth0_arp_filters_do_not_exist;
240  process::metrics::Counter updating_eth0_icmp_filters_errors;
241  process::metrics::Counter updating_eth0_icmp_filters_already_exist;
242  process::metrics::Counter updating_eth0_icmp_filters_do_not_exist;
243  process::metrics::Counter updating_eth0_arp_filters_errors;
244  process::metrics::Counter updating_eth0_arp_filters_already_exist;
245  process::metrics::Counter updating_eth0_arp_filters_do_not_exist;
246  process::metrics::Counter updating_container_ip_filters_errors;
247  } metrics;
248 
250  const Flags& _flags,
251  const std::string& _bindMountRoot,
252  const std::string& _eth0,
253  const std::string& _lo,
254  const net::MAC& _hostMAC,
255  const net::IP::Network& _hostIPNetwork,
256  const size_t _hostEth0MTU,
257  const net::IP& _hostDefaultGateway,
258  const routing::Handle& _hostTxFqCodelHandle,
259  const hashmap<std::string, std::string>& _hostNetworkConfigurations,
260  const Option<Bytes>& _egressRateLimitPerContainer,
261  const IntervalSet<uint16_t>& _managedNonEphemeralPorts,
262  const process::Owned<EphemeralPortsAllocator>& _ephemeralPortsAllocator,
263  const std::set<uint16_t>& _flowIDs)
264  : ProcessBase(process::ID::generate("mesos-port-mapping-isolator")),
265  flags(_flags),
266  bindMountRoot(_bindMountRoot),
267  eth0(_eth0),
268  lo(_lo),
269  hostMAC(_hostMAC),
270  hostIPNetwork(_hostIPNetwork),
271  hostEth0MTU(_hostEth0MTU),
272  hostDefaultGateway(_hostDefaultGateway),
273  hostTxFqCodelHandle(_hostTxFqCodelHandle),
274  hostNetworkConfigurations(_hostNetworkConfigurations),
275  egressRateLimitPerContainer(_egressRateLimitPerContainer),
276  managedNonEphemeralPorts(_managedNonEphemeralPorts),
277  ephemeralPortsAllocator(_ephemeralPortsAllocator),
278  freeFlowIds(_flowIDs) {}
279 
280  // Continuations.
281  Try<Nothing> _cleanup(Info* info, const Option<ContainerID>& containerId);
282  Try<Info*> _recover(pid_t pid);
283 
284  void _update(
285  const ContainerID& containerId,
287 
289  const ResourceStatistics& result,
290  const process::Subprocess& s);
291 
293  ResourceStatistics result,
294  const process::Future<std::string>& out);
295 
296  // Helper functions.
297  Try<Nothing> addHostIPFilters(
298  const routing::filter::ip::PortRange& range,
299  const Option<uint16_t>& flowId,
300  const std::string& veth);
301 
302  Try<Nothing> removeHostIPFilters(
303  const routing::filter::ip::PortRange& range,
304  const std::string& veth,
305  bool removeFiltersOnVeth = true);
306 
307  // Return the scripts that will be executed in the child context.
308  std::string scripts(Info* info);
309 
310  uint16_t getNextFlowId();
311 
312  const Flags flags;
313  const std::string bindMountRoot;
314 
315  const std::string eth0;
316  const std::string lo;
317  const net::MAC hostMAC;
318  const net::IP::Network hostIPNetwork;
319  const size_t hostEth0MTU;
320  const net::IP hostDefaultGateway;
321  const routing::Handle hostTxFqCodelHandle;
322 
323  // Describe the host network configurations. It is a map between
324  // configure proc files (e.g., /proc/sys/net/core/somaxconn) and
325  // values of the configure proc files.
326  const hashmap<std::string, std::string> hostNetworkConfigurations;
327 
328  // The optional throughput limit to containers' egress traffic.
329  const Option<Bytes> egressRateLimitPerContainer;
330 
331  // All the non-ephemeral ports managed by the slave, as passed in
332  // via flags.resources.
333  const IntervalSet<uint16_t> managedNonEphemeralPorts;
334 
335  process::Owned<EphemeralPortsAllocator> ephemeralPortsAllocator;
336 
337  // Store a set of unused flow ID's on this slave.
338  std::set<uint16_t> freeFlowIds;
339 
341 
342  // Recovered containers from a previous run that weren't managed by
343  // the network isolator.
344  hashset<ContainerID> unmanaged;
345 };
346 
347 
348 // Defines the subcommand for 'update' that needs to be executed by a
349 // subprocess to update the filters inside a container.
351 {
352 public:
353  static const char* NAME;
354 
355  struct Flags : public virtual flags::FlagsBase
356  {
357  Flags();
358 
364  };
365 
367 
369 
370 protected:
371  int execute() override;
372  flags::FlagsBase* getFlags() override { return &flags; }
373 };
374 
375 
376 // Defines the subcommand for 'statistics' that needs to be executed
377 // by a subprocess to retrieve newtork statistics from inside a
378 // container.
380 {
381 public:
382  static const char* NAME;
383 
384  struct Flags : public virtual flags::FlagsBase
385  {
386  Flags();
387 
393  };
394 
396 
398 
399 protected:
400  int execute() override;
401  flags::FlagsBase* getFlags() override { return &flags; }
402 };
403 
404 } // namespace slave {
405 } // namespace internal {
406 } // namespace mesos {
407 
408 #endif // __PORT_MAPPING_ISOLATOR_HPP__
Protocol< RecoverRequest, RecoverResponse > recover
Try< Nothing > isolate(const std::string &hierarchy, const std::string &cgroup, pid_t pid)
std::string generate(const std::string &prefix="")
Returns &#39;prefix(N)&#39; where N represents the number of instances where the same prefix (wrt...
Option< std::string > lo_name
Definition: port_mapping.hpp:360
EphemeralPortsAllocator(const IntervalSet< uint16_t > &total, size_t _portsPerContainer)
Definition: port_mapping.hpp:87
void execute(const std::string &script)
Try< bool > update(const std::string &link, const Handle &parent, uint16_t protocol, const action::Mirror &mirror)
Option< std::string > eth0_name
Definition: port_mapping.hpp:359
std::vector< routing::filter::ip::PortRange > getPortRanges(const IntervalSet< uint16_t > &ports)
constexpr char NET_ISOLATOR_BLOAT_REDUCTION[]
Definition: port_mapping.hpp:79
Definition: check.hpp:33
bool enable_snmp_statistics
Definition: port_mapping.hpp:392
Flags flags
Definition: port_mapping.hpp:368
PortMappingUpdate()
Definition: port_mapping.hpp:366
process::Future< bool > cleanup(const std::string &hierarchy)
Result< ProcessStatus > status(pid_t pid)
Definition: proc.hpp:166
Definition: handle.hpp:38
Definition: resources.hpp:79
constexpr char NET_ISOLATOR_BW_LIMIT[]
Definition: port_mapping.hpp:78
bool enable_socket_statistics_details
Definition: port_mapping.hpp:391
PortMappingStatistics()
Definition: port_mapping.hpp:395
Definition: flags.hpp:39
std::string PORT_MAPPING_BIND_MOUNT_SYMLINK_ROOT()
Definition: port_mapping.hpp:70
Definition: counter.hpp:26
Definition: port_mapping.hpp:350
Flags flags
Definition: port_mapping.hpp:397
Definition: ip.hpp:73
Represents a fork() exec()ed subprocess.
Definition: subprocess.hpp:43
DWORD pid_t
Definition: windows.hpp:181
Try< ResourceStatistics > usage(pid_t pid, bool mem=true, bool cpus=true)
flags::FlagsBase * getFlags() override
Definition: port_mapping.hpp:372
Definition: subcommand.hpp:41
Option< JSON::Object > ports_to_add
Definition: port_mapping.hpp:362
Definition: port_mapping.hpp:379
Try< std::vector< Info > > infos(int familiy, int states)
size_t portsPerContainer() const
Definition: port_mapping.hpp:94
Option< pid_t > pid
Definition: port_mapping.hpp:361
static const char * NAME
Definition: port_mapping.hpp:353
Option< pid_t > pid
Definition: port_mapping.hpp:389
Definition: flags.hpp:44
Definition: spec.hpp:30
Definition: port_mapping.hpp:355
flags::FlagsBase * getFlags() override
Definition: port_mapping.hpp:401
Definition: ip.hpp:49
static const char * NAME
Definition: port_mapping.hpp:382
Definition: ip.hpp:203
Definition: mac.hpp:74
bool enable_socket_statistics_summary
Definition: port_mapping.hpp:390
#define flags
Definition: decoder.hpp:18
Definition: none.hpp:27
Definition: attributes.hpp:24
Option< std::string > eth0_name
Definition: port_mapping.hpp:388
void deallocate(const Interval< uint16_t > &ports)
std::string PORT_MAPPING_BIND_MOUNT_ROOT()
Definition: port_mapping.hpp:65
std::string PORT_MAPPING_VETH_PREFIX()
Definition: port_mapping.hpp:57
Try< std::string > prepare(const std::string &baseHierarchy, const std::string &subsystem, const std::string &cgroup)
bool isManaged(const Interval< uint16_t > &ports)
Definition: port_mapping.hpp:110
JSON::Object Metrics()
Try< Nothing > create(const std::string &hierarchy, const std::string &cgroup, bool recursive=false)
Definition: owned.hpp:36
Try< Interval< uint16_t > > allocate()
Definition: parse.hpp:33
PID< MetricsProcess > metrics
~PortMappingIsolatorProcess() override
Definition: port_mapping.hpp:152
Option< JSON::Object > ports_to_remove
Definition: port_mapping.hpp:363