latest/c++/src_2slave_2containerizer_2mesos_2isolators_2gpu_2isolator_8hpp_source.html

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef __NVIDIA_GPU_ISOLATOR_HPP__
 #define __NVIDIA_GPU_ISOLATOR_HPP__

 #include <map>
 #include <set>
 #include <vector>

 #include <process/future.hpp>

 #include <stout/hashmap.hpp>
 #include <stout/option.hpp>
 #include <stout/path.hpp>
 #include <stout/try.hpp>

 #include "linux/cgroups.hpp"

 #include "slave/flags.hpp"

 #include "slave/containerizer/mesos/isolator.hpp"

 #include "slave/containerizer/mesos/isolators/gpu/allocator.hpp"
 #include "slave/containerizer/mesos/isolators/gpu/components.hpp"
 #include "slave/containerizer/mesos/isolators/gpu/volume.hpp"

 namespace mesos {
 namespace internal {
 namespace slave {

 // This isolator uses the cgroups devices subsystem to control
 // access to Nvidia GPUs. Since this is the very first device
 // isolator, it currently contains generic device isolation
 // logic that needs to be pulled up into a generic device
 // isolator.
 //
 // GPUs are allocated to containers in an arbitrary fashion.
 // For example, if a container requires 2 GPUs, we will
 // arbitrarily choose 2 from the GPUs that are available.
 // This may not behave well if tasks within an executor use
 // GPUs since we cannot identify which task are using which
 // GPUs (i.e. when a task terminates, we may remove a GPU
 // that is still being used by a different task!).
 //
 // Note that this isolator is not responsible for ensuring
 // that the necessary Nvidia libraries are visible in the
 // container. If filesystem isolation is not enabled, this
 // means that the container can simply use the libraries
 // available on the host. When filesystem isolation is
 // enabled, it is the responsibility of the operator /
 // application developer to ensure that the necessary
 // libraries are visible to the container (note that they
 // must be version compatible with the kernel driver on
 // the host).
 //
 // TODO(klueska): To better support containers with a
 // provisioned filesystem, we will need to add a mechanism
 // for operators to inject the libraries as a volume into
 // containers that require GPU access.
 //
 // TODO(klueska): If multiple containerizers are enabled,
 // they need to co-ordinate their allocation of GPUs.
 //
 // TODO(klueska): Move generic device isolation logic
 // out into its own component.
 class NvidiaGpuIsolatorProcess : public MesosIsolatorProcess
 {
 public:
   static Try<mesos::slave::Isolator*> create(
       const Flags& flags,
       const NvidiaComponents& components);

   bool supportsNesting() override;
   bool supportsStandalone() override;

   process::Future<Nothing> recover(
       const std::vector<mesos::slave::ContainerState>& states,
       const hashset<ContainerID>& orphans) override;

   process::Future<Option<mesos::slave::ContainerLaunchInfo>> prepare(
       const ContainerID& containerId,
       const mesos::slave::ContainerConfig& containerConfig) override;

   process::Future<Nothing> update(
       const ContainerID& containerId,
       const Resources& resourceRequests,
       const google::protobuf::Map<
           std::string, Value::Scalar>& resourceLimits = {}) override;

   process::Future<ResourceStatistics> usage(
       const ContainerID& containerId) override;

   process::Future<Nothing> cleanup(
       const ContainerID& containerId) override;

 private:
   NvidiaGpuIsolatorProcess(
       const Flags& _flags,
       const std::string& hierarchy,
       const NvidiaGpuAllocator& _allocator,
       const NvidiaVolume& _volume,
       const std::map<Path, cgroups::devices::Entry>& _controlDeviceEntries);

   virtual process::Future<Option<mesos::slave::ContainerLaunchInfo>> _prepare(
       const ContainerID& containerId,
       const mesos::slave::ContainerConfig& containerConfig);

   process::Future<Nothing> _update(
       const ContainerID& containerId,
       const std::set<Gpu>& allocation);

   struct Info
   {
     Info(const ContainerID& _containerId, const std::string& _cgroup)
       : containerId(_containerId), cgroup(_cgroup) {}

     const ContainerID containerId;
     const std::string cgroup;
     std::set<Gpu> allocated;
   };

   const Flags flags;

   // The path to the cgroups subsystem hierarchy root.
   const std::string hierarchy;

   // TODO(bmahler): Use Owned<Info>.
   hashmap<ContainerID, Info*> infos;

   NvidiaGpuAllocator allocator;
   NvidiaVolume volume;

   const std::map<Path, cgroups::devices::Entry> controlDeviceEntries;
 };

 } // namespace slave {
 } // namespace internal {
 } // namespace mesos {

 #endif // __NVIDIA_GPU_ISOLATOR_HPP__
mesos::internal::slave::NvidiaGpuIsolatorProcess::supportsNesting
bool supportsNesting() override

Try
Definition: check.hpp:33

hashset< ContainerID >

allocator.hpp

mesos::Resources
Definition: resources.hpp:83

mesos::internal::slave::NvidiaGpuIsolatorProcess::cleanup
process::Future< Nothing > cleanup(const ContainerID &containerId) override

mesos::internal::slave::NvidiaGpuAllocator
Definition: allocator.hpp:52

mesos::internal::slave::NvidiaGpuIsolatorProcess::recover
process::Future< Nothing > recover(const std::vector< mesos::slave::ContainerState > &states, const hashset< ContainerID > &orphans) override

mesos::internal::slave::Flags
Definition: flags.hpp:39

isolator.hpp

hashmap< ContainerID, Info * >

mesos::internal::slave::NvidiaGpuIsolatorProcess::update
process::Future< Nothing > update(const ContainerID &containerId, const Resources &resourceRequests, const google::protobuf::Map< std::string, Value::Scalar > &resourceLimits={}) override

mesos::internal::slave::NvidiaGpuIsolatorProcess::prepare
process::Future< Option< mesos::slave::ContainerLaunchInfo > > prepare(const ContainerID &containerId, const mesos::slave::ContainerConfig &containerConfig) override

mesos::internal::slave::NvidiaGpuIsolatorProcess::supportsStandalone
bool supportsStandalone() override

mesos::internal::slave::NvidiaGpuIsolatorProcess::usage
process::Future< ResourceStatistics > usage(const ContainerID &containerId) override

path.hpp

mesos::internal::slave::NvidiaComponents
Definition: components.hpp:34

mesos
Definition: agent.hpp:25

cgroups::blkio::cgroup
Result< std::string > cgroup(pid_t pid)

mesos::internal::slave::NvidiaGpuIsolatorProcess::create
static Try< mesos::slave::Isolator * > create(const Flags &flags, const NvidiaComponents &components)

mesos::internal::slave::MesosIsolatorProcess
Definition: isolator.hpp:82

option.hpp

mesos::internal::slave::NvidiaGpuIsolatorProcess
Definition: isolator.hpp:80

future.hpp

volume.hpp

internal
Definition: attributes.hpp:24

try.hpp

components.hpp

flags.hpp

cgroups.hpp

flags
Definition: parse.hpp:33

hashmap.hpp

mesos::internal::slave::NvidiaVolume
Definition: volume.hpp:36

process::Future< Nothing >