Home | History | Annotate | Download | only in gpu
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // TODO(opensource): Use a more generic sounding preprocessor name than
     17 // GOOGLE_CUDA
     18 #if GOOGLE_CUDA
     19 
     20 #define EIGEN_USE_GPU
     21 
     22 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
     23 
     24 #include <stdlib.h>
     25 #include <string.h>
     26 #include <algorithm>
     27 #include <list>
     28 #include <map>
     29 #include <tuple>
     30 #include <vector>
     31 
     32 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     33 #include "tensorflow/core/common_runtime/device_factory.h"
     34 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
     35 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
     36 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
     37 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
     38 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
     39 #include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h"
     40 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
     41 #include "tensorflow/core/common_runtime/gpu/process_state.h"
     42 #include "tensorflow/core/common_runtime/gpu_device_context.h"
     43 #include "tensorflow/core/common_runtime/local_device.h"
     44 #include "tensorflow/core/framework/allocator.h"
     45 #include "tensorflow/core/framework/device_base.h"
     46 #include "tensorflow/core/framework/op_kernel.h"
     47 #include "tensorflow/core/framework/tensor.h"
     48 #include "tensorflow/core/framework/tensor.pb.h"
     49 #include "tensorflow/core/framework/types.h"
     50 #include "tensorflow/core/framework/variant_op_registry.h"
     51 #include "tensorflow/core/graph/types.h"
     52 #include "tensorflow/core/lib/core/errors.h"
     53 #include "tensorflow/core/lib/core/status.h"
     54 #include "tensorflow/core/lib/gtl/stl_util.h"
     55 #include "tensorflow/core/lib/strings/numbers.h"
     56 #include "tensorflow/core/lib/strings/str_util.h"
     57 #include "tensorflow/core/lib/strings/strcat.h"
     58 #include "tensorflow/core/platform/cuda.h"
     59 #include "tensorflow/core/platform/logging.h"
     60 #include "tensorflow/core/platform/macros.h"
     61 #include "tensorflow/core/platform/stream_executor.h"
     62 #include "tensorflow/core/platform/tracing.h"
     63 #include "tensorflow/core/platform/types.h"
     64 #include "tensorflow/core/public/session_options.h"
     65 #include "tensorflow/core/util/device_name_utils.h"
     66 #include "tensorflow/core/util/env_var.h"
     67 #include "tensorflow/core/util/stream_executor_util.h"
     68 
     69 #if !defined(PLATFORM_GOOGLE)
     70 #include "cuda/cuda_config.h"
     71 #endif
     72 
     73 namespace tensorflow {
     74 
     75 // Eigen Ops directly allocate memory only for temporary buffers used
     76 // during OpKernel::Compute().  The recommended way of allocating such
     77 // memory is via OpKernelContext::allocate_temp().  However, Eigen Ops
     78 // don't have access to OpKernelContext, instead they get access to
     79 // memory directly through the device allocator.  As an Open Source
     80 // project, Eigen assumes allocator semantics similar to those of the
     81 // CUDA memory allocator, and may not work correctly due to race
     82 // conditions if used with some other allocator.  For safety, we need
     83 // to delay deallocation calls out of Eigen until all events on the
     84 // corresponding stream have completed.  The following two classes
     85 // serve this purpose in two different compilation environments.
     86 
     87 class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
     88  public:
     89   EigenCudaStreamDevice()
     90       : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) {
     91     Eigen::initializeDeviceProp();
     92   }
     93   ~EigenCudaStreamDevice() override {}
     94   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
     95                     TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc,
     96                     char* scratch) {
     97     if (LogMemory::IsEnabled()) {
     98       operation_ = context->op_kernel().name() + "/EigenAllocator";
     99       step_id_ = context->step_id();
    100     }
    101     context_ = context;
    102     scratch_ = scratch;
    103     semaphore_ =
    104         reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize);
    105     stream_ = cuda_stream;
    106     allocator_ = alloc;
    107     const int cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id).value();
    108     device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id];
    109   }
    110 
    111   const cudaStream_t& stream() const override { return *stream_; }
    112   const cudaDeviceProp& deviceProperties() const override {
    113     return *device_prop_;
    114   }
    115 
    116   void* allocate(size_t num_bytes) const override {
    117     void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes);
    118     if (ret == nullptr) {
    119       if (context_) {
    120         context_->SetStatus(errors::ResourceExhausted(
    121             strings::StrCat("Ran out of GPU memory when allocating ", num_bytes,
    122                             " bytes for ", operation_)));
    123       } else {
    124         LOG(FATAL)
    125             << "EigenAllocator for GPU ran out of memory when allocating "
    126             << num_bytes << ". See error logs for more detailed info.";
    127       }
    128     }
    129     if (LogMemory::IsEnabled() && ret != nullptr) {
    130       LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret,
    131                                      allocator_);
    132     }
    133     return ret;
    134   }
    135   void deallocate(void* buffer) const override {
    136     if (LogMemory::IsEnabled() && buffer != nullptr) {
    137       LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_,
    138                                        true);
    139     }
    140     AsyncFreeData* afData =
    141         new AsyncFreeData(allocator_, buffer, operation_, step_id_);
    142     cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0);
    143     CHECK_EQ(err, cudaSuccess);
    144   }
    145 
    146   // Return a pointer to a per stream scratchpad of 1024 bytes residing
    147   // in global memory.
    148   void* scratchpad() const override { return scratch_; }
    149 
    150   // Return a semaphore. The semaphore is initially initialized to 0, and
    151   // each kernel using it is responsible for resetting to 0 upon completion
    152   // to maintain the invariant that the semaphore is always equal to 0 upon
    153   // each kernel start.
    154   unsigned int* semaphore() const override { return semaphore_; }
    155 
    156  private:
    157   struct AsyncFreeData {
    158     AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
    159                   const int64 s)
    160         : allocator_(a), address_(p), operation_(o), step_id_(s) {}
    161     ::tensorflow::Allocator* allocator_;
    162     void* address_;
    163     const string operation_;
    164     const int64 step_id_;
    165   };
    166 
    167   static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status,
    168                                   void* userData) {
    169     AsyncFreeData* data = static_cast<AsyncFreeData*>(userData);
    170     if (LogMemory::IsEnabled()) {
    171       LogMemory::RecordRawDeallocation(data->operation_, data->step_id_,
    172                                        data->address_, data->allocator_, false);
    173     }
    174     data->allocator_->DeallocateRaw(data->address_);
    175     delete data;
    176   }
    177 
    178   string operation_;
    179   int64 step_id_;
    180   const cudaStream_t* stream_;          // Not owned.
    181   const cudaDeviceProp* device_prop_;   // Not owned.
    182   ::tensorflow::Allocator* allocator_;  // Not owned.
    183   mutable char* scratch_;
    184   mutable unsigned int* semaphore_;
    185   OpKernelContext* context_;
    186 
    187   TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
    188 };
    189 
    190 // This factory helps to ensure that different GPU device objects that refer to
    191 // the same physical device and stream group id use the same stream group
    192 // object (and therefore the same CUDA streams). This is necessary since there
    193 // is a single memory allocator per device (see ProcessState::GetGPUAllocator)
    194 // and allocators must not be shared across streams.
    195 class BaseGPUDevice::StreamGroupFactory {
    196  public:
    197   // Returns the unique stream group for use with the stream defined by
    198   // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet
    199   // exist.
    200   // This function is thread safe.
    201   BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id,
    202                                           int stream_group_within_gpu,
    203                                           gpu::StreamExecutor* executor) {
    204     mutex_lock guard(lock_);
    205     StreamGroup* group =
    206         &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)];
    207     if (!group->compute) {
    208       group->compute = new gpu::Stream(executor);
    209       group->compute->Init();
    210       VLOG(2) << "Created stream[" << stream_group_within_gpu
    211               << "] = " << group->compute;
    212 
    213       group->host_to_device = new gpu::Stream(executor);
    214       group->host_to_device->Init();
    215       VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
    216               << "] = " << group->host_to_device;
    217 
    218       group->device_to_host = new gpu::Stream(executor);
    219       group->device_to_host->Init();
    220       VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
    221               << "] = " << group->device_to_host;
    222 
    223       group->device_to_device = new gpu::Stream(executor);
    224       group->device_to_device->Init();
    225       VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
    226               << "] = " << group->device_to_host;
    227     }
    228     return group;
    229   }
    230 
    231   // Returns a reference to the StreamGroupFactory singleton. Note that this is
    232   // never destroyed, so the objects it owns are never deleted.
    233   static StreamGroupFactory& Global() {
    234     static StreamGroupFactory* instance = new StreamGroupFactory();
    235     return *instance;
    236   }
    237 
    238  private:
    239   mutex lock_;
    240   using key_type = std::tuple<int, int>;
    241   std::map<key_type, StreamGroup> streams_;
    242 
    243   // StreamGroupFactory cannot be created directly; Call
    244   // StreamGroupFactory::Global() to get the global instance.
    245   StreamGroupFactory() = default;
    246   TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory);
    247 };
    248 
    249 BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
    250                              Bytes memory_limit, const DeviceLocality& locality,
    251                              TfGpuId tf_gpu_id,
    252                              const string& physical_device_desc,
    253                              Allocator* gpu_allocator, Allocator* cpu_allocator,
    254                              bool sync_every_op, int32 max_streams)
    255     : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU,
    256                                                          memory_limit, locality,
    257                                                          physical_device_desc)),
    258       gpu_allocator_(gpu_allocator),
    259       cpu_allocator_(cpu_allocator),
    260       tf_gpu_id_(tf_gpu_id),
    261       sync_every_op_(sync_every_op),
    262       max_streams_(max_streams) {
    263   ProcessState::singleton()->EnableGPUDevice();
    264 }
    265 
    266 BaseGPUDevice::~BaseGPUDevice() {
    267   delete gpu_device_info_;
    268   for (auto ctx : device_contexts_) ctx->Unref();
    269 }
    270 
    271 Status BaseGPUDevice::Init(const SessionOptions& options) {
    272   auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_);
    273   if (!executor_status.status().ok()) {
    274     return errors::Internal("Failed to get StreamExecutor for device ",
    275                             tf_gpu_id_.value());
    276   }
    277 
    278   executor_ = executor_status.ValueOrDie();
    279   em_.reset(new EventMgr(executor_, options.config.gpu_options()));
    280 
    281   if (max_streams_ < 1) {
    282     return errors::InvalidArgument("Invalid value for max_streams.");
    283   }
    284 
    285   // Create the specified number of GPU streams
    286   for (int i = 0; i < max_streams_; i++) {
    287     streams_.push_back(
    288         StreamGroupFactory::Global().GetOrCreate(tf_gpu_id_, i, executor_));
    289 
    290     size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int);
    291     void* scratch_buffer = gpu_allocator_->AllocateRaw(
    292         Allocator::kAllocatorAlignment, scratch_buffer_size);
    293     if (scratch_buffer == nullptr) {
    294       return errors::FailedPrecondition(
    295           "Failed to allocate scratch buffer for device ", tf_gpu_id_.value());
    296     }
    297     scratch_.push_back(static_cast<char*>(scratch_buffer));
    298 
    299     perftools::gputools::DeviceMemory<char> mem(
    300         perftools::gputools::DeviceMemoryBase(scratch_buffer,
    301                                               scratch_buffer_size));
    302 
    303     bool ok = executor_->SynchronousMemZero(
    304         &mem, Eigen::kCudaScratchSize + sizeof(unsigned int));
    305     if (!ok) {
    306       return errors::FailedPrecondition(
    307           "Failed to memcopy into scratch buffer for device ",
    308           tf_gpu_id_.value());
    309     }
    310 
    311     device_contexts_.push_back(new GPUDeviceContext(
    312         i, streams_.back()->compute, streams_.back()->host_to_device,
    313         streams_.back()->device_to_host, streams_.back()->device_to_device));
    314   }
    315   gpu_device_info_ = new GpuDeviceInfo;
    316   gpu_device_info_->stream = streams_[0]->compute;
    317   gpu_device_info_->default_context = device_contexts_[0];
    318   gpu_device_info_->event_mgr = em_.get();
    319   gpu_device_info_->gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value();
    320   set_tensorflow_gpu_device_info(gpu_device_info_);
    321 
    322   // Whether and how the GPU device uses its own threadpool.
    323   // This option is experimental. Once we confirm the best setting, we
    324   // may change the default behavior and completely remove this flag.
    325   // Default values might change in future releases.
    326   // Possible values:
    327   //   * global: GPU uses threads shared with CPU in the main compute
    328   //          thread-pool. This is currently the default.
    329   //   * gpu_private: GPU uses threads dedicated to this device.
    330   //   * gpu_shared: All GPUs share a dedicated thread pool.
    331   string gpu_thread_mode;
    332   TF_RETURN_IF_ERROR(
    333       ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode));
    334   gpu_thread_mode = str_util::Lowercase(gpu_thread_mode);
    335   if (gpu_thread_mode != "global") {
    336     int64 gpu_thread_count = -1;
    337     // Default to two threads. One for device compute and another for memory
    338     // copies.
    339     TF_RETURN_IF_ERROR(
    340         ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count));
    341     if (gpu_thread_mode == "gpu_private") {
    342       // TODO(zhengxq): since these threads only serve a single GPU device,
    343       //   we should set the device context once for each thread, and avoid
    344       //   setting them for each kernel.
    345       // TODO(zhengxq): pin the thread to the same socket of the target GPU.
    346       thread_pool_.reset(new thread::ThreadPool(
    347           options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()),
    348           static_cast<int32>(gpu_thread_count)));
    349       set_tensorflow_device_thread_pool(thread_pool_.get());
    350     } else if (gpu_thread_mode == "gpu_shared") {
    351       static thread::ThreadPool* thread_pool = new thread::ThreadPool(
    352           options.env, "gpu_shared", static_cast<int32>(gpu_thread_count));
    353       set_tensorflow_device_thread_pool(thread_pool);
    354     } else {
    355       string error_message =
    356           strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode);
    357       LOG(WARNING) << error_message;
    358       return errors::InvalidArgument(error_message);
    359     }
    360   }
    361 
    362   return Status::OK();
    363 }
    364 
    365 bool BaseGPUDevice::RequiresRecordingAccessedTensors() const {
    366   // When there is no more than one stream, we release the tensor reference
    367   // at the end of the kernel launch, instead of at the end of the kernel
    368   // execution.
    369   return streams_.size() > 1;
    370 }
    371 
    372 Status BaseGPUDevice::FillContextMap(const Graph* graph,
    373                                      DeviceContextMap* device_context_map) {
    374   VLOG(2) << "FillContextMap";
    375 
    376   const size_t num_streams = streams_.size();
    377   // Special case for single stream.
    378   if (num_streams == 1) {
    379     return Status::OK();
    380   }
    381   const int64 before = Env::Default()->NowMicros();
    382   gpu_stream_util::AssignStreamsOpts opts;
    383   opts.max_streams = static_cast<int32>(num_streams);
    384   std::unordered_map<int, int> node_to_stream_id;
    385   TF_RETURN_IF_ERROR(
    386       gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id));
    387   int64 elapsed = Env::Default()->NowMicros() - before;
    388   VLOG(3) << "AssignStreams took " << elapsed << "us";
    389 
    390   // Fill in the context map.  It is OK for this map to contain
    391   // duplicate DeviceContexts so long as we increment the refcount.
    392   device_context_map->resize(graph->num_node_ids());
    393   for (Node* n : graph->nodes()) {
    394     auto mapped_stream = node_to_stream_id[n->id()];
    395     CHECK_LE(mapped_stream, num_streams);
    396     auto ctx = device_contexts_[mapped_stream];
    397     VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()]
    398             << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id()
    399             << " " << n->type_string() << " " << n->name();
    400     ctx->Ref();
    401     (*device_context_map)[n->id()] = ctx;
    402   }
    403 
    404   return Status::OK();
    405 }
    406 
    407 void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
    408   // ScopedActivity is cheap when tracing is not active, but we
    409   // can avoid computing the Hash64.
    410   // TODO(pbar) This would no longer be needed if Ops have a unique id.
    411   const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0;
    412   port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
    413                                        id);
    414 
    415   // NOTE(tucker): We need to discriminate between Eigen GPU
    416   // operations and all others.  If an operation is Eigen
    417   // implemented (or otherwise tries to launch a cuda kernel
    418   // directly), we need to establish a stacked-scoped environment
    419   // that directs it to execute on the proper device.  Otherwise we
    420   // expect the Op to use StreamExecutor directly and correctly.  The
    421   // way we make this discrimination is quite hacky: At the moment
    422   // the only non-Eigen GPU Op is the recv-op, which is known to be
    423   // asynchronous.
    424   if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
    425     context->SetStatus(errors::Internal(
    426         "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
    427   } else if (port::Tracing::ScopedAnnotation::Enabled()) {
    428     port::Tracing::ScopedAnnotation annotation(op_kernel->name(),
    429                                                op_kernel->type_string());
    430     ComputeHelper(op_kernel, context);
    431   } else {
    432     ComputeHelper(op_kernel, context);
    433   }
    434 }
    435 
    436 void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
    437                                   OpKernelContext* context) {
    438   GPUDeviceContext* gpu_device_context = device_contexts_[0];
    439   if (context->op_device_context() != nullptr) {
    440     gpu_device_context =
    441         static_cast<GPUDeviceContext*>(context->op_device_context());
    442   }
    443   gpu::Stream* stream = gpu_device_context->stream();
    444   const auto stream_id = gpu_device_context->stream_id();
    445 
    446   const bool vlog_1 = VLOG_IS_ON(1);
    447   const bool vlog_2 = vlog_1 && VLOG_IS_ON(2);
    448 
    449   if (vlog_1) {
    450     VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op "
    451             << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
    452             << stream_id << "]";
    453   }
    454 
    455   const auto num_streams = streams_.size();
    456   if (num_streams > 1) {
    457     // If this op's device context is different from the other contexts,
    458     // we must wait on the stream.
    459     for (int i = 0; i < context->num_inputs(); ++i) {
    460       const GPUDeviceContext* idc =
    461           static_cast<GPUDeviceContext*>(context->input_device_context(i));
    462       OP_REQUIRES(context, idc != nullptr,
    463                   errors::Internal("Input device context ", i,
    464                                    " was not set properly."));
    465       if (vlog_2) {
    466         const void* base;
    467         size_t len;
    468         if (context->has_input(i)) {
    469           if (IsRefType(context->input_dtype(i))) {
    470             Tensor tensor = context->mutable_input(i, false);
    471             base = DMAHelper::base(&tensor);
    472             len = tensor.TotalBytes();
    473           } else {
    474             const Tensor& tensor = context->input(i);
    475             base = DMAHelper::base(&tensor);
    476             len = tensor.TotalBytes();
    477           }
    478           LOG(INFO) << "Input " << i << " " << base << "  " << len;
    479           LOG(INFO) << "  stream[" << stream_id << "].ThenWaitFor(stream["
    480                     << idc->stream_id() << "])"
    481                     << ((idc->stream() == stream) ? " not needed" : "");
    482         }
    483       }
    484       if (idc->stream() != stream) stream->ThenWaitFor(idc->stream());
    485     }
    486   }
    487   gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
    488   op_kernel->Compute(context);
    489   if (context->status().ok()) {
    490     if (sync_every_op_) {
    491       // Note: GPUUtil::Sync() only syncs the default stream.
    492       // We need to either sync the stream used by this op, or
    493       // all streams.  Given that this flag is typically used for
    494       // debugging it makes more sense to sync all GPU activity.
    495       context->SetStatus(GPUUtil::SyncAll(this));
    496     }
    497   }
    498 }
    499 
    500 void BaseGPUDevice::ConsumeListOfAccessedTensors(
    501     DeviceContext* device_context, const TensorReferenceVector& tensor_refs) {
    502   GPUDeviceContext* gpu_device_context = device_contexts_[0];
    503   if (device_context != nullptr) {
    504     gpu_device_context = static_cast<GPUDeviceContext*>(device_context);
    505   }
    506   gpu::Stream* stream = gpu_device_context->stream();
    507   em_->ThenDeleteTensors(stream, tensor_refs);
    508 }
    509 
    510 // Based on the semantics of Device::Sync this call should wait for
    511 // all streams not just the current one.
    512 Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); }
    513 
    514 void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
    515                                  OpKernelContext* context,
    516                                  AsyncOpKernel::DoneCallback done) {
    517   GPUDeviceContext* gpu_device_context = device_contexts_[0];
    518   if (context->op_device_context() != nullptr) {
    519     gpu_device_context =
    520         static_cast<GPUDeviceContext*>(context->op_device_context());
    521   }
    522   gpu::Stream* stream = gpu_device_context->stream();
    523   const auto stream_id = gpu_device_context->stream_id();
    524 
    525   VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op "
    526           << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
    527           << stream_id << "]";
    528 
    529   // When TraceMe profiling is off (which is the default), the
    530   // following TraceMe constructor is simply a conditional test of
    531   // false value. Measurements show that its overhead is negligible.
    532   port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(),
    533                                   op_kernel->IsExpensive());
    534   gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()};
    535   op_kernel->ComputeAsync(context, done);
    536 }
    537 
    538 Status BaseGPUDevice::MaybeCopyTensorToGPU(
    539     const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to,
    540     StatusCallback done) {
    541   if (alloc_attrs.on_host()) {
    542     *to = from;
    543     done(Status::OK());
    544     return Status::OK();
    545   } else {
    546     if (!DMAHelper::CanUseDMA(&from)) {
    547       Status err = errors::Internal("GPU copy from non-DMA ",
    548                                     DataTypeString(from.dtype()), " tensor");
    549       done(err);
    550       return err;
    551     }
    552     auto* copy =
    553         new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape());
    554 
    555     // If the tensor is not initialized, we likely ran out of memory.
    556     if (!copy->IsInitialized()) {
    557       delete copy;
    558       Status err = errors::ResourceExhausted(
    559           "OOM when allocating tensor of shape ", from.shape().DebugString(),
    560           " and type ", DataTypeString(from.dtype()));
    561       done(err);
    562       return err;
    563     }
    564 
    565     StatusCallback wrapped_done = std::bind(
    566         [to, copy](StatusCallback done_,
    567                    // Begin unbound arguments.
    568                    const Status& s) {
    569           *to = std::move(*copy);
    570           delete copy;
    571           done_(s);
    572         },
    573         std::move(done), std::placeholders::_1);
    574 
    575     port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
    576     device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy,
    577                                                std::move(wrapped_done));
    578     return Status::OK();
    579   }
    580 }
    581 
    582 Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
    583                                           const AllocatorAttributes alloc_attrs,
    584                                           Tensor* tensor) {
    585   AllocatorAttributes attr;
    586   attr.set_on_host(true);
    587   attr.set_gpu_compatible(true);
    588   Allocator* host_alloc = GetAllocator(attr);
    589   Tensor parsed(tensor_proto.dtype());
    590   if (!parsed.FromProto(host_alloc, tensor_proto)) {
    591     return errors::InvalidArgument("Cannot parse tensor from proto: ",
    592                                    tensor_proto.DebugString());
    593   }
    594 
    595   if (parsed.dtype() == DT_VARIANT) {
    596     const Variant* from = parsed.flat<Variant>().data();
    597     Tensor copy(cpu_allocator(), DT_VARIANT, parsed.shape());
    598     Variant* copy_variant = copy.flat<Variant>().data();
    599 
    600     std::list<Notification> notifications;
    601     Status copy_status;
    602     auto copier = [this, &alloc_attrs, &notifications, &copy_status](
    603                       const Tensor& from, Tensor* to) {
    604       // Copier isn't run in a multithreaded environment, so we don't
    605       // have to worry about the notifications list being modified in parallel.
    606       notifications.emplace_back();
    607       Notification& n = *notifications.rbegin();
    608       return MaybeCopyTensorToGPU(alloc_attrs, from, to,
    609                                   [&n, &copy_status](const Status& s) {
    610                                     if (copy_status.ok()) {
    611                                       copy_status.Update(s);
    612                                     }
    613                                     n.Notify();
    614                                   });
    615     };
    616     Status s;
    617     for (int64 ix = 0; ix < parsed.NumElements(); ++ix) {
    618       s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE,
    619                             from[ix], &copy_variant[ix], copier);
    620       if (!s.ok()) {
    621         break;
    622       }
    623     }
    624     for (auto& n : notifications) {
    625       n.WaitForNotification();
    626     }
    627     if (!s.ok()) {
    628       return s;
    629     }
    630     *tensor = std::move(copy);
    631     return copy_status;
    632   } else {
    633     Notification n;
    634     Status status;
    635     TF_RETURN_IF_ERROR(MaybeCopyTensorToGPU(alloc_attrs, parsed, tensor,
    636                                             [&n, &status](const Status& s) {
    637                                               status = s;
    638                                               n.Notify();
    639                                             }));
    640     n.WaitForNotification();
    641     return status;
    642   }
    643 }
    644 
    645 namespace {
    646 class ConcretePerOpGpuDevice : public PerOpGpuDevice {
    647  public:
    648   ConcretePerOpGpuDevice() : device_(&stream_device_) {}
    649 
    650   void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
    651                     TfGpuId tf_gpu_id, Allocator* base_allocator,
    652                     char* scratch) {
    653     stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator,
    654                                 scratch);
    655   }
    656 
    657   const Eigen::GpuDevice& device() const override { return device_; }
    658 
    659  private:
    660   EigenCudaStreamDevice stream_device_;
    661   Eigen::GpuDevice device_;
    662 };
    663 
    664 // Parse 'visible_device_list' into a list of CUDA GPU ids.
    665 Status ParseVisibleDeviceList(const string& visible_device_list,
    666                               std::vector<CudaGpuId>* visible_gpu_order) {
    667   visible_gpu_order->clear();
    668   gpu::Platform* gpu_manager = GPUMachineManager();
    669 
    670   // If the user wants to remap the visible to virtual GPU mapping,
    671   // check for that here.
    672   if (visible_device_list.empty()) {
    673     visible_gpu_order->resize(gpu_manager->VisibleDeviceCount());
    674     // By default, visible to virtual mapping is unchanged.
    675     int deviceNo = 0;
    676     std::generate(visible_gpu_order->begin(), visible_gpu_order->end(),
    677                   [&deviceNo] { return deviceNo++; });
    678   } else {
    679     const std::vector<string> order_str =
    680         str_util::Split(visible_device_list, ',');
    681     for (const string& cuda_gpu_id_str : order_str) {
    682       int32 cuda_gpu_id;
    683       if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) {
    684         return errors::InvalidArgument(
    685             "Could not parse entry in 'visible_device_list': '",
    686             cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list);
    687       }
    688       if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) {
    689         return errors::InvalidArgument(
    690             "'visible_device_list' listed an invalid GPU id '", cuda_gpu_id,
    691             "' but visible device count is ",
    692             gpu_manager->VisibleDeviceCount());
    693       }
    694       visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id));
    695     }
    696   }
    697 
    698   // Validate no repeats.
    699   std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(),
    700                                          visible_gpu_order->end());
    701   if (visible_device_set.size() != visible_gpu_order->size()) {
    702     return errors::InvalidArgument(
    703         "visible_device_list contained a duplicate entry: ",
    704         visible_device_list);
    705   }
    706   return Status::OK();
    707 }
    708 
    709 Status VerifyVirtualDeviceSettings(
    710     const size_t num_gpus_to_use, const GPUOptions& gpu_options,
    711     const std::vector<CudaGpuId>& visible_gpu_order,
    712     const std::vector<CudaGpuId>& valid_cuda_gpu_ids) {
    713   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
    714   CHECK(!virtual_devices.empty());
    715   if (gpu_options.per_process_gpu_memory_fraction() > 0) {
    716     return errors::InvalidArgument(
    717         "It's invalid to set per_process_gpu_memory_fraction when "
    718         "virtual_devices is set.");
    719   }
    720   if (num_gpus_to_use < virtual_devices.size()) {
    721     return errors::Unknown(
    722         "Not enough GPUs to create virtual devices."
    723         " num_gpus_to_use: ",
    724         num_gpus_to_use, " #virtual_devices: ", virtual_devices.size());
    725   }
    726   if (!gpu_options.visible_device_list().empty() &&
    727       visible_gpu_order.size() != virtual_devices.size()) {
    728     return errors::InvalidArgument(
    729         "The number of GPUs in visible_device_list doesn't match the number "
    730         "of elements in the virtual_devices list.",
    731         " #GPUs in visible_device_list: ", visible_gpu_order.size(),
    732         " virtual_devices.size(): ", virtual_devices.size());
    733   }
    734   if (valid_cuda_gpu_ids.size() != virtual_devices.size()) {
    735     return errors::Unknown(
    736         "The number of valid GPUs doesn't match the number of elements in "
    737         "the virtual_devices list.",
    738         " #valid GPUs: ", valid_cuda_gpu_ids.size(),
    739         " virtual_devices.size(): ", virtual_devices.size());
    740   }
    741   return Status::OK();
    742 }
    743 
    744 int64 MinSystemMemory(int64 available_memory) {
    745   // We use the following heuristic for now:
    746   //
    747   // If the available_memory is < 2GiB, we allocate 225MiB to system memory.
    748   // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory.
    749   //
    750   // In the future we could be more sophisticated by using a table of devices.
    751   int64 min_system_memory;
    752   if (available_memory < (1LL << 31)) {
    753     // 225MiB
    754     min_system_memory = 225 * 1024 * 1024;
    755   } else {
    756     // max(300 MiB, 0.05 * available_memory)
    757     min_system_memory =
    758         std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
    759   }
    760 #if defined(__GNUC__) && defined(__OPTIMIZE__)
    761 // Do nothing
    762 #elif !defined(__GNUC__) && defined(NDEBUG)
    763 // Do nothing
    764 #else
    765   // Double the amount of available GPU memory in non-opt builds (debug
    766   // builds in windows); because in non-opt builds more system memory
    767   // is necessary.
    768   min_system_memory *= 2;
    769 #endif
    770 
    771 #if defined(ANDROID_TEGRA)
    772   // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM
    773   // and Video RAM
    774   min_system_memory = 1 << 30;
    775 #endif
    776   return min_system_memory;
    777 }
    778 
    779 // Get the memory limit for the virtual device being created on GPU with
    780 // 'cuda_gpu_id', when that virtual device is the only virtual device being
    781 // created on that GPU.
    782 Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
    783                                       CudaGpuId cuda_gpu_id,
    784                                       int64* memory_limit) {
    785   int64 total_memory = 0;
    786   int64 available_memory = 0;
    787   gpu::StreamExecutor* se =
    788       GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
    789   if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
    790     return errors::Unknown("Failed to query available memory for GPU ",
    791                            cuda_gpu_id.value());
    792   }
    793 
    794   int64 allocated_memory = 0;
    795   const double per_process_gpu_memory_fraction =
    796       gpu_options.per_process_gpu_memory_fraction();
    797   if (per_process_gpu_memory_fraction == 0) {
    798     allocated_memory = available_memory;
    799     const int64 min_system_memory = MinSystemMemory(available_memory);
    800     if (min_system_memory < allocated_memory) {
    801       allocated_memory -= min_system_memory;
    802     }
    803   } else {
    804     allocated_memory = total_memory * per_process_gpu_memory_fraction;
    805   }
    806   *memory_limit = allocated_memory;
    807   return Status::OK();
    808 }
    809 }  // namespace
    810 
    811 void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context,
    812                                        PerOpGpuDevice* device, int stream_id,
    813                                        Allocator* allocator) {
    814   ConcretePerOpGpuDevice* concrete_device =
    815       static_cast<ConcretePerOpGpuDevice*>(device);
    816   DCHECK(concrete_device);
    817   const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>(
    818       streams_[stream_id]->compute->implementation()->CudaStreamMemberHack());
    819   concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator,
    820                                 scratch_[stream_id]);
    821 }
    822 
    823 PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() {
    824   return new ConcretePerOpGpuDevice();
    825 }
    826 
    827 void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context,
    828                                           PerOpGpuDevice* device,
    829                                           DeviceContext* dc,
    830                                           Allocator* allocator) {
    831   if (dc) {
    832     const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc);
    833     const int stream_id = gpu_dc->stream_id();
    834     VLOG(1) << "  eigen_gpu_device(" << dc << ") => stream[" << stream_id
    835             << "]";
    836     CHECK_LT(stream_id, streams_.size());
    837     ReinitializeDevice(context, device, stream_id, allocator);
    838   } else {
    839     ReinitializeDevice(context, device, 0, allocator);
    840   }
    841 }
    842 
    843 const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
    844 const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
    845 
    846 Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
    847                                            const string& name_prefix,
    848                                            std::vector<Device*>* devices) {
    849   TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
    850   gpu::Platform* gpu_manager = GPUMachineManager();
    851   if (gpu_manager == nullptr) {
    852     return Status::OK();
    853   }
    854   // If there are no GPUs visible, do nothing.
    855   if (gpu_manager->VisibleDeviceCount() <= 0) {
    856     return Status::OK();
    857   }
    858 
    859   size_t num_gpus_to_use = INT_MAX;
    860   auto iter = options.config.device_count().find("GPU");
    861   if (iter != options.config.device_count().end()) {
    862     num_gpus_to_use = iter->second;
    863   }
    864   const auto& gpu_options = options.config.gpu_options();
    865   std::vector<CudaGpuId> visible_gpu_order;
    866   TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(),
    867                                             &visible_gpu_order));
    868 
    869   std::vector<CudaGpuId> valid_cuda_gpu_ids;
    870   TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids));
    871   if (num_gpus_to_use > valid_cuda_gpu_ids.size()) {
    872     num_gpus_to_use = valid_cuda_gpu_ids.size();
    873   }
    874   if (!valid_cuda_gpu_ids.empty()) {
    875     // Save the original device.
    876     int original_device = 0;
    877     cudaError_t err = cudaGetDevice(&original_device);
    878     if (err != cudaSuccess) {
    879       return errors::Internal("cudaGetDevice() failed. Status: ",
    880                               cudaGetErrorString(err));
    881     }
    882     // Force to implicitly initialize CUDA runtime on each valid GPU before
    883     // CreateGPUDevice().
    884     for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) {
    885       err = cudaSetDevice(cuda_gpu_id.value());
    886       if (err != cudaSuccess) {
    887         return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(),
    888                                 " failed. Status: ", cudaGetErrorString(err));
    889       }
    890       err = cudaFree(nullptr);
    891       if (err != cudaSuccess) {
    892         return errors::Internal(
    893             "CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(),
    894             " failed. Status: ", cudaGetErrorString(err));
    895       }
    896     }
    897     // Reset to the original device.
    898     err = cudaSetDevice(original_device);
    899     if (err != cudaSuccess) {
    900       return errors::Internal("cudaSetDevice() on GPU:", original_device,
    901                               " failed. Status: ", cudaGetErrorString(err));
    902     }
    903   }
    904 
    905   std::vector<InterconnectMap> interconnect_maps;
    906   TF_RETURN_IF_ERROR(
    907       GetInterconnectMaps(visible_gpu_order, gpu_manager, &interconnect_maps));
    908 
    909   // Print each interconnect map to the log.
    910   for (const InterconnectMap& im : interconnect_maps) {
    911     LOG(INFO) << "Device interconnect " << im.name << " with strength "
    912               << im.strength << " edge matrix:";
    913     string line_buf = "     ";
    914     for (int i = 0; i < visible_gpu_order.size(); ++i) {
    915       strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " ");
    916     }
    917     LOG(INFO) << line_buf;
    918     for (int i = 0; i < visible_gpu_order.size(); ++i) {
    919       line_buf = strings::StrCat(visible_gpu_order[i].value(), ":   ");
    920       CudaGpuId cuda_id_i = visible_gpu_order[i];
    921       for (int j = 0; j < visible_gpu_order.size(); ++j) {
    922         CudaGpuId cuda_id_j = visible_gpu_order[j];
    923         if (im.directed_links.find({cuda_id_i, cuda_id_j}) !=
    924             im.directed_links.end()) {
    925           line_buf.append("Y ");
    926         } else {
    927           line_buf.append("N ");
    928         }
    929       }
    930       LOG(INFO) << line_buf;
    931     }
    932   }
    933 
    934   const auto& virtual_devices = gpu_options.experimental().virtual_devices();
    935   if (!virtual_devices.empty()) {
    936     TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings(
    937         num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids));
    938     // We've verified that num_gpus_to_use >= virtual_devices.size().
    939     num_gpus_to_use = virtual_devices.size();
    940     CHECK(gpu_options.visible_device_list().empty() ||
    941           valid_cuda_gpu_ids == visible_gpu_order);
    942   }
    943   int next_tf_gpu_id = 0;
    944   std::vector<int64> memory_limit_bytes;
    945   for (int i = 0; i < num_gpus_to_use; ++i) {
    946     const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i];
    947     if (virtual_devices.empty() ||
    948         virtual_devices.Get(i).memory_limit_mb_size() == 0) {
    949       int64 single_virtual_device_memory_limit = 0;
    950       TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit(
    951           gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit));
    952       memory_limit_bytes.push_back(single_virtual_device_memory_limit);
    953     } else {
    954       const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb();
    955       std::transform(memory_limit_mb.begin(), memory_limit_mb.end(),
    956                      std::back_inserter(memory_limit_bytes), [](float mb) {
    957                        return static_cast<int64>(mb) * (1ll << 20);
    958                      });
    959     }
    960     while (next_tf_gpu_id < memory_limit_bytes.size()) {
    961       TfGpuId tf_gpu_id(next_tf_gpu_id);
    962       ++next_tf_gpu_id;
    963       GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id);
    964     }
    965   }
    966   const int num_tf_gpus = next_tf_gpu_id;
    967 
    968   LocalityMap device_localities;
    969   TF_RETURN_IF_ERROR(
    970       GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities));
    971 
    972   // Build the GPUDevices
    973   CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size());
    974   for (int di = 0; di < num_tf_gpus; ++di) {
    975     TfGpuId tf_gpu_id(di);
    976     int64 bytes = memory_limit_bytes[di];
    977     auto it = device_localities.find(tf_gpu_id);
    978     if (it == device_localities.end()) {
    979       return errors::Internal("Failed to find DeviceLocality for GPU device ",
    980                               tf_gpu_id.value());
    981     }
    982     TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes,
    983                                        it->second, devices));
    984   }
    985   return Status::OK();
    986 }
    987 
    988 static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id,
    989                                         const gpu::DeviceDescription& desc) {
    990   int cc_major;
    991   int cc_minor;
    992   if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) {
    993     cc_major = 0;
    994     cc_minor = 0;
    995   }
    996   // LINT.IfChange
    997   return strings::StrCat("device: ", cuda_gpu_id.value(),
    998                          ", name: ", desc.name(),
    999                          ", pci bus id: ", desc.pci_bus_id(),
   1000                          ", compute capability: ", cc_major, ".", cc_minor);
   1001   // LINT.ThenChange(//tensorflow/python/platform/test.py)
   1002 }
   1003 
   1004 Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   1005                                              const string& name_prefix,
   1006                                              TfGpuId tf_gpu_id,
   1007                                              int64 memory_limit,
   1008                                              const DeviceLocality& dev_locality,
   1009                                              std::vector<Device*>* devices) {
   1010   CHECK_GE(tf_gpu_id.value(), 0);
   1011   const string device_name =
   1012       strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
   1013   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
   1014   CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
   1015   int numa_node = dev_locality.numa_node();
   1016   Bytes allocated_bytes = static_cast<Bytes>(memory_limit);
   1017 
   1018   gpu::StreamExecutor* se =
   1019       GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
   1020   const gpu::DeviceDescription& desc = se->GetDeviceDescription();
   1021   LOG(INFO) << "Creating TensorFlow device (" << device_name << " with "
   1022             << (memory_limit >> 20) << " MB memory) -> physical GPU ("
   1023             << GetShortDeviceDescription(cuda_gpu_id, desc) << ")";
   1024   ProcessState* process_state = ProcessState::singleton();
   1025   BaseGPUDevice* gpu_device = CreateGPUDevice(
   1026       options, device_name, allocated_bytes, dev_locality, tf_gpu_id,
   1027       GetShortDeviceDescription(cuda_gpu_id, desc),
   1028       process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id,
   1029                                      memory_limit),
   1030       process_state->GetCPUAllocator(numa_node));
   1031   TF_RETURN_IF_ERROR(gpu_device->Init(options));
   1032   devices->push_back(gpu_device);
   1033 
   1034   return Status::OK();
   1035 }
   1036 
   1037 namespace {
   1038 std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>>
   1039 GetPeerAccessMap(gpu::Platform* platform,
   1040                  const std::vector<CudaGpuId>& visible_gpu_order) {
   1041   std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map(
   1042       new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>);
   1043   for (CudaGpuId cuda_gpu_i : visible_gpu_order) {
   1044     for (CudaGpuId cuda_gpu_j : visible_gpu_order) {
   1045       gpu::StreamExecutor* from =
   1046           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
   1047       gpu::StreamExecutor* to =
   1048           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
   1049       (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to);
   1050     }
   1051   }
   1052 
   1053   return map;
   1054 }
   1055 
   1056 }  // namespace
   1057 
   1058 Status BaseGPUDeviceFactory::GetInterconnectMaps(
   1059     const std::vector<CudaGpuId>& visible_gpu_order, gpu::Platform* gpu_manager,
   1060     std::vector<InterconnectMap>* maps) {
   1061   // The default interconnect map is obtained from the StreamExecutor.
   1062   auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order);
   1063   maps->resize(1);
   1064   InterconnectMap& imap = maps->at(0);
   1065   imap.name = "StreamExecutor";
   1066   imap.strength = InterconnectMap::kStreamExecutorStrength;
   1067   for (CudaGpuId cuda_id_i : visible_gpu_order) {
   1068     for (CudaGpuId cuda_id_j : visible_gpu_order) {
   1069       if (cuda_id_i == cuda_id_j) continue;
   1070       if ((*access_map)[{cuda_id_i, cuda_id_j}]) {
   1071         imap.directed_links.insert({cuda_id_i, cuda_id_j});
   1072       }
   1073     }
   1074   }
   1075   return Status::OK();
   1076 }
   1077 
   1078 Status BaseGPUDeviceFactory::GetDeviceLocalities(
   1079     int num_tf_gpus, const std::vector<InterconnectMap>& interconnects,
   1080     LocalityMap* localities) {
   1081   std::vector<TfGpuId> all_tf_gpu_ids;
   1082   for (int i = 0; i < num_tf_gpus; ++i) {
   1083     all_tf_gpu_ids.push_back(TfGpuId(i));
   1084   }
   1085   for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
   1086     CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id);
   1087     // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
   1088     // virtualized in some environments, we can't just use the GPU id.
   1089     // NUMA locales are indexed from 0, buses are indexed from 1.
   1090     gpu::StreamExecutor* se =
   1091         GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
   1092     const gpu::DeviceDescription& desc = se->GetDeviceDescription();
   1093     int numa_node = desc.numa_node();
   1094     if (numa_node < 0) {
   1095       // For some reason the StreamExecutor couldn't get the NUMA
   1096       // affinity of the GPU.  If this is not a multi-socket mobo with
   1097       // GPUs local to different buses, it doesn't matter.  If it is, we
   1098       // may run into trouble later with data transfer operations.  The
   1099       // trouble may manifest as slower than expected performance, or
   1100       // outright failures.
   1101       LOG(INFO) << "Could not identify NUMA node of CUDA gpu id " << cuda_gpu_id
   1102                 << ", defaulting to 0.  Your kernel may not have been built "
   1103                 << "with NUMA support.";
   1104       numa_node = 0;
   1105     }
   1106     DeviceLocality dev_locality;
   1107     dev_locality.set_numa_node(numa_node);
   1108     dev_locality.set_bus_id(numa_node + 1);
   1109 
   1110     // Set LocalLinks from InterconnectMaps.
   1111     LocalLinks* links = dev_locality.mutable_links();
   1112     for (const InterconnectMap& imap : interconnects) {
   1113       for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
   1114         CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
   1115         if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) !=
   1116             imap.directed_links.end()) {
   1117           InterconnectLink* ilink = links->add_link();
   1118           ilink->set_device_id(tf_gpu_dst.value());
   1119           ilink->set_type(imap.name);
   1120           ilink->set_strength(imap.strength);
   1121         }
   1122       }
   1123     }
   1124 
   1125     // If this is one of multiple virtual GPUs on the same physical GPU
   1126     // add high strength links to the others.
   1127     for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
   1128       if (tf_gpu_id == tf_gpu_dst) continue;
   1129       CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst);
   1130       if (cuda_gpu_id == cuda_gpu_dst) {
   1131         InterconnectLink* ilink = links->add_link();
   1132         ilink->set_device_id(tf_gpu_dst.value());
   1133         ilink->set_type("SAME_DEVICE");
   1134         ilink->set_strength(InterconnectMap::kSameDeviceStrength);
   1135       }
   1136     }
   1137 
   1138     (*localities)[tf_gpu_id] = dev_locality;
   1139     VLOG(1) << "GPUDevice CudaGpuId " << cuda_gpu_id << " TfGpuId " << tf_gpu_id
   1140             << " on bus " << dev_locality.bus_id() << " numa: " << numa_node
   1141             << " pci: " << desc.pci_bus_id()
   1142             << " DeviceLocality: " << dev_locality.DebugString();
   1143   }
   1144   return Status::OK();
   1145 }
   1146 
   1147 static int GetDefaultMinGPUMultiprocessorCount(
   1148     gpu::Platform* gpu_manager,
   1149     const std::vector<CudaGpuId>& visible_gpu_order) {
   1150   static const int kDefaultMinGPUMultiprocessorCount = 8;
   1151 
   1152   // Find the highest multi-processor count across all visible GPUs.
   1153   int max_count = -1;
   1154   for (int i = 0; i < visible_gpu_order.size(); ++i) {
   1155     auto exec_status =
   1156         GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]);
   1157     if (!exec_status.ok()) {
   1158       continue;
   1159     }
   1160 
   1161     gpu::StreamExecutor* se = exec_status.ValueOrDie();
   1162     const gpu::DeviceDescription& desc = se->GetDeviceDescription();
   1163     max_count = std::max(max_count, desc.core_count());
   1164   }
   1165 
   1166   if (max_count < 0 || kDefaultMinGPUMultiprocessorCount < max_count) {
   1167     return kDefaultMinGPUMultiprocessorCount;
   1168   } else {
   1169     return max_count;
   1170   }
   1171 }
   1172 
   1173 static int GetMinGPUMultiprocessorCount(
   1174     gpu::Platform* gpu_manager,
   1175     const std::vector<CudaGpuId>& visible_gpu_order) {
   1176   const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT");
   1177 
   1178   if (tf_min_gpu_core_count == nullptr ||
   1179       strcmp(tf_min_gpu_core_count, "") == 0) {
   1180     return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
   1181   }
   1182 
   1183   int min_gpu_core_count = -1;
   1184   if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) {
   1185     if (min_gpu_core_count >= 0) {
   1186       return min_gpu_core_count;
   1187     }
   1188   }
   1189 
   1190   int count =
   1191       GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
   1192   LOG(ERROR) << "Invalid minimum GPU multiprocessor count: ["
   1193              << tf_min_gpu_core_count << "]. "
   1194              << "Using the default value: " << count;
   1195   return count;
   1196 }
   1197 
   1198 namespace {
   1199 
   1200 struct CudaVersion {
   1201   // Initialize from version_name in the form of "3.5"
   1202   explicit CudaVersion(const std::string& version_name) {
   1203     size_t dot_pos = version_name.find('.');
   1204     CHECK(dot_pos != string::npos)
   1205         << "Illegal version name: [" << version_name << "]";
   1206     string major_str = version_name.substr(0, dot_pos);
   1207     CHECK(strings::safe_strto32(major_str, &major_part))
   1208         << "Illegal version name: [" << version_name << "]";
   1209     string minor_str = version_name.substr(dot_pos + 1);
   1210     CHECK(strings::safe_strto32(minor_str, &minor_part))
   1211         << "Illegal version name: [" << version_name << "]";
   1212   }
   1213   CudaVersion() {}
   1214   bool operator<(const CudaVersion& other) const {
   1215     if (this->major_part != other.major_part) {
   1216       return this->major_part < other.major_part;
   1217     }
   1218     return this->minor_part < other.minor_part;
   1219   }
   1220   friend std::ostream& operator<<(std::ostream& os,
   1221                                   const CudaVersion& version) {
   1222     os << version.major_part << "." << version.minor_part;
   1223     return os;
   1224   }
   1225   int major_part = -1;
   1226   int minor_part = -1;
   1227 };
   1228 
   1229 std::vector<CudaVersion> supported_cuda_compute_capabilities = {
   1230     TF_CUDA_CAPABILITIES,};
   1231 
   1232 std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() {
   1233   auto cuda_caps = supported_cuda_compute_capabilities;
   1234 #ifdef TF_EXTRA_CUDA_CAPABILITIES
   1235 // TF_EXTRA_CUDA_CAPABILITIES should be defined a sequence separated by commas,
   1236 // for example:
   1237 //   TF_EXTRA_CUDA_CAPABILITIES=3.0,4.0,5.0
   1238 // Use two-level macro expansion for stringification.
   1239 #define TF_XSTRING(...) #__VA_ARGS__
   1240 #define TF_STRING(s) TF_XSTRING(s)
   1241   string extra_cuda_caps = TF_STRING(TF_EXTRA_CUDA_CAPABILITIES);
   1242 #undef TF_STRING
   1243 #undef TF_XSTRING
   1244   auto extra_capabilities = str_util::Split(extra_cuda_caps, ',');
   1245   for (const auto& capability : extra_capabilities) {
   1246     cuda_caps.push_back(CudaVersion(capability));
   1247   }
   1248 #endif
   1249   return cuda_caps;
   1250 }
   1251 
   1252 Status EnablePeerAccess(gpu::Platform* platform,
   1253                         const std::vector<CudaGpuId>& visible_gpu_order) {
   1254   int possible_peer_count = 0;
   1255   int enabled_peer_count = 0;
   1256   for (int i = 0; i < visible_gpu_order.size(); ++i) {
   1257     const CudaGpuId cuda_gpu_i = visible_gpu_order[i];
   1258     for (int j = 0; j < visible_gpu_order.size(); ++j) {
   1259       const CudaGpuId cuda_gpu_j = visible_gpu_order[j];
   1260       // We have already validated that ExecutorForDevice() calls return OK.
   1261       gpu::StreamExecutor* from =
   1262           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie();
   1263       gpu::StreamExecutor* to =
   1264           GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie();
   1265 
   1266       if (from->CanEnablePeerAccessTo(to)) {
   1267         ++possible_peer_count;
   1268         auto status = from->EnablePeerAccessTo(to);
   1269         if (!status.ok()) {
   1270           LOG(WARNING)
   1271               << "Unable to enable peer access between device ordinals "
   1272               << cuda_gpu_i << " and " << cuda_gpu_j << ", status: " << status;
   1273         } else {
   1274           ++enabled_peer_count;
   1275         }
   1276       }
   1277     }
   1278   }
   1279 
   1280   // Return an error in the extreme failure case where the driver
   1281   // reported that peering was possible but not a single peering was
   1282   // successful.  This is to catch possible system misconfigurations
   1283   // or more fundamental issues.
   1284   if (possible_peer_count > 0 && enabled_peer_count == 0) {
   1285     return errors::Internal(possible_peer_count,
   1286                             " potential peer access pairs were reported by the "
   1287                             "driver, but no peering could be enabled.");
   1288   }
   1289   return Status::OK();
   1290 }
   1291 
   1292 }  // namespace
   1293 
   1294 Status BaseGPUDeviceFactory::GetValidDeviceIds(
   1295     const std::vector<CudaGpuId>& visible_gpu_order,
   1296     std::vector<CudaGpuId>* ids) {
   1297   gpu::Platform* gpu_manager = GPUMachineManager();
   1298   bool new_gpu_found = false;
   1299   for (int i = 0; i < visible_gpu_order.size(); ++i) {
   1300     const CudaGpuId cuda_gpu_id = visible_gpu_order[i];
   1301 
   1302     // Only perform this once per visible cuda gpu id.
   1303     if (visible_gpu_initialized_[cuda_gpu_id.value()]) {
   1304       continue;
   1305     }
   1306 
   1307     visible_gpu_initialized_[cuda_gpu_id.value()] = true;
   1308     new_gpu_found = true;
   1309 
   1310     auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id);
   1311     if (!executor.ok()) {
   1312       return StreamExecutorUtil::ConvertStatus(executor.status());
   1313     }
   1314 
   1315     auto stream_exec = executor.ValueOrDie();
   1316     int64 free_bytes;
   1317     int64 total_bytes;
   1318     if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) {
   1319       // Logs internally on failure.
   1320       free_bytes = 0;
   1321       total_bytes = 0;
   1322     }
   1323     const auto& description = stream_exec->GetDeviceDescription();
   1324     int cc_major;
   1325     int cc_minor;
   1326     if (!description.cuda_compute_capability(&cc_major, &cc_minor)) {
   1327       // Logs internally on failure.
   1328       cc_major = 0;
   1329       cc_minor = 0;
   1330     }
   1331     LOG(INFO) << "Found device " << i << " with properties: "
   1332               << "\nname: " << description.name() << " major: " << cc_major
   1333               << " minor: " << cc_minor
   1334               << " memoryClockRate(GHz): " << description.clock_rate_ghz()
   1335               << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: "
   1336               << strings::HumanReadableNumBytes(total_bytes)
   1337               << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes);
   1338   }
   1339   // Checking peering and shows matrix if more than one gpu found.
   1340   if (new_gpu_found && visible_gpu_order.size() > 1) {
   1341     // Enable peer access
   1342     TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order));
   1343   }
   1344 
   1345   auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities();
   1346   if (cuda_supported_capabilities.empty()) {
   1347     return errors::FailedPrecondition(
   1348         "No supported cuda capabilities in binary.");
   1349   }
   1350   CudaVersion min_supported_capability = *std::min_element(
   1351       cuda_supported_capabilities.begin(), cuda_supported_capabilities.end());
   1352 
   1353   int min_gpu_core_count =
   1354       GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order);
   1355 
   1356   // Filter out devices that don't have the right capability or power.
   1357   for (int i = 0; i < visible_gpu_order.size(); ++i) {
   1358     const CudaGpuId visible_gpu_id = visible_gpu_order[i];
   1359     auto exec_status =
   1360         GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id);
   1361     if (!exec_status.ok()) {
   1362       LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id
   1363                 << " whose executor is in invalid state: "
   1364                 << exec_status.status().ToString();
   1365       continue;
   1366     }
   1367     gpu::StreamExecutor* se = exec_status.ValueOrDie();
   1368     const gpu::DeviceDescription& desc = se->GetDeviceDescription();
   1369     CudaVersion device_capability;
   1370     if (!desc.cuda_compute_capability(&device_capability.major_part,
   1371                                       &device_capability.minor_part)) {
   1372       LOG(INFO) << "Ignoring visible gpu device "
   1373                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
   1374                 << ") "
   1375                 << "whose CUDA compute capability is not available.";
   1376       continue;
   1377     }
   1378     // Only GPUs with no less than the minimum supported compute capability is
   1379     // accepted.
   1380     if (device_capability < min_supported_capability) {
   1381       LOG(INFO) << "Ignoring visible gpu device "
   1382                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
   1383                 << ") "
   1384                 << "with Cuda compute capability " << device_capability
   1385                 << ". The minimum required Cuda capability is "
   1386                 << min_supported_capability << ".";
   1387       continue;
   1388     }
   1389 
   1390     // Filter out slow GPUs. By default, GPUs with a lower multiprocessor
   1391     // count than the fastest GPU are filtered out, unless they have 8 or more
   1392     // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment
   1393     // variable is set, its value will be used to filter out GPUs.
   1394     if (desc.core_count() < min_gpu_core_count) {
   1395       LOG(INFO) << "Ignoring visible gpu device "
   1396                 << "(" << GetShortDeviceDescription(visible_gpu_id, desc)
   1397                 << ") "
   1398                 << "with Cuda multiprocessor count: " << desc.core_count()
   1399                 << ". The minimum required count is " << min_gpu_core_count
   1400                 << ". You can adjust this requirement with the env var "
   1401                    "TF_MIN_GPU_MULTIPROCESSOR_COUNT.";
   1402       continue;
   1403     }
   1404     ids->push_back(visible_gpu_id);
   1405   }
   1406   if (!ids->empty()) {
   1407     std::vector<int> raw_ids(ids->size());
   1408     std::transform(ids->begin(), ids->end(), raw_ids.begin(),
   1409                    [](CudaGpuId id) -> int { return id.value(); });
   1410     LOG(INFO) << "Adding visible gpu devices: "
   1411               << str_util::Join(raw_ids, ", ");
   1412   }
   1413 
   1414   return Status::OK();
   1415 }
   1416 
   1417 }  // namespace tensorflow
   1418 
   1419 #endif  // GOOGLE_CUDA
   1420