1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // TODO(opensource): Use a more generic sounding preprocessor name than 17 // GOOGLE_CUDA 18 #if GOOGLE_CUDA 19 20 #define EIGEN_USE_GPU 21 22 #include "tensorflow/core/common_runtime/gpu/gpu_device.h" 23 24 #include <stdlib.h> 25 #include <string.h> 26 #include <algorithm> 27 #include <list> 28 #include <map> 29 #include <tuple> 30 #include <vector> 31 32 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 33 #include "tensorflow/core/common_runtime/device_factory.h" 34 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 35 #include "tensorflow/core/common_runtime/gpu/gpu_id.h" 36 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h" 37 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" 38 #include "tensorflow/core/common_runtime/gpu/gpu_init.h" 39 #include "tensorflow/core/common_runtime/gpu/gpu_stream_util.h" 40 #include "tensorflow/core/common_runtime/gpu/gpu_util.h" 41 #include "tensorflow/core/common_runtime/gpu/process_state.h" 42 #include "tensorflow/core/common_runtime/gpu_device_context.h" 43 #include "tensorflow/core/common_runtime/local_device.h" 44 #include "tensorflow/core/framework/allocator.h" 45 #include "tensorflow/core/framework/device_base.h" 46 #include "tensorflow/core/framework/op_kernel.h" 47 #include "tensorflow/core/framework/tensor.h" 48 #include "tensorflow/core/framework/tensor.pb.h" 49 #include "tensorflow/core/framework/types.h" 50 #include "tensorflow/core/framework/variant_op_registry.h" 51 #include "tensorflow/core/graph/types.h" 52 #include "tensorflow/core/lib/core/errors.h" 53 #include "tensorflow/core/lib/core/status.h" 54 #include "tensorflow/core/lib/gtl/stl_util.h" 55 #include "tensorflow/core/lib/strings/numbers.h" 56 #include "tensorflow/core/lib/strings/str_util.h" 57 #include "tensorflow/core/lib/strings/strcat.h" 58 #include "tensorflow/core/platform/cuda.h" 59 #include "tensorflow/core/platform/logging.h" 60 #include "tensorflow/core/platform/macros.h" 61 #include "tensorflow/core/platform/stream_executor.h" 62 #include "tensorflow/core/platform/tracing.h" 63 #include "tensorflow/core/platform/types.h" 64 #include "tensorflow/core/public/session_options.h" 65 #include "tensorflow/core/util/device_name_utils.h" 66 #include "tensorflow/core/util/env_var.h" 67 #include "tensorflow/core/util/stream_executor_util.h" 68 69 #if !defined(PLATFORM_GOOGLE) 70 #include "cuda/cuda_config.h" 71 #endif 72 73 namespace tensorflow { 74 75 // Eigen Ops directly allocate memory only for temporary buffers used 76 // during OpKernel::Compute(). The recommended way of allocating such 77 // memory is via OpKernelContext::allocate_temp(). However, Eigen Ops 78 // don't have access to OpKernelContext, instead they get access to 79 // memory directly through the device allocator. As an Open Source 80 // project, Eigen assumes allocator semantics similar to those of the 81 // CUDA memory allocator, and may not work correctly due to race 82 // conditions if used with some other allocator. For safety, we need 83 // to delay deallocation calls out of Eigen until all events on the 84 // corresponding stream have completed. The following two classes 85 // serve this purpose in two different compilation environments. 86 87 class EigenCudaStreamDevice : public ::Eigen::StreamInterface { 88 public: 89 EigenCudaStreamDevice() 90 : scratch_(nullptr), semaphore_(nullptr), context_(nullptr) { 91 Eigen::initializeDeviceProp(); 92 } 93 ~EigenCudaStreamDevice() override {} 94 void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream, 95 TfGpuId tf_gpu_id, ::tensorflow::Allocator* alloc, 96 char* scratch) { 97 if (LogMemory::IsEnabled()) { 98 operation_ = context->op_kernel().name() + "/EigenAllocator"; 99 step_id_ = context->step_id(); 100 } 101 context_ = context; 102 scratch_ = scratch; 103 semaphore_ = 104 reinterpret_cast<unsigned int*>(scratch + Eigen::kCudaScratchSize); 105 stream_ = cuda_stream; 106 allocator_ = alloc; 107 const int cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id).value(); 108 device_prop_ = &Eigen::m_deviceProperties[cuda_gpu_id]; 109 } 110 111 const cudaStream_t& stream() const override { return *stream_; } 112 const cudaDeviceProp& deviceProperties() const override { 113 return *device_prop_; 114 } 115 116 void* allocate(size_t num_bytes) const override { 117 void* ret = allocator_->AllocateRaw(32 /* alignment */, num_bytes); 118 if (ret == nullptr) { 119 if (context_) { 120 context_->SetStatus(errors::ResourceExhausted( 121 strings::StrCat("Ran out of GPU memory when allocating ", num_bytes, 122 " bytes for ", operation_))); 123 } else { 124 LOG(FATAL) 125 << "EigenAllocator for GPU ran out of memory when allocating " 126 << num_bytes << ". See error logs for more detailed info."; 127 } 128 } 129 if (LogMemory::IsEnabled() && ret != nullptr) { 130 LogMemory::RecordRawAllocation(operation_, step_id_, num_bytes, ret, 131 allocator_); 132 } 133 return ret; 134 } 135 void deallocate(void* buffer) const override { 136 if (LogMemory::IsEnabled() && buffer != nullptr) { 137 LogMemory::RecordRawDeallocation(operation_, step_id_, buffer, allocator_, 138 true); 139 } 140 AsyncFreeData* afData = 141 new AsyncFreeData(allocator_, buffer, operation_, step_id_); 142 cudaError_t err = cudaStreamAddCallback(*stream_, asyncFree, afData, 0); 143 CHECK_EQ(err, cudaSuccess); 144 } 145 146 // Return a pointer to a per stream scratchpad of 1024 bytes residing 147 // in global memory. 148 void* scratchpad() const override { return scratch_; } 149 150 // Return a semaphore. The semaphore is initially initialized to 0, and 151 // each kernel using it is responsible for resetting to 0 upon completion 152 // to maintain the invariant that the semaphore is always equal to 0 upon 153 // each kernel start. 154 unsigned int* semaphore() const override { return semaphore_; } 155 156 private: 157 struct AsyncFreeData { 158 AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o, 159 const int64 s) 160 : allocator_(a), address_(p), operation_(o), step_id_(s) {} 161 ::tensorflow::Allocator* allocator_; 162 void* address_; 163 const string operation_; 164 const int64 step_id_; 165 }; 166 167 static void CUDART_CB asyncFree(cudaStream_t stream, cudaError_t status, 168 void* userData) { 169 AsyncFreeData* data = static_cast<AsyncFreeData*>(userData); 170 if (LogMemory::IsEnabled()) { 171 LogMemory::RecordRawDeallocation(data->operation_, data->step_id_, 172 data->address_, data->allocator_, false); 173 } 174 data->allocator_->DeallocateRaw(data->address_); 175 delete data; 176 } 177 178 string operation_; 179 int64 step_id_; 180 const cudaStream_t* stream_; // Not owned. 181 const cudaDeviceProp* device_prop_; // Not owned. 182 ::tensorflow::Allocator* allocator_; // Not owned. 183 mutable char* scratch_; 184 mutable unsigned int* semaphore_; 185 OpKernelContext* context_; 186 187 TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice); 188 }; 189 190 // This factory helps to ensure that different GPU device objects that refer to 191 // the same physical device and stream group id use the same stream group 192 // object (and therefore the same CUDA streams). This is necessary since there 193 // is a single memory allocator per device (see ProcessState::GetGPUAllocator) 194 // and allocators must not be shared across streams. 195 class BaseGPUDevice::StreamGroupFactory { 196 public: 197 // Returns the unique stream group for use with the stream defined by 198 // {tf_gpu_id, stream_group_within_gpu}, creating it if it does not yet 199 // exist. 200 // This function is thread safe. 201 BaseGPUDevice::StreamGroup* GetOrCreate(TfGpuId tf_gpu_id, 202 int stream_group_within_gpu, 203 gpu::StreamExecutor* executor) { 204 mutex_lock guard(lock_); 205 StreamGroup* group = 206 &streams_[key_type(tf_gpu_id.value(), stream_group_within_gpu)]; 207 if (!group->compute) { 208 group->compute = new gpu::Stream(executor); 209 group->compute->Init(); 210 VLOG(2) << "Created stream[" << stream_group_within_gpu 211 << "] = " << group->compute; 212 213 group->host_to_device = new gpu::Stream(executor); 214 group->host_to_device->Init(); 215 VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu 216 << "] = " << group->host_to_device; 217 218 group->device_to_host = new gpu::Stream(executor); 219 group->device_to_host->Init(); 220 VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu 221 << "] = " << group->device_to_host; 222 223 group->device_to_device = new gpu::Stream(executor); 224 group->device_to_device->Init(); 225 VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu 226 << "] = " << group->device_to_host; 227 } 228 return group; 229 } 230 231 // Returns a reference to the StreamGroupFactory singleton. Note that this is 232 // never destroyed, so the objects it owns are never deleted. 233 static StreamGroupFactory& Global() { 234 static StreamGroupFactory* instance = new StreamGroupFactory(); 235 return *instance; 236 } 237 238 private: 239 mutex lock_; 240 using key_type = std::tuple<int, int>; 241 std::map<key_type, StreamGroup> streams_; 242 243 // StreamGroupFactory cannot be created directly; Call 244 // StreamGroupFactory::Global() to get the global instance. 245 StreamGroupFactory() = default; 246 TF_DISALLOW_COPY_AND_ASSIGN(StreamGroupFactory); 247 }; 248 249 BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name, 250 Bytes memory_limit, const DeviceLocality& locality, 251 TfGpuId tf_gpu_id, 252 const string& physical_device_desc, 253 Allocator* gpu_allocator, Allocator* cpu_allocator, 254 bool sync_every_op, int32 max_streams) 255 : LocalDevice(options, Device::BuildDeviceAttributes(name, DEVICE_GPU, 256 memory_limit, locality, 257 physical_device_desc)), 258 gpu_allocator_(gpu_allocator), 259 cpu_allocator_(cpu_allocator), 260 tf_gpu_id_(tf_gpu_id), 261 sync_every_op_(sync_every_op), 262 max_streams_(max_streams) { 263 ProcessState::singleton()->EnableGPUDevice(); 264 } 265 266 BaseGPUDevice::~BaseGPUDevice() { 267 delete gpu_device_info_; 268 for (auto ctx : device_contexts_) ctx->Unref(); 269 } 270 271 Status BaseGPUDevice::Init(const SessionOptions& options) { 272 auto executor_status = GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id_); 273 if (!executor_status.status().ok()) { 274 return errors::Internal("Failed to get StreamExecutor for device ", 275 tf_gpu_id_.value()); 276 } 277 278 executor_ = executor_status.ValueOrDie(); 279 em_.reset(new EventMgr(executor_, options.config.gpu_options())); 280 281 if (max_streams_ < 1) { 282 return errors::InvalidArgument("Invalid value for max_streams."); 283 } 284 285 // Create the specified number of GPU streams 286 for (int i = 0; i < max_streams_; i++) { 287 streams_.push_back( 288 StreamGroupFactory::Global().GetOrCreate(tf_gpu_id_, i, executor_)); 289 290 size_t scratch_buffer_size = Eigen::kCudaScratchSize + sizeof(unsigned int); 291 void* scratch_buffer = gpu_allocator_->AllocateRaw( 292 Allocator::kAllocatorAlignment, scratch_buffer_size); 293 if (scratch_buffer == nullptr) { 294 return errors::FailedPrecondition( 295 "Failed to allocate scratch buffer for device ", tf_gpu_id_.value()); 296 } 297 scratch_.push_back(static_cast<char*>(scratch_buffer)); 298 299 perftools::gputools::DeviceMemory<char> mem( 300 perftools::gputools::DeviceMemoryBase(scratch_buffer, 301 scratch_buffer_size)); 302 303 bool ok = executor_->SynchronousMemZero( 304 &mem, Eigen::kCudaScratchSize + sizeof(unsigned int)); 305 if (!ok) { 306 return errors::FailedPrecondition( 307 "Failed to memcopy into scratch buffer for device ", 308 tf_gpu_id_.value()); 309 } 310 311 device_contexts_.push_back(new GPUDeviceContext( 312 i, streams_.back()->compute, streams_.back()->host_to_device, 313 streams_.back()->device_to_host, streams_.back()->device_to_device)); 314 } 315 gpu_device_info_ = new GpuDeviceInfo; 316 gpu_device_info_->stream = streams_[0]->compute; 317 gpu_device_info_->default_context = device_contexts_[0]; 318 gpu_device_info_->event_mgr = em_.get(); 319 gpu_device_info_->gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id_).value(); 320 set_tensorflow_gpu_device_info(gpu_device_info_); 321 322 // Whether and how the GPU device uses its own threadpool. 323 // This option is experimental. Once we confirm the best setting, we 324 // may change the default behavior and completely remove this flag. 325 // Default values might change in future releases. 326 // Possible values: 327 // * global: GPU uses threads shared with CPU in the main compute 328 // thread-pool. This is currently the default. 329 // * gpu_private: GPU uses threads dedicated to this device. 330 // * gpu_shared: All GPUs share a dedicated thread pool. 331 string gpu_thread_mode; 332 TF_RETURN_IF_ERROR( 333 ReadStringFromEnvVar("TF_GPU_THREAD_MODE", "global", &gpu_thread_mode)); 334 gpu_thread_mode = str_util::Lowercase(gpu_thread_mode); 335 if (gpu_thread_mode != "global") { 336 int64 gpu_thread_count = -1; 337 // Default to two threads. One for device compute and another for memory 338 // copies. 339 TF_RETURN_IF_ERROR( 340 ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &gpu_thread_count)); 341 if (gpu_thread_mode == "gpu_private") { 342 // TODO(zhengxq): since these threads only serve a single GPU device, 343 // we should set the device context once for each thread, and avoid 344 // setting them for each kernel. 345 // TODO(zhengxq): pin the thread to the same socket of the target GPU. 346 thread_pool_.reset(new thread::ThreadPool( 347 options.env, strings::StrCat("gpu_private_", tf_gpu_id_.value()), 348 static_cast<int32>(gpu_thread_count))); 349 set_tensorflow_device_thread_pool(thread_pool_.get()); 350 } else if (gpu_thread_mode == "gpu_shared") { 351 static thread::ThreadPool* thread_pool = new thread::ThreadPool( 352 options.env, "gpu_shared", static_cast<int32>(gpu_thread_count)); 353 set_tensorflow_device_thread_pool(thread_pool); 354 } else { 355 string error_message = 356 strings::StrCat("Invalid gpu_thread_mode: ", gpu_thread_mode); 357 LOG(WARNING) << error_message; 358 return errors::InvalidArgument(error_message); 359 } 360 } 361 362 return Status::OK(); 363 } 364 365 bool BaseGPUDevice::RequiresRecordingAccessedTensors() const { 366 // When there is no more than one stream, we release the tensor reference 367 // at the end of the kernel launch, instead of at the end of the kernel 368 // execution. 369 return streams_.size() > 1; 370 } 371 372 Status BaseGPUDevice::FillContextMap(const Graph* graph, 373 DeviceContextMap* device_context_map) { 374 VLOG(2) << "FillContextMap"; 375 376 const size_t num_streams = streams_.size(); 377 // Special case for single stream. 378 if (num_streams == 1) { 379 return Status::OK(); 380 } 381 const int64 before = Env::Default()->NowMicros(); 382 gpu_stream_util::AssignStreamsOpts opts; 383 opts.max_streams = static_cast<int32>(num_streams); 384 std::unordered_map<int, int> node_to_stream_id; 385 TF_RETURN_IF_ERROR( 386 gpu_stream_util::AssignStreams(graph, opts, &node_to_stream_id)); 387 int64 elapsed = Env::Default()->NowMicros() - before; 388 VLOG(3) << "AssignStreams took " << elapsed << "us"; 389 390 // Fill in the context map. It is OK for this map to contain 391 // duplicate DeviceContexts so long as we increment the refcount. 392 device_context_map->resize(graph->num_node_ids()); 393 for (Node* n : graph->nodes()) { 394 auto mapped_stream = node_to_stream_id[n->id()]; 395 CHECK_LE(mapped_stream, num_streams); 396 auto ctx = device_contexts_[mapped_stream]; 397 VLOG(3) << "Assigned stream " << node_to_stream_id[n->id()] 398 << " ==> stream[" << ctx->stream_id() << "] for node id " << n->id() 399 << " " << n->type_string() << " " << n->name(); 400 ctx->Ref(); 401 (*device_context_map)[n->id()] = ctx; 402 } 403 404 return Status::OK(); 405 } 406 407 void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) { 408 // ScopedActivity is cheap when tracing is not active, but we 409 // can avoid computing the Hash64. 410 // TODO(pbar) This would no longer be needed if Ops have a unique id. 411 const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0; 412 port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute, 413 id); 414 415 // NOTE(tucker): We need to discriminate between Eigen GPU 416 // operations and all others. If an operation is Eigen 417 // implemented (or otherwise tries to launch a cuda kernel 418 // directly), we need to establish a stacked-scoped environment 419 // that directs it to execute on the proper device. Otherwise we 420 // expect the Op to use StreamExecutor directly and correctly. The 421 // way we make this discrimination is quite hacky: At the moment 422 // the only non-Eigen GPU Op is the recv-op, which is known to be 423 // asynchronous. 424 if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") { 425 context->SetStatus(errors::Internal( 426 "Invalid synchronous 'Compute' on GPU for '_Recv' op")); 427 } else if (port::Tracing::ScopedAnnotation::Enabled()) { 428 port::Tracing::ScopedAnnotation annotation(op_kernel->name(), 429 op_kernel->type_string()); 430 ComputeHelper(op_kernel, context); 431 } else { 432 ComputeHelper(op_kernel, context); 433 } 434 } 435 436 void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel, 437 OpKernelContext* context) { 438 GPUDeviceContext* gpu_device_context = device_contexts_[0]; 439 if (context->op_device_context() != nullptr) { 440 gpu_device_context = 441 static_cast<GPUDeviceContext*>(context->op_device_context()); 442 } 443 gpu::Stream* stream = gpu_device_context->stream(); 444 const auto stream_id = gpu_device_context->stream_id(); 445 446 const bool vlog_1 = VLOG_IS_ON(1); 447 const bool vlog_2 = vlog_1 && VLOG_IS_ON(2); 448 449 if (vlog_1) { 450 VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op " 451 << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream[" 452 << stream_id << "]"; 453 } 454 455 const auto num_streams = streams_.size(); 456 if (num_streams > 1) { 457 // If this op's device context is different from the other contexts, 458 // we must wait on the stream. 459 for (int i = 0; i < context->num_inputs(); ++i) { 460 const GPUDeviceContext* idc = 461 static_cast<GPUDeviceContext*>(context->input_device_context(i)); 462 OP_REQUIRES(context, idc != nullptr, 463 errors::Internal("Input device context ", i, 464 " was not set properly.")); 465 if (vlog_2) { 466 const void* base; 467 size_t len; 468 if (context->has_input(i)) { 469 if (IsRefType(context->input_dtype(i))) { 470 Tensor tensor = context->mutable_input(i, false); 471 base = DMAHelper::base(&tensor); 472 len = tensor.TotalBytes(); 473 } else { 474 const Tensor& tensor = context->input(i); 475 base = DMAHelper::base(&tensor); 476 len = tensor.TotalBytes(); 477 } 478 LOG(INFO) << "Input " << i << " " << base << " " << len; 479 LOG(INFO) << " stream[" << stream_id << "].ThenWaitFor(stream[" 480 << idc->stream_id() << "])" 481 << ((idc->stream() == stream) ? " not needed" : ""); 482 } 483 } 484 if (idc->stream() != stream) stream->ThenWaitFor(idc->stream()); 485 } 486 } 487 gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()}; 488 op_kernel->Compute(context); 489 if (context->status().ok()) { 490 if (sync_every_op_) { 491 // Note: GPUUtil::Sync() only syncs the default stream. 492 // We need to either sync the stream used by this op, or 493 // all streams. Given that this flag is typically used for 494 // debugging it makes more sense to sync all GPU activity. 495 context->SetStatus(GPUUtil::SyncAll(this)); 496 } 497 } 498 } 499 500 void BaseGPUDevice::ConsumeListOfAccessedTensors( 501 DeviceContext* device_context, const TensorReferenceVector& tensor_refs) { 502 GPUDeviceContext* gpu_device_context = device_contexts_[0]; 503 if (device_context != nullptr) { 504 gpu_device_context = static_cast<GPUDeviceContext*>(device_context); 505 } 506 gpu::Stream* stream = gpu_device_context->stream(); 507 em_->ThenDeleteTensors(stream, tensor_refs); 508 } 509 510 // Based on the semantics of Device::Sync this call should wait for 511 // all streams not just the current one. 512 Status BaseGPUDevice::Sync() { return GPUUtil::SyncAll(this); } 513 514 void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel, 515 OpKernelContext* context, 516 AsyncOpKernel::DoneCallback done) { 517 GPUDeviceContext* gpu_device_context = device_contexts_[0]; 518 if (context->op_device_context() != nullptr) { 519 gpu_device_context = 520 static_cast<GPUDeviceContext*>(context->op_device_context()); 521 } 522 gpu::Stream* stream = gpu_device_context->stream(); 523 const auto stream_id = gpu_device_context->stream_id(); 524 525 VLOG(1) << "GpuDevice::ComputeAsync " << op_kernel->name() << " op " 526 << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream[" 527 << stream_id << "]"; 528 529 // When TraceMe profiling is off (which is the default), the 530 // following TraceMe constructor is simply a conditional test of 531 // false value. Measurements show that its overhead is negligible. 532 port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(), 533 op_kernel->IsExpensive()); 534 gpu::cuda::ScopedActivateExecutorContext scoped_activation{stream->parent()}; 535 op_kernel->ComputeAsync(context, done); 536 } 537 538 Status BaseGPUDevice::MaybeCopyTensorToGPU( 539 const AllocatorAttributes& alloc_attrs, const Tensor& from, Tensor* to, 540 StatusCallback done) { 541 if (alloc_attrs.on_host()) { 542 *to = from; 543 done(Status::OK()); 544 return Status::OK(); 545 } else { 546 if (!DMAHelper::CanUseDMA(&from)) { 547 Status err = errors::Internal("GPU copy from non-DMA ", 548 DataTypeString(from.dtype()), " tensor"); 549 done(err); 550 return err; 551 } 552 auto* copy = 553 new Tensor(GetAllocator(alloc_attrs), from.dtype(), from.shape()); 554 555 // If the tensor is not initialized, we likely ran out of memory. 556 if (!copy->IsInitialized()) { 557 delete copy; 558 Status err = errors::ResourceExhausted( 559 "OOM when allocating tensor of shape ", from.shape().DebugString(), 560 " and type ", DataTypeString(from.dtype())); 561 done(err); 562 return err; 563 } 564 565 StatusCallback wrapped_done = std::bind( 566 [to, copy](StatusCallback done_, 567 // Begin unbound arguments. 568 const Status& s) { 569 *to = std::move(*copy); 570 delete copy; 571 done_(s); 572 }, 573 std::move(done), std::placeholders::_1); 574 575 port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto"); 576 device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy, 577 std::move(wrapped_done)); 578 return Status::OK(); 579 } 580 } 581 582 Status BaseGPUDevice::MakeTensorFromProto(const TensorProto& tensor_proto, 583 const AllocatorAttributes alloc_attrs, 584 Tensor* tensor) { 585 AllocatorAttributes attr; 586 attr.set_on_host(true); 587 attr.set_gpu_compatible(true); 588 Allocator* host_alloc = GetAllocator(attr); 589 Tensor parsed(tensor_proto.dtype()); 590 if (!parsed.FromProto(host_alloc, tensor_proto)) { 591 return errors::InvalidArgument("Cannot parse tensor from proto: ", 592 tensor_proto.DebugString()); 593 } 594 595 if (parsed.dtype() == DT_VARIANT) { 596 const Variant* from = parsed.flat<Variant>().data(); 597 Tensor copy(cpu_allocator(), DT_VARIANT, parsed.shape()); 598 Variant* copy_variant = copy.flat<Variant>().data(); 599 600 std::list<Notification> notifications; 601 Status copy_status; 602 auto copier = [this, &alloc_attrs, ¬ifications, ©_status]( 603 const Tensor& from, Tensor* to) { 604 // Copier isn't run in a multithreaded environment, so we don't 605 // have to worry about the notifications list being modified in parallel. 606 notifications.emplace_back(); 607 Notification& n = *notifications.rbegin(); 608 return MaybeCopyTensorToGPU(alloc_attrs, from, to, 609 [&n, ©_status](const Status& s) { 610 if (copy_status.ok()) { 611 copy_status.Update(s); 612 } 613 n.Notify(); 614 }); 615 }; 616 Status s; 617 for (int64 ix = 0; ix < parsed.NumElements(); ++ix) { 618 s = VariantDeviceCopy(VariantDeviceCopyDirection::HOST_TO_DEVICE, 619 from[ix], ©_variant[ix], copier); 620 if (!s.ok()) { 621 break; 622 } 623 } 624 for (auto& n : notifications) { 625 n.WaitForNotification(); 626 } 627 if (!s.ok()) { 628 return s; 629 } 630 *tensor = std::move(copy); 631 return copy_status; 632 } else { 633 Notification n; 634 Status status; 635 TF_RETURN_IF_ERROR(MaybeCopyTensorToGPU(alloc_attrs, parsed, tensor, 636 [&n, &status](const Status& s) { 637 status = s; 638 n.Notify(); 639 })); 640 n.WaitForNotification(); 641 return status; 642 } 643 } 644 645 namespace { 646 class ConcretePerOpGpuDevice : public PerOpGpuDevice { 647 public: 648 ConcretePerOpGpuDevice() : device_(&stream_device_) {} 649 650 void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream, 651 TfGpuId tf_gpu_id, Allocator* base_allocator, 652 char* scratch) { 653 stream_device_.Reinitialize(context, cuda_stream, tf_gpu_id, base_allocator, 654 scratch); 655 } 656 657 const Eigen::GpuDevice& device() const override { return device_; } 658 659 private: 660 EigenCudaStreamDevice stream_device_; 661 Eigen::GpuDevice device_; 662 }; 663 664 // Parse 'visible_device_list' into a list of CUDA GPU ids. 665 Status ParseVisibleDeviceList(const string& visible_device_list, 666 std::vector<CudaGpuId>* visible_gpu_order) { 667 visible_gpu_order->clear(); 668 gpu::Platform* gpu_manager = GPUMachineManager(); 669 670 // If the user wants to remap the visible to virtual GPU mapping, 671 // check for that here. 672 if (visible_device_list.empty()) { 673 visible_gpu_order->resize(gpu_manager->VisibleDeviceCount()); 674 // By default, visible to virtual mapping is unchanged. 675 int deviceNo = 0; 676 std::generate(visible_gpu_order->begin(), visible_gpu_order->end(), 677 [&deviceNo] { return deviceNo++; }); 678 } else { 679 const std::vector<string> order_str = 680 str_util::Split(visible_device_list, ','); 681 for (const string& cuda_gpu_id_str : order_str) { 682 int32 cuda_gpu_id; 683 if (!strings::safe_strto32(cuda_gpu_id_str, &cuda_gpu_id)) { 684 return errors::InvalidArgument( 685 "Could not parse entry in 'visible_device_list': '", 686 cuda_gpu_id_str, "'. visible_device_list = ", visible_device_list); 687 } 688 if (cuda_gpu_id < 0 || cuda_gpu_id >= gpu_manager->VisibleDeviceCount()) { 689 return errors::InvalidArgument( 690 "'visible_device_list' listed an invalid GPU id '", cuda_gpu_id, 691 "' but visible device count is ", 692 gpu_manager->VisibleDeviceCount()); 693 } 694 visible_gpu_order->push_back(CudaGpuId(cuda_gpu_id)); 695 } 696 } 697 698 // Validate no repeats. 699 std::set<CudaGpuId> visible_device_set(visible_gpu_order->begin(), 700 visible_gpu_order->end()); 701 if (visible_device_set.size() != visible_gpu_order->size()) { 702 return errors::InvalidArgument( 703 "visible_device_list contained a duplicate entry: ", 704 visible_device_list); 705 } 706 return Status::OK(); 707 } 708 709 Status VerifyVirtualDeviceSettings( 710 const size_t num_gpus_to_use, const GPUOptions& gpu_options, 711 const std::vector<CudaGpuId>& visible_gpu_order, 712 const std::vector<CudaGpuId>& valid_cuda_gpu_ids) { 713 const auto& virtual_devices = gpu_options.experimental().virtual_devices(); 714 CHECK(!virtual_devices.empty()); 715 if (gpu_options.per_process_gpu_memory_fraction() > 0) { 716 return errors::InvalidArgument( 717 "It's invalid to set per_process_gpu_memory_fraction when " 718 "virtual_devices is set."); 719 } 720 if (num_gpus_to_use < virtual_devices.size()) { 721 return errors::Unknown( 722 "Not enough GPUs to create virtual devices." 723 " num_gpus_to_use: ", 724 num_gpus_to_use, " #virtual_devices: ", virtual_devices.size()); 725 } 726 if (!gpu_options.visible_device_list().empty() && 727 visible_gpu_order.size() != virtual_devices.size()) { 728 return errors::InvalidArgument( 729 "The number of GPUs in visible_device_list doesn't match the number " 730 "of elements in the virtual_devices list.", 731 " #GPUs in visible_device_list: ", visible_gpu_order.size(), 732 " virtual_devices.size(): ", virtual_devices.size()); 733 } 734 if (valid_cuda_gpu_ids.size() != virtual_devices.size()) { 735 return errors::Unknown( 736 "The number of valid GPUs doesn't match the number of elements in " 737 "the virtual_devices list.", 738 " #valid GPUs: ", valid_cuda_gpu_ids.size(), 739 " virtual_devices.size(): ", virtual_devices.size()); 740 } 741 return Status::OK(); 742 } 743 744 int64 MinSystemMemory(int64 available_memory) { 745 // We use the following heuristic for now: 746 // 747 // If the available_memory is < 2GiB, we allocate 225MiB to system memory. 748 // Otherwise, allocate max(300MiB, 0.05 * available_memory) to system memory. 749 // 750 // In the future we could be more sophisticated by using a table of devices. 751 int64 min_system_memory; 752 if (available_memory < (1LL << 31)) { 753 // 225MiB 754 min_system_memory = 225 * 1024 * 1024; 755 } else { 756 // max(300 MiB, 0.05 * available_memory) 757 min_system_memory = 758 std::max(314572800LL, static_cast<int64>(available_memory * 0.05)); 759 } 760 #if defined(__GNUC__) && defined(__OPTIMIZE__) 761 // Do nothing 762 #elif !defined(__GNUC__) && defined(NDEBUG) 763 // Do nothing 764 #else 765 // Double the amount of available GPU memory in non-opt builds (debug 766 // builds in windows); because in non-opt builds more system memory 767 // is necessary. 768 min_system_memory *= 2; 769 #endif 770 771 #if defined(ANDROID_TEGRA) 772 // 1GB system mem for NVIDIA Tegra devices since they use the same mem for RAM 773 // and Video RAM 774 min_system_memory = 1 << 30; 775 #endif 776 return min_system_memory; 777 } 778 779 // Get the memory limit for the virtual device being created on GPU with 780 // 'cuda_gpu_id', when that virtual device is the only virtual device being 781 // created on that GPU. 782 Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options, 783 CudaGpuId cuda_gpu_id, 784 int64* memory_limit) { 785 int64 total_memory = 0; 786 int64 available_memory = 0; 787 gpu::StreamExecutor* se = 788 GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); 789 if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) { 790 return errors::Unknown("Failed to query available memory for GPU ", 791 cuda_gpu_id.value()); 792 } 793 794 int64 allocated_memory = 0; 795 const double per_process_gpu_memory_fraction = 796 gpu_options.per_process_gpu_memory_fraction(); 797 if (per_process_gpu_memory_fraction == 0) { 798 allocated_memory = available_memory; 799 const int64 min_system_memory = MinSystemMemory(available_memory); 800 if (min_system_memory < allocated_memory) { 801 allocated_memory -= min_system_memory; 802 } 803 } else { 804 allocated_memory = total_memory * per_process_gpu_memory_fraction; 805 } 806 *memory_limit = allocated_memory; 807 return Status::OK(); 808 } 809 } // namespace 810 811 void BaseGPUDevice::ReinitializeDevice(OpKernelContext* context, 812 PerOpGpuDevice* device, int stream_id, 813 Allocator* allocator) { 814 ConcretePerOpGpuDevice* concrete_device = 815 static_cast<ConcretePerOpGpuDevice*>(device); 816 DCHECK(concrete_device); 817 const cudaStream_t* cuda_stream = reinterpret_cast<const cudaStream_t*>( 818 streams_[stream_id]->compute->implementation()->CudaStreamMemberHack()); 819 concrete_device->Reinitialize(context, cuda_stream, tf_gpu_id_, allocator, 820 scratch_[stream_id]); 821 } 822 823 PerOpGpuDevice* BaseGPUDevice::MakeGpuDevice() { 824 return new ConcretePerOpGpuDevice(); 825 } 826 827 void BaseGPUDevice::ReinitializeGpuDevice(OpKernelContext* context, 828 PerOpGpuDevice* device, 829 DeviceContext* dc, 830 Allocator* allocator) { 831 if (dc) { 832 const GPUDeviceContext* gpu_dc = static_cast<GPUDeviceContext*>(dc); 833 const int stream_id = gpu_dc->stream_id(); 834 VLOG(1) << " eigen_gpu_device(" << dc << ") => stream[" << stream_id 835 << "]"; 836 CHECK_LT(stream_id, streams_.size()); 837 ReinitializeDevice(context, device, stream_id, allocator); 838 } else { 839 ReinitializeDevice(context, device, 0, allocator); 840 } 841 } 842 843 const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000; 844 const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1; 845 846 Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options, 847 const string& name_prefix, 848 std::vector<Device*>* devices) { 849 TF_RETURN_IF_ERROR(ValidateGPUMachineManager()); 850 gpu::Platform* gpu_manager = GPUMachineManager(); 851 if (gpu_manager == nullptr) { 852 return Status::OK(); 853 } 854 // If there are no GPUs visible, do nothing. 855 if (gpu_manager->VisibleDeviceCount() <= 0) { 856 return Status::OK(); 857 } 858 859 size_t num_gpus_to_use = INT_MAX; 860 auto iter = options.config.device_count().find("GPU"); 861 if (iter != options.config.device_count().end()) { 862 num_gpus_to_use = iter->second; 863 } 864 const auto& gpu_options = options.config.gpu_options(); 865 std::vector<CudaGpuId> visible_gpu_order; 866 TF_RETURN_IF_ERROR(ParseVisibleDeviceList(gpu_options.visible_device_list(), 867 &visible_gpu_order)); 868 869 std::vector<CudaGpuId> valid_cuda_gpu_ids; 870 TF_RETURN_IF_ERROR(GetValidDeviceIds(visible_gpu_order, &valid_cuda_gpu_ids)); 871 if (num_gpus_to_use > valid_cuda_gpu_ids.size()) { 872 num_gpus_to_use = valid_cuda_gpu_ids.size(); 873 } 874 if (!valid_cuda_gpu_ids.empty()) { 875 // Save the original device. 876 int original_device = 0; 877 cudaError_t err = cudaGetDevice(&original_device); 878 if (err != cudaSuccess) { 879 return errors::Internal("cudaGetDevice() failed. Status: ", 880 cudaGetErrorString(err)); 881 } 882 // Force to implicitly initialize CUDA runtime on each valid GPU before 883 // CreateGPUDevice(). 884 for (CudaGpuId cuda_gpu_id : valid_cuda_gpu_ids) { 885 err = cudaSetDevice(cuda_gpu_id.value()); 886 if (err != cudaSuccess) { 887 return errors::Internal("cudaSetDevice() on GPU:", cuda_gpu_id.value(), 888 " failed. Status: ", cudaGetErrorString(err)); 889 } 890 err = cudaFree(nullptr); 891 if (err != cudaSuccess) { 892 return errors::Internal( 893 "CUDA runtime implicit initialization on GPU:", cuda_gpu_id.value(), 894 " failed. Status: ", cudaGetErrorString(err)); 895 } 896 } 897 // Reset to the original device. 898 err = cudaSetDevice(original_device); 899 if (err != cudaSuccess) { 900 return errors::Internal("cudaSetDevice() on GPU:", original_device, 901 " failed. Status: ", cudaGetErrorString(err)); 902 } 903 } 904 905 std::vector<InterconnectMap> interconnect_maps; 906 TF_RETURN_IF_ERROR( 907 GetInterconnectMaps(visible_gpu_order, gpu_manager, &interconnect_maps)); 908 909 // Print each interconnect map to the log. 910 for (const InterconnectMap& im : interconnect_maps) { 911 LOG(INFO) << "Device interconnect " << im.name << " with strength " 912 << im.strength << " edge matrix:"; 913 string line_buf = " "; 914 for (int i = 0; i < visible_gpu_order.size(); ++i) { 915 strings::StrAppend(&line_buf, visible_gpu_order[i].value(), " "); 916 } 917 LOG(INFO) << line_buf; 918 for (int i = 0; i < visible_gpu_order.size(); ++i) { 919 line_buf = strings::StrCat(visible_gpu_order[i].value(), ": "); 920 CudaGpuId cuda_id_i = visible_gpu_order[i]; 921 for (int j = 0; j < visible_gpu_order.size(); ++j) { 922 CudaGpuId cuda_id_j = visible_gpu_order[j]; 923 if (im.directed_links.find({cuda_id_i, cuda_id_j}) != 924 im.directed_links.end()) { 925 line_buf.append("Y "); 926 } else { 927 line_buf.append("N "); 928 } 929 } 930 LOG(INFO) << line_buf; 931 } 932 } 933 934 const auto& virtual_devices = gpu_options.experimental().virtual_devices(); 935 if (!virtual_devices.empty()) { 936 TF_RETURN_IF_ERROR(VerifyVirtualDeviceSettings( 937 num_gpus_to_use, gpu_options, visible_gpu_order, valid_cuda_gpu_ids)); 938 // We've verified that num_gpus_to_use >= virtual_devices.size(). 939 num_gpus_to_use = virtual_devices.size(); 940 CHECK(gpu_options.visible_device_list().empty() || 941 valid_cuda_gpu_ids == visible_gpu_order); 942 } 943 int next_tf_gpu_id = 0; 944 std::vector<int64> memory_limit_bytes; 945 for (int i = 0; i < num_gpus_to_use; ++i) { 946 const CudaGpuId cuda_gpu_id = valid_cuda_gpu_ids[i]; 947 if (virtual_devices.empty() || 948 virtual_devices.Get(i).memory_limit_mb_size() == 0) { 949 int64 single_virtual_device_memory_limit = 0; 950 TF_RETURN_IF_ERROR(SingleVirtualDeviceMemoryLimit( 951 gpu_options, cuda_gpu_id, &single_virtual_device_memory_limit)); 952 memory_limit_bytes.push_back(single_virtual_device_memory_limit); 953 } else { 954 const auto& memory_limit_mb = virtual_devices.Get(i).memory_limit_mb(); 955 std::transform(memory_limit_mb.begin(), memory_limit_mb.end(), 956 std::back_inserter(memory_limit_bytes), [](float mb) { 957 return static_cast<int64>(mb) * (1ll << 20); 958 }); 959 } 960 while (next_tf_gpu_id < memory_limit_bytes.size()) { 961 TfGpuId tf_gpu_id(next_tf_gpu_id); 962 ++next_tf_gpu_id; 963 GpuIdManager::InsertTfCudaGpuIdPair(tf_gpu_id, cuda_gpu_id); 964 } 965 } 966 const int num_tf_gpus = next_tf_gpu_id; 967 968 LocalityMap device_localities; 969 TF_RETURN_IF_ERROR( 970 GetDeviceLocalities(num_tf_gpus, interconnect_maps, &device_localities)); 971 972 // Build the GPUDevices 973 CHECK_EQ(next_tf_gpu_id, memory_limit_bytes.size()); 974 for (int di = 0; di < num_tf_gpus; ++di) { 975 TfGpuId tf_gpu_id(di); 976 int64 bytes = memory_limit_bytes[di]; 977 auto it = device_localities.find(tf_gpu_id); 978 if (it == device_localities.end()) { 979 return errors::Internal("Failed to find DeviceLocality for GPU device ", 980 tf_gpu_id.value()); 981 } 982 TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_gpu_id, bytes, 983 it->second, devices)); 984 } 985 return Status::OK(); 986 } 987 988 static string GetShortDeviceDescription(CudaGpuId cuda_gpu_id, 989 const gpu::DeviceDescription& desc) { 990 int cc_major; 991 int cc_minor; 992 if (!desc.cuda_compute_capability(&cc_major, &cc_minor)) { 993 cc_major = 0; 994 cc_minor = 0; 995 } 996 // LINT.IfChange 997 return strings::StrCat("device: ", cuda_gpu_id.value(), 998 ", name: ", desc.name(), 999 ", pci bus id: ", desc.pci_bus_id(), 1000 ", compute capability: ", cc_major, ".", cc_minor); 1001 // LINT.ThenChange(//tensorflow/python/platform/test.py) 1002 } 1003 1004 Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options, 1005 const string& name_prefix, 1006 TfGpuId tf_gpu_id, 1007 int64 memory_limit, 1008 const DeviceLocality& dev_locality, 1009 std::vector<Device*>* devices) { 1010 CHECK_GE(tf_gpu_id.value(), 0); 1011 const string device_name = 1012 strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value()); 1013 GpuIdUtil::CheckValidTfGpuId(tf_gpu_id); 1014 CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id); 1015 int numa_node = dev_locality.numa_node(); 1016 Bytes allocated_bytes = static_cast<Bytes>(memory_limit); 1017 1018 gpu::StreamExecutor* se = 1019 GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); 1020 const gpu::DeviceDescription& desc = se->GetDeviceDescription(); 1021 LOG(INFO) << "Creating TensorFlow device (" << device_name << " with " 1022 << (memory_limit >> 20) << " MB memory) -> physical GPU (" 1023 << GetShortDeviceDescription(cuda_gpu_id, desc) << ")"; 1024 ProcessState* process_state = ProcessState::singleton(); 1025 BaseGPUDevice* gpu_device = CreateGPUDevice( 1026 options, device_name, allocated_bytes, dev_locality, tf_gpu_id, 1027 GetShortDeviceDescription(cuda_gpu_id, desc), 1028 process_state->GetGPUAllocator(options.config.gpu_options(), tf_gpu_id, 1029 memory_limit), 1030 process_state->GetCPUAllocator(numa_node)); 1031 TF_RETURN_IF_ERROR(gpu_device->Init(options)); 1032 devices->push_back(gpu_device); 1033 1034 return Status::OK(); 1035 } 1036 1037 namespace { 1038 std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> 1039 GetPeerAccessMap(gpu::Platform* platform, 1040 const std::vector<CudaGpuId>& visible_gpu_order) { 1041 std::unique_ptr<std::map<std::pair<CudaGpuId, CudaGpuId>, bool>> map( 1042 new std::map<std::pair<CudaGpuId, CudaGpuId>, bool>); 1043 for (CudaGpuId cuda_gpu_i : visible_gpu_order) { 1044 for (CudaGpuId cuda_gpu_j : visible_gpu_order) { 1045 gpu::StreamExecutor* from = 1046 GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie(); 1047 gpu::StreamExecutor* to = 1048 GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie(); 1049 (*map)[{cuda_gpu_i, cuda_gpu_j}] = from->CanEnablePeerAccessTo(to); 1050 } 1051 } 1052 1053 return map; 1054 } 1055 1056 } // namespace 1057 1058 Status BaseGPUDeviceFactory::GetInterconnectMaps( 1059 const std::vector<CudaGpuId>& visible_gpu_order, gpu::Platform* gpu_manager, 1060 std::vector<InterconnectMap>* maps) { 1061 // The default interconnect map is obtained from the StreamExecutor. 1062 auto access_map = GetPeerAccessMap(gpu_manager, visible_gpu_order); 1063 maps->resize(1); 1064 InterconnectMap& imap = maps->at(0); 1065 imap.name = "StreamExecutor"; 1066 imap.strength = InterconnectMap::kStreamExecutorStrength; 1067 for (CudaGpuId cuda_id_i : visible_gpu_order) { 1068 for (CudaGpuId cuda_id_j : visible_gpu_order) { 1069 if (cuda_id_i == cuda_id_j) continue; 1070 if ((*access_map)[{cuda_id_i, cuda_id_j}]) { 1071 imap.directed_links.insert({cuda_id_i, cuda_id_j}); 1072 } 1073 } 1074 } 1075 return Status::OK(); 1076 } 1077 1078 Status BaseGPUDeviceFactory::GetDeviceLocalities( 1079 int num_tf_gpus, const std::vector<InterconnectMap>& interconnects, 1080 LocalityMap* localities) { 1081 std::vector<TfGpuId> all_tf_gpu_ids; 1082 for (int i = 0; i < num_tf_gpus; ++i) { 1083 all_tf_gpu_ids.push_back(TfGpuId(i)); 1084 } 1085 for (TfGpuId tf_gpu_id : all_tf_gpu_ids) { 1086 CudaGpuId cuda_gpu_id = GpuIdManager::TfToCudaGpuId(tf_gpu_id); 1087 // Get GPU bus_id from its reported NUMA affinity. Because GPUs are 1088 // virtualized in some environments, we can't just use the GPU id. 1089 // NUMA locales are indexed from 0, buses are indexed from 1. 1090 gpu::StreamExecutor* se = 1091 GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); 1092 const gpu::DeviceDescription& desc = se->GetDeviceDescription(); 1093 int numa_node = desc.numa_node(); 1094 if (numa_node < 0) { 1095 // For some reason the StreamExecutor couldn't get the NUMA 1096 // affinity of the GPU. If this is not a multi-socket mobo with 1097 // GPUs local to different buses, it doesn't matter. If it is, we 1098 // may run into trouble later with data transfer operations. The 1099 // trouble may manifest as slower than expected performance, or 1100 // outright failures. 1101 LOG(INFO) << "Could not identify NUMA node of CUDA gpu id " << cuda_gpu_id 1102 << ", defaulting to 0. Your kernel may not have been built " 1103 << "with NUMA support."; 1104 numa_node = 0; 1105 } 1106 DeviceLocality dev_locality; 1107 dev_locality.set_numa_node(numa_node); 1108 dev_locality.set_bus_id(numa_node + 1); 1109 1110 // Set LocalLinks from InterconnectMaps. 1111 LocalLinks* links = dev_locality.mutable_links(); 1112 for (const InterconnectMap& imap : interconnects) { 1113 for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { 1114 CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst); 1115 if (imap.directed_links.find({cuda_gpu_id, cuda_gpu_dst}) != 1116 imap.directed_links.end()) { 1117 InterconnectLink* ilink = links->add_link(); 1118 ilink->set_device_id(tf_gpu_dst.value()); 1119 ilink->set_type(imap.name); 1120 ilink->set_strength(imap.strength); 1121 } 1122 } 1123 } 1124 1125 // If this is one of multiple virtual GPUs on the same physical GPU 1126 // add high strength links to the others. 1127 for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) { 1128 if (tf_gpu_id == tf_gpu_dst) continue; 1129 CudaGpuId cuda_gpu_dst = GpuIdManager::TfToCudaGpuId(tf_gpu_dst); 1130 if (cuda_gpu_id == cuda_gpu_dst) { 1131 InterconnectLink* ilink = links->add_link(); 1132 ilink->set_device_id(tf_gpu_dst.value()); 1133 ilink->set_type("SAME_DEVICE"); 1134 ilink->set_strength(InterconnectMap::kSameDeviceStrength); 1135 } 1136 } 1137 1138 (*localities)[tf_gpu_id] = dev_locality; 1139 VLOG(1) << "GPUDevice CudaGpuId " << cuda_gpu_id << " TfGpuId " << tf_gpu_id 1140 << " on bus " << dev_locality.bus_id() << " numa: " << numa_node 1141 << " pci: " << desc.pci_bus_id() 1142 << " DeviceLocality: " << dev_locality.DebugString(); 1143 } 1144 return Status::OK(); 1145 } 1146 1147 static int GetDefaultMinGPUMultiprocessorCount( 1148 gpu::Platform* gpu_manager, 1149 const std::vector<CudaGpuId>& visible_gpu_order) { 1150 static const int kDefaultMinGPUMultiprocessorCount = 8; 1151 1152 // Find the highest multi-processor count across all visible GPUs. 1153 int max_count = -1; 1154 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1155 auto exec_status = 1156 GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_order[i]); 1157 if (!exec_status.ok()) { 1158 continue; 1159 } 1160 1161 gpu::StreamExecutor* se = exec_status.ValueOrDie(); 1162 const gpu::DeviceDescription& desc = se->GetDeviceDescription(); 1163 max_count = std::max(max_count, desc.core_count()); 1164 } 1165 1166 if (max_count < 0 || kDefaultMinGPUMultiprocessorCount < max_count) { 1167 return kDefaultMinGPUMultiprocessorCount; 1168 } else { 1169 return max_count; 1170 } 1171 } 1172 1173 static int GetMinGPUMultiprocessorCount( 1174 gpu::Platform* gpu_manager, 1175 const std::vector<CudaGpuId>& visible_gpu_order) { 1176 const char* tf_min_gpu_core_count = getenv("TF_MIN_GPU_MULTIPROCESSOR_COUNT"); 1177 1178 if (tf_min_gpu_core_count == nullptr || 1179 strcmp(tf_min_gpu_core_count, "") == 0) { 1180 return GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); 1181 } 1182 1183 int min_gpu_core_count = -1; 1184 if (strings::safe_strto32(tf_min_gpu_core_count, &min_gpu_core_count)) { 1185 if (min_gpu_core_count >= 0) { 1186 return min_gpu_core_count; 1187 } 1188 } 1189 1190 int count = 1191 GetDefaultMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); 1192 LOG(ERROR) << "Invalid minimum GPU multiprocessor count: [" 1193 << tf_min_gpu_core_count << "]. " 1194 << "Using the default value: " << count; 1195 return count; 1196 } 1197 1198 namespace { 1199 1200 struct CudaVersion { 1201 // Initialize from version_name in the form of "3.5" 1202 explicit CudaVersion(const std::string& version_name) { 1203 size_t dot_pos = version_name.find('.'); 1204 CHECK(dot_pos != string::npos) 1205 << "Illegal version name: [" << version_name << "]"; 1206 string major_str = version_name.substr(0, dot_pos); 1207 CHECK(strings::safe_strto32(major_str, &major_part)) 1208 << "Illegal version name: [" << version_name << "]"; 1209 string minor_str = version_name.substr(dot_pos + 1); 1210 CHECK(strings::safe_strto32(minor_str, &minor_part)) 1211 << "Illegal version name: [" << version_name << "]"; 1212 } 1213 CudaVersion() {} 1214 bool operator<(const CudaVersion& other) const { 1215 if (this->major_part != other.major_part) { 1216 return this->major_part < other.major_part; 1217 } 1218 return this->minor_part < other.minor_part; 1219 } 1220 friend std::ostream& operator<<(std::ostream& os, 1221 const CudaVersion& version) { 1222 os << version.major_part << "." << version.minor_part; 1223 return os; 1224 } 1225 int major_part = -1; 1226 int minor_part = -1; 1227 }; 1228 1229 std::vector<CudaVersion> supported_cuda_compute_capabilities = { 1230 TF_CUDA_CAPABILITIES,}; 1231 1232 std::vector<CudaVersion> GetSupportedCudaComputeCapabilities() { 1233 auto cuda_caps = supported_cuda_compute_capabilities; 1234 #ifdef TF_EXTRA_CUDA_CAPABILITIES 1235 // TF_EXTRA_CUDA_CAPABILITIES should be defined a sequence separated by commas, 1236 // for example: 1237 // TF_EXTRA_CUDA_CAPABILITIES=3.0,4.0,5.0 1238 // Use two-level macro expansion for stringification. 1239 #define TF_XSTRING(...) #__VA_ARGS__ 1240 #define TF_STRING(s) TF_XSTRING(s) 1241 string extra_cuda_caps = TF_STRING(TF_EXTRA_CUDA_CAPABILITIES); 1242 #undef TF_STRING 1243 #undef TF_XSTRING 1244 auto extra_capabilities = str_util::Split(extra_cuda_caps, ','); 1245 for (const auto& capability : extra_capabilities) { 1246 cuda_caps.push_back(CudaVersion(capability)); 1247 } 1248 #endif 1249 return cuda_caps; 1250 } 1251 1252 Status EnablePeerAccess(gpu::Platform* platform, 1253 const std::vector<CudaGpuId>& visible_gpu_order) { 1254 int possible_peer_count = 0; 1255 int enabled_peer_count = 0; 1256 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1257 const CudaGpuId cuda_gpu_i = visible_gpu_order[i]; 1258 for (int j = 0; j < visible_gpu_order.size(); ++j) { 1259 const CudaGpuId cuda_gpu_j = visible_gpu_order[j]; 1260 // We have already validated that ExecutorForDevice() calls return OK. 1261 gpu::StreamExecutor* from = 1262 GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_i).ValueOrDie(); 1263 gpu::StreamExecutor* to = 1264 GpuIdUtil::ExecutorForCudaGpuId(platform, cuda_gpu_j).ValueOrDie(); 1265 1266 if (from->CanEnablePeerAccessTo(to)) { 1267 ++possible_peer_count; 1268 auto status = from->EnablePeerAccessTo(to); 1269 if (!status.ok()) { 1270 LOG(WARNING) 1271 << "Unable to enable peer access between device ordinals " 1272 << cuda_gpu_i << " and " << cuda_gpu_j << ", status: " << status; 1273 } else { 1274 ++enabled_peer_count; 1275 } 1276 } 1277 } 1278 } 1279 1280 // Return an error in the extreme failure case where the driver 1281 // reported that peering was possible but not a single peering was 1282 // successful. This is to catch possible system misconfigurations 1283 // or more fundamental issues. 1284 if (possible_peer_count > 0 && enabled_peer_count == 0) { 1285 return errors::Internal(possible_peer_count, 1286 " potential peer access pairs were reported by the " 1287 "driver, but no peering could be enabled."); 1288 } 1289 return Status::OK(); 1290 } 1291 1292 } // namespace 1293 1294 Status BaseGPUDeviceFactory::GetValidDeviceIds( 1295 const std::vector<CudaGpuId>& visible_gpu_order, 1296 std::vector<CudaGpuId>* ids) { 1297 gpu::Platform* gpu_manager = GPUMachineManager(); 1298 bool new_gpu_found = false; 1299 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1300 const CudaGpuId cuda_gpu_id = visible_gpu_order[i]; 1301 1302 // Only perform this once per visible cuda gpu id. 1303 if (visible_gpu_initialized_[cuda_gpu_id.value()]) { 1304 continue; 1305 } 1306 1307 visible_gpu_initialized_[cuda_gpu_id.value()] = true; 1308 new_gpu_found = true; 1309 1310 auto executor = GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, cuda_gpu_id); 1311 if (!executor.ok()) { 1312 return StreamExecutorUtil::ConvertStatus(executor.status()); 1313 } 1314 1315 auto stream_exec = executor.ValueOrDie(); 1316 int64 free_bytes; 1317 int64 total_bytes; 1318 if (!stream_exec->DeviceMemoryUsage(&free_bytes, &total_bytes)) { 1319 // Logs internally on failure. 1320 free_bytes = 0; 1321 total_bytes = 0; 1322 } 1323 const auto& description = stream_exec->GetDeviceDescription(); 1324 int cc_major; 1325 int cc_minor; 1326 if (!description.cuda_compute_capability(&cc_major, &cc_minor)) { 1327 // Logs internally on failure. 1328 cc_major = 0; 1329 cc_minor = 0; 1330 } 1331 LOG(INFO) << "Found device " << i << " with properties: " 1332 << "\nname: " << description.name() << " major: " << cc_major 1333 << " minor: " << cc_minor 1334 << " memoryClockRate(GHz): " << description.clock_rate_ghz() 1335 << "\npciBusID: " << description.pci_bus_id() << "\ntotalMemory: " 1336 << strings::HumanReadableNumBytes(total_bytes) 1337 << " freeMemory: " << strings::HumanReadableNumBytes(free_bytes); 1338 } 1339 // Checking peering and shows matrix if more than one gpu found. 1340 if (new_gpu_found && visible_gpu_order.size() > 1) { 1341 // Enable peer access 1342 TF_RETURN_IF_ERROR(EnablePeerAccess(gpu_manager, visible_gpu_order)); 1343 } 1344 1345 auto cuda_supported_capabilities = GetSupportedCudaComputeCapabilities(); 1346 if (cuda_supported_capabilities.empty()) { 1347 return errors::FailedPrecondition( 1348 "No supported cuda capabilities in binary."); 1349 } 1350 CudaVersion min_supported_capability = *std::min_element( 1351 cuda_supported_capabilities.begin(), cuda_supported_capabilities.end()); 1352 1353 int min_gpu_core_count = 1354 GetMinGPUMultiprocessorCount(gpu_manager, visible_gpu_order); 1355 1356 // Filter out devices that don't have the right capability or power. 1357 for (int i = 0; i < visible_gpu_order.size(); ++i) { 1358 const CudaGpuId visible_gpu_id = visible_gpu_order[i]; 1359 auto exec_status = 1360 GpuIdUtil::ExecutorForCudaGpuId(gpu_manager, visible_gpu_id); 1361 if (!exec_status.ok()) { 1362 LOG(INFO) << "Ignoring visible gpu device " << visible_gpu_id 1363 << " whose executor is in invalid state: " 1364 << exec_status.status().ToString(); 1365 continue; 1366 } 1367 gpu::StreamExecutor* se = exec_status.ValueOrDie(); 1368 const gpu::DeviceDescription& desc = se->GetDeviceDescription(); 1369 CudaVersion device_capability; 1370 if (!desc.cuda_compute_capability(&device_capability.major_part, 1371 &device_capability.minor_part)) { 1372 LOG(INFO) << "Ignoring visible gpu device " 1373 << "(" << GetShortDeviceDescription(visible_gpu_id, desc) 1374 << ") " 1375 << "whose CUDA compute capability is not available."; 1376 continue; 1377 } 1378 // Only GPUs with no less than the minimum supported compute capability is 1379 // accepted. 1380 if (device_capability < min_supported_capability) { 1381 LOG(INFO) << "Ignoring visible gpu device " 1382 << "(" << GetShortDeviceDescription(visible_gpu_id, desc) 1383 << ") " 1384 << "with Cuda compute capability " << device_capability 1385 << ". The minimum required Cuda capability is " 1386 << min_supported_capability << "."; 1387 continue; 1388 } 1389 1390 // Filter out slow GPUs. By default, GPUs with a lower multiprocessor 1391 // count than the fastest GPU are filtered out, unless they have 8 or more 1392 // multiprocessors. If the TF_MIN_GPU_MULTIPROCESSOR_COUNT environment 1393 // variable is set, its value will be used to filter out GPUs. 1394 if (desc.core_count() < min_gpu_core_count) { 1395 LOG(INFO) << "Ignoring visible gpu device " 1396 << "(" << GetShortDeviceDescription(visible_gpu_id, desc) 1397 << ") " 1398 << "with Cuda multiprocessor count: " << desc.core_count() 1399 << ". The minimum required count is " << min_gpu_core_count 1400 << ". You can adjust this requirement with the env var " 1401 "TF_MIN_GPU_MULTIPROCESSOR_COUNT."; 1402 continue; 1403 } 1404 ids->push_back(visible_gpu_id); 1405 } 1406 if (!ids->empty()) { 1407 std::vector<int> raw_ids(ids->size()); 1408 std::transform(ids->begin(), ids->end(), raw_ids.begin(), 1409 [](CudaGpuId id) -> int { return id.value(); }); 1410 LOG(INFO) << "Adding visible gpu devices: " 1411 << str_util::Join(raw_ids, ", "); 1412 } 1413 1414 return Status::OK(); 1415 } 1416 1417 } // namespace tensorflow 1418 1419 #endif // GOOGLE_CUDA 1420