1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/common_runtime/gpu/gpu_util.h" 17 18 #include "tensorflow/core/common_runtime/copy_tensor.h" 19 #include "tensorflow/core/common_runtime/device.h" 20 #include "tensorflow/core/common_runtime/dma_helper.h" 21 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h" 22 #include "tensorflow/core/common_runtime/gpu/process_state.h" 23 #include "tensorflow/core/common_runtime/gpu_device_context.h" 24 #include "tensorflow/core/framework/tensor.h" 25 #include "tensorflow/core/framework/tensor.pb.h" 26 #include "tensorflow/core/framework/tensor_reference.h" 27 #include "tensorflow/core/framework/types.h" 28 #include "tensorflow/core/lib/core/errors.h" 29 #include "tensorflow/core/lib/core/refcount.h" 30 #include "tensorflow/core/lib/gtl/array_slice.h" 31 #include "tensorflow/core/lib/gtl/stl_util.h" 32 #include "tensorflow/core/lib/hash/hash.h" 33 #include "tensorflow/core/lib/strings/strcat.h" 34 #include "tensorflow/core/lib/strings/stringprintf.h" 35 #include "tensorflow/core/platform/logging.h" 36 #include "tensorflow/core/platform/stream_executor.h" 37 #include "tensorflow/core/platform/tensor_coding.h" 38 #include "tensorflow/core/platform/tracing.h" 39 #include "tensorflow/core/util/util.h" 40 41 // IMPLEMENTATION NOTE: 42 // 43 // 1. Within this module, we intentionally LOG(FATAL) if any stream 44 // involved in memcpy becomes !stream->ok(), because TF process 45 // today (1/2016) can not properly recover from such an error. 46 // 47 // 2. When 0-size tensor is being copied, we should not schedule a 48 // copy ThenMemcpy since there is no byte to move. However, we must 49 // ensure the causal ordering by arranging the copy done callback 50 // happens-after all activities scheduled on the given stream being 51 // finished. 52 53 // If this need to be runtime configurable, consider adding options to 54 // ConfigProto. 55 const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128; 56 extern bool FLAGS_brain_gpu_record_mem_types; 57 58 using perftools::gputools::DeviceMemoryBase; 59 using perftools::gputools::Stream; 60 61 namespace tensorflow { 62 63 namespace gpu = ::perftools::gputools; 64 65 Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src, 66 const Tensor* dst, 67 const DeviceBase::GpuDeviceInfo** dev_info, 68 gpu::Stream** stream) { 69 if (device == nullptr) { 70 return errors::Internal("Unexpected null device."); 71 } 72 auto di = device->tensorflow_gpu_device_info(); 73 if (di == nullptr) { 74 return errors::Internal("Unexpected null device info."); 75 } 76 *dev_info = di; 77 if (ctx == nullptr) { 78 return errors::Internal("Unexpected null device context."); 79 } 80 auto gs = static_cast<const GPUDeviceContext*>(ctx)->stream(); 81 if (gs == nullptr) { 82 return errors::Internal("No gpu stream is available."); 83 } 84 *stream = gs; 85 if (dst != nullptr) { 86 if (src.dtype() != dst->dtype()) { 87 return errors::Internal("Can't copy a tensor of ", 88 DataTypeString(src.dtype()), " into a tensor of ", 89 DataTypeString(dst->dtype())); 90 } 91 if (src.TotalBytes() != dst->TotalBytes()) { 92 return errors::Internal("Can't copy ", src.TotalBytes(), 93 " bytes of a tensor into another with ", 94 dst->TotalBytes(), " bytes buffer."); 95 } 96 if ((src.TotalBytes() > 0) && !src.IsInitialized()) { 97 return errors::Internal("Src tensor is not initialized."); 98 } 99 if ((dst->TotalBytes() > 0) && !dst->IsInitialized()) { 100 return errors::Internal("Dst tensor is not initialized."); 101 } 102 } 103 if (!DMAHelper::CanUseDMA(&src)) { 104 return errors::Internal("GPU copy from non-DMA ", 105 DataTypeString(src.dtype()), "tensor"); 106 } 107 return Status::OK(); 108 } 109 110 void* GetBase(const Tensor* src) { 111 return const_cast<void*>(DMAHelper::base(src)); 112 } 113 114 void* GetBase(Tensor* dst) { return DMAHelper::base(dst); } 115 116 /*static*/ 117 void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev, 118 const DeviceContext* device_context, 119 TensorProto* proto, bool is_dead, 120 StatusCallback done) { 121 VLOG(1) << "SetProtoFromGPU device_context " << device_context; 122 const DeviceBase::GpuDeviceInfo* dev_info = nullptr; 123 gpu::Stream* send_stream = nullptr; 124 Status s = PrepareCopy(dev, device_context, tensor, nullptr, &dev_info, 125 &send_stream); 126 if (!s.ok()) { 127 done(s); 128 return; 129 } 130 131 auto send_device_to_host_stream = 132 static_cast<const GPUDeviceContext*>(device_context) 133 ->device_to_host_stream(); 134 if (send_device_to_host_stream == nullptr) { 135 done(errors::Internal("No send gpu copy-out-stream is available.")); 136 return; 137 } 138 // Wait for the sender's main stream to make sure the data are available. 139 send_device_to_host_stream->ThenWaitFor(send_stream); 140 141 // Tensor values need to be copied from GPU to CPU ram so that 142 // we can build the protobuf response for a RecvTensor RPC. 143 // "device context" identifies the stream where the _Send op executed. 144 proto->set_dtype(tensor.dtype()); 145 tensor.shape().AsProto(proto->mutable_tensor_shape()); 146 147 // Prepare a proto with the right data buf size, and DMA the data 148 // over from the GPU buffer. Note that 0-size tensors do not have a 149 // backing buffer. 150 Allocator* alloc = nullptr; 151 char* buf = nullptr; 152 const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes(); 153 if (total_bytes > 0) { 154 port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU"); 155 alloc = ProcessState::singleton()->GetCUDAHostAllocator(0); 156 buf = alloc->Allocate<char>(total_bytes); 157 if (LogMemory::IsEnabled()) { 158 LogMemory::RecordRawAllocation("SetProtoFromGPU", 159 LogMemory::PROTO_BUFFER_STEP_ID, 160 total_bytes, buf, alloc); 161 } 162 void* src_ptr = GetBase(&tensor); 163 DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); 164 send_device_to_host_stream->ThenMemcpy(buf, gpu_src_ptr, total_bytes); 165 } 166 // Use of tensor may outlive stack scope, so keep a ref. 167 TensorReference tensor_ref(tensor); 168 dev_info->event_mgr->ThenExecute( 169 send_device_to_host_stream, [send_device_to_host_stream, done, proto, buf, 170 total_bytes, alloc, tensor_ref]() { 171 if (!send_device_to_host_stream->ok()) { 172 LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed"; 173 } 174 tensor_ref.Unref(); 175 if (total_bytes > 0) { 176 port::CopyFromArray(proto->mutable_tensor_content(), buf, 177 total_bytes); 178 if (LogMemory::IsEnabled()) { 179 LogMemory::RecordRawDeallocation("SetProtoFromGPU", 180 LogMemory::PROTO_BUFFER_STEP_ID, 181 buf, alloc, false); 182 } 183 alloc->Deallocate<char>(buf, total_bytes); 184 } 185 done(Status::OK()); 186 }); 187 } 188 189 // static 190 void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context, 191 DeviceContext* recv_dev_context, Device* src, 192 Device* dst, 193 AllocatorAttributes src_alloc_attr, 194 AllocatorAttributes dst_alloc_attr, 195 const Tensor* input, Tensor* output, 196 StatusCallback done) { 197 const DeviceBase::GpuDeviceInfo* dev_info = nullptr; 198 gpu::Stream* send_stream = nullptr; 199 Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info, 200 &send_stream); 201 if (!s.ok()) { 202 done(s); 203 return; 204 } 205 auto send_device_to_device_stream = 206 static_cast<const GPUDeviceContext*>(send_dev_context) 207 ->device_to_device_stream(); 208 if (send_device_to_device_stream == nullptr) { 209 done(errors::Internal("No send gpu copy-out-stream is available.")); 210 return; 211 } 212 // Wait for the main stream on the sender to make sure the result is 213 // available. 214 send_device_to_device_stream->ThenWaitFor(send_stream); 215 216 const int64 total_bytes = input->TotalBytes(); 217 if (total_bytes > 0) { 218 void* src_ptr = GetBase(input); 219 DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); 220 void* dst_ptr = GetBase(output); 221 DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); 222 auto recv_stream = 223 static_cast<const GPUDeviceContext*>(recv_dev_context)->stream(); 224 if (recv_stream == nullptr) { 225 done(errors::Internal("No recv gpu stream is available.")); 226 return; 227 } 228 // Since we want to use the memory from recv_stream in the 229 // send_device_to_device_stream, add a dependency to make sure the memory is 230 // truly free. 231 // TODO(zhengxq): remove this dependency when we switch to a better way 232 // to make sure the memory is free. 233 send_device_to_device_stream->ThenWaitFor(recv_stream); 234 235 VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr; 236 send_device_to_device_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, 237 total_bytes); 238 } 239 240 // Use of input may outlive stack scope, so keep a ref. 241 TensorReference input_ref(*input); 242 dev_info->event_mgr->ThenExecute( 243 send_device_to_device_stream, 244 [done, send_device_to_device_stream, input_ref]() { 245 input_ref.Unref(); 246 if (!send_device_to_device_stream->ok()) { 247 LOG(FATAL) << "GPU->GPU Memcpy failed"; 248 } 249 done(Status::OK()); 250 }); 251 send_dev_context->MaintainLifetimeOnStream(input, 252 send_device_to_device_stream); 253 } 254 255 static CopyTensor::Registration register_gpu_gpu_copy( 256 DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy); 257 258 // static 259 void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device, 260 const DeviceContext* device_context, 261 const Tensor* gpu_tensor, Tensor* cpu_tensor, 262 StatusCallback done) { 263 VLOG(1) << "CopyGPUTensorToCPU"; 264 const DeviceBase::GpuDeviceInfo* dev_info = nullptr; 265 gpu::Stream* send_stream = nullptr; 266 Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor, 267 &dev_info, &send_stream); 268 if (!s.ok()) { 269 done(s); 270 return; 271 } 272 273 auto send_device_to_host_stream = 274 static_cast<const GPUDeviceContext*>(device_context) 275 ->device_to_host_stream(); 276 if (send_device_to_host_stream == nullptr) { 277 done(errors::Internal("No send gpu copy-out-stream is available.")); 278 return; 279 } 280 // Wait for the sender's main stream to make sure the data are available. 281 send_device_to_host_stream->ThenWaitFor(send_stream); 282 283 const int64 total_bytes = gpu_tensor->TotalBytes(); 284 if (total_bytes > 0) { 285 void* src_ptr = GetBase(gpu_tensor); 286 DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); 287 void* dst_ptr = GetBase(cpu_tensor); 288 send_device_to_host_stream->ThenMemcpy(dst_ptr, gpu_src_ptr, total_bytes); 289 } 290 // Use of the input may outlive stack scope, so keep a ref. 291 TensorReference input_ref(*gpu_tensor); 292 dev_info->event_mgr->ThenExecute( 293 send_device_to_host_stream, 294 [send_device_to_host_stream, done, input_ref]() { 295 if (!send_device_to_host_stream->ok()) { 296 LOG(FATAL) << "GPU->CPU Memcpy failed"; 297 } 298 input_ref.Unref(); 299 done(Status::OK()); 300 }); 301 } 302 303 /* static */ 304 void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor, 305 const DeviceContext* device_context, 306 Device* gpu_device, Tensor* gpu_tensor, 307 StatusCallback done) { 308 VLOG(1) << "CopyCPUTensorToGPU"; 309 const DeviceBase::GpuDeviceInfo* dev_info = nullptr; 310 gpu::Stream* recv_stream = nullptr; 311 Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor, 312 &dev_info, &recv_stream); 313 if (!s.ok()) { 314 done(s); 315 return; 316 } 317 318 auto recv_host_to_device_stream = 319 static_cast<const GPUDeviceContext*>(device_context) 320 ->host_to_device_stream(); 321 if (recv_host_to_device_stream == nullptr) { 322 done(errors::Internal("No send gpu copy-out-stream is available.")); 323 return; 324 } 325 // Wait for the recv-stream to make sure the buffer is truly available. 326 recv_host_to_device_stream->ThenWaitFor(recv_stream); 327 328 const int64 total_bytes = cpu_tensor->TotalBytes(); 329 // Note that 0-size tensors have no backing buffer. 330 if (total_bytes > 0) { 331 void* src_ptr = GetBase(cpu_tensor); 332 void* dst_ptr = GetBase(gpu_tensor); 333 DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); 334 recv_host_to_device_stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes); 335 } 336 // Use of cpu_tensor may outlive stack scope, so keep a ref. 337 TensorReference input_ref(*cpu_tensor); 338 dev_info->event_mgr->ThenExecute( 339 recv_host_to_device_stream, 340 [recv_host_to_device_stream, done, input_ref]() { 341 input_ref.Unref(); 342 if (!recv_host_to_device_stream->ok()) { 343 LOG(FATAL) << "CPU->GPU Memcpy failed"; 344 } 345 done(Status::OK()); 346 }); 347 } 348 349 Status GPUUtil::Sync(Device* gpu_device) { 350 VLOG(1) << "GPUUtil::Sync"; 351 auto* dev_info = gpu_device->tensorflow_gpu_device_info(); 352 if (!dev_info) { 353 return errors::Internal("Failed to find dest device GPUDeviceInfo"); 354 } 355 return dev_info->stream->BlockHostUntilDone(); 356 } 357 358 Status GPUUtil::SyncAll(Device* gpu_device) { 359 VLOG(1) << "GPUUtil::SyncAll"; 360 auto* dev_info = gpu_device->tensorflow_gpu_device_info(); 361 if (!dev_info) { 362 return errors::Internal("Failed to find dest device GPUDeviceInfo"); 363 } 364 if (!dev_info->stream->parent()->SynchronizeAllActivity() || 365 !dev_info->stream->ok()) { 366 return errors::Internal("GPU sync failed"); 367 } 368 return Status::OK(); 369 } 370 371 string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) { 372 string ret; 373 CHECK(tensor); 374 const int64 num_bytes = std::min<int64>( 375 FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes()); 376 void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr; 377 strings::Appendf(&ret, "%p:", ptr); 378 if (num_bytes > 0) { 379 auto* dev_info = device->tensorflow_gpu_device_info(); 380 if (!dev_info) { 381 strings::StrAppend( 382 &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes)); 383 } else { 384 string buf; 385 buf.resize(num_bytes); 386 DeviceMemoryBase gpu_ptr(ptr, num_bytes); 387 auto s = dev_info->stream->parent()->SynchronousMemcpyD2H( 388 gpu_ptr, num_bytes, gtl::string_as_array(&buf)); 389 strings::StrAppend(&ret, 390 PrintMemory(gtl::string_as_array(&buf), num_bytes)); 391 } 392 } 393 return ret; 394 } 395 396 // TODO(pbar) Checksum is called from places without a valid device context. 397 uint64 GPUUtil::Checksum(Device* gpu_device, 398 const DeviceContext* device_context, 399 const Tensor& tensor) { 400 Tensor copy(tensor.dtype(), tensor.shape()); 401 Status s; 402 Notification n; 403 CopyGPUTensorToCPU(gpu_device, device_context, &tensor, ©, 404 [&s, &n](Status status) { 405 s.Update(status); 406 n.Notify(); 407 }); 408 n.WaitForNotification(); 409 CHECK(s.ok()) << s; 410 return Checksum(copy); 411 } 412 413 uint64 GPUUtil::Checksum(const Tensor& tensor) { 414 const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor)); 415 size_t num_bytes = tensor.TotalBytes(); 416 size_t num_floats = num_bytes / sizeof(float); 417 for (size_t i = 0; i < num_floats; ++i) { 418 CHECK(!std::isnan(fptr[i])) << " i " << i; 419 } 420 // TODO(tucker): consider using crc32c instead. 421 return Hash64(reinterpret_cast<const char*>(GetBase(&tensor)), 422 tensor.TotalBytes(), 0); 423 } 424 425 // static 426 void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device, 427 const DeviceContext* device_context, 428 const Tensor* src_gpu_tensor, 429 Tensor* dst_gpu_tensor, 430 StatusCallback done) { 431 VLOG(1) << "CopyGPUTensorToSameGPU"; 432 const DeviceBase::GpuDeviceInfo* dev_info = nullptr; 433 gpu::Stream* send_stream = nullptr; 434 Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor, 435 dst_gpu_tensor, &dev_info, &send_stream); 436 if (!s.ok()) { 437 done(s); 438 return; 439 } 440 441 const int64 total_bytes = src_gpu_tensor->TotalBytes(); 442 if (total_bytes > 0) { 443 void* src_ptr = GetBase(src_gpu_tensor); 444 DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes); 445 void* dst_ptr = GetBase(dst_gpu_tensor); 446 DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes); 447 send_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes); 448 } 449 450 done(Status::OK()); 451 } 452 453 } // namespace tensorflow 454