Home | History | Annotate | Download | only in gpu
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/common_runtime/gpu/gpu_util.h"
     17 
     18 #include "tensorflow/core/common_runtime/copy_tensor.h"
     19 #include "tensorflow/core/common_runtime/device.h"
     20 #include "tensorflow/core/common_runtime/dma_helper.h"
     21 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
     22 #include "tensorflow/core/common_runtime/gpu/process_state.h"
     23 #include "tensorflow/core/common_runtime/gpu_device_context.h"
     24 #include "tensorflow/core/framework/tensor.h"
     25 #include "tensorflow/core/framework/tensor.pb.h"
     26 #include "tensorflow/core/framework/tensor_reference.h"
     27 #include "tensorflow/core/framework/types.h"
     28 #include "tensorflow/core/lib/core/errors.h"
     29 #include "tensorflow/core/lib/core/refcount.h"
     30 #include "tensorflow/core/lib/gtl/array_slice.h"
     31 #include "tensorflow/core/lib/gtl/stl_util.h"
     32 #include "tensorflow/core/lib/hash/hash.h"
     33 #include "tensorflow/core/lib/strings/strcat.h"
     34 #include "tensorflow/core/lib/strings/stringprintf.h"
     35 #include "tensorflow/core/platform/logging.h"
     36 #include "tensorflow/core/platform/stream_executor.h"
     37 #include "tensorflow/core/platform/tensor_coding.h"
     38 #include "tensorflow/core/platform/tracing.h"
     39 #include "tensorflow/core/util/util.h"
     40 
     41 // IMPLEMENTATION NOTE:
     42 //
     43 // 1. Within this module, we intentionally LOG(FATAL) if any stream
     44 //    involved in memcpy becomes !stream->ok(), because TF process
     45 //    today (1/2016) can not properly recover from such an error.
     46 //
     47 // 2. When 0-size tensor is being copied, we should not schedule a
     48 //    copy ThenMemcpy since there is no byte to move. However, we must
     49 //    ensure the causal ordering by arranging the copy done callback
     50 //    happens-after all activities scheduled on the given stream being
     51 //    finished.
     52 
     53 // If this need to be runtime configurable, consider adding options to
     54 // ConfigProto.
     55 const tensorflow::int64 FLAGS_brain_gpu_util_debug_string_maxlen = 128;
     56 extern bool FLAGS_brain_gpu_record_mem_types;
     57 
     58 using perftools::gputools::DeviceMemoryBase;
     59 using perftools::gputools::Stream;
     60 
     61 namespace tensorflow {
     62 
     63 namespace gpu = ::perftools::gputools;
     64 
     65 Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
     66                    const Tensor* dst,
     67                    const DeviceBase::GpuDeviceInfo** dev_info,
     68                    gpu::Stream** stream) {
     69   if (device == nullptr) {
     70     return errors::Internal("Unexpected null device.");
     71   }
     72   auto di = device->tensorflow_gpu_device_info();
     73   if (di == nullptr) {
     74     return errors::Internal("Unexpected null device info.");
     75   }
     76   *dev_info = di;
     77   if (ctx == nullptr) {
     78     return errors::Internal("Unexpected null device context.");
     79   }
     80   auto gs = static_cast<const GPUDeviceContext*>(ctx)->stream();
     81   if (gs == nullptr) {
     82     return errors::Internal("No gpu stream is available.");
     83   }
     84   *stream = gs;
     85   if (dst != nullptr) {
     86     if (src.dtype() != dst->dtype()) {
     87       return errors::Internal("Can't copy a tensor of ",
     88                               DataTypeString(src.dtype()), " into a tensor of ",
     89                               DataTypeString(dst->dtype()));
     90     }
     91     if (src.TotalBytes() != dst->TotalBytes()) {
     92       return errors::Internal("Can't copy ", src.TotalBytes(),
     93                               " bytes of a tensor into another with ",
     94                               dst->TotalBytes(), " bytes buffer.");
     95     }
     96     if ((src.TotalBytes() > 0) && !src.IsInitialized()) {
     97       return errors::Internal("Src tensor is not initialized.");
     98     }
     99     if ((dst->TotalBytes() > 0) && !dst->IsInitialized()) {
    100       return errors::Internal("Dst tensor is not initialized.");
    101     }
    102   }
    103   if (!DMAHelper::CanUseDMA(&src)) {
    104     return errors::Internal("GPU copy from non-DMA ",
    105                             DataTypeString(src.dtype()), "tensor");
    106   }
    107   return Status::OK();
    108 }
    109 
    110 void* GetBase(const Tensor* src) {
    111   return const_cast<void*>(DMAHelper::base(src));
    112 }
    113 
    114 void* GetBase(Tensor* dst) { return DMAHelper::base(dst); }
    115 
    116 /*static*/
    117 void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
    118                               const DeviceContext* device_context,
    119                               TensorProto* proto, bool is_dead,
    120                               StatusCallback done) {
    121   VLOG(1) << "SetProtoFromGPU device_context " << device_context;
    122   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
    123   gpu::Stream* send_stream = nullptr;
    124   Status s = PrepareCopy(dev, device_context, tensor, nullptr, &dev_info,
    125                          &send_stream);
    126   if (!s.ok()) {
    127     done(s);
    128     return;
    129   }
    130 
    131   auto send_device_to_host_stream =
    132       static_cast<const GPUDeviceContext*>(device_context)
    133           ->device_to_host_stream();
    134   if (send_device_to_host_stream == nullptr) {
    135     done(errors::Internal("No send gpu copy-out-stream is available."));
    136     return;
    137   }
    138   // Wait for the sender's main stream to make sure the data are available.
    139   send_device_to_host_stream->ThenWaitFor(send_stream);
    140 
    141   // Tensor values need to be copied from GPU to CPU ram so that
    142   // we can build the protobuf response for a RecvTensor RPC.
    143   // "device context" identifies the stream where the _Send op executed.
    144   proto->set_dtype(tensor.dtype());
    145   tensor.shape().AsProto(proto->mutable_tensor_shape());
    146 
    147   // Prepare a proto with the right data buf size, and DMA the data
    148   // over from the GPU buffer.  Note that 0-size tensors do not have a
    149   // backing buffer.
    150   Allocator* alloc = nullptr;
    151   char* buf = nullptr;
    152   const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes();
    153   if (total_bytes > 0) {
    154     port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU");
    155     alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
    156     buf = alloc->Allocate<char>(total_bytes);
    157     if (LogMemory::IsEnabled()) {
    158       LogMemory::RecordRawAllocation("SetProtoFromGPU",
    159                                      LogMemory::PROTO_BUFFER_STEP_ID,
    160                                      total_bytes, buf, alloc);
    161     }
    162     void* src_ptr = GetBase(&tensor);
    163     DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
    164     send_device_to_host_stream->ThenMemcpy(buf, gpu_src_ptr, total_bytes);
    165   }
    166   // Use of tensor may outlive stack scope, so keep a ref.
    167   TensorReference tensor_ref(tensor);
    168   dev_info->event_mgr->ThenExecute(
    169       send_device_to_host_stream, [send_device_to_host_stream, done, proto, buf,
    170                                    total_bytes, alloc, tensor_ref]() {
    171         if (!send_device_to_host_stream->ok()) {
    172           LOG(FATAL) << "SetProtoFromGPU: GPU Memcpy failed";
    173         }
    174         tensor_ref.Unref();
    175         if (total_bytes > 0) {
    176           port::CopyFromArray(proto->mutable_tensor_content(), buf,
    177                               total_bytes);
    178           if (LogMemory::IsEnabled()) {
    179             LogMemory::RecordRawDeallocation("SetProtoFromGPU",
    180                                              LogMemory::PROTO_BUFFER_STEP_ID,
    181                                              buf, alloc, false);
    182           }
    183           alloc->Deallocate<char>(buf, total_bytes);
    184         }
    185         done(Status::OK());
    186       });
    187 }
    188 
    189 // static
    190 void GPUUtil::DeviceToDeviceCopy(DeviceContext* send_dev_context,
    191                                  DeviceContext* recv_dev_context, Device* src,
    192                                  Device* dst,
    193                                  AllocatorAttributes src_alloc_attr,
    194                                  AllocatorAttributes dst_alloc_attr,
    195                                  const Tensor* input, Tensor* output,
    196                                  StatusCallback done) {
    197   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
    198   gpu::Stream* send_stream = nullptr;
    199   Status s = PrepareCopy(src, send_dev_context, *input, output, &dev_info,
    200                          &send_stream);
    201   if (!s.ok()) {
    202     done(s);
    203     return;
    204   }
    205   auto send_device_to_device_stream =
    206       static_cast<const GPUDeviceContext*>(send_dev_context)
    207           ->device_to_device_stream();
    208   if (send_device_to_device_stream == nullptr) {
    209     done(errors::Internal("No send gpu copy-out-stream is available."));
    210     return;
    211   }
    212   // Wait for the main stream on the sender to make sure the result is
    213   // available.
    214   send_device_to_device_stream->ThenWaitFor(send_stream);
    215 
    216   const int64 total_bytes = input->TotalBytes();
    217   if (total_bytes > 0) {
    218     void* src_ptr = GetBase(input);
    219     DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
    220     void* dst_ptr = GetBase(output);
    221     DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
    222     auto recv_stream =
    223         static_cast<const GPUDeviceContext*>(recv_dev_context)->stream();
    224     if (recv_stream == nullptr) {
    225       done(errors::Internal("No recv gpu stream is available."));
    226       return;
    227     }
    228     // Since we want to use the memory from recv_stream in the
    229     // send_device_to_device_stream, add a dependency to make sure the memory is
    230     // truly free.
    231     // TODO(zhengxq): remove this dependency when we switch to a better way
    232     // to make sure the memory is free.
    233     send_device_to_device_stream->ThenWaitFor(recv_stream);
    234 
    235     VLOG(2) << "src_ptr " << src_ptr << " dst_ptr " << dst_ptr;
    236     send_device_to_device_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr,
    237                                              total_bytes);
    238   }
    239 
    240   // Use of input may outlive stack scope, so keep a ref.
    241   TensorReference input_ref(*input);
    242   dev_info->event_mgr->ThenExecute(
    243       send_device_to_device_stream,
    244       [done, send_device_to_device_stream, input_ref]() {
    245         input_ref.Unref();
    246         if (!send_device_to_device_stream->ok()) {
    247           LOG(FATAL) << "GPU->GPU Memcpy failed";
    248         }
    249         done(Status::OK());
    250       });
    251   send_dev_context->MaintainLifetimeOnStream(input,
    252                                              send_device_to_device_stream);
    253 }
    254 
    255 static CopyTensor::Registration register_gpu_gpu_copy(
    256     DEVICE_GPU, DEVICE_GPU, GPUUtil::DeviceToDeviceCopy);
    257 
    258 // static
    259 void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
    260                                  const DeviceContext* device_context,
    261                                  const Tensor* gpu_tensor, Tensor* cpu_tensor,
    262                                  StatusCallback done) {
    263   VLOG(1) << "CopyGPUTensorToCPU";
    264   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
    265   gpu::Stream* send_stream = nullptr;
    266   Status s = PrepareCopy(gpu_device, device_context, *gpu_tensor, cpu_tensor,
    267                          &dev_info, &send_stream);
    268   if (!s.ok()) {
    269     done(s);
    270     return;
    271   }
    272 
    273   auto send_device_to_host_stream =
    274       static_cast<const GPUDeviceContext*>(device_context)
    275           ->device_to_host_stream();
    276   if (send_device_to_host_stream == nullptr) {
    277     done(errors::Internal("No send gpu copy-out-stream is available."));
    278     return;
    279   }
    280   // Wait for the sender's main stream to make sure the data are available.
    281   send_device_to_host_stream->ThenWaitFor(send_stream);
    282 
    283   const int64 total_bytes = gpu_tensor->TotalBytes();
    284   if (total_bytes > 0) {
    285     void* src_ptr = GetBase(gpu_tensor);
    286     DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
    287     void* dst_ptr = GetBase(cpu_tensor);
    288     send_device_to_host_stream->ThenMemcpy(dst_ptr, gpu_src_ptr, total_bytes);
    289   }
    290   // Use of the input may outlive stack scope, so keep a ref.
    291   TensorReference input_ref(*gpu_tensor);
    292   dev_info->event_mgr->ThenExecute(
    293       send_device_to_host_stream,
    294       [send_device_to_host_stream, done, input_ref]() {
    295         if (!send_device_to_host_stream->ok()) {
    296           LOG(FATAL) << "GPU->CPU Memcpy failed";
    297         }
    298         input_ref.Unref();
    299         done(Status::OK());
    300       });
    301 }
    302 
    303 /*  static */
    304 void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
    305                                  const DeviceContext* device_context,
    306                                  Device* gpu_device, Tensor* gpu_tensor,
    307                                  StatusCallback done) {
    308   VLOG(1) << "CopyCPUTensorToGPU";
    309   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
    310   gpu::Stream* recv_stream = nullptr;
    311   Status s = PrepareCopy(gpu_device, device_context, *cpu_tensor, gpu_tensor,
    312                          &dev_info, &recv_stream);
    313   if (!s.ok()) {
    314     done(s);
    315     return;
    316   }
    317 
    318   auto recv_host_to_device_stream =
    319       static_cast<const GPUDeviceContext*>(device_context)
    320           ->host_to_device_stream();
    321   if (recv_host_to_device_stream == nullptr) {
    322     done(errors::Internal("No send gpu copy-out-stream is available."));
    323     return;
    324   }
    325   // Wait for the recv-stream to make sure the buffer is truly available.
    326   recv_host_to_device_stream->ThenWaitFor(recv_stream);
    327 
    328   const int64 total_bytes = cpu_tensor->TotalBytes();
    329   // Note that 0-size tensors have no backing buffer.
    330   if (total_bytes > 0) {
    331     void* src_ptr = GetBase(cpu_tensor);
    332     void* dst_ptr = GetBase(gpu_tensor);
    333     DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
    334     recv_host_to_device_stream->ThenMemcpy(&gpu_dst_ptr, src_ptr, total_bytes);
    335   }
    336   // Use of cpu_tensor may outlive stack scope, so keep a ref.
    337   TensorReference input_ref(*cpu_tensor);
    338   dev_info->event_mgr->ThenExecute(
    339       recv_host_to_device_stream,
    340       [recv_host_to_device_stream, done, input_ref]() {
    341         input_ref.Unref();
    342         if (!recv_host_to_device_stream->ok()) {
    343           LOG(FATAL) << "CPU->GPU Memcpy failed";
    344         }
    345         done(Status::OK());
    346       });
    347 }
    348 
    349 Status GPUUtil::Sync(Device* gpu_device) {
    350   VLOG(1) << "GPUUtil::Sync";
    351   auto* dev_info = gpu_device->tensorflow_gpu_device_info();
    352   if (!dev_info) {
    353     return errors::Internal("Failed to find dest device GPUDeviceInfo");
    354   }
    355   return dev_info->stream->BlockHostUntilDone();
    356 }
    357 
    358 Status GPUUtil::SyncAll(Device* gpu_device) {
    359   VLOG(1) << "GPUUtil::SyncAll";
    360   auto* dev_info = gpu_device->tensorflow_gpu_device_info();
    361   if (!dev_info) {
    362     return errors::Internal("Failed to find dest device GPUDeviceInfo");
    363   }
    364   if (!dev_info->stream->parent()->SynchronizeAllActivity() ||
    365       !dev_info->stream->ok()) {
    366     return errors::Internal("GPU sync failed");
    367   }
    368   return Status::OK();
    369 }
    370 
    371 string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
    372   string ret;
    373   CHECK(tensor);
    374   const int64 num_bytes = std::min<int64>(
    375       FLAGS_brain_gpu_util_debug_string_maxlen, tensor->TotalBytes());
    376   void* ptr = (num_bytes > 0) ? GetBase(tensor) : nullptr;
    377   strings::Appendf(&ret, "%p:", ptr);
    378   if (num_bytes > 0) {
    379     auto* dev_info = device->tensorflow_gpu_device_info();
    380     if (!dev_info) {
    381       strings::StrAppend(
    382           &ret, PrintMemory(reinterpret_cast<const char*>(ptr), num_bytes));
    383     } else {
    384       string buf;
    385       buf.resize(num_bytes);
    386       DeviceMemoryBase gpu_ptr(ptr, num_bytes);
    387       auto s = dev_info->stream->parent()->SynchronousMemcpyD2H(
    388           gpu_ptr, num_bytes, gtl::string_as_array(&buf));
    389       strings::StrAppend(&ret,
    390                          PrintMemory(gtl::string_as_array(&buf), num_bytes));
    391     }
    392   }
    393   return ret;
    394 }
    395 
    396 // TODO(pbar) Checksum is called from places without a valid device context.
    397 uint64 GPUUtil::Checksum(Device* gpu_device,
    398                          const DeviceContext* device_context,
    399                          const Tensor& tensor) {
    400   Tensor copy(tensor.dtype(), tensor.shape());
    401   Status s;
    402   Notification n;
    403   CopyGPUTensorToCPU(gpu_device, device_context, &tensor, &copy,
    404                      [&s, &n](Status status) {
    405                        s.Update(status);
    406                        n.Notify();
    407                      });
    408   n.WaitForNotification();
    409   CHECK(s.ok()) << s;
    410   return Checksum(copy);
    411 }
    412 
    413 uint64 GPUUtil::Checksum(const Tensor& tensor) {
    414   const float* fptr = reinterpret_cast<const float*>(GetBase(&tensor));
    415   size_t num_bytes = tensor.TotalBytes();
    416   size_t num_floats = num_bytes / sizeof(float);
    417   for (size_t i = 0; i < num_floats; ++i) {
    418     CHECK(!std::isnan(fptr[i])) << " i " << i;
    419   }
    420   // TODO(tucker): consider using crc32c instead.
    421   return Hash64(reinterpret_cast<const char*>(GetBase(&tensor)),
    422                 tensor.TotalBytes(), 0);
    423 }
    424 
    425 // static
    426 void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device,
    427                                      const DeviceContext* device_context,
    428                                      const Tensor* src_gpu_tensor,
    429                                      Tensor* dst_gpu_tensor,
    430                                      StatusCallback done) {
    431   VLOG(1) << "CopyGPUTensorToSameGPU";
    432   const DeviceBase::GpuDeviceInfo* dev_info = nullptr;
    433   gpu::Stream* send_stream = nullptr;
    434   Status s = PrepareCopy(gpu_device, device_context, *src_gpu_tensor,
    435                          dst_gpu_tensor, &dev_info, &send_stream);
    436   if (!s.ok()) {
    437     done(s);
    438     return;
    439   }
    440 
    441   const int64 total_bytes = src_gpu_tensor->TotalBytes();
    442   if (total_bytes > 0) {
    443     void* src_ptr = GetBase(src_gpu_tensor);
    444     DeviceMemoryBase gpu_src_ptr(src_ptr, total_bytes);
    445     void* dst_ptr = GetBase(dst_gpu_tensor);
    446     DeviceMemoryBase gpu_dst_ptr(dst_ptr, total_bytes);
    447     send_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
    448   }
    449 
    450   done(Status::OK());
    451 }
    452 
    453 }  // namespace tensorflow
    454