Home | History | Annotate | Download | only in gpu
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h"
     17 
     18 #include <cstddef>
     19 #include <vector>
     20 
     21 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
     22 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
     23 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
     24 #include "tensorflow/core/platform/stream_executor.h"
     25 
     26 #define MASK_WORDS 2
     27 #define MASK_BYTES (MASK_WORDS * sizeof(int64))
     28 
     29 namespace tensorflow {
     30 namespace {
     31 
     32 int64* NewMask(int64 word) {
     33   int64* m = new int64[MASK_WORDS];
     34   for (int i = 0; i < MASK_WORDS; ++i) {
     35     m[i] = word;
     36   }
     37   return m;
     38 }
     39 
     40 int64* before_mask = NewMask(0xabababababababab);
     41 int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd);
     42 
     43 bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr,
     44                int64* mask) {
     45   gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
     46   int64 tmp[MASK_WORDS];
     47 
     48   if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) {
     49     LOG(FATAL) << "Could not copy debug mask";
     50   }
     51 
     52   bool ok = true;
     53   for (int i = 0; i < MASK_WORDS; ++i) {
     54     ok &= (mask[i] == tmp[i]);
     55     if (!ok) {
     56       LOG(ERROR) << "i=" << i
     57                  << " mask=" << reinterpret_cast<const void*>(mask[i])
     58                  << " field=" << reinterpret_cast<const void*>(tmp[i]);
     59     }
     60   }
     61 
     62   return ok;
     63 }
     64 
     65 void InitMask(perftools::gputools::StreamExecutor* exec, void* ptr,
     66               int64* mask) {
     67   gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}};
     68   if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) {
     69     LOG(FATAL) << "Could not copy debug mask";
     70   }
     71 }
     72 
     73 }  // namespace
     74 
     75 // -----------------------------------------------------------------------------
     76 // GPUDebugAllocator
     77 // -----------------------------------------------------------------------------
     78 GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator,
     79                                      CudaGpuId cuda_gpu_id)
     80     : base_allocator_(allocator) {
     81   stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
     82 }
     83 
     84 GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; }
     85 
     86 void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
     87   num_bytes += (2 * MASK_BYTES);
     88 
     89   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
     90 
     91   // Return the pointer after the header
     92   void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES;
     93 
     94   // Write the header at allocated_ptr
     95   InitMask(stream_exec_, allocated_ptr, before_mask);
     96 
     97   // Write the footer at the end.
     98   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
     99   InitMask(stream_exec_,
    100            static_cast<char*>(allocated_ptr) + req_size - MASK_BYTES,
    101            after_mask);
    102   return rv;
    103 }
    104 void GPUDebugAllocator::DeallocateRaw(void* ptr) {
    105   CHECK(CheckHeader(ptr)) << "before_mask has been overwritten";
    106   CHECK(CheckFooter(ptr)) << "after_mask has been overwritten";
    107 
    108   // Backtrack to the beginning of the header.
    109   ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES);
    110   // Deallocate the memory
    111   base_allocator_->DeallocateRaw(ptr);
    112 }
    113 
    114 void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) {
    115   return base_allocator_->AddAllocVisitor(visitor);
    116 }
    117 
    118 void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) {
    119   return base_allocator_->AddFreeVisitor(visitor);
    120 }
    121 
    122 bool GPUDebugAllocator::TracksAllocationSizes() { return true; }
    123 
    124 size_t GPUDebugAllocator::RequestedSize(const void* ptr) {
    125   auto req_size = base_allocator_->RequestedSize(static_cast<const char*>(ptr) -
    126                                                  MASK_BYTES);
    127   return req_size - 2 * MASK_BYTES;
    128 }
    129 
    130 size_t GPUDebugAllocator::AllocatedSize(const void* ptr) {
    131   return base_allocator_->AllocatedSize(static_cast<const char*>(ptr) -
    132                                         MASK_BYTES);
    133 }
    134 
    135 int64 GPUDebugAllocator::AllocationId(const void* ptr) {
    136   return base_allocator_->AllocationId(static_cast<const char*>(ptr) -
    137                                        MASK_BYTES);
    138 }
    139 
    140 void GPUDebugAllocator::GetStats(AllocatorStats* stats) {
    141   base_allocator_->GetStats(stats);
    142 }
    143 
    144 void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); }
    145 
    146 bool GPUDebugAllocator::CheckHeader(void* ptr) {
    147   return CheckMask(stream_exec_, static_cast<char*>(ptr) - MASK_BYTES,
    148                    before_mask);
    149 }
    150 
    151 bool GPUDebugAllocator::CheckFooter(void* ptr) {
    152   char* original_ptr = static_cast<char*>(ptr) - MASK_BYTES;
    153   size_t req_size = base_allocator_->RequestedSize(original_ptr);
    154   return CheckMask(stream_exec_, original_ptr + req_size - MASK_BYTES,
    155                    after_mask);
    156 }
    157 
    158 // -----------------------------------------------------------------------------
    159 // GPUNanResetAllocator
    160 // -----------------------------------------------------------------------------
    161 GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator,
    162                                            CudaGpuId cuda_gpu_id)
    163     : base_allocator_(allocator) {
    164   stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie();
    165 }
    166 
    167 GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; }
    168 
    169 void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
    170   void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes);
    171 
    172   // Initialize the buffer to Nans
    173   size_t req_size = base_allocator_->RequestedSize(allocated_ptr);
    174   std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
    175   gpu::DeviceMemory<float> nan_ptr{
    176       gpu::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
    177 
    178   if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
    179     LOG(ERROR) << "Could not initialize to NaNs";
    180   }
    181 
    182   return allocated_ptr;
    183 }
    184 void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
    185   // Reset the buffer to Nans
    186   size_t req_size = base_allocator_->RequestedSize(ptr);
    187   std::vector<float> nans(req_size / sizeof(float), std::nanf(""));
    188   gpu::DeviceMemory<float> nan_ptr{
    189       gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
    190   if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) {
    191     LOG(ERROR) << "Could not initialize to NaNs";
    192   }
    193 
    194   // Deallocate the memory
    195   base_allocator_->DeallocateRaw(ptr);
    196 }
    197 
    198 void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) {
    199   return base_allocator_->AddAllocVisitor(visitor);
    200 }
    201 
    202 void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) {
    203   return base_allocator_->AddFreeVisitor(visitor);
    204 }
    205 
    206 size_t GPUNanResetAllocator::RequestedSize(const void* ptr) {
    207   return base_allocator_->RequestedSize(ptr);
    208 }
    209 
    210 size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) {
    211   return base_allocator_->AllocatedSize(ptr);
    212 }
    213 
    214 void GPUNanResetAllocator::GetStats(AllocatorStats* stats) {
    215   base_allocator_->GetStats(stats);
    216 }
    217 
    218 void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); }
    219 
    220 }  // namespace tensorflow
    221