1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/common_runtime/gpu/gpu_debug_allocator.h" 17 18 #include <cstddef> 19 #include <vector> 20 21 #include "tensorflow/core/common_runtime/gpu/gpu_id.h" 22 #include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h" 23 #include "tensorflow/core/common_runtime/gpu/gpu_init.h" 24 #include "tensorflow/core/platform/stream_executor.h" 25 26 #define MASK_WORDS 2 27 #define MASK_BYTES (MASK_WORDS * sizeof(int64)) 28 29 namespace tensorflow { 30 namespace { 31 32 int64* NewMask(int64 word) { 33 int64* m = new int64[MASK_WORDS]; 34 for (int i = 0; i < MASK_WORDS; ++i) { 35 m[i] = word; 36 } 37 return m; 38 } 39 40 int64* before_mask = NewMask(0xabababababababab); 41 int64* after_mask = NewMask(0xcdcdcdcdcdcdcdcd); 42 43 bool CheckMask(perftools::gputools::StreamExecutor* exec, void* ptr, 44 int64* mask) { 45 gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}}; 46 int64 tmp[MASK_WORDS]; 47 48 if (!exec->SynchronousMemcpy(&tmp, gpu_ptr, MASK_BYTES)) { 49 LOG(FATAL) << "Could not copy debug mask"; 50 } 51 52 bool ok = true; 53 for (int i = 0; i < MASK_WORDS; ++i) { 54 ok &= (mask[i] == tmp[i]); 55 if (!ok) { 56 LOG(ERROR) << "i=" << i 57 << " mask=" << reinterpret_cast<const void*>(mask[i]) 58 << " field=" << reinterpret_cast<const void*>(tmp[i]); 59 } 60 } 61 62 return ok; 63 } 64 65 void InitMask(perftools::gputools::StreamExecutor* exec, void* ptr, 66 int64* mask) { 67 gpu::DeviceMemory<int64> gpu_ptr{gpu::DeviceMemoryBase{ptr, MASK_BYTES}}; 68 if (!exec->SynchronousMemcpy(&gpu_ptr, mask, MASK_BYTES)) { 69 LOG(FATAL) << "Could not copy debug mask"; 70 } 71 } 72 73 } // namespace 74 75 // ----------------------------------------------------------------------------- 76 // GPUDebugAllocator 77 // ----------------------------------------------------------------------------- 78 GPUDebugAllocator::GPUDebugAllocator(VisitableAllocator* allocator, 79 CudaGpuId cuda_gpu_id) 80 : base_allocator_(allocator) { 81 stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); 82 } 83 84 GPUDebugAllocator::~GPUDebugAllocator() { delete base_allocator_; } 85 86 void* GPUDebugAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { 87 num_bytes += (2 * MASK_BYTES); 88 89 void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes); 90 91 // Return the pointer after the header 92 void* rv = static_cast<char*>(allocated_ptr) + MASK_BYTES; 93 94 // Write the header at allocated_ptr 95 InitMask(stream_exec_, allocated_ptr, before_mask); 96 97 // Write the footer at the end. 98 size_t req_size = base_allocator_->RequestedSize(allocated_ptr); 99 InitMask(stream_exec_, 100 static_cast<char*>(allocated_ptr) + req_size - MASK_BYTES, 101 after_mask); 102 return rv; 103 } 104 void GPUDebugAllocator::DeallocateRaw(void* ptr) { 105 CHECK(CheckHeader(ptr)) << "before_mask has been overwritten"; 106 CHECK(CheckFooter(ptr)) << "after_mask has been overwritten"; 107 108 // Backtrack to the beginning of the header. 109 ptr = static_cast<void*>(static_cast<char*>(ptr) - MASK_BYTES); 110 // Deallocate the memory 111 base_allocator_->DeallocateRaw(ptr); 112 } 113 114 void GPUDebugAllocator::AddAllocVisitor(Visitor visitor) { 115 return base_allocator_->AddAllocVisitor(visitor); 116 } 117 118 void GPUDebugAllocator::AddFreeVisitor(Visitor visitor) { 119 return base_allocator_->AddFreeVisitor(visitor); 120 } 121 122 bool GPUDebugAllocator::TracksAllocationSizes() { return true; } 123 124 size_t GPUDebugAllocator::RequestedSize(const void* ptr) { 125 auto req_size = base_allocator_->RequestedSize(static_cast<const char*>(ptr) - 126 MASK_BYTES); 127 return req_size - 2 * MASK_BYTES; 128 } 129 130 size_t GPUDebugAllocator::AllocatedSize(const void* ptr) { 131 return base_allocator_->AllocatedSize(static_cast<const char*>(ptr) - 132 MASK_BYTES); 133 } 134 135 int64 GPUDebugAllocator::AllocationId(const void* ptr) { 136 return base_allocator_->AllocationId(static_cast<const char*>(ptr) - 137 MASK_BYTES); 138 } 139 140 void GPUDebugAllocator::GetStats(AllocatorStats* stats) { 141 base_allocator_->GetStats(stats); 142 } 143 144 void GPUDebugAllocator::ClearStats() { base_allocator_->ClearStats(); } 145 146 bool GPUDebugAllocator::CheckHeader(void* ptr) { 147 return CheckMask(stream_exec_, static_cast<char*>(ptr) - MASK_BYTES, 148 before_mask); 149 } 150 151 bool GPUDebugAllocator::CheckFooter(void* ptr) { 152 char* original_ptr = static_cast<char*>(ptr) - MASK_BYTES; 153 size_t req_size = base_allocator_->RequestedSize(original_ptr); 154 return CheckMask(stream_exec_, original_ptr + req_size - MASK_BYTES, 155 after_mask); 156 } 157 158 // ----------------------------------------------------------------------------- 159 // GPUNanResetAllocator 160 // ----------------------------------------------------------------------------- 161 GPUNanResetAllocator::GPUNanResetAllocator(VisitableAllocator* allocator, 162 CudaGpuId cuda_gpu_id) 163 : base_allocator_(allocator) { 164 stream_exec_ = GpuIdUtil::ExecutorForCudaGpuId(cuda_gpu_id).ValueOrDie(); 165 } 166 167 GPUNanResetAllocator::~GPUNanResetAllocator() { delete base_allocator_; } 168 169 void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) { 170 void* allocated_ptr = base_allocator_->AllocateRaw(alignment, num_bytes); 171 172 // Initialize the buffer to Nans 173 size_t req_size = base_allocator_->RequestedSize(allocated_ptr); 174 std::vector<float> nans(req_size / sizeof(float), std::nanf("")); 175 gpu::DeviceMemory<float> nan_ptr{ 176 gpu::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}}; 177 178 if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) { 179 LOG(ERROR) << "Could not initialize to NaNs"; 180 } 181 182 return allocated_ptr; 183 } 184 void GPUNanResetAllocator::DeallocateRaw(void* ptr) { 185 // Reset the buffer to Nans 186 size_t req_size = base_allocator_->RequestedSize(ptr); 187 std::vector<float> nans(req_size / sizeof(float), std::nanf("")); 188 gpu::DeviceMemory<float> nan_ptr{ 189 gpu::DeviceMemoryBase{static_cast<float*>(ptr), req_size}}; 190 if (!stream_exec_->SynchronousMemcpy(&nan_ptr, &nans[0], req_size)) { 191 LOG(ERROR) << "Could not initialize to NaNs"; 192 } 193 194 // Deallocate the memory 195 base_allocator_->DeallocateRaw(ptr); 196 } 197 198 void GPUNanResetAllocator::AddAllocVisitor(Visitor visitor) { 199 return base_allocator_->AddAllocVisitor(visitor); 200 } 201 202 void GPUNanResetAllocator::AddFreeVisitor(Visitor visitor) { 203 return base_allocator_->AddFreeVisitor(visitor); 204 } 205 206 size_t GPUNanResetAllocator::RequestedSize(const void* ptr) { 207 return base_allocator_->RequestedSize(ptr); 208 } 209 210 size_t GPUNanResetAllocator::AllocatedSize(const void* ptr) { 211 return base_allocator_->AllocatedSize(ptr); 212 } 213 214 void GPUNanResetAllocator::GetStats(AllocatorStats* stats) { 215 base_allocator_->GetStats(stats); 216 } 217 218 void GPUNanResetAllocator::ClearStats() { base_allocator_->ClearStats(); } 219 220 } // namespace tensorflow 221