1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #if GOOGLE_CUDA 17 18 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h" 19 20 #include <algorithm> 21 #include <vector> 22 23 #include "tensorflow/core/common_runtime/gpu/gpu_id.h" 24 #include "tensorflow/core/common_runtime/gpu/gpu_init.h" 25 #include "tensorflow/core/lib/core/threadpool.h" 26 #include "tensorflow/core/lib/gtl/inlined_vector.h" 27 #include "tensorflow/core/lib/random/simple_philox.h" 28 #include "tensorflow/core/platform/logging.h" 29 #include "tensorflow/core/platform/stream_executor.h" 30 #include "tensorflow/core/platform/test.h" 31 #include "tensorflow/core/platform/test_benchmark.h" 32 #include "tensorflow/core/platform/types.h" 33 34 namespace tensorflow { 35 namespace { 36 37 static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use, 38 int64 max_bytes_in_use, int64 max_alloc_size) { 39 AllocatorStats stats; 40 a->GetStats(&stats); 41 LOG(INFO) << "Alloc stats: " << std::endl << stats.DebugString(); 42 EXPECT_EQ(stats.bytes_in_use, bytes_in_use); 43 EXPECT_EQ(stats.max_bytes_in_use, max_bytes_in_use); 44 EXPECT_EQ(stats.num_allocs, num_allocs); 45 EXPECT_EQ(stats.max_alloc_size, max_alloc_size); 46 } 47 48 TEST(GPUBFCAllocatorTest, NoDups) { 49 GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); 50 CheckStats(&a, 0, 0, 0, 0); 51 52 // Allocate a lot of raw pointers 53 std::vector<void*> ptrs; 54 for (int s = 1; s < 1024; s++) { 55 void* raw = a.AllocateRaw(1, s); 56 ptrs.push_back(raw); 57 } 58 CheckStats(&a, 1023, 654336, 654336, 1024); 59 60 std::sort(ptrs.begin(), ptrs.end()); 61 62 // Make sure none of them are equal, and that none of them overlap. 63 for (size_t i = 1; i < ptrs.size(); i++) { 64 ASSERT_NE(ptrs[i], ptrs[i - 1]); // No dups 65 size_t req_size = a.RequestedSize(ptrs[i - 1]); 66 ASSERT_GT(req_size, 0); 67 ASSERT_GE(static_cast<char*>(ptrs[i]) - static_cast<char*>(ptrs[i - 1]), 68 req_size); 69 } 70 71 for (size_t i = 0; i < ptrs.size(); i++) { 72 a.DeallocateRaw(ptrs[i]); 73 } 74 CheckStats(&a, 1023, 0, 654336, 1024); 75 } 76 77 TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) { 78 GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); 79 // Allocate 256 raw pointers of sizes between 100 bytes and about 80 // a meg 81 random::PhiloxRandom philox(123, 17); 82 random::SimplePhilox rand(&philox); 83 84 std::vector<void*> initial_ptrs; 85 for (int s = 1; s < 256; s++) { 86 size_t size = std::min<size_t>( 87 std::max<size_t>(rand.Rand32() % 1048576, 100), 1048576); 88 void* raw = a.AllocateRaw(1, size); 89 90 initial_ptrs.push_back(raw); 91 } 92 93 // Deallocate half of the memory, and keep track of the others. 94 std::vector<void*> existing_ptrs; 95 for (size_t i = 0; i < initial_ptrs.size(); i++) { 96 if (i % 2 == 1) { 97 a.DeallocateRaw(initial_ptrs[i]); 98 } else { 99 existing_ptrs.push_back(initial_ptrs[i]); 100 } 101 } 102 103 // Ensure out of memory errors work and do not prevent future allocations from 104 // working. 105 void* out_of_memory_ptr = a.AllocateRaw(1, (1 << 30) + 1); 106 CHECK_EQ(out_of_memory_ptr, nullptr); 107 108 // Allocate a lot of raw pointers 109 for (int s = 1; s < 256; s++) { 110 size_t size = std::min<size_t>( 111 std::max<size_t>(rand.Rand32() % 1048576, 100), 1048576); 112 void* raw = a.AllocateRaw(1, size); 113 existing_ptrs.push_back(raw); 114 } 115 116 std::sort(existing_ptrs.begin(), existing_ptrs.end()); 117 // Make sure none of them are equal 118 for (size_t i = 1; i < existing_ptrs.size(); i++) { 119 CHECK_NE(existing_ptrs[i], existing_ptrs[i - 1]); // No dups 120 121 size_t req_size = a.RequestedSize(existing_ptrs[i - 1]); 122 ASSERT_GT(req_size, 0); 123 124 // Check that they don't overlap. 125 ASSERT_GE(static_cast<char*>(existing_ptrs[i]) - 126 static_cast<char*>(existing_ptrs[i - 1]), 127 req_size); 128 } 129 130 for (size_t i = 0; i < existing_ptrs.size(); i++) { 131 a.DeallocateRaw(existing_ptrs[i]); 132 } 133 } 134 135 TEST(GPUBFCAllocatorTest, ExerciseCoalescing) { 136 GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); 137 CheckStats(&a, 0, 0, 0, 0); 138 139 float* first_ptr = a.Allocate<float>(1024); 140 a.DeallocateRaw(first_ptr); 141 CheckStats(&a, 1, 0, 4096, 4096); 142 for (int i = 0; i < 1024; ++i) { 143 // Allocate several buffers of different sizes, and then clean them 144 // all up. We should be able to repeat this endlessly without 145 // causing fragmentation and growth. 146 float* t1 = a.Allocate<float>(1024); 147 148 int64* t2 = a.Allocate<int64>(1048576); 149 double* t3 = a.Allocate<double>(2048); 150 float* t4 = a.Allocate<float>(10485760); 151 152 a.DeallocateRaw(t1); 153 a.DeallocateRaw(t2); 154 a.DeallocateRaw(t3); 155 a.DeallocateRaw(t4); 156 } 157 CheckStats(&a, 4097, 0, 158 1024 * sizeof(float) + 1048576 * sizeof(int64) + 159 2048 * sizeof(double) + 10485760 * sizeof(float), 160 10485760 * sizeof(float)); 161 162 // At the end, we should have coalesced all memory into one region 163 // starting at the beginning, so validate that allocating a pointer 164 // starts from this region. 165 float* first_ptr_after = a.Allocate<float>(1024); 166 EXPECT_EQ(first_ptr, first_ptr_after); 167 a.DeallocateRaw(first_ptr_after); 168 } 169 170 TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) { 171 GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); 172 float* ptr = a.Allocate<float>(0); 173 EXPECT_EQ(nullptr, ptr); 174 } 175 176 TEST(GPUBFCAllocatorTest, TracksSizes) { 177 GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); 178 EXPECT_EQ(true, a.TracksAllocationSizes()); 179 } 180 181 TEST(GPUBFCAllocatorTest, AllocatedVsRequested) { 182 GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); 183 float* t1 = a.Allocate<float>(1); 184 EXPECT_EQ(4, a.RequestedSize(t1)); 185 EXPECT_EQ(256, a.AllocatedSize(t1)); 186 a.DeallocateRaw(t1); 187 } 188 189 TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) { 190 // Configure a 1MiB byte limit 191 GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc"); 192 193 float* first_ptr = a.Allocate<float>(1 << 6); 194 float* second_ptr = a.Allocate<float>(1 << 20); 195 196 EXPECT_NE(nullptr, first_ptr); 197 EXPECT_EQ(nullptr, second_ptr); 198 a.DeallocateRaw(first_ptr); 199 } 200 201 TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) { 202 GPUOptions options; 203 options.set_allow_growth(true); 204 205 // Max of 2GiB, but starts out small. 206 GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc"); 207 208 // Allocate 10 raw pointers of sizes between 100 bytes and about 209 // 64 megs. 210 random::PhiloxRandom philox(123, 17); 211 random::SimplePhilox rand(&philox); 212 213 const int32 max_mem = 1 << 27; 214 215 std::vector<void*> initial_ptrs; 216 for (int s = 1; s < 10; s++) { 217 size_t size = std::min<size_t>( 218 std::max<size_t>(rand.Rand32() % max_mem, 100), max_mem); 219 void* raw = a.AllocateRaw(1, size); 220 221 initial_ptrs.push_back(raw); 222 } 223 224 // Deallocate half of the memory, and keep track of the others. 225 std::vector<void*> existing_ptrs; 226 for (size_t i = 0; i < initial_ptrs.size(); i++) { 227 if (i % 2 == 1) { 228 a.DeallocateRaw(initial_ptrs[i]); 229 } else { 230 existing_ptrs.push_back(initial_ptrs[i]); 231 } 232 } 233 234 const int32 max_mem_2 = 1 << 26; 235 // Allocate a lot of raw pointers between 100 bytes and 64 megs. 236 for (int s = 1; s < 10; s++) { 237 size_t size = std::min<size_t>( 238 std::max<size_t>(rand.Rand32() % max_mem_2, 100), max_mem_2); 239 void* raw = a.AllocateRaw(1, size); 240 existing_ptrs.push_back(raw); 241 } 242 243 std::sort(existing_ptrs.begin(), existing_ptrs.end()); 244 // Make sure none of them are equal 245 for (size_t i = 1; i < existing_ptrs.size(); i++) { 246 CHECK_NE(existing_ptrs[i], existing_ptrs[i - 1]); // No dups 247 248 size_t req_size = a.RequestedSize(existing_ptrs[i - 1]); 249 ASSERT_GT(req_size, 0); 250 251 // Check that they don't overlap. 252 ASSERT_GE(static_cast<char*>(existing_ptrs[i]) - 253 static_cast<char*>(existing_ptrs[i - 1]), 254 req_size); 255 } 256 257 for (size_t i = 0; i < existing_ptrs.size(); i++) { 258 a.DeallocateRaw(existing_ptrs[i]); 259 } 260 261 AllocatorStats stats; 262 a.GetStats(&stats); 263 LOG(INFO) << "Alloc stats: \n" << stats.DebugString(); 264 } 265 266 TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) { 267 GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); 268 GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc"); 269 void* amem = a.AllocateRaw(1, 1); 270 void* bmem = b.AllocateRaw(1, 1 << 30); 271 a.DeallocateRaw(amem); 272 b.DeallocateRaw(bmem); 273 } 274 275 static void BM_Allocation(int iters) { 276 GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); 277 // Exercise a few different allocation sizes 278 std::vector<size_t> sizes = {256, 4096, 16384, 524288, 279 512, 1048576, 10485760, 104857600, 280 1048576000, 2048576000}; 281 int size_index = 0; 282 283 while (--iters > 0) { 284 size_t bytes = sizes[size_index++ % sizes.size()]; 285 void* p = a.AllocateRaw(1, bytes); 286 a.DeallocateRaw(p); 287 } 288 } 289 BENCHMARK(BM_Allocation); 290 291 static void BM_AllocationThreaded(int iters, int num_threads) { 292 GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc"); 293 thread::ThreadPool pool(Env::Default(), "test", num_threads); 294 std::atomic_int_fast32_t count(iters); 295 mutex done_lock; 296 condition_variable done; 297 bool done_flag = false; 298 299 for (int t = 0; t < num_threads; t++) { 300 pool.Schedule([&a, &count, &done_lock, &done, &done_flag, iters]() { 301 // Exercise a few different allocation sizes 302 std::vector<int> sizes = {256, 4096, 16384, 524288, 303 512, 1048576, 10485760, 104857600}; 304 int size_index = 0; 305 for (int i = 0; i < iters; i++) { 306 int bytes = sizes[size_index++ % sizes.size()]; 307 void* p = a.AllocateRaw(1, bytes); 308 a.DeallocateRaw(p); 309 if (count.fetch_sub(1) == 1) { 310 mutex_lock l(done_lock); 311 done_flag = true; 312 done.notify_all(); 313 break; 314 } 315 } 316 }); 317 } 318 mutex_lock l(done_lock); 319 if (!done_flag) { 320 done.wait(l); 321 } 322 } 323 BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16); 324 325 // A more complex benchmark that defers deallocation of an object for 326 // "delay" allocations. 327 static void BM_AllocationDelayed(int iters, int delay) { 328 GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); 329 // Exercise a few different allocation sizes 330 std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024}; 331 int size_index = 0; 332 333 std::vector<void*> ptrs; 334 ptrs.reserve(delay); 335 for (int i = 0; i < delay; i++) { 336 ptrs.push_back(nullptr); 337 } 338 int pindex = 0; 339 while (--iters > 0) { 340 if (ptrs[pindex] != nullptr) { 341 a.DeallocateRaw(ptrs[pindex]); 342 ptrs[pindex] = nullptr; 343 } 344 int bytes = sizes[size_index++ % sizes.size()]; 345 void* p = a.AllocateRaw(1, bytes); 346 ptrs[pindex] = p; 347 pindex = (pindex + 1) % ptrs.size(); 348 } 349 for (int i = 0; i < ptrs.size(); i++) { 350 if (ptrs[i] != nullptr) { 351 a.DeallocateRaw(ptrs[i]); 352 } 353 } 354 } 355 BENCHMARK(BM_AllocationDelayed)->Arg(1)->Arg(10)->Arg(100)->Arg(1000); 356 357 } // namespace 358 359 class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test { 360 protected: 361 // The following test methods are called from tests. The reason for this is 362 // that this class is a friend class to BFCAllocator, but tests are not, so 363 // only methods inside this class can access private members of BFCAllocator. 364 365 void TestBinDebugInfo() { 366 GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc"); 367 368 std::vector<void*> initial_ptrs; 369 std::vector<size_t> initial_ptrs_allocated_sizes; 370 for (int i = 0; i < 5; i++) { 371 for (int j = 0; j < 2; j++) { 372 size_t size = 256 << i; 373 void* raw = a.AllocateRaw(1, size); 374 ASSERT_NE(raw, nullptr); 375 initial_ptrs.push_back(raw); 376 initial_ptrs_allocated_sizes.push_back(a.AllocatedSize(raw)); 377 } 378 } 379 380 std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins> bin_infos; 381 { 382 mutex_lock l(a.lock_); 383 bin_infos = a.get_bin_debug_info(); 384 } 385 386 for (int i = 0; i < BFCAllocator::kNumBins; i++) { 387 const BFCAllocator::BinDebugInfo& bin_info = bin_infos[i]; 388 if (i < 5) { 389 const size_t requested_size = 2 * (256 << i); 390 EXPECT_EQ(requested_size, a.RequestedSize(initial_ptrs[2 * i]) + 391 a.RequestedSize(initial_ptrs[2 * i + 1])); 392 size_t allocated_size = initial_ptrs_allocated_sizes[2 * i] + 393 initial_ptrs_allocated_sizes[2 * i + 1]; 394 EXPECT_EQ(bin_info.total_bytes_in_use, allocated_size); 395 EXPECT_EQ(bin_info.total_bytes_in_bin, allocated_size); 396 EXPECT_EQ(bin_info.total_requested_bytes_in_use, requested_size); 397 EXPECT_EQ(bin_info.total_chunks_in_use, 2); 398 EXPECT_EQ(bin_info.total_chunks_in_bin, 2); 399 } else { 400 EXPECT_EQ(bin_info.total_bytes_in_use, 0); 401 EXPECT_EQ(bin_info.total_requested_bytes_in_use, 0); 402 EXPECT_EQ(bin_info.total_chunks_in_use, 0); 403 if (i == BFCAllocator::kNumBins - 1) { 404 EXPECT_GT(bin_info.total_bytes_in_bin, 0); 405 EXPECT_EQ(bin_info.total_chunks_in_bin, 1); 406 } else { 407 EXPECT_EQ(bin_info.total_bytes_in_bin, 0); 408 EXPECT_EQ(bin_info.total_chunks_in_bin, 0); 409 } 410 } 411 } 412 413 for (size_t i = 1; i < initial_ptrs.size(); i += 2) { 414 a.DeallocateRaw(initial_ptrs[i]); 415 initial_ptrs[i] = nullptr; 416 } 417 { 418 mutex_lock l(a.lock_); 419 bin_infos = a.get_bin_debug_info(); 420 } 421 for (int i = 0; i < BFCAllocator::kNumBins; i++) { 422 const BFCAllocator::BinDebugInfo& bin_info = bin_infos[i]; 423 if (i < 5) { 424 // We cannot assert the exact number of bytes or chunks in the bin, 425 // because it depends on what chunks were coalesced. 426 size_t requested_size = 256 << i; 427 EXPECT_EQ(requested_size, a.RequestedSize(initial_ptrs[2 * i])); 428 EXPECT_EQ(bin_info.total_bytes_in_use, 429 initial_ptrs_allocated_sizes[2 * i]); 430 EXPECT_GE(bin_info.total_bytes_in_bin, 431 initial_ptrs_allocated_sizes[2 * i]); 432 EXPECT_EQ(bin_info.total_requested_bytes_in_use, requested_size); 433 EXPECT_EQ(bin_info.total_chunks_in_use, 1); 434 EXPECT_GE(bin_info.total_chunks_in_bin, 1); 435 } else { 436 EXPECT_EQ(bin_info.total_bytes_in_use, 0); 437 EXPECT_EQ(bin_info.total_requested_bytes_in_use, 0); 438 EXPECT_EQ(bin_info.total_chunks_in_use, 0); 439 } 440 } 441 } 442 443 void TestLog2FloorNonZeroSlow() { 444 GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc"); 445 EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0)); 446 EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1)); 447 EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2)); 448 EXPECT_EQ(1, a.Log2FloorNonZeroSlow(3)); 449 EXPECT_EQ(9, a.Log2FloorNonZeroSlow(1023)); 450 EXPECT_EQ(10, a.Log2FloorNonZeroSlow(1024)); 451 EXPECT_EQ(10, a.Log2FloorNonZeroSlow(1025)); 452 } 453 }; 454 455 TEST_F(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); } 456 457 TEST_F(GPUBFCAllocatorPrivateMethodsTest, Log2FloorNonZeroSlow) { 458 TestLog2FloorNonZeroSlow(); 459 } 460 461 } // namespace tensorflow 462 463 #endif // GOOGLE_CUDA 464