Home | History | Annotate | Download | only in gpu
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #if GOOGLE_CUDA
     17 
     18 #include "tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h"
     19 
     20 #include <algorithm>
     21 #include <vector>
     22 
     23 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
     24 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
     25 #include "tensorflow/core/lib/core/threadpool.h"
     26 #include "tensorflow/core/lib/gtl/inlined_vector.h"
     27 #include "tensorflow/core/lib/random/simple_philox.h"
     28 #include "tensorflow/core/platform/logging.h"
     29 #include "tensorflow/core/platform/stream_executor.h"
     30 #include "tensorflow/core/platform/test.h"
     31 #include "tensorflow/core/platform/test_benchmark.h"
     32 #include "tensorflow/core/platform/types.h"
     33 
     34 namespace tensorflow {
     35 namespace {
     36 
     37 static void CheckStats(Allocator* a, int64 num_allocs, int64 bytes_in_use,
     38                        int64 max_bytes_in_use, int64 max_alloc_size) {
     39   AllocatorStats stats;
     40   a->GetStats(&stats);
     41   LOG(INFO) << "Alloc stats: " << std::endl << stats.DebugString();
     42   EXPECT_EQ(stats.bytes_in_use, bytes_in_use);
     43   EXPECT_EQ(stats.max_bytes_in_use, max_bytes_in_use);
     44   EXPECT_EQ(stats.num_allocs, num_allocs);
     45   EXPECT_EQ(stats.max_alloc_size, max_alloc_size);
     46 }
     47 
     48 TEST(GPUBFCAllocatorTest, NoDups) {
     49   GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
     50   CheckStats(&a, 0, 0, 0, 0);
     51 
     52   // Allocate a lot of raw pointers
     53   std::vector<void*> ptrs;
     54   for (int s = 1; s < 1024; s++) {
     55     void* raw = a.AllocateRaw(1, s);
     56     ptrs.push_back(raw);
     57   }
     58   CheckStats(&a, 1023, 654336, 654336, 1024);
     59 
     60   std::sort(ptrs.begin(), ptrs.end());
     61 
     62   // Make sure none of them are equal, and that none of them overlap.
     63   for (size_t i = 1; i < ptrs.size(); i++) {
     64     ASSERT_NE(ptrs[i], ptrs[i - 1]);  // No dups
     65     size_t req_size = a.RequestedSize(ptrs[i - 1]);
     66     ASSERT_GT(req_size, 0);
     67     ASSERT_GE(static_cast<char*>(ptrs[i]) - static_cast<char*>(ptrs[i - 1]),
     68               req_size);
     69   }
     70 
     71   for (size_t i = 0; i < ptrs.size(); i++) {
     72     a.DeallocateRaw(ptrs[i]);
     73   }
     74   CheckStats(&a, 1023, 0, 654336, 1024);
     75 }
     76 
     77 TEST(GPUBFCAllocatorTest, AllocationsAndDeallocations) {
     78   GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
     79   // Allocate 256 raw pointers of sizes between 100 bytes and about
     80   // a meg
     81   random::PhiloxRandom philox(123, 17);
     82   random::SimplePhilox rand(&philox);
     83 
     84   std::vector<void*> initial_ptrs;
     85   for (int s = 1; s < 256; s++) {
     86     size_t size = std::min<size_t>(
     87         std::max<size_t>(rand.Rand32() % 1048576, 100), 1048576);
     88     void* raw = a.AllocateRaw(1, size);
     89 
     90     initial_ptrs.push_back(raw);
     91   }
     92 
     93   // Deallocate half of the memory, and keep track of the others.
     94   std::vector<void*> existing_ptrs;
     95   for (size_t i = 0; i < initial_ptrs.size(); i++) {
     96     if (i % 2 == 1) {
     97       a.DeallocateRaw(initial_ptrs[i]);
     98     } else {
     99       existing_ptrs.push_back(initial_ptrs[i]);
    100     }
    101   }
    102 
    103   // Ensure out of memory errors work and do not prevent future allocations from
    104   // working.
    105   void* out_of_memory_ptr = a.AllocateRaw(1, (1 << 30) + 1);
    106   CHECK_EQ(out_of_memory_ptr, nullptr);
    107 
    108   // Allocate a lot of raw pointers
    109   for (int s = 1; s < 256; s++) {
    110     size_t size = std::min<size_t>(
    111         std::max<size_t>(rand.Rand32() % 1048576, 100), 1048576);
    112     void* raw = a.AllocateRaw(1, size);
    113     existing_ptrs.push_back(raw);
    114   }
    115 
    116   std::sort(existing_ptrs.begin(), existing_ptrs.end());
    117   // Make sure none of them are equal
    118   for (size_t i = 1; i < existing_ptrs.size(); i++) {
    119     CHECK_NE(existing_ptrs[i], existing_ptrs[i - 1]);  // No dups
    120 
    121     size_t req_size = a.RequestedSize(existing_ptrs[i - 1]);
    122     ASSERT_GT(req_size, 0);
    123 
    124     // Check that they don't overlap.
    125     ASSERT_GE(static_cast<char*>(existing_ptrs[i]) -
    126                   static_cast<char*>(existing_ptrs[i - 1]),
    127               req_size);
    128   }
    129 
    130   for (size_t i = 0; i < existing_ptrs.size(); i++) {
    131     a.DeallocateRaw(existing_ptrs[i]);
    132   }
    133 }
    134 
    135 TEST(GPUBFCAllocatorTest, ExerciseCoalescing) {
    136   GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
    137   CheckStats(&a, 0, 0, 0, 0);
    138 
    139   float* first_ptr = a.Allocate<float>(1024);
    140   a.DeallocateRaw(first_ptr);
    141   CheckStats(&a, 1, 0, 4096, 4096);
    142   for (int i = 0; i < 1024; ++i) {
    143     // Allocate several buffers of different sizes, and then clean them
    144     // all up.  We should be able to repeat this endlessly without
    145     // causing fragmentation and growth.
    146     float* t1 = a.Allocate<float>(1024);
    147 
    148     int64* t2 = a.Allocate<int64>(1048576);
    149     double* t3 = a.Allocate<double>(2048);
    150     float* t4 = a.Allocate<float>(10485760);
    151 
    152     a.DeallocateRaw(t1);
    153     a.DeallocateRaw(t2);
    154     a.DeallocateRaw(t3);
    155     a.DeallocateRaw(t4);
    156   }
    157   CheckStats(&a, 4097, 0,
    158              1024 * sizeof(float) + 1048576 * sizeof(int64) +
    159                  2048 * sizeof(double) + 10485760 * sizeof(float),
    160              10485760 * sizeof(float));
    161 
    162   // At the end, we should have coalesced all memory into one region
    163   // starting at the beginning, so validate that allocating a pointer
    164   // starts from this region.
    165   float* first_ptr_after = a.Allocate<float>(1024);
    166   EXPECT_EQ(first_ptr, first_ptr_after);
    167   a.DeallocateRaw(first_ptr_after);
    168 }
    169 
    170 TEST(GPUBFCAllocatorTest, AllocateZeroBufSize) {
    171   GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
    172   float* ptr = a.Allocate<float>(0);
    173   EXPECT_EQ(nullptr, ptr);
    174 }
    175 
    176 TEST(GPUBFCAllocatorTest, TracksSizes) {
    177   GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
    178   EXPECT_EQ(true, a.TracksAllocationSizes());
    179 }
    180 
    181 TEST(GPUBFCAllocatorTest, AllocatedVsRequested) {
    182   GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
    183   float* t1 = a.Allocate<float>(1);
    184   EXPECT_EQ(4, a.RequestedSize(t1));
    185   EXPECT_EQ(256, a.AllocatedSize(t1));
    186   a.DeallocateRaw(t1);
    187 }
    188 
    189 TEST(GPUBFCAllocatorTest, TestCustomMemoryLimit) {
    190   // Configure a 1MiB byte limit
    191   GPUBFCAllocator a(CudaGpuId(0), 1 << 20, "GPU_0_bfc");
    192 
    193   float* first_ptr = a.Allocate<float>(1 << 6);
    194   float* second_ptr = a.Allocate<float>(1 << 20);
    195 
    196   EXPECT_NE(nullptr, first_ptr);
    197   EXPECT_EQ(nullptr, second_ptr);
    198   a.DeallocateRaw(first_ptr);
    199 }
    200 
    201 TEST(GPUBFCAllocatorTest, AllocationsAndDeallocationsWithGrowth) {
    202   GPUOptions options;
    203   options.set_allow_growth(true);
    204 
    205   // Max of 2GiB, but starts out small.
    206   GPUBFCAllocator a(CudaGpuId(0), 1LL << 31, options, "GPU_0_bfc");
    207 
    208   // Allocate 10 raw pointers of sizes between 100 bytes and about
    209   // 64 megs.
    210   random::PhiloxRandom philox(123, 17);
    211   random::SimplePhilox rand(&philox);
    212 
    213   const int32 max_mem = 1 << 27;
    214 
    215   std::vector<void*> initial_ptrs;
    216   for (int s = 1; s < 10; s++) {
    217     size_t size = std::min<size_t>(
    218         std::max<size_t>(rand.Rand32() % max_mem, 100), max_mem);
    219     void* raw = a.AllocateRaw(1, size);
    220 
    221     initial_ptrs.push_back(raw);
    222   }
    223 
    224   // Deallocate half of the memory, and keep track of the others.
    225   std::vector<void*> existing_ptrs;
    226   for (size_t i = 0; i < initial_ptrs.size(); i++) {
    227     if (i % 2 == 1) {
    228       a.DeallocateRaw(initial_ptrs[i]);
    229     } else {
    230       existing_ptrs.push_back(initial_ptrs[i]);
    231     }
    232   }
    233 
    234   const int32 max_mem_2 = 1 << 26;
    235   // Allocate a lot of raw pointers between 100 bytes and 64 megs.
    236   for (int s = 1; s < 10; s++) {
    237     size_t size = std::min<size_t>(
    238         std::max<size_t>(rand.Rand32() % max_mem_2, 100), max_mem_2);
    239     void* raw = a.AllocateRaw(1, size);
    240     existing_ptrs.push_back(raw);
    241   }
    242 
    243   std::sort(existing_ptrs.begin(), existing_ptrs.end());
    244   // Make sure none of them are equal
    245   for (size_t i = 1; i < existing_ptrs.size(); i++) {
    246     CHECK_NE(existing_ptrs[i], existing_ptrs[i - 1]);  // No dups
    247 
    248     size_t req_size = a.RequestedSize(existing_ptrs[i - 1]);
    249     ASSERT_GT(req_size, 0);
    250 
    251     // Check that they don't overlap.
    252     ASSERT_GE(static_cast<char*>(existing_ptrs[i]) -
    253                   static_cast<char*>(existing_ptrs[i - 1]),
    254               req_size);
    255   }
    256 
    257   for (size_t i = 0; i < existing_ptrs.size(); i++) {
    258     a.DeallocateRaw(existing_ptrs[i]);
    259   }
    260 
    261   AllocatorStats stats;
    262   a.GetStats(&stats);
    263   LOG(INFO) << "Alloc stats: \n" << stats.DebugString();
    264 }
    265 
    266 TEST(GPUBFCAllocatorTest, DISABLED_AllocatorReceivesZeroMemory) {
    267   GPUBFCAllocator a(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
    268   GPUBFCAllocator b(CudaGpuId(0), 1UL << 60, "GPU_0_bfc");
    269   void* amem = a.AllocateRaw(1, 1);
    270   void* bmem = b.AllocateRaw(1, 1 << 30);
    271   a.DeallocateRaw(amem);
    272   b.DeallocateRaw(bmem);
    273 }
    274 
    275 static void BM_Allocation(int iters) {
    276   GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
    277   // Exercise a few different allocation sizes
    278   std::vector<size_t> sizes = {256,        4096,      16384,    524288,
    279                                512,        1048576,   10485760, 104857600,
    280                                1048576000, 2048576000};
    281   int size_index = 0;
    282 
    283   while (--iters > 0) {
    284     size_t bytes = sizes[size_index++ % sizes.size()];
    285     void* p = a.AllocateRaw(1, bytes);
    286     a.DeallocateRaw(p);
    287   }
    288 }
    289 BENCHMARK(BM_Allocation);
    290 
    291 static void BM_AllocationThreaded(int iters, int num_threads) {
    292   GPUBFCAllocator a(CudaGpuId(0), 1uLL << 33, "GPU_0_bfc");
    293   thread::ThreadPool pool(Env::Default(), "test", num_threads);
    294   std::atomic_int_fast32_t count(iters);
    295   mutex done_lock;
    296   condition_variable done;
    297   bool done_flag = false;
    298 
    299   for (int t = 0; t < num_threads; t++) {
    300     pool.Schedule([&a, &count, &done_lock, &done, &done_flag, iters]() {
    301       // Exercise a few different allocation sizes
    302       std::vector<int> sizes = {256, 4096,    16384,    524288,
    303                                 512, 1048576, 10485760, 104857600};
    304       int size_index = 0;
    305       for (int i = 0; i < iters; i++) {
    306         int bytes = sizes[size_index++ % sizes.size()];
    307         void* p = a.AllocateRaw(1, bytes);
    308         a.DeallocateRaw(p);
    309         if (count.fetch_sub(1) == 1) {
    310           mutex_lock l(done_lock);
    311           done_flag = true;
    312           done.notify_all();
    313           break;
    314         }
    315       }
    316     });
    317   }
    318   mutex_lock l(done_lock);
    319   if (!done_flag) {
    320     done.wait(l);
    321   }
    322 }
    323 BENCHMARK(BM_AllocationThreaded)->Arg(1)->Arg(4)->Arg(16);
    324 
    325 // A more complex benchmark that defers deallocation of an object for
    326 // "delay" allocations.
    327 static void BM_AllocationDelayed(int iters, int delay) {
    328   GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
    329   // Exercise a few different allocation sizes
    330   std::vector<int> sizes = {256, 4096, 16384, 4096, 512, 1024, 1024};
    331   int size_index = 0;
    332 
    333   std::vector<void*> ptrs;
    334   ptrs.reserve(delay);
    335   for (int i = 0; i < delay; i++) {
    336     ptrs.push_back(nullptr);
    337   }
    338   int pindex = 0;
    339   while (--iters > 0) {
    340     if (ptrs[pindex] != nullptr) {
    341       a.DeallocateRaw(ptrs[pindex]);
    342       ptrs[pindex] = nullptr;
    343     }
    344     int bytes = sizes[size_index++ % sizes.size()];
    345     void* p = a.AllocateRaw(1, bytes);
    346     ptrs[pindex] = p;
    347     pindex = (pindex + 1) % ptrs.size();
    348   }
    349   for (int i = 0; i < ptrs.size(); i++) {
    350     if (ptrs[i] != nullptr) {
    351       a.DeallocateRaw(ptrs[i]);
    352     }
    353   }
    354 }
    355 BENCHMARK(BM_AllocationDelayed)->Arg(1)->Arg(10)->Arg(100)->Arg(1000);
    356 
    357 }  // namespace
    358 
    359 class GPUBFCAllocatorPrivateMethodsTest : public ::testing::Test {
    360  protected:
    361   // The following test methods are called from tests. The reason for this is
    362   // that this class is a friend class to BFCAllocator, but tests are not, so
    363   // only methods inside this class can access private members of BFCAllocator.
    364 
    365   void TestBinDebugInfo() {
    366     GPUBFCAllocator a(CudaGpuId(0), 1 << 30, "GPU_0_bfc");
    367 
    368     std::vector<void*> initial_ptrs;
    369     std::vector<size_t> initial_ptrs_allocated_sizes;
    370     for (int i = 0; i < 5; i++) {
    371       for (int j = 0; j < 2; j++) {
    372         size_t size = 256 << i;
    373         void* raw = a.AllocateRaw(1, size);
    374         ASSERT_NE(raw, nullptr);
    375         initial_ptrs.push_back(raw);
    376         initial_ptrs_allocated_sizes.push_back(a.AllocatedSize(raw));
    377       }
    378     }
    379 
    380     std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins> bin_infos;
    381     {
    382       mutex_lock l(a.lock_);
    383       bin_infos = a.get_bin_debug_info();
    384     }
    385 
    386     for (int i = 0; i < BFCAllocator::kNumBins; i++) {
    387       const BFCAllocator::BinDebugInfo& bin_info = bin_infos[i];
    388       if (i < 5) {
    389         const size_t requested_size = 2 * (256 << i);
    390         EXPECT_EQ(requested_size, a.RequestedSize(initial_ptrs[2 * i]) +
    391                                       a.RequestedSize(initial_ptrs[2 * i + 1]));
    392         size_t allocated_size = initial_ptrs_allocated_sizes[2 * i] +
    393                                 initial_ptrs_allocated_sizes[2 * i + 1];
    394         EXPECT_EQ(bin_info.total_bytes_in_use, allocated_size);
    395         EXPECT_EQ(bin_info.total_bytes_in_bin, allocated_size);
    396         EXPECT_EQ(bin_info.total_requested_bytes_in_use, requested_size);
    397         EXPECT_EQ(bin_info.total_chunks_in_use, 2);
    398         EXPECT_EQ(bin_info.total_chunks_in_bin, 2);
    399       } else {
    400         EXPECT_EQ(bin_info.total_bytes_in_use, 0);
    401         EXPECT_EQ(bin_info.total_requested_bytes_in_use, 0);
    402         EXPECT_EQ(bin_info.total_chunks_in_use, 0);
    403         if (i == BFCAllocator::kNumBins - 1) {
    404           EXPECT_GT(bin_info.total_bytes_in_bin, 0);
    405           EXPECT_EQ(bin_info.total_chunks_in_bin, 1);
    406         } else {
    407           EXPECT_EQ(bin_info.total_bytes_in_bin, 0);
    408           EXPECT_EQ(bin_info.total_chunks_in_bin, 0);
    409         }
    410       }
    411     }
    412 
    413     for (size_t i = 1; i < initial_ptrs.size(); i += 2) {
    414       a.DeallocateRaw(initial_ptrs[i]);
    415       initial_ptrs[i] = nullptr;
    416     }
    417     {
    418       mutex_lock l(a.lock_);
    419       bin_infos = a.get_bin_debug_info();
    420     }
    421     for (int i = 0; i < BFCAllocator::kNumBins; i++) {
    422       const BFCAllocator::BinDebugInfo& bin_info = bin_infos[i];
    423       if (i < 5) {
    424         // We cannot assert the exact number of bytes or chunks in the bin,
    425         // because it depends on what chunks were coalesced.
    426         size_t requested_size = 256 << i;
    427         EXPECT_EQ(requested_size, a.RequestedSize(initial_ptrs[2 * i]));
    428         EXPECT_EQ(bin_info.total_bytes_in_use,
    429                   initial_ptrs_allocated_sizes[2 * i]);
    430         EXPECT_GE(bin_info.total_bytes_in_bin,
    431                   initial_ptrs_allocated_sizes[2 * i]);
    432         EXPECT_EQ(bin_info.total_requested_bytes_in_use, requested_size);
    433         EXPECT_EQ(bin_info.total_chunks_in_use, 1);
    434         EXPECT_GE(bin_info.total_chunks_in_bin, 1);
    435       } else {
    436         EXPECT_EQ(bin_info.total_bytes_in_use, 0);
    437         EXPECT_EQ(bin_info.total_requested_bytes_in_use, 0);
    438         EXPECT_EQ(bin_info.total_chunks_in_use, 0);
    439       }
    440     }
    441   }
    442 
    443   void TestLog2FloorNonZeroSlow() {
    444     GPUBFCAllocator a(CudaGpuId(0), 1 /* total_memory */, "GPU_0_bfc");
    445     EXPECT_EQ(-1, a.Log2FloorNonZeroSlow(0));
    446     EXPECT_EQ(0, a.Log2FloorNonZeroSlow(1));
    447     EXPECT_EQ(1, a.Log2FloorNonZeroSlow(2));
    448     EXPECT_EQ(1, a.Log2FloorNonZeroSlow(3));
    449     EXPECT_EQ(9, a.Log2FloorNonZeroSlow(1023));
    450     EXPECT_EQ(10, a.Log2FloorNonZeroSlow(1024));
    451     EXPECT_EQ(10, a.Log2FloorNonZeroSlow(1025));
    452   }
    453 };
    454 
    455 TEST_F(GPUBFCAllocatorPrivateMethodsTest, BinDebugInfo) { TestBinDebugInfo(); }
    456 
    457 TEST_F(GPUBFCAllocatorPrivateMethodsTest, Log2FloorNonZeroSlow) {
    458   TestLog2FloorNonZeroSlow();
    459 }
    460 
    461 }  // namespace tensorflow
    462 
    463 #endif  // GOOGLE_CUDA
    464