Home | History | Annotate | Download | only in gpu
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #if GOOGLE_CUDA
     17 
     18 #include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
     19 
     20 #include <atomic>
     21 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
     22 #include "tensorflow/core/platform/stream_executor.h"
     23 #include "tensorflow/core/platform/test.h"
     24 #include "tensorflow/core/protobuf/config.pb.h"
     25 
     26 namespace gpu = ::perftools::gputools;
     27 
     28 namespace tensorflow {
     29 
     30 class TEST_EventMgrHelper {
     31  public:
     32   explicit TEST_EventMgrHelper(EventMgr* em) : em_(em) {
     33     // The polling loop can interfere with the measurements made here, and
     34     // isn't needed since the member PollEvents() always clears the queue.
     35     // The tested behavior is slightly different from what may occur in
     36     // ordinary execution.
     37     StopPollingLoop();
     38   }
     39 
     40   size_t queue_size() {
     41     mutex_lock l(em_->mu_);
     42     return em_->used_events_.size();
     43   }
     44 
     45   size_t free_size() {
     46     mutex_lock l(em_->mu_);
     47     return em_->free_events_.size();
     48   }
     49 
     50   void QueueTensors(perftools::gputools::Stream* stream,
     51                     TensorReferenceVector* tensors) {
     52     mutex_lock l(em_->mu_);
     53     em_->QueueTensors(stream, tensors);
     54   }
     55 
     56   void PollEvents(bool is_dedicated_poller) {
     57     while (queue_size() > 0) {
     58       // For ordinary tensor frees, this function
     59       // should synchronously harvest all complete
     60       // events and execute the corresponding memory frees.
     61       EventMgr::ToFreeVector to_free;
     62       {
     63         mutex_lock l(em_->mu_);
     64         em_->PollEvents(is_dedicated_poller, &to_free);
     65       }
     66       em_->FreeMemory(to_free);
     67     }
     68   }
     69 
     70   void StopPollingLoop() { em_->StopPollingLoop(); }
     71 
     72   void StartPollingLoop() { em_->StartPollingLoop(); }
     73 
     74  private:
     75   EventMgr* em_;
     76 };
     77 
     78 static std::atomic_int_fast64_t live_tensor_bytes(0);
     79 
     80 // A TensorBuffer that counts live memory usage for testing
     81 class TestTensorBuffer : public TensorBuffer {
     82  public:
     83   explicit TestTensorBuffer(size_t bytes) : bytes_(bytes) {
     84     live_tensor_bytes += bytes_;
     85   }
     86   ~TestTensorBuffer() override { live_tensor_bytes -= bytes_; }
     87 
     88   size_t size() const override { return bytes_; }
     89 
     90   // Not used in this test
     91   void* data() const override { return nullptr; }
     92   TensorBuffer* root_buffer() override { return nullptr; }
     93   void FillAllocationDescription(AllocationDescription* arg) const override {}
     94 
     95  private:
     96   size_t bytes_;
     97 };
     98 
     99 namespace {
    100 
    101 TEST(EventMgr, Empty) {
    102   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
    103   EventMgr em(stream_exec, GPUOptions());
    104   TEST_EventMgrHelper th(&em);
    105   EXPECT_EQ(0, th.queue_size());
    106   EXPECT_EQ(0, th.free_size());
    107 }
    108 
    109 static void AddTensorReference(TensorReferenceVector* v, int64 size) {
    110   TestTensorBuffer* buf = new TestTensorBuffer(size);
    111   v->push_back(TensorReference(buf));
    112   buf->Unref();
    113 }
    114 
    115 // Delaying polling until after several enqueings should grow the
    116 // total number of allocated events.  Once we have enough events for
    117 // the max simultaneously pending, we should not allocate any more.
    118 TEST(EventMgr, DelayedPolling) {
    119   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
    120   EventMgr em(stream_exec, GPUOptions());
    121   TEST_EventMgrHelper th(&em);
    122   EXPECT_EQ(0, th.queue_size());
    123   TensorReferenceVector* v = nullptr;
    124   std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
    125   CHECK(stream.get());
    126   stream->Init();
    127   for (int i = 0; i < 5; ++i) {
    128     v = new TensorReferenceVector;
    129     AddTensorReference(v, 100 * 1048576);
    130     th.QueueTensors(stream.get(), v);
    131     EXPECT_EQ(i + 1, th.queue_size());
    132     EXPECT_EQ(0, th.free_size());
    133   }
    134   th.PollEvents(false);
    135   EXPECT_EQ(0, th.queue_size());
    136   EXPECT_EQ(5, th.free_size());
    137   for (int j = 0; j < 2; ++j) {
    138     for (int i = 0; i < 5; ++i) {
    139       v = new TensorReferenceVector;
    140       AddTensorReference(v, 100 * 1048576);
    141       th.QueueTensors(stream.get(), v);
    142       EXPECT_EQ(i + 1, th.queue_size());
    143       EXPECT_EQ(4 - i, th.free_size());
    144     }
    145     th.PollEvents(false);
    146     EXPECT_EQ(0, th.queue_size());
    147     EXPECT_EQ(5, th.free_size());
    148   }
    149 }
    150 
    151 TEST(EventMgr, FlushLargeTensorImmediately) {
    152   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
    153   EventMgr em(stream_exec, GPUOptions());
    154   TEST_EventMgrHelper th(&em);
    155   EXPECT_EQ(0, live_tensor_bytes);
    156   std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
    157   CHECK(stream.get());
    158   stream->Init();
    159   for (int i = 0; i < 5; ++i) {
    160     TensorReferenceVector v;
    161     AddTensorReference(&v, 100 * 1048576);
    162     em.ThenDeleteTensors(stream.get(), v);
    163     th.PollEvents(false);  // Ensure things get registered to be freed by Poll
    164     EXPECT_EQ(0, live_tensor_bytes);
    165   }
    166 }
    167 
    168 TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
    169   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
    170   EventMgr em(stream_exec, GPUOptions());
    171   TEST_EventMgrHelper th(&em);
    172   EXPECT_EQ(0, live_tensor_bytes);
    173   std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
    174   CHECK(stream.get());
    175   stream->Init();
    176   for (int i = 0; i < 5; ++i) {
    177     TensorReferenceVector v;
    178     for (int i = 0; i < 1000; i++) {
    179       AddTensorReference(&v, 100 * 1024);
    180     }
    181     em.ThenDeleteTensors(stream.get(), v);
    182     th.PollEvents(false);  // Harvest the tensors ready to be freed.
    183     EXPECT_EQ(0, live_tensor_bytes);
    184   }
    185 }
    186 
    187 TEST(EventMgr, StreamSwitchingFlushesImmediately) {
    188   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
    189   EventMgr em(stream_exec, GPUOptions());
    190   TEST_EventMgrHelper th(&em);
    191   EXPECT_EQ(0, live_tensor_bytes);
    192   std::unique_ptr<gpu::Stream> stream1(new gpu::Stream(stream_exec));
    193   std::unique_ptr<gpu::Stream> stream2(new gpu::Stream(stream_exec));
    194   stream1->Init();
    195   stream2->Init();
    196   TensorReferenceVector v1;
    197   AddTensorReference(&v1, 1024);
    198   em.ThenDeleteTensors(stream1.get(), v1);
    199 
    200   TensorReferenceVector v2;
    201   AddTensorReference(&v2, 1024);
    202   int64 initial_live_bytes = live_tensor_bytes;
    203   em.ThenDeleteTensors(stream2.get(), v2);
    204   th.PollEvents(false);  // Ensure things get registered to be freed by Poll
    205   // Different stream should cause first tensor to get deleted
    206   EXPECT_GT(initial_live_bytes, live_tensor_bytes);
    207 }
    208 
    209 TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
    210   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
    211   EventMgr em(stream_exec, GPUOptions());
    212   TEST_EventMgrHelper th(&em);
    213   EXPECT_EQ(0, live_tensor_bytes);
    214   std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
    215   CHECK(stream.get());
    216   stream->Init();
    217   for (int i = 0; i < 5; ++i) {
    218     for (int i = 0; i < 1000; i++) {
    219       TensorReferenceVector v;
    220       AddTensorReference(&v, 100 * 1024);
    221       em.ThenDeleteTensors(stream.get(), v);
    222     }
    223     th.PollEvents(false);  // Ensure things get registered to be freed by Poll
    224     // Some of the tensors at least should be flushed
    225     EXPECT_GT(1000 * 100 * 1024, live_tensor_bytes);
    226   }
    227 }
    228 
    229 // Deleting the EventMgr when events are still pending should shut
    230 // down gracefully.
    231 TEST(EventMgr, NonEmptyShutdown) {
    232   auto stream_exec = GPUMachineManager()->ExecutorForDevice(0).ValueOrDie();
    233   EventMgr em(stream_exec, GPUOptions());
    234   TEST_EventMgrHelper th(&em);
    235   EXPECT_EQ(0, th.queue_size());
    236   EXPECT_EQ(0, th.free_size());
    237   std::unique_ptr<gpu::Stream> stream(new gpu::Stream(stream_exec));
    238   CHECK(stream.get());
    239   stream->Init();
    240   for (int i = 0; i < 5; ++i) {
    241     TensorReferenceVector* v = new TensorReferenceVector;
    242     AddTensorReference(v, 100 * 1048576);
    243     th.QueueTensors(stream.get(), v);
    244     EXPECT_EQ(1 + i, th.queue_size());
    245     EXPECT_EQ(0, th.free_size());
    246   }
    247 }
    248 
    249 }  // namespace
    250 }  // namespace tensorflow
    251 
    252 #endif  // GOOGLE_CUDA
    253