Home | History | Annotate | Download | only in gpu
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
     17 
     18 #include <set>
     19 #include <utility>
     20 #include <vector>
     21 
     22 #include "tensorflow/compiler/xla/map_util.h"
     23 #include "tensorflow/compiler/xla/ptr_util.h"
     24 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
     25 #include "tensorflow/compiler/xla/service/hlo_computation.h"
     26 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
     27 #include "tensorflow/compiler/xla/service/logical_buffer.h"
     28 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
     29 #include "tensorflow/compiler/xla/service/transfer_manager.h"
     30 #include "tensorflow/compiler/xla/shape_tree.h"
     31 #include "tensorflow/compiler/xla/shape_util.h"
     32 #include "tensorflow/compiler/xla/status_macros.h"
     33 #include "tensorflow/compiler/xla/util.h"
     34 #include "tensorflow/core/platform/logging.h"
     35 #include "tensorflow/core/platform/types.h"
     36 
     37 namespace se = ::perftools::gputools;
     38 
     39 namespace xla {
     40 namespace gpu {
     41 namespace {
     42 
     43 // A helper class for profiling HLO in the course of GPU program execution.
     44 // All of the profiling is guarded internally, to avoid the caller needing to
     45 // have lots of conditionals sprinkled around.
     46 class HloExecutionProfiler {
     47  public:
     48   // If profiling is enabled, start an execution timer running.
     49   explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile,
     50                                 se::Stream* stream,
     51                                 const HloComputation* computation)
     52       : do_profile_(do_profile),
     53         profile_(profile),
     54         stream_(stream),
     55         computation_(computation) {
     56     if (do_profile_) {
     57       clock_rate_ghz_ =
     58           stream->parent()->GetDeviceDescription().clock_rate_ghz();
     59       execution_timer_.reset(new se::Timer(stream->parent()));
     60       per_op_timer_.reset(new se::Timer(stream->parent()));
     61       stream->InitTimer(execution_timer_.get())
     62           .ThenStartTimer(execution_timer_.get());
     63       stream->InitTimer(per_op_timer_.get());
     64     }
     65   }
     66 
     67   // If profiling is enabled, sets the total cycle count on the profile from the
     68   // execution timer.
     69   void FinishExecution() {
     70     CHECK(!finished_execution_) << "Call FinishExecution only once!";
     71     finished_execution_ = true;
     72     if (do_profile_) {
     73       stream_->ThenStopTimer(execution_timer_.get());
     74       stream_->BlockHostUntilDone().IgnoreError();
     75       profile_->set_total_cycles_executed(
     76           *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_);
     77     }
     78   }
     79 
     80   // If profiling is enabled, starts the per-operation timer.
     81   void StartOperation() {
     82     if (do_profile_) {
     83       stream_->ThenStartTimer(per_op_timer_.get());
     84     }
     85   }
     86 
     87   // If profiling is enabled, stops the per-operation timer and records the time
     88   // that the hlo_instruction took to execute in the profile.
     89   void FinishOperation(const HloInstruction* hlo_instruction) {
     90     if (do_profile_) {
     91       stream_->ThenStopTimer(per_op_timer_.get());
     92       stream_->BlockHostUntilDone().IgnoreError();
     93       profile_->SetCyclesTakenBy(
     94           hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_);
     95     }
     96   }
     97 
     98  private:
     99   const bool do_profile_;
    100   double clock_rate_ghz_;
    101   HloExecutionProfile* profile_;
    102   se::Stream* stream_;
    103   const HloComputation* computation_;
    104   std::unique_ptr<se::Timer> execution_timer_;
    105   std::unique_ptr<se::Timer> per_op_timer_;
    106   bool finished_execution_ = false;
    107 };
    108 
    109 }  // namespace
    110 
    111 // Implementation note: HLO profiling is always enabled for GPU executables,
    112 // since we can use timers around thunks.
    113 GpuExecutable::GpuExecutable(
    114     const string& ptx, const std::vector<uint8>& cubin,
    115     std::pair<int, int> compute_capability,
    116     std::unique_ptr<const ThunkSchedule> thunk_schedule,
    117     std::unique_ptr<const HloModule> hlo_module,
    118     std::unique_ptr<const BufferAssignment> assignment,
    119     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
    120     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
    121     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
    122                  std::move(hlo_profile_index_map)),
    123       ptx_(ptx),
    124       cubin_(cubin),
    125       compute_capability_(compute_capability),
    126       thunk_schedule_(std::move(thunk_schedule)),
    127       assignment_(std::move(assignment)) {}
    128 
    129 Status GpuExecutable::ExecuteThunks(
    130     const ServiceExecutableRunOptions* run_options,
    131     const BufferAllocations& buffer_allocations, bool block_host_until_done,
    132     HloExecutionProfile* hlo_execution_profile) {
    133   se::Stream* main_stream = run_options->stream();
    134 
    135   std::pair<int, int> stream_compute_compatibility;
    136   main_stream->parent()->GetDeviceDescription().cuda_compute_capability(
    137       &stream_compute_compatibility.first,
    138       &stream_compute_compatibility.second);
    139   TF_RET_CHECK(stream_compute_compatibility == compute_capability_)
    140       << "Compute capability mismatch; expected {" << compute_capability_.first
    141       << ", " << compute_capability_.second << "}, but was {"
    142       << stream_compute_compatibility.first << ", "
    143       << stream_compute_compatibility.second << "}";
    144 
    145   bool do_profile = hlo_execution_profile != nullptr;
    146   if (do_profile) {
    147     LOG(WARNING) << "PROFILING: profiling is enabled";
    148   }
    149 
    150   HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
    151                                 hlo_module_->entry_computation());
    152 
    153   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
    154 
    155   // Stream 0 indicates `main_stream` and substreams start from stream 1.
    156   std::vector<Pool<se::Stream>::SmartPtr> sub_streams;
    157   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
    158     sub_streams.emplace_back();
    159     TF_ASSIGN_OR_RETURN(
    160         sub_streams.back(),
    161         run_options->BorrowStream(main_stream->parent()->device_ordinal()));
    162   }
    163 
    164   // The next event enqueued on stream N must not run until the thunk at
    165   // last_blocking_thunk_for_stream[N] completes.
    166   std::map<int32, const Thunk*> last_blocking_thunk_for_stream;
    167   std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
    168   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
    169     TF_RETURN_IF_ERROR(thunk->Initialize(*this));
    170     int32 stream_no =
    171         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
    172     se::Stream* stream =
    173         (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get());
    174 
    175     for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) {
    176       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
    177     }
    178 
    179     if (last_blocking_thunk_for_stream.count(stream_no)) {
    180       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event,
    181                                     last_blocking_thunk_for_stream[stream_no])
    182                               .get());
    183       last_blocking_thunk_for_stream.erase(stream_no);
    184     }
    185 
    186     // If this thunk requests it, wait for all currently-executing thunks to
    187     // finish.  This is useful e.g. if the thunk is about to perform autotuning.
    188     if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) {
    189       TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone());
    190       last_blocking_thunk_for_stream.clear();
    191     }
    192 
    193     profiler.StartOperation();
    194     VLOG(2) << "Executing the thunk for "
    195             << thunk->hlo_instruction()->ToString() << " on stream "
    196             << stream_no;
    197     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
    198     if (thunk_schedule_->Depended(thunk) || thunk->ShouldBlockFutureThunks()) {
    199       auto finish_event = MakeUnique<se::Event>(main_stream->parent());
    200       finish_event->Init();
    201       stream->ThenRecordEvent(finish_event.get());
    202       thunk_to_finish_event[thunk] = std::move(finish_event);
    203 
    204       if (thunk->ShouldBlockFutureThunks()) {
    205         // Set last_blocking_thunk_for_stream on all streams other than this one
    206         // so that all other streams will wait for this thunk to complete before
    207         // executing any events that occur later in the total order.
    208         for (int32 i = 0; i < sub_streams.size() + 1; ++i) {
    209           if (i != stream_no) {
    210             last_blocking_thunk_for_stream[i] = thunk;
    211           }
    212         }
    213       }
    214     }
    215     profiler.FinishOperation(thunk->hlo_instruction());
    216   }
    217 
    218   main_stream->ThenWaitFor(&sub_streams);
    219   // Make sure kernels are completed before deallocating temporary buffers.
    220   // TODO(b/30100571): we could potentially postpone deallocating the temp
    221   // buffers until a different computation is executed.
    222   if (block_host_until_done) {
    223     Status block_status = main_stream->BlockHostUntilDone();
    224     if (!block_status.ok()) {
    225       return InternalError(
    226           "Failed to complete all kernels launched on stream %p: %s",
    227           main_stream, block_status.error_message().c_str());
    228     }
    229   }
    230 
    231   profiler.FinishExecution();
    232   uint64 end_micros = tensorflow::Env::Default()->NowMicros();
    233 
    234   {
    235     tensorflow::mutex_lock lock(mutex_);
    236     const double nanoseconds = (end_micros - start_micros) * 1000.0;
    237     execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0));
    238 
    239     // If hlo profiling was disabled then the cycle count is left empty.
    240     if (do_profile) {
    241       execution_profile_.set_compute_cycle_count(
    242           hlo_execution_profile->total_cycles_executed(
    243               *module().entry_computation()));
    244     }
    245   }
    246 
    247   return Status::OK();
    248 }
    249 
    250 StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream(
    251     const ServiceExecutableRunOptions* run_options,
    252     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
    253     HloExecutionProfile* hlo_execution_profile) {
    254   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
    255 
    256   if (GetRootPointsToSet().IsAmbiguous()) {
    257     return Unimplemented("Points-to set of root instruction is ambiguous");
    258   }
    259 
    260   BufferAllocations::Builder buffer_allocations_builder;
    261   for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size();
    262        ++i) {
    263     const BufferAllocation& allocation = assignment_->GetAllocation(i);
    264     if (allocation.is_entry_computation_parameter()) {
    265       // The caller must give us a buffer for ShapeIndex {} of every parameter.
    266       // It can optionally give us a buffer for other ShapeIndices, but we
    267       // ignore them: Because we can't rely on these sub-buffers' addresses
    268       // being available, our generated code can't use them.  Instead, it must
    269       // chase pointers starting at the tuple root.
    270       if (allocation.param_shape_index().empty()) {
    271         auto param_no = allocation.parameter_number();
    272         buffer_allocations_builder.RegisterBuffer(
    273             i, arguments[param_no]->root_buffer());
    274       }
    275     }
    276   }
    277   se::StreamExecutor* executor = run_options->stream()->parent();
    278   TF_ASSIGN_OR_RETURN(
    279       auto buffer_allocations,
    280       buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(),
    281                                        memory_allocator));
    282 
    283   bool block_host_until_done =
    284       !memory_allocator->AllowsAsynchronousDeallocation();
    285   TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations,
    286                                    block_host_until_done,
    287                                    hlo_execution_profile));
    288 
    289   HloInstruction* root = hlo_module_->entry_computation()->root_instruction();
    290   auto device_ordinal = executor->device_ordinal();
    291   auto shaped_buffer = MakeUnique<ShapedBuffer>(
    292       root->shape(), root->shape(), executor->platform(), device_ordinal);
    293 
    294   // Copy DeviceMemoryBase values which contain the array(s) of the result into
    295   // the respective location in ShapedBuffer.
    296   std::set<se::DeviceMemoryBase> buffers_in_result;
    297   TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus(
    298       [&buffer_allocations, &buffers_in_result, &shaped_buffer, this](
    299           const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
    300         const auto& sources = this->GetRootPointsToSet().element(index);
    301         // The points-to set is unambiguous so the set should be a
    302         // singleton. That is, we know exactly which instruction
    303         // produced the array at this element.
    304         CHECK_EQ(1, sources.size());
    305         auto src_hlo = sources[0]->instruction();
    306 
    307         VLOG(4) << "Looking at: " << sources[0];
    308 
    309         // The source instruction should have a non-parameter buffer
    310         // assigned.
    311         TF_ASSIGN_OR_RETURN(
    312             const BufferAllocation::Slice slice,
    313             this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index()));
    314         CHECK(!slice.allocation()->is_entry_computation_parameter());
    315 
    316         perftools::gputools::DeviceMemoryBase src_base =
    317             buffer_allocations->GetDeviceAddress(slice.index());
    318         CHECK(!src_base.is_null() || src_base.size() == 0);
    319         *device_memory = src_base;
    320         buffers_in_result.insert(src_base);
    321         return Status::OK();
    322       }));
    323   TF_RETURN_IF_ERROR(
    324       buffer_allocations->TearDown(buffers_in_result, *assignment_));
    325 
    326   return std::move(shaped_buffer);
    327 }
    328 
    329 StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream(
    330     const ServiceExecutableRunOptions* run_options,
    331     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
    332   // TODO(b/30671675): Implement asynchronous execution mode.
    333   return Unimplemented(
    334       "Asynchronous execution on stream is not yet supported on GPU.");
    335 }
    336 
    337 const PointsToSet& GpuExecutable::GetRootPointsToSet() const {
    338   return assignment_->points_to_analysis().GetPointsToSet(
    339       module().entry_computation()->root_instruction());
    340 }
    341 
    342 }  // namespace gpu
    343 }  // namespace xla
    344