1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" 17 18 #include <set> 19 #include <utility> 20 #include <vector> 21 22 #include "tensorflow/compiler/xla/map_util.h" 23 #include "tensorflow/compiler/xla/ptr_util.h" 24 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" 25 #include "tensorflow/compiler/xla/service/hlo_computation.h" 26 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 27 #include "tensorflow/compiler/xla/service/logical_buffer.h" 28 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 29 #include "tensorflow/compiler/xla/service/transfer_manager.h" 30 #include "tensorflow/compiler/xla/shape_tree.h" 31 #include "tensorflow/compiler/xla/shape_util.h" 32 #include "tensorflow/compiler/xla/status_macros.h" 33 #include "tensorflow/compiler/xla/util.h" 34 #include "tensorflow/core/platform/logging.h" 35 #include "tensorflow/core/platform/types.h" 36 37 namespace se = ::perftools::gputools; 38 39 namespace xla { 40 namespace gpu { 41 namespace { 42 43 // A helper class for profiling HLO in the course of GPU program execution. 44 // All of the profiling is guarded internally, to avoid the caller needing to 45 // have lots of conditionals sprinkled around. 46 class HloExecutionProfiler { 47 public: 48 // If profiling is enabled, start an execution timer running. 49 explicit HloExecutionProfiler(bool do_profile, HloExecutionProfile* profile, 50 se::Stream* stream, 51 const HloComputation* computation) 52 : do_profile_(do_profile), 53 profile_(profile), 54 stream_(stream), 55 computation_(computation) { 56 if (do_profile_) { 57 clock_rate_ghz_ = 58 stream->parent()->GetDeviceDescription().clock_rate_ghz(); 59 execution_timer_.reset(new se::Timer(stream->parent())); 60 per_op_timer_.reset(new se::Timer(stream->parent())); 61 stream->InitTimer(execution_timer_.get()) 62 .ThenStartTimer(execution_timer_.get()); 63 stream->InitTimer(per_op_timer_.get()); 64 } 65 } 66 67 // If profiling is enabled, sets the total cycle count on the profile from the 68 // execution timer. 69 void FinishExecution() { 70 CHECK(!finished_execution_) << "Call FinishExecution only once!"; 71 finished_execution_ = true; 72 if (do_profile_) { 73 stream_->ThenStopTimer(execution_timer_.get()); 74 stream_->BlockHostUntilDone().IgnoreError(); 75 profile_->set_total_cycles_executed( 76 *computation_, execution_timer_->Nanoseconds() * clock_rate_ghz_); 77 } 78 } 79 80 // If profiling is enabled, starts the per-operation timer. 81 void StartOperation() { 82 if (do_profile_) { 83 stream_->ThenStartTimer(per_op_timer_.get()); 84 } 85 } 86 87 // If profiling is enabled, stops the per-operation timer and records the time 88 // that the hlo_instruction took to execute in the profile. 89 void FinishOperation(const HloInstruction* hlo_instruction) { 90 if (do_profile_) { 91 stream_->ThenStopTimer(per_op_timer_.get()); 92 stream_->BlockHostUntilDone().IgnoreError(); 93 profile_->SetCyclesTakenBy( 94 hlo_instruction, per_op_timer_->Nanoseconds() * clock_rate_ghz_); 95 } 96 } 97 98 private: 99 const bool do_profile_; 100 double clock_rate_ghz_; 101 HloExecutionProfile* profile_; 102 se::Stream* stream_; 103 const HloComputation* computation_; 104 std::unique_ptr<se::Timer> execution_timer_; 105 std::unique_ptr<se::Timer> per_op_timer_; 106 bool finished_execution_ = false; 107 }; 108 109 } // namespace 110 111 // Implementation note: HLO profiling is always enabled for GPU executables, 112 // since we can use timers around thunks. 113 GpuExecutable::GpuExecutable( 114 const string& ptx, const std::vector<uint8>& cubin, 115 std::pair<int, int> compute_capability, 116 std::unique_ptr<const ThunkSchedule> thunk_schedule, 117 std::unique_ptr<const HloModule> hlo_module, 118 std::unique_ptr<const BufferAssignment> assignment, 119 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data, 120 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) 121 : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data), 122 std::move(hlo_profile_index_map)), 123 ptx_(ptx), 124 cubin_(cubin), 125 compute_capability_(compute_capability), 126 thunk_schedule_(std::move(thunk_schedule)), 127 assignment_(std::move(assignment)) {} 128 129 Status GpuExecutable::ExecuteThunks( 130 const ServiceExecutableRunOptions* run_options, 131 const BufferAllocations& buffer_allocations, bool block_host_until_done, 132 HloExecutionProfile* hlo_execution_profile) { 133 se::Stream* main_stream = run_options->stream(); 134 135 std::pair<int, int> stream_compute_compatibility; 136 main_stream->parent()->GetDeviceDescription().cuda_compute_capability( 137 &stream_compute_compatibility.first, 138 &stream_compute_compatibility.second); 139 TF_RET_CHECK(stream_compute_compatibility == compute_capability_) 140 << "Compute capability mismatch; expected {" << compute_capability_.first 141 << ", " << compute_capability_.second << "}, but was {" 142 << stream_compute_compatibility.first << ", " 143 << stream_compute_compatibility.second << "}"; 144 145 bool do_profile = hlo_execution_profile != nullptr; 146 if (do_profile) { 147 LOG(WARNING) << "PROFILING: profiling is enabled"; 148 } 149 150 HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream, 151 hlo_module_->entry_computation()); 152 153 uint64 start_micros = tensorflow::Env::Default()->NowMicros(); 154 155 // Stream 0 indicates `main_stream` and substreams start from stream 1. 156 std::vector<Pool<se::Stream>::SmartPtr> sub_streams; 157 while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) { 158 sub_streams.emplace_back(); 159 TF_ASSIGN_OR_RETURN( 160 sub_streams.back(), 161 run_options->BorrowStream(main_stream->parent()->device_ordinal())); 162 } 163 164 // The next event enqueued on stream N must not run until the thunk at 165 // last_blocking_thunk_for_stream[N] completes. 166 std::map<int32, const Thunk*> last_blocking_thunk_for_stream; 167 std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event; 168 for (Thunk* thunk : thunk_schedule_->TotalOrder()) { 169 TF_RETURN_IF_ERROR(thunk->Initialize(*this)); 170 int32 stream_no = 171 thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction()); 172 se::Stream* stream = 173 (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get()); 174 175 for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) { 176 stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get()); 177 } 178 179 if (last_blocking_thunk_for_stream.count(stream_no)) { 180 stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, 181 last_blocking_thunk_for_stream[stream_no]) 182 .get()); 183 last_blocking_thunk_for_stream.erase(stream_no); 184 } 185 186 // If this thunk requests it, wait for all currently-executing thunks to 187 // finish. This is useful e.g. if the thunk is about to perform autotuning. 188 if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) { 189 TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone()); 190 last_blocking_thunk_for_stream.clear(); 191 } 192 193 profiler.StartOperation(); 194 VLOG(2) << "Executing the thunk for " 195 << thunk->hlo_instruction()->ToString() << " on stream " 196 << stream_no; 197 TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream)); 198 if (thunk_schedule_->Depended(thunk) || thunk->ShouldBlockFutureThunks()) { 199 auto finish_event = MakeUnique<se::Event>(main_stream->parent()); 200 finish_event->Init(); 201 stream->ThenRecordEvent(finish_event.get()); 202 thunk_to_finish_event[thunk] = std::move(finish_event); 203 204 if (thunk->ShouldBlockFutureThunks()) { 205 // Set last_blocking_thunk_for_stream on all streams other than this one 206 // so that all other streams will wait for this thunk to complete before 207 // executing any events that occur later in the total order. 208 for (int32 i = 0; i < sub_streams.size() + 1; ++i) { 209 if (i != stream_no) { 210 last_blocking_thunk_for_stream[i] = thunk; 211 } 212 } 213 } 214 } 215 profiler.FinishOperation(thunk->hlo_instruction()); 216 } 217 218 main_stream->ThenWaitFor(&sub_streams); 219 // Make sure kernels are completed before deallocating temporary buffers. 220 // TODO(b/30100571): we could potentially postpone deallocating the temp 221 // buffers until a different computation is executed. 222 if (block_host_until_done) { 223 Status block_status = main_stream->BlockHostUntilDone(); 224 if (!block_status.ok()) { 225 return InternalError( 226 "Failed to complete all kernels launched on stream %p: %s", 227 main_stream, block_status.error_message().c_str()); 228 } 229 } 230 231 profiler.FinishExecution(); 232 uint64 end_micros = tensorflow::Env::Default()->NowMicros(); 233 234 { 235 tensorflow::mutex_lock lock(mutex_); 236 const double nanoseconds = (end_micros - start_micros) * 1000.0; 237 execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0)); 238 239 // If hlo profiling was disabled then the cycle count is left empty. 240 if (do_profile) { 241 execution_profile_.set_compute_cycle_count( 242 hlo_execution_profile->total_cycles_executed( 243 *module().entry_computation())); 244 } 245 } 246 247 return Status::OK(); 248 } 249 250 StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteOnStream( 251 const ServiceExecutableRunOptions* run_options, 252 tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, 253 HloExecutionProfile* hlo_execution_profile) { 254 DeviceMemoryAllocator* memory_allocator = run_options->allocator(); 255 256 if (GetRootPointsToSet().IsAmbiguous()) { 257 return Unimplemented("Points-to set of root instruction is ambiguous"); 258 } 259 260 BufferAllocations::Builder buffer_allocations_builder; 261 for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); 262 ++i) { 263 const BufferAllocation& allocation = assignment_->GetAllocation(i); 264 if (allocation.is_entry_computation_parameter()) { 265 // The caller must give us a buffer for ShapeIndex {} of every parameter. 266 // It can optionally give us a buffer for other ShapeIndices, but we 267 // ignore them: Because we can't rely on these sub-buffers' addresses 268 // being available, our generated code can't use them. Instead, it must 269 // chase pointers starting at the tuple root. 270 if (allocation.param_shape_index().empty()) { 271 auto param_no = allocation.parameter_number(); 272 buffer_allocations_builder.RegisterBuffer( 273 i, arguments[param_no]->root_buffer()); 274 } 275 } 276 } 277 se::StreamExecutor* executor = run_options->stream()->parent(); 278 TF_ASSIGN_OR_RETURN( 279 auto buffer_allocations, 280 buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(), 281 memory_allocator)); 282 283 bool block_host_until_done = 284 !memory_allocator->AllowsAsynchronousDeallocation(); 285 TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations, 286 block_host_until_done, 287 hlo_execution_profile)); 288 289 HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); 290 auto device_ordinal = executor->device_ordinal(); 291 auto shaped_buffer = MakeUnique<ShapedBuffer>( 292 root->shape(), root->shape(), executor->platform(), device_ordinal); 293 294 // Copy DeviceMemoryBase values which contain the array(s) of the result into 295 // the respective location in ShapedBuffer. 296 std::set<se::DeviceMemoryBase> buffers_in_result; 297 TF_RETURN_IF_ERROR(shaped_buffer->buffers().ForEachMutableElementWithStatus( 298 [&buffer_allocations, &buffers_in_result, &shaped_buffer, this]( 299 const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { 300 const auto& sources = this->GetRootPointsToSet().element(index); 301 // The points-to set is unambiguous so the set should be a 302 // singleton. That is, we know exactly which instruction 303 // produced the array at this element. 304 CHECK_EQ(1, sources.size()); 305 auto src_hlo = sources[0]->instruction(); 306 307 VLOG(4) << "Looking at: " << sources[0]; 308 309 // The source instruction should have a non-parameter buffer 310 // assigned. 311 TF_ASSIGN_OR_RETURN( 312 const BufferAllocation::Slice slice, 313 this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index())); 314 CHECK(!slice.allocation()->is_entry_computation_parameter()); 315 316 perftools::gputools::DeviceMemoryBase src_base = 317 buffer_allocations->GetDeviceAddress(slice.index()); 318 CHECK(!src_base.is_null() || src_base.size() == 0); 319 *device_memory = src_base; 320 buffers_in_result.insert(src_base); 321 return Status::OK(); 322 })); 323 TF_RETURN_IF_ERROR( 324 buffer_allocations->TearDown(buffers_in_result, *assignment_)); 325 326 return std::move(shaped_buffer); 327 } 328 329 StatusOr<std::unique_ptr<ShapedBuffer>> GpuExecutable::ExecuteAsyncOnStream( 330 const ServiceExecutableRunOptions* run_options, 331 tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) { 332 // TODO(b/30671675): Implement asynchronous execution mode. 333 return Unimplemented( 334 "Asynchronous execution on stream is not yet supported on GPU."); 335 } 336 337 const PointsToSet& GpuExecutable::GetRootPointsToSet() const { 338 return assignment_->points_to_analysis().GetPointsToSet( 339 module().entry_computation()->root_instruction()); 340 } 341 342 } // namespace gpu 343 } // namespace xla 344