1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" 17 18 #include <set> 19 #include <utility> 20 #include <vector> 21 22 #include "absl/container/flat_hash_map.h" 23 #include "absl/memory/memory.h" 24 #include "tensorflow/compiler/xla/map_util.h" 25 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h" 26 #include "tensorflow/compiler/xla/service/gpu/hlo_execution_profiler.h" 27 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 28 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h" 29 #include "tensorflow/compiler/xla/service/logical_buffer.h" 30 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 31 #include "tensorflow/compiler/xla/service/transfer_manager.h" 32 #include "tensorflow/compiler/xla/shape_tree.h" 33 #include "tensorflow/compiler/xla/shape_util.h" 34 #include "tensorflow/compiler/xla/status_macros.h" 35 #include "tensorflow/compiler/xla/util.h" 36 #include "tensorflow/core/platform/logging.h" 37 #include "tensorflow/core/platform/tracing.h" 38 #include "tensorflow/core/platform/types.h" 39 40 namespace xla { 41 namespace gpu { 42 namespace { 43 44 using tensorflow::tracing::ScopedAnnotation; 45 46 } // namespace 47 48 // Implementation note: HLO profiling is always enabled for GPU executables, 49 // since we can use timers around thunks. 50 GpuExecutable::GpuExecutable( 51 const string& ptx, const std::vector<uint8>& cubin, 52 std::pair<int, int> compute_capability, 53 std::unique_ptr<const ThunkSchedule> thunk_schedule, 54 std::unique_ptr<HloModule> hlo_module, 55 std::unique_ptr<const BufferAssignment> assignment, 56 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data, 57 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map) 58 : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data), 59 std::move(hlo_profile_index_map)), 60 ptx_(ptx), 61 cubin_(cubin), 62 compute_capability_(compute_capability), 63 thunk_schedule_(std::move(thunk_schedule)), 64 assignment_(std::move(assignment)) {} 65 66 Status GpuExecutable::ExecuteThunks( 67 const ServiceExecutableRunOptions* run_options, 68 const BufferAllocations& buffer_allocations, bool block_host_until_done, 69 HloExecutionProfile* hlo_execution_profile) { 70 se::Stream* main_stream = run_options->stream(); 71 se::StreamExecutor* executor = main_stream->parent(); 72 73 std::pair<int, int> stream_compute_compatibility; 74 executor->GetDeviceDescription().cuda_compute_capability( 75 &stream_compute_compatibility.first, 76 &stream_compute_compatibility.second); 77 TF_RET_CHECK(stream_compute_compatibility == compute_capability_) 78 << "Compute capability mismatch; expected {" << compute_capability_.first 79 << ", " << compute_capability_.second << "}, but was {" 80 << stream_compute_compatibility.first << ", " 81 << stream_compute_compatibility.second << "}"; 82 83 bool do_profile = hlo_execution_profile != nullptr; 84 if (do_profile) { 85 LOG(WARNING) << "PROFILING: profiling is enabled"; 86 } 87 88 // Stream 0 indicates `main_stream` and substreams start from stream 1. 89 std::vector<StreamPool::Ptr> sub_streams; 90 sub_streams.reserve(thunk_schedule_->StreamCount() - 1); 91 while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) { 92 sub_streams.emplace_back(); 93 TF_ASSIGN_OR_RETURN(sub_streams.back(), 94 run_options->BorrowStream(executor->device_ordinal())); 95 } 96 97 HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream, 98 sub_streams, hlo_module_->entry_computation()); 99 uint64 start_micros = tensorflow::Env::Default()->NowMicros(); 100 101 // This top-level trace serves two purposes: 102 // 1) It marks the scope of the whole XLA module. 103 // 2) It tells us whether tracing is enabled. We use this to avoid the 104 // expensive HloInstruction::ToString() calls inside the loop below if 105 // tracing is disabled. 106 ScopedAnnotation top_level_annotation(hlo_module_->name(), "XLA GPU module"); 107 108 std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event; 109 for (Thunk* thunk : thunk_schedule_->TotalOrder()) { 110 // Annotate execution of this op if tracing was enabled when we started 111 // running this module. If tracing is enabled *while* we're running the 112 // module, we won't get any data, but that's probably an OK trade-off. 113 // 114 // TODO(jlebar): Should we cache the results of HloInstruction::ToString(), 115 // since we expect it to be an expensive call? 116 absl::optional<ScopedAnnotation> op_annotation; 117 if (top_level_annotation.IsEnabled()) { 118 op_annotation.emplace( 119 thunk->hlo_instruction() != nullptr 120 ? thunk->hlo_instruction()->ToString(HloPrintOptions::Canonical()) 121 : "<unknown>", 122 "XLA op"); 123 } 124 125 TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor)); 126 int32 stream_no = 127 thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction()); 128 se::Stream* stream = 129 (stream_no == 0 ? main_stream : sub_streams[stream_no - 1].get()); 130 131 for (const Thunk* dependency : thunk_schedule_->DependsOn(thunk)) { 132 stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get()); 133 } 134 135 // If this thunk is about to autotune then wait for all currently executing 136 // thunks to finish. This reduces noise and thus the probability of 137 // choosing a suboptimal algorithm. 138 if (thunk->WillAutotuneKernel(stream)) { 139 TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone()); 140 } 141 142 VLOG(2) << "Executing the thunk for " 143 << thunk->hlo_instruction()->ToString() << " on stream " 144 << stream_no; 145 TF_RETURN_IF_ERROR( 146 thunk->ExecuteOnStream(buffer_allocations, stream, &profiler)); 147 if (thunk_schedule_->Depended(thunk)) { 148 auto finish_event = absl::make_unique<se::Event>(main_stream->parent()); 149 finish_event->Init(); 150 stream->ThenRecordEvent(finish_event.get()); 151 thunk_to_finish_event[thunk] = std::move(finish_event); 152 } 153 } 154 155 main_stream->ThenWaitFor(&sub_streams); 156 // Make sure kernels are completed before deallocating temporary buffers. 157 // TODO(b/30100571): we could potentially postpone deallocating the temp 158 // buffers until a different computation is executed. 159 if (block_host_until_done) { 160 Status block_status = main_stream->BlockHostUntilDone(); 161 if (!block_status.ok()) { 162 return InternalError( 163 "Failed to complete all kernels launched on stream %p: %s", 164 main_stream, block_status.error_message()); 165 } 166 } 167 168 profiler.FinishExecution(); 169 uint64 end_micros = tensorflow::Env::Default()->NowMicros(); 170 171 { 172 tensorflow::mutex_lock lock(mutex_); 173 const double nanoseconds = (end_micros - start_micros) * 1000.0; 174 execution_profile_.set_compute_time_ns(std::max(nanoseconds, 1.0)); 175 176 // If hlo profiling was disabled then the cycle count is left empty. 177 if (do_profile) { 178 execution_profile_.set_compute_cycle_count( 179 hlo_execution_profile->total_cycles_executed( 180 *module().entry_computation())); 181 } 182 } 183 184 return Status::OK(); 185 } 186 187 StatusOr<const GpuExecutable::BufferAllocToDeviceMemoryMap*> 188 GpuExecutable::ResolveConstantGlobals(se::StreamExecutor* executor) { 189 tensorflow::mutex_lock lock(module_handle_mutex_); 190 auto it = module_globals_.find(executor); 191 if (it != module_globals_.end()) { 192 return &it->second; 193 } 194 195 se::MultiModuleLoaderSpec module_spec; 196 if (!cubin().empty()) { 197 module_spec.AddCudaCubinInMemory(cubin()); 198 } 199 module_spec.AddCudaPtxInMemory(ptx().c_str()); 200 201 absl::flat_hash_map<int64, se::DeviceMemoryBase> globals; 202 se::ModuleHandle module_handle; 203 executor->LoadModule(module_spec, &module_handle); 204 205 for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); 206 ++i) { 207 const BufferAllocation& allocation = assignment_->GetAllocation(i); 208 if (allocation.is_constant()) { 209 TF_ASSIGN_OR_RETURN( 210 se::DeviceMemoryBase global, 211 executor->GetUntypedSymbol( 212 llvm_ir::ConstantBufferAllocationToGlobalName(allocation), 213 module_handle)); 214 VLOG(3) << "Resolved global " 215 << llvm_ir::ConstantBufferAllocationToGlobalName(allocation) 216 << " to " << global.opaque(); 217 InsertOrDie(&globals, i, global); 218 219 const Literal& literal = 220 llvm_ir::LiteralForConstantAllocation(allocation); 221 CHECK(literal.shape().IsArray()); 222 if (!ShouldEmitLiteralInLlvmIr(literal)) { 223 VLOG(3) << "H2D memcpy for constant with shape " 224 << ShapeUtil::HumanString(literal.shape()); 225 TF_RETURN_IF_ERROR(executor->SynchronousMemcpyH2D( 226 literal.untyped_data(), allocation.size(), &global)); 227 } 228 } 229 } 230 231 module_handles_.emplace(executor, 232 se::ScopedModuleHandle(executor, module_handle)); 233 return &module_globals_.emplace(executor, std::move(globals)).first->second; 234 } 235 236 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream( 237 const ServiceExecutableRunOptions* run_options, 238 absl::Span<const ShapedBuffer* const> arguments, 239 HloExecutionProfile* hlo_execution_profile) { 240 DeviceMemoryAllocator* memory_allocator = run_options->allocator(); 241 242 if (GetRootPointsToSet().IsAmbiguous()) { 243 return Unimplemented("Points-to set of root instruction is ambiguous"); 244 } 245 246 BufferAllocations::Builder buffer_allocations_builder; 247 se::StreamExecutor* executor = run_options->stream()->parent(); 248 249 TF_ASSIGN_OR_RETURN(auto* const globals, ResolveConstantGlobals(executor)); 250 251 for (BufferAllocation::Index i = 0; i < assignment_->Allocations().size(); 252 ++i) { 253 const BufferAllocation& allocation = assignment_->GetAllocation(i); 254 if (allocation.is_entry_computation_parameter()) { 255 auto param_no = allocation.parameter_number(); 256 se::DeviceMemoryBase buffer = 257 arguments[param_no]->buffer(allocation.param_shape_index()); 258 259 // All top-level buffers and sub-buffers must have an explicit, non-null 260 // pointer, except for zero-sized buffers, which may be null. 261 if (buffer.is_null() && buffer.size() > 0) { 262 return FailedPrecondition( 263 "Cannot run XLA computation because pointer to (sub-)buffer at " 264 "index %s of parameter %d was null. All pointers to (sub-)buffers " 265 "must not be null, unless the (sub-)buffer has zero elements.", 266 allocation.param_shape_index().ToString(), param_no); 267 } 268 269 buffer_allocations_builder.RegisterBuffer(i, buffer); 270 } 271 272 if (allocation.is_constant()) { 273 buffer_allocations_builder.RegisterBuffer(i, FindOrDie(*globals, i)); 274 } 275 } 276 277 TF_ASSIGN_OR_RETURN( 278 auto buffer_allocations, 279 buffer_allocations_builder.Build( 280 assignment_.get(), executor->device_ordinal(), memory_allocator)); 281 282 bool block_host_until_done = 283 !memory_allocator->AllowsAsynchronousDeallocation(); 284 TF_RETURN_IF_ERROR(ExecuteThunks(run_options, *buffer_allocations, 285 block_host_until_done, 286 hlo_execution_profile)); 287 288 HloInstruction* root = hlo_module_->entry_computation()->root_instruction(); 289 auto device_ordinal = executor->device_ordinal(); 290 ScopedShapedBuffer shaped_buffer(root->shape(), root->shape(), 291 memory_allocator, device_ordinal); 292 293 // Copy DeviceMemoryBase values which contain the array(s) of the result into 294 // the respective location in ShapedBuffer. 295 std::set<se::DeviceMemoryBase> buffers_in_result; 296 TF_RETURN_IF_ERROR(shaped_buffer.buffers().ForEachMutableElementWithStatus( 297 [&buffer_allocations, &buffers_in_result, this]( 298 const ShapeIndex& index, se::DeviceMemoryBase* device_memory) { 299 const auto& sources = this->GetRootPointsToSet().element(index); 300 // The points-to set is unambiguous so the set should be a 301 // singleton. That is, we know exactly which instruction 302 // produced the array at this element. 303 CHECK_EQ(1, sources.size()); 304 auto src_hlo = sources[0]->instruction(); 305 306 VLOG(4) << "Looking at: " << sources[0]; 307 308 // The source instruction should have a non-parameter buffer 309 // assigned. 310 TF_ASSIGN_OR_RETURN( 311 const BufferAllocation::Slice slice, 312 this->assignment_->GetUniqueSlice(src_hlo, sources[0]->index())); 313 314 se::DeviceMemoryBase src_base = 315 buffer_allocations->GetDeviceAddress(slice.index()); 316 CHECK(!src_base.is_null() || src_base.size() == 0); 317 if (!slice.allocation()->is_entry_computation_parameter()) { 318 // If the buffer coming out of the result is from a parameter, it 319 // means the caller aliased some parameter buffer to an output one 320 // (via the HloInputOutputAliasConfig API). If that is the case, the 321 // caller will receive a partially complete scoped shaped buffer, 322 // which they will have to fill up on return. 323 // Unfortunately the interface to the execute APIs are ShapedBuffer 324 // pointer based, which assumes caller ownership, and hence a buffer 325 // coming from there cannot be part of the new ScopedShapedBuffer we 326 // create for the result (which assumes ownership). 327 *device_memory = src_base; 328 } else { 329 const HloInputOutputAliasConfig& input_output_alias = 330 module().input_output_alias_config(); 331 auto output_alias = input_output_alias.GetAliasedOutput( 332 slice.allocation()->parameter_number(), 333 slice.allocation()->param_shape_index()); 334 CHECK(output_alias) 335 << "Ouput buffer is coming from parameter " 336 << slice.allocation()->parameter_number() << " at index " 337 << slice.allocation()->param_shape_index() 338 << ", but no alias exists"; 339 CHECK_EQ(*output_alias, index); 340 } 341 buffers_in_result.insert(src_base); 342 return Status::OK(); 343 })); 344 TF_RETURN_IF_ERROR(buffer_allocations->TearDown(buffers_in_result)); 345 346 return std::move(shaped_buffer); 347 } 348 349 StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream( 350 const ServiceExecutableRunOptions* run_options, 351 absl::Span<const ShapedBuffer* const> arguments) { 352 // TODO(b/30671675): Implement asynchronous execution mode. 353 return Unimplemented( 354 "Asynchronous execution on stream is not yet supported on GPU."); 355 } 356 357 const PointsToSet& GpuExecutable::GetRootPointsToSet() const { 358 return assignment_->points_to_analysis().GetPointsToSet( 359 module().entry_computation()->root_instruction()); 360 } 361 362 } // namespace gpu 363 } // namespace xla 364