1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_EXECUTABLE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_EXECUTABLE_H_ 18 19 #include <cstddef> 20 #include <memory> 21 #include <string> 22 #include <unordered_map> 23 #include <vector> 24 25 #include "tensorflow/compiler/xla/service/buffer_assignment.h" 26 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h" 27 #include "tensorflow/compiler/xla/service/device_memory_allocator.h" 28 #include "tensorflow/compiler/xla/service/executable.h" 29 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h" 30 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 31 #include "tensorflow/compiler/xla/service/hlo_module.h" 32 #include "tensorflow/compiler/xla/service/shaped_buffer.h" 33 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h" 34 #include "tensorflow/compiler/xla/statusor.h" 35 #include "tensorflow/compiler/xla/types.h" 36 #include "tensorflow/core/lib/gtl/array_slice.h" 37 #include "tensorflow/core/platform/macros.h" 38 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 39 #include "tensorflow/core/platform/types.h" 40 41 namespace xla { 42 namespace cpu { 43 44 // CPU-targeting implementation of the XLA Executable interface. 45 // 46 // Wraps a JIT-ed object that can be executed "on device". We JIT for the host 47 // architecture, so JIT-ed code and host code share the same ABI. 48 class CpuExecutable : public Executable { 49 public: 50 CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit, 51 std::unique_ptr<const BufferAssignment> assignment, 52 std::unique_ptr<const HloModule> hlo_module, 53 const string& entry_function_name, 54 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data, 55 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map); 56 ~CpuExecutable() override {} 57 58 StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteOnStream( 59 const ServiceExecutableRunOptions* run_options, 60 tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, 61 HloExecutionProfile* hlo_execution_profile) override; 62 63 StatusOr<std::unique_ptr<ShapedBuffer>> ExecuteAsyncOnStream( 64 const ServiceExecutableRunOptions* run_options, 65 tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) override; 66 67 // This should be called after set_ir_module_string. 68 const string& ir_module_string() const { return ir_module_string_; } 69 70 void set_ir_module_string(const string& ir_module_string) { 71 ir_module_string_ = ir_module_string; 72 } 73 74 const Status EqualOrFail(const Executable& executable) { 75 // TODO(b/62952745) Implement equality test on CPU executable. 76 return Unimplemented("Equality test on CPU executable is not implemented."); 77 } 78 79 static int64 ShapeSizeBytes(const Shape& shape); 80 81 // Type of the computation function we expect in the JIT. 82 using ComputeFunctionType = void (*)( 83 void* /*result*/, const ExecutableRunOptions* /*run_options*/, 84 const void** /*args*/, void** /*temps*/, int64* /*profile_counters*/); 85 86 const ComputeFunctionType& compute_function() const { 87 return compute_function_; 88 } 89 90 const BufferAssignment& buffer_assignment() const { return *assignment_; } 91 92 private: 93 // Allocate buffers required for execution and assign them to the elements of 94 // "buffers". "buffers" should be sized to the number of buffers in buffer 95 // assignment. Each vector element corresponds to a particular Index. If 96 // a vector element already contains a non-null DeviceMemoryBase, then no 97 // buffer is assigned for this element. 98 Status AllocateBuffers( 99 DeviceMemoryAllocator* memory_allocator, int device_ordinal, 100 std::vector<perftools::gputools::DeviceMemoryBase>* buffers); 101 102 // Calls the generated function performing the computation with the given 103 // arguments using the supplied buffers. 104 Status ExecuteComputeFunction( 105 const ExecutableRunOptions* run_options, 106 tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments, 107 tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase> 108 buffers, 109 HloExecutionProfile* hlo_execution_profile); 110 111 // Create a ShapedBuffer for holding the result of the computation. The 112 // addresses (DeviceMemoryBases) are set according to buffer assignment. 113 // 'buffers_in_result' should point to a vector of the same size as 114 // 'allocated_buffers'. An element in buffers_in_result is set to true if the 115 // corresponding buffer is live out of the computation (and thus contained in 116 // the returned ShapedBuffer). 117 StatusOr<std::unique_ptr<ShapedBuffer>> CreateResultShapedBuffer( 118 const ServiceExecutableRunOptions* run_options, 119 tensorflow::gtl::ArraySlice<perftools::gputools::DeviceMemoryBase> 120 allocated_buffers, 121 std::vector<bool>* buffers_in_result); 122 123 // Returns the points-to set of the root instruction of the entry 124 // computation. Uses points-to analysis from buffer assignment. 125 const PointsToSet& GetRootPointsToSet() const; 126 127 // The JIT containing compiled modules. 128 const std::unique_ptr<SimpleOrcJIT> jit_; 129 130 // Buffer assignment for the buffers we need to allocate. 131 const std::unique_ptr<const BufferAssignment> assignment_; 132 133 // The LLVM IR, in string format, of the unoptimized module generated for this 134 // CpuExecutable. We save a string instead of an llvm::Module* because leaving 135 // llvm::Module* in a singleton can cause the heap checker to emit false 136 // positives. 137 string ir_module_string_; 138 139 ComputeFunctionType compute_function_; 140 141 // Entry function name for the computation. 142 const string entry_function_name_; 143 144 TF_DISALLOW_COPY_AND_ASSIGN(CpuExecutable); 145 }; 146 147 } // namespace cpu 148 } // namespace xla 149 150 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_EXECUTABLE_H_ 151