1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EXECUTION_PROFILE_H_ 17 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EXECUTION_PROFILE_H_ 18 19 #include <unordered_map> 20 21 #include "tensorflow/compiler/xla/map_util.h" 22 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h" 23 #include "tensorflow/compiler/xla/service/hlo_profile_printer.h" 24 #include "tensorflow/compiler/xla/types.h" 25 #include "tensorflow/core/platform/stream_executor_no_cuda.h" 26 #include "tensorflow/core/platform/types.h" 27 28 namespace xla { 29 30 class HloInstruction; 31 32 // Maps all HloInstructions and HloComputations in an HloModule to integers. 33 // These integers form the contiguous range [0, total_count()). 34 class HloProfileIndexMap { 35 public: 36 // Scans `module` to populate this instance of HloProfileIndexMap. 37 explicit HloProfileIndexMap(const HloModule& module); 38 39 HloProfileIndexMap(const HloProfileIndexMap&) = default; 40 HloProfileIndexMap(HloProfileIndexMap&&) = default; 41 42 HloProfileIndexMap& operator=(const HloProfileIndexMap&) = default; 43 HloProfileIndexMap& operator=(HloProfileIndexMap&&) = default; 44 45 size_t GetProfileIndexFor(const HloInstruction& instruction) const { 46 return FindOrDie(instruction_to_profile_idx(), &instruction); 47 } 48 49 size_t GetProfileIndexFor(const HloComputation& computation) const { 50 return FindOrDie(computation_to_profile_idx(), &computation); 51 } 52 53 size_t instruction_count() const { 54 return instruction_to_profile_idx().size(); 55 } 56 57 size_t computation_count() const { 58 return computation_to_profile_idx().size(); 59 } 60 61 size_t total_count() const { 62 return instruction_count() + computation_count(); 63 } 64 65 const std::unordered_map<const HloInstruction*, int64>& 66 instruction_to_profile_idx() const { 67 return instruction_to_profile_idx_; 68 } 69 70 const std::unordered_map<const HloComputation*, int64>& 71 computation_to_profile_idx() const { 72 return computation_to_profile_idx_; 73 } 74 75 private: 76 std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx_; 77 std::unordered_map<const HloComputation*, int64> computation_to_profile_idx_; 78 }; 79 80 // Create an instance of `HloProfilePrinterData`. 81 std::unique_ptr<HloProfilePrinterData> CreateHloProfilePrinterData( 82 const HloProfileIndexMap& hlo_profile_index_map, 83 const HloCostAnalysis& cost_analysis); 84 85 // Describes how much time each HLO operation took. 86 // 87 // Each HloComputation takes a certain number of cycles. This class helps break 88 // down how much time each HLO took. 89 class HloExecutionProfile { 90 public: 91 using DeviceDescription = perftools::gputools::DeviceDescription; 92 93 HloExecutionProfile(const HloProfilePrinterData* hlo_profile_printer_data, 94 const HloProfileIndexMap* hlo_profile_index_map); 95 96 // Record how many cycles this HLO took to execute. 97 void SetCyclesTakenBy(const HloInstruction* hlo, uint64 cycles_taken); 98 99 // Returns how many cycles this HLO took to execute. Profiling information 100 // may not be available for some instructions in which case zero is returned. 101 uint64 GetCyclesTakenBy(const HloInstruction& hlo) const; 102 103 // Return the number of cycles this computation took to execute. 104 uint64 total_cycles_executed(const HloComputation& computation) const { 105 return profile_counters_[hlo_profile_index_map_.GetProfileIndexFor( 106 computation)]; 107 } 108 109 // Record how many cycles a computation took to execute. 110 void set_total_cycles_executed(const HloComputation& computation, 111 uint64 total_cycles_executed) { 112 profile_counters_[hlo_profile_index_map_.GetProfileIndexFor(computation)] = 113 total_cycles_executed; 114 } 115 116 // Returns a version of the execution profile suitable for performance 117 // debugging; e.g. emits cycle counts, execution time at the nominal device 118 // frequency, and the effective throughput given the provided cost_analysis 119 // for the operations in a given computation. Returns an empty string if it 120 // wasn't possible to generate a printable version. 121 string ToString(const DeviceDescription& device_description) const { 122 return PrintHloProfile(hlo_profile_printer_data_, profile_counters_.data(), 123 device_description.clock_rate_ghz()); 124 } 125 126 std::vector<int64>* mutable_profile_counters() { return &profile_counters_; } 127 const std::vector<int64>& profile_counters() const { 128 return profile_counters_; 129 } 130 131 private: 132 const HloProfilePrinterData& hlo_profile_printer_data_; 133 const HloProfileIndexMap& hlo_profile_index_map_; 134 135 // Stores per-Hlo profile counters. This is the only thing that changes when 136 // we execute an XLA computation. 137 std::vector<int64> profile_counters_; 138 }; 139 140 } // namespace xla 141 142 #endif // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EXECUTION_PROFILE_H_ 143