1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.h" 17 18 #include <memory> 19 #include <set> 20 #include <vector> 21 22 #include "tensorflow/compiler/xla/service/call_graph.h" 23 #include "tensorflow/compiler/xla/service/copy_insertion.h" 24 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" 25 #include "tensorflow/compiler/xla/service/hlo_computation.h" 26 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h" 27 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 28 #include "tensorflow/compiler/xla/service/hlo_opcode.h" 29 #include "tensorflow/core/lib/core/status.h" 30 #include "tensorflow/core/lib/gtl/flatset.h" 31 #include "tensorflow/core/platform/logging.h" 32 33 namespace xla { 34 35 namespace gpu { 36 37 StatusOr<HloInstruction*> GpuCopyInsertion::FindOrInsertCopy( 38 HloInstruction* hlo) { 39 HloInstruction*& copy = hlo_to_copy_map_[hlo]; 40 if (copy == nullptr) { 41 TF_ASSIGN_OR_RETURN(copy, hlo->parent()->DeepCopyInstruction(hlo)); 42 } 43 return copy; 44 } 45 46 StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) { 47 CopyInsertion generic_copy_insertion; 48 49 TF_ASSIGN_OR_RETURN(bool changed, generic_copy_insertion.Run(module)); 50 51 TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDataflowAnalysis> dataflow, 52 HloDataflowAnalysis::Run(*module)); 53 54 // Make sure all operands of a library call are in memory instead of constants 55 // in IR. 56 for (HloInstruction* hlo : 57 module->entry_computation()->MakeInstructionPostOrder()) { 58 // Inserts a copy of hlo->operand(n) if it's a constant. 59 auto copy_operand_if_constant = [&](int64 n) -> Status { 60 HloInstruction* operand = hlo->mutable_operand(n); 61 TF_RET_CHECK(ShapeUtil::IsArray(operand->shape())); 62 const auto& values = dataflow->GetValueSet(operand).values(); 63 if (std::any_of(values.begin(), values.end(), [](const HloValue* value) { 64 return value->defining_instruction()->opcode() == 65 HloOpcode::kConstant; 66 })) { 67 TF_ASSIGN_OR_RETURN(HloInstruction * copy, FindOrInsertCopy(operand)); 68 TF_RETURN_IF_ERROR(hlo->ReplaceOperandWith(n, copy)); 69 changed = true; 70 } 71 return Status::OK(); 72 }; 73 74 if (IsCustomCallToDnnBatchNorm(*hlo)) { 75 // The epsilon and feature_index operands to a CUDNN batchnorm op don't 76 // need to be materialized in memory -- in fact, they must be constants. 77 // These are the last two operands of all three batchnorm ops. 78 for (int64 i = 0; i < hlo->operand_count() - 2; ++i) { 79 TF_RETURN_IF_ERROR(copy_operand_if_constant(i)); 80 } 81 } else if (IsCustomCallToDnnConvolution(*hlo)) { 82 // The last two arguments to a CUDNN convolution are two HLO constants for 83 // cudnn algorithm and tensor_ops_enabled flag, which shouldn't be copied. 84 for (int64 i = 0; i < hlo->operand_count() - 2; ++i) { 85 TF_RETURN_IF_ERROR(copy_operand_if_constant(i)); 86 } 87 } else if (ImplementedAsLibraryCall(*hlo)) { 88 // For all other library calls, materialize all the operands into memory. 89 for (int64 i = 0; i < hlo->operand_count(); ++i) { 90 TF_RETURN_IF_ERROR(copy_operand_if_constant(i)); 91 } 92 } 93 } 94 95 // Init values of while and conditional nodes cannot be constants. Insert 96 // copies for any constants found at the operands of these nodes. 97 tensorflow::gtl::FlatSet<HloInstruction*> inserted_copies; 98 for (HloComputation* computation : module->computations()) { 99 for (HloInstruction* instruction : computation->instructions()) { 100 if (instruction->opcode() != HloOpcode::kWhile && 101 instruction->opcode() != HloOpcode::kConditional) { 102 continue; 103 } 104 for (auto operand : instruction->operands()) { 105 // Skip the operands that have already been replaced with a copy in a 106 // previous iteration (which is possible when a constant is used as an 107 // operand in multiple places). 108 if (ContainsKey(inserted_copies, operand)) { 109 continue; 110 } 111 for (auto& pair : dataflow->GetInstructionValueSet(operand)) { 112 const HloValueSet& value_set = pair.second; 113 for (const HloValue* value : value_set.values()) { 114 if (value->defining_instruction()->IsConstant() && 115 !ContainsKey(hlo_to_copy_map_, value->defining_instruction())) { 116 HloInstruction* constant = value->defining_instruction(); 117 TF_ASSIGN_OR_RETURN(HloInstruction * copy, 118 FindOrInsertCopy(constant)); 119 TF_RETURN_IF_ERROR(constant->ReplaceAllUsesWith(copy)); 120 inserted_copies.insert(copy); 121 changed = true; 122 } 123 } 124 } 125 } 126 } 127 } 128 129 // The GPU backend needs additional copies added due to deficiencies in 130 // buffer assignment. 131 TF_ASSIGN_OR_RETURN(bool buffer_assignment_changed, 132 CopyInsertion::AddCopiesForBufferAssignment(module)); 133 134 return changed || buffer_assignment_changed; 135 } 136 137 } // namespace gpu 138 } // namespace xla 139