1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h" 17 18 #include <memory> 19 20 #include "tensorflow/compiler/xla/layout_util.h" 21 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" 22 #include "tensorflow/compiler/xla/service/hlo_computation.h" 23 #include "tensorflow/compiler/xla/service/hlo_instruction.h" 24 #include "tensorflow/compiler/xla/status_macros.h" 25 #include "tensorflow/compiler/xla/xla_data.pb.h" 26 #include "tensorflow/core/lib/core/errors.h" 27 28 namespace xla { 29 namespace gpu { 30 31 // cuDNN convolutions are called with specific layouts on the input, output, 32 // and filter: 33 // 34 // input: DataLayout::kBatchDepthYX 35 // output: DataLayout::kBatchDepthYX 36 // filter: FilterLayout::kOutputInputYX 37 // 38 // The order dimensions in the constant name is major-to-minor (eg, the 39 // most-major dimension of the input is batch, most-minor is X). The 40 // specific dimension numbers these named dimensions correspond to is 41 // determined by the ConvolutionDimensionNumbers argument. Y is spatial 42 // dimension 0, and X is spatial dimension 1. 43 // 44 // TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls. 45 static Status AddBackendConstraintsToDnnConvCustomCall( 46 HloInstruction* instr, LayoutConstraints* constraints) { 47 CHECK(IsCustomCallToDnnConvolution(*instr)) << instr->ToString(); 48 Shape input_shape; 49 Shape filter_shape; 50 Shape output_shape; 51 const auto& target = instr->custom_call_target(); 52 if (target == kCudnnConvForwardCallTarget) { 53 input_shape = instr->operand(0)->shape(); 54 filter_shape = instr->operand(1)->shape(); 55 output_shape = instr->shape().tuple_shapes(0); 56 } else if (target == kCudnnConvBackwardInputCallTarget) { 57 input_shape = instr->shape().tuple_shapes(0); 58 filter_shape = instr->operand(1)->shape(); 59 output_shape = instr->operand(0)->shape(); 60 } else if (target == kCudnnConvBackwardFilterCallTarget) { 61 input_shape = instr->operand(0)->shape(); 62 filter_shape = instr->shape().tuple_shapes(0); 63 output_shape = instr->operand(1)->shape(); 64 } else { 65 LOG(FATAL) << "Unexpected custom call target: " 66 << instr->custom_call_target(); 67 } 68 69 // Construct minor-to-major dimension orders for operands and result. 70 // cuDNN's convolution APIs support the BDYX layout for activations/output 71 // and the OIYX layout for weights. 72 // TODO(b/29399649): Be more flexible about handling layouts of cuDNN 73 // calls after we switch to cuDNN v5. 74 const ConvolutionDimensionNumbers& dimension_numbers = 75 instr->convolution_dimension_numbers(); 76 std::vector<int64> input_layout; 77 for (int i = dimension_numbers.input_spatial_dimensions_size() - 1; i >= 0; 78 --i) { 79 input_layout.push_back(dimension_numbers.input_spatial_dimensions(i)); 80 } 81 input_layout.push_back(dimension_numbers.input_feature_dimension()); 82 input_layout.push_back(dimension_numbers.input_batch_dimension()); 83 *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout); 84 85 std::vector<int64> filter_layout; 86 for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1; i >= 0; 87 --i) { 88 filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i)); 89 } 90 filter_layout.push_back(dimension_numbers.kernel_input_feature_dimension()); 91 filter_layout.push_back(dimension_numbers.kernel_output_feature_dimension()); 92 *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout); 93 94 std::vector<int64> output_layout; 95 for (int i = dimension_numbers.output_spatial_dimensions_size() - 1; i >= 0; 96 --i) { 97 output_layout.push_back(dimension_numbers.output_spatial_dimensions(i)); 98 } 99 output_layout.push_back(dimension_numbers.output_feature_dimension()); 100 output_layout.push_back(dimension_numbers.output_batch_dimension()); 101 *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout); 102 103 // The custom call returns a tuple of (actual_result, scratch_buffer); 104 // call_result_buf is the logical buffer for actual_result, the thing that 105 // contains the result of the conv call. 106 TF_ASSIGN_OR_RETURN(const LogicalBuffer* call_result_buf, 107 constraints->points_to_analysis().GetBufferDefinedAt( 108 instr, /*index=*/{0})); 109 110 // Set layouts of the instructions' shapes. 111 if (target == kCudnnConvForwardCallTarget) { 112 TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0)); 113 TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1)); 114 TF_RETURN_IF_ERROR( 115 constraints->SetBufferLayout(output_shape.layout(), *call_result_buf)); 116 } else if (target == kCudnnConvBackwardInputCallTarget) { 117 TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 0)); 118 TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1)); 119 TF_RETURN_IF_ERROR( 120 constraints->SetBufferLayout(input_shape.layout(), *call_result_buf)); 121 } else if (target == kCudnnConvBackwardFilterCallTarget) { 122 TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0)); 123 TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 1)); 124 TF_RETURN_IF_ERROR( 125 constraints->SetBufferLayout(filter_shape.layout(), *call_result_buf)); 126 } else { 127 LOG(FATAL) << "Unexpected custom call target: " 128 << instr->custom_call_target(); 129 } 130 return Status::OK(); 131 } 132 133 Status GpuLayoutAssignment::AddBackendConstraints( 134 LayoutConstraints* constraints) { 135 for (auto* instruction : constraints->computation()->instructions()) { 136 if (IsCustomCallToDnnConvolution(*instruction)) { 137 TF_RETURN_IF_ERROR( 138 AddBackendConstraintsToDnnConvCustomCall(instruction, constraints)); 139 } 140 } 141 return Status::OK(); 142 } 143 144 bool GpuLayoutAssignment::CustomCallRequiresMajorFirstLayout( 145 const HloInstruction* instruction) { 146 // - Inputs to cudnn batchnorm custom calls don't need the major-first layout 147 // (i.e. {n, n-1, ...0}) -- we can handle any layout. 148 // - Inputs to cudnn convolution require custom layouts handled in 149 // AddBackendConstraints. 150 return !IsCustomCallToDnnBatchNorm(*instruction) && 151 !IsCustomCallToDnnConvolution(*instruction); 152 } 153 154 Status GpuLayoutAssignment::PropagateOperandConstraint( 155 const OperandLayoutConstraint& layout_constraint, 156 LayoutConstraints* constraints) { 157 const HloInstruction* instruction = layout_constraint.instruction(); 158 159 // cudnn batchnorm forward inference's result must have the same layout as its 160 // operand 0. 161 if (instruction->opcode() == HloOpcode::kCustomCall && 162 instruction->custom_call_target() == 163 kCudnnBatchNormForwardInferenceCallTarget && 164 layout_constraint.operand_no() == 0) { 165 TF_RETURN_IF_ERROR(constraints->SetInstructionLayout( 166 layout_constraint.shape_layout().shape(), instruction)); 167 } 168 169 // cudnn batchnorm forward training returns a tuple {output, mean, 170 // inverse-stddev}. mean and inverse-stddev are rank 1 and so have only one 171 // possible layout, but output is not (necessarily) rank 1, and, like in 172 // batchnorm forward inference, must have the same layout as operand 0. 173 if (instruction->opcode() == HloOpcode::kCustomCall && 174 instruction->custom_call_target() == 175 kCudnnBatchNormForwardTrainingCallTarget && 176 layout_constraint.operand_no() == 0) { 177 TF_ASSIGN_OR_RETURN(const LogicalBuffer* out_buf, 178 constraints->points_to_analysis().GetBufferDefinedAt( 179 instruction, /*index=*/{0})); 180 TF_RETURN_IF_ERROR(constraints->SetBufferLayout( 181 layout_constraint.shape_layout().layout(), *out_buf)); 182 } 183 184 // Like forward training, cudnn batchnorm backward returns a tuple {output, 185 // mean, inverse-stddev}, and its operand 0 and 'output' must have the same 186 // layout. In addition, its operand 0 and operand 4 -- the 'operand' and 187 // 'grad_output' parameters -- must have the same layout. 188 if (instruction->opcode() == HloOpcode::kCustomCall && 189 instruction->custom_call_target() == kCudnnBatchNormBackwardCallTarget && 190 (layout_constraint.operand_no() == 0 || 191 layout_constraint.operand_no() == 4)) { 192 TF_ASSIGN_OR_RETURN(const LogicalBuffer* out_buf, 193 constraints->points_to_analysis().GetBufferDefinedAt( 194 instruction, /*index=*/{0})); 195 TF_RETURN_IF_ERROR(constraints->SetBufferLayout( 196 layout_constraint.shape_layout().layout(), *out_buf)); 197 198 int64 operand_to_set = layout_constraint.operand_no() == 0 ? 4 : 0; 199 TF_RETURN_IF_ERROR(constraints->SetOperandLayout( 200 layout_constraint.shape_layout().shape(), instruction, operand_to_set)); 201 } 202 203 return LayoutAssignment::PropagateOperandConstraint(layout_constraint, 204 constraints); 205 } 206 207 Status GpuLayoutAssignment::PropagateBufferConstraint( 208 const BufferLayoutConstraint& buffer_constraint, 209 LayoutConstraints* constraints) { 210 const LogicalBuffer& buf = buffer_constraint.buffer(); 211 const HloInstruction* instruction = buf.instruction(); 212 213 Shape shape_with_layout = buf.shape(); 214 *shape_with_layout.mutable_layout() = buffer_constraint.layout(); 215 216 // Propagate output constraints to the operands of cudnn batchnorm ops. This 217 // is the same as PropagateOperandConstraint, just in the other direction. We 218 // need to both to fulfill our contract to LayoutAssignment. 219 if (instruction->opcode() == HloOpcode::kCustomCall && 220 instruction->custom_call_target() == 221 kCudnnBatchNormForwardInferenceCallTarget) { 222 TF_RETURN_IF_ERROR(constraints->SetOperandLayout( 223 shape_with_layout, instruction, /*operand_no=*/0)); 224 } 225 226 if (instruction->opcode() == HloOpcode::kCustomCall && 227 instruction->custom_call_target() == 228 kCudnnBatchNormForwardTrainingCallTarget && 229 buf.index() == ShapeIndex({0})) { 230 TF_RETURN_IF_ERROR(constraints->SetOperandLayout( 231 shape_with_layout, instruction, /*operand_no=*/0)); 232 } 233 if (instruction->opcode() == HloOpcode::kCustomCall && 234 instruction->custom_call_target() == kCudnnBatchNormBackwardCallTarget && 235 buf.index() == ShapeIndex({0})) { 236 // batchnorm backward has two operands, "operand" and "grad_output" whose 237 // layouts must both match that of the result at tuple-index 0. 238 TF_RETURN_IF_ERROR(constraints->SetOperandLayout( 239 shape_with_layout, instruction, /*operand_no=*/0)); 240 TF_RETURN_IF_ERROR(constraints->SetOperandLayout( 241 shape_with_layout, instruction, /*operand_no=*/4)); 242 } 243 244 return LayoutAssignment::PropagateBufferConstraint(buffer_constraint, 245 constraints); 246 } 247 248 } // namespace gpu 249 } // namespace xla 250