Home | History | Annotate | Download | only in gpu
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h"
     17 
     18 #include <memory>
     19 
     20 #include "tensorflow/compiler/xla/layout_util.h"
     21 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
     22 #include "tensorflow/compiler/xla/service/hlo_computation.h"
     23 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
     24 #include "tensorflow/compiler/xla/status_macros.h"
     25 #include "tensorflow/compiler/xla/xla_data.pb.h"
     26 #include "tensorflow/core/lib/core/errors.h"
     27 
     28 namespace xla {
     29 namespace gpu {
     30 
     31 // cuDNN convolutions are called with specific layouts on the input, output,
     32 // and filter:
     33 //
     34 //   input: DataLayout::kBatchDepthYX
     35 //   output: DataLayout::kBatchDepthYX
     36 //   filter: FilterLayout::kOutputInputYX
     37 //
     38 // The order dimensions in the constant name is major-to-minor (eg, the
     39 // most-major dimension of the input is batch, most-minor is X). The
     40 // specific dimension numbers these named dimensions correspond to is
     41 // determined by the ConvolutionDimensionNumbers argument. Y is spatial
     42 // dimension 0, and X is spatial dimension 1.
     43 //
     44 // TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls.
     45 static Status AddBackendConstraintsToDnnConvCustomCall(
     46     HloInstruction* instr, LayoutConstraints* constraints) {
     47   CHECK(IsCustomCallToDnnConvolution(*instr)) << instr->ToString();
     48   Shape input_shape;
     49   Shape filter_shape;
     50   Shape output_shape;
     51   const auto& target = instr->custom_call_target();
     52   if (target == kCudnnConvForwardCallTarget) {
     53     input_shape = instr->operand(0)->shape();
     54     filter_shape = instr->operand(1)->shape();
     55     output_shape = instr->shape().tuple_shapes(0);
     56   } else if (target == kCudnnConvBackwardInputCallTarget) {
     57     input_shape = instr->shape().tuple_shapes(0);
     58     filter_shape = instr->operand(1)->shape();
     59     output_shape = instr->operand(0)->shape();
     60   } else if (target == kCudnnConvBackwardFilterCallTarget) {
     61     input_shape = instr->operand(0)->shape();
     62     filter_shape = instr->shape().tuple_shapes(0);
     63     output_shape = instr->operand(1)->shape();
     64   } else {
     65     LOG(FATAL) << "Unexpected custom call target: "
     66                << instr->custom_call_target();
     67   }
     68 
     69   // Construct minor-to-major dimension orders for operands and result.
     70   // cuDNN's convolution APIs support the BDYX layout for activations/output
     71   // and the OIYX layout for weights.
     72   // TODO(b/29399649): Be more flexible about handling layouts of cuDNN
     73   // calls after we switch to cuDNN v5.
     74   const ConvolutionDimensionNumbers& dimension_numbers =
     75       instr->convolution_dimension_numbers();
     76   std::vector<int64> input_layout;
     77   for (int i = dimension_numbers.input_spatial_dimensions_size() - 1; i >= 0;
     78        --i) {
     79     input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
     80   }
     81   input_layout.push_back(dimension_numbers.input_feature_dimension());
     82   input_layout.push_back(dimension_numbers.input_batch_dimension());
     83   *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout);
     84 
     85   std::vector<int64> filter_layout;
     86   for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1; i >= 0;
     87        --i) {
     88     filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i));
     89   }
     90   filter_layout.push_back(dimension_numbers.kernel_input_feature_dimension());
     91   filter_layout.push_back(dimension_numbers.kernel_output_feature_dimension());
     92   *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
     93 
     94   std::vector<int64> output_layout;
     95   for (int i = dimension_numbers.output_spatial_dimensions_size() - 1; i >= 0;
     96        --i) {
     97     output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
     98   }
     99   output_layout.push_back(dimension_numbers.output_feature_dimension());
    100   output_layout.push_back(dimension_numbers.output_batch_dimension());
    101   *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout);
    102 
    103   // The custom call returns a tuple of (actual_result, scratch_buffer);
    104   // call_result_buf is the logical buffer for actual_result, the thing that
    105   // contains the result of the conv call.
    106   TF_ASSIGN_OR_RETURN(const LogicalBuffer* call_result_buf,
    107                       constraints->points_to_analysis().GetBufferDefinedAt(
    108                           instr, /*index=*/{0}));
    109 
    110   // Set layouts of the instructions' shapes.
    111   if (target == kCudnnConvForwardCallTarget) {
    112     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0));
    113     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1));
    114     TF_RETURN_IF_ERROR(
    115         constraints->SetBufferLayout(output_shape.layout(), *call_result_buf));
    116   } else if (target == kCudnnConvBackwardInputCallTarget) {
    117     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 0));
    118     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(filter_shape, instr, 1));
    119     TF_RETURN_IF_ERROR(
    120         constraints->SetBufferLayout(input_shape.layout(), *call_result_buf));
    121   } else if (target == kCudnnConvBackwardFilterCallTarget) {
    122     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(input_shape, instr, 0));
    123     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(output_shape, instr, 1));
    124     TF_RETURN_IF_ERROR(
    125         constraints->SetBufferLayout(filter_shape.layout(), *call_result_buf));
    126   } else {
    127     LOG(FATAL) << "Unexpected custom call target: "
    128                << instr->custom_call_target();
    129   }
    130   return Status::OK();
    131 }
    132 
    133 Status GpuLayoutAssignment::AddBackendConstraints(
    134     LayoutConstraints* constraints) {
    135   for (auto* instruction : constraints->computation()->instructions()) {
    136     if (IsCustomCallToDnnConvolution(*instruction)) {
    137       TF_RETURN_IF_ERROR(
    138           AddBackendConstraintsToDnnConvCustomCall(instruction, constraints));
    139     }
    140   }
    141   return Status::OK();
    142 }
    143 
    144 bool GpuLayoutAssignment::CustomCallRequiresMajorFirstLayout(
    145     const HloInstruction* instruction) {
    146   // - Inputs to cudnn batchnorm custom calls don't need the major-first layout
    147   //   (i.e. {n, n-1, ...0}) -- we can handle any layout.
    148   // - Inputs to cudnn convolution require custom layouts handled in
    149   //   AddBackendConstraints.
    150   return !IsCustomCallToDnnBatchNorm(*instruction) &&
    151          !IsCustomCallToDnnConvolution(*instruction);
    152 }
    153 
    154 Status GpuLayoutAssignment::PropagateOperandConstraint(
    155     const OperandLayoutConstraint& layout_constraint,
    156     LayoutConstraints* constraints) {
    157   const HloInstruction* instruction = layout_constraint.instruction();
    158 
    159   // cudnn batchnorm forward inference's result must have the same layout as its
    160   // operand 0.
    161   if (instruction->opcode() == HloOpcode::kCustomCall &&
    162       instruction->custom_call_target() ==
    163           kCudnnBatchNormForwardInferenceCallTarget &&
    164       layout_constraint.operand_no() == 0) {
    165     TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
    166         layout_constraint.shape_layout().shape(), instruction));
    167   }
    168 
    169   // cudnn batchnorm forward training returns a tuple {output, mean,
    170   // inverse-stddev}.  mean and inverse-stddev are rank 1 and so have only one
    171   // possible layout, but output is not (necessarily) rank 1, and, like in
    172   // batchnorm forward inference, must have the same layout as operand 0.
    173   if (instruction->opcode() == HloOpcode::kCustomCall &&
    174       instruction->custom_call_target() ==
    175           kCudnnBatchNormForwardTrainingCallTarget &&
    176       layout_constraint.operand_no() == 0) {
    177     TF_ASSIGN_OR_RETURN(const LogicalBuffer* out_buf,
    178                         constraints->points_to_analysis().GetBufferDefinedAt(
    179                             instruction, /*index=*/{0}));
    180     TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
    181         layout_constraint.shape_layout().layout(), *out_buf));
    182   }
    183 
    184   // Like forward training, cudnn batchnorm backward returns a tuple {output,
    185   // mean, inverse-stddev}, and its operand 0 and 'output' must have the same
    186   // layout.  In addition, its operand 0 and operand 4 -- the 'operand' and
    187   // 'grad_output' parameters -- must have the same layout.
    188   if (instruction->opcode() == HloOpcode::kCustomCall &&
    189       instruction->custom_call_target() == kCudnnBatchNormBackwardCallTarget &&
    190       (layout_constraint.operand_no() == 0 ||
    191        layout_constraint.operand_no() == 4)) {
    192     TF_ASSIGN_OR_RETURN(const LogicalBuffer* out_buf,
    193                         constraints->points_to_analysis().GetBufferDefinedAt(
    194                             instruction, /*index=*/{0}));
    195     TF_RETURN_IF_ERROR(constraints->SetBufferLayout(
    196         layout_constraint.shape_layout().layout(), *out_buf));
    197 
    198     int64 operand_to_set = layout_constraint.operand_no() == 0 ? 4 : 0;
    199     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
    200         layout_constraint.shape_layout().shape(), instruction, operand_to_set));
    201   }
    202 
    203   return LayoutAssignment::PropagateOperandConstraint(layout_constraint,
    204                                                       constraints);
    205 }
    206 
    207 Status GpuLayoutAssignment::PropagateBufferConstraint(
    208     const BufferLayoutConstraint& buffer_constraint,
    209     LayoutConstraints* constraints) {
    210   const LogicalBuffer& buf = buffer_constraint.buffer();
    211   const HloInstruction* instruction = buf.instruction();
    212 
    213   Shape shape_with_layout = buf.shape();
    214   *shape_with_layout.mutable_layout() = buffer_constraint.layout();
    215 
    216   // Propagate output constraints to the operands of cudnn batchnorm ops.  This
    217   // is the same as PropagateOperandConstraint, just in the other direction.  We
    218   // need to both to fulfill our contract to LayoutAssignment.
    219   if (instruction->opcode() == HloOpcode::kCustomCall &&
    220       instruction->custom_call_target() ==
    221           kCudnnBatchNormForwardInferenceCallTarget) {
    222     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
    223         shape_with_layout, instruction, /*operand_no=*/0));
    224   }
    225 
    226   if (instruction->opcode() == HloOpcode::kCustomCall &&
    227       instruction->custom_call_target() ==
    228           kCudnnBatchNormForwardTrainingCallTarget &&
    229       buf.index() == ShapeIndex({0})) {
    230     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
    231         shape_with_layout, instruction, /*operand_no=*/0));
    232   }
    233   if (instruction->opcode() == HloOpcode::kCustomCall &&
    234       instruction->custom_call_target() == kCudnnBatchNormBackwardCallTarget &&
    235       buf.index() == ShapeIndex({0})) {
    236     // batchnorm backward has two operands, "operand" and "grad_output" whose
    237     // layouts must both match that of the result at tuple-index 0.
    238     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
    239         shape_with_layout, instruction, /*operand_no=*/0));
    240     TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
    241         shape_with_layout, instruction, /*operand_no=*/4));
    242   }
    243 
    244   return LayoutAssignment::PropagateBufferConstraint(buffer_constraint,
    245                                                      constraints);
    246 }
    247 
    248 }  // namespace gpu
    249 }  // namespace xla
    250