1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // Neural Net operation support for StreamExecutor instances. 17 // 18 // This is an abstract interface for a platform to optionally support common 19 // neural net operations; it accommodates implementations such as the cudnn 20 // library operations. 21 22 #ifndef TENSORFLOW_STREAM_EXECUTOR_DNN_H_ 23 #define TENSORFLOW_STREAM_EXECUTOR_DNN_H_ 24 25 #include <functional> 26 #include <limits> 27 #include <memory> 28 #include <tuple> 29 30 #include "absl/types/optional.h" 31 #include "absl/types/span.h" 32 #include "tensorflow/core/platform/protobuf.h" 33 #include "tensorflow/stream_executor/device_memory.h" 34 #include "tensorflow/stream_executor/dnn.pb.h" 35 #include "tensorflow/stream_executor/lib/array_slice.h" 36 #include "tensorflow/stream_executor/lib/status.h" 37 #include "tensorflow/stream_executor/lib/statusor.h" 38 #include "tensorflow/stream_executor/platform/logging.h" 39 #include "tensorflow/stream_executor/platform/port.h" 40 41 namespace Eigen { 42 struct half; 43 } // namespace Eigen 44 45 namespace stream_executor { 46 47 class HostBuffer; 48 class Stream; 49 class ScratchAllocator; 50 51 namespace dnn { 52 53 // Specifies an index to use when accessing specific spatial dimensions. 54 enum class DimIndex : int { 55 X = 0, 56 Y = 1, 57 Z = 2, 58 }; 59 60 // Helper functions to make methods more readable. 61 inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) { 62 return data.rbegin()[static_cast<int64>(dim)]; 63 } 64 65 inline void SetDim(absl::Span<int64> data, DimIndex dim, int64 value) { 66 data.rbegin()[static_cast<int64>(dim)] = value; 67 } 68 69 inline void SetDim(std::vector<int64>* data, DimIndex dim, int64 value) { 70 return SetDim(absl::MakeSpan(*data), dim, value); 71 } 72 73 // tensorflow::int64 is not the same type as tensorflow::protobuf_int64 in 74 // open-source. Wrapper function that gives an int64 array slice view of a 75 // repeated int64 protobuf field. 76 inline absl::Span<const int64> AsInt64Slice( 77 const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>& v) { 78 return absl::Span<const int64>(reinterpret_cast<const int64*>(v.data()), 79 v.size()); 80 } 81 82 inline absl::Span<int64> AsInt64Slice( 83 tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* v) { 84 return absl::Span<int64>(reinterpret_cast<int64*>(v->mutable_data()), 85 v->size()); 86 } 87 88 // Returns a string representation of the given data layout. 89 string DataLayoutString(DataLayout layout); 90 91 // Specifies a quantization for activations in a given BatchDescriptor. 92 enum class QuantizedActivationMode { 93 k8Bit = 1, 94 k16Bit = 2, 95 k32Bit = 4, 96 }; 97 98 // A helper class to convert C/C++ types to the proper enums. 99 template <typename T> 100 struct ToDataType; 101 template <> 102 struct ToDataType<float> { 103 static constexpr DataType value = DataType::kFloat; 104 }; 105 template <> 106 struct ToDataType<double> { 107 static constexpr DataType value = DataType::kDouble; 108 }; 109 template <> 110 struct ToDataType<Eigen::half> { 111 static constexpr DataType value = DataType::kHalf; 112 }; 113 template <> 114 struct ToDataType<int8> { 115 static constexpr DataType value = DataType::kInt8; 116 }; 117 template <> 118 struct ToDataType<int32> { 119 static constexpr DataType value = DataType::kInt32; 120 }; 121 122 // Specifies the types of a RNN model. 123 enum class RnnMode { 124 kRnnRelu = 0, 125 kRnnTanh = 1, 126 kRnnLstm = 2, 127 kRnnGru = 3, 128 }; 129 130 // Specifies the input model and whether there is a linear transformation 131 // between the input state and the first layer hidden state. 132 enum class RnnInputMode { 133 kRnnLinearSkip = 0, 134 kRnnSkipInput = 1, 135 }; 136 137 // Specifies the number of directions used in a RNN model. When bidirection 138 // is used, the input states and output sequence contain data for both 139 // directions. 140 enum class RnnDirectionMode { 141 kRnnUnidirectional = 0, 142 kRnnBidirectional = 1, 143 }; 144 145 // Relevant to DepthToSpace and SpaceToDepth. This is the write layout when 146 // performing depth to space and the read layout when performing space to depth. 147 // It's specified with most-major dimension first and most-minor dimension last. 148 // In DepthToSpace, the D*M values are read in and then, for DepthHeightWidth, 149 // written out to the output patch, by varying first width, then height, then 150 // depth. In C array format, it looks like [depth][height][width]. See 151 // DepthToSpace comment for more information. 152 enum class DepthToSpaceLayout { DepthHeightWidth }; 153 154 // Specifies the descriptor for a RNN model. 155 // 156 // An example use case: 157 // * The user first creates a model through createRnnDescriptor. 158 // * The user queries the size of the underlying opaque parameter buffer. 159 // * The user creates and initializes a parameter buffer of the proper size. 160 // * The user runs forward and backward operations using this RNN descriptor. 161 // * Once a while, user queries maintainable weights and bias regions from 162 // the underlying parameter buffer. They are more likely to be forward 163 // compatible and should used in saving and restoring a model. 164 // * The user releases the RNN descriptor when the model is no longer in use. 165 class RnnDescriptor { 166 public: 167 struct ParamsRegion { 168 int64 offset; 169 int64 size; 170 }; 171 typedef std::vector<ParamsRegion> ParamsRegions; 172 virtual ~RnnDescriptor() {} 173 virtual int64 ParamsSizeInBytes() const { return -1; } 174 virtual ParamsRegions ParamsWeightRegions() const { return ParamsRegions(); } 175 virtual ParamsRegions ParamsBiasRegions() const { return ParamsRegions(); } 176 }; 177 178 // Specifies the sequence in a RNN model. 179 // 180 // The user is responsible for releasing this descriptor when it is no longer 181 // in use. The destructor releases the underlying descriptors. 182 class RnnSequenceTensorDescriptor { 183 public: 184 virtual ~RnnSequenceTensorDescriptor() {} 185 }; 186 187 // Specifies either the input and hidden state in a RNN model. 188 // 189 // The user is responsible for releasing this descriptor when it is no longer 190 // in use. The destructor releases the underlying descriptors. 191 class RnnStateTensorDescriptor { 192 public: 193 virtual ~RnnStateTensorDescriptor() {} 194 }; 195 196 // Returns a string representation of the given quantization mode. 197 string QuantizedActivationModeString(QuantizedActivationMode mode); 198 199 // Describes the dimensions that a layer consumes/produces. 200 // 201 // This is a matrix (height, width), its "depth" (feature_map_count), 202 // how many of these matrices are present (count), 203 // and the maximum and minimum values expected in the matrix (value_max, 204 // value_min). 205 // If input is quantized, all values greater 206 // than value_max will be clipped to value_max and all values less than 207 // value_min will be clipped to value_min. 208 // When quantized output is dequantized no value will be greater than 209 // value_max or less than value_min. 210 // 211 // Uses the named argument construction form: 212 // 213 // auto input_batch_dimensions = 214 // BatchDescriptor().set_count(42).set_feature_map_count(7)... 215 // 216 // Details: 217 // 218 // For a convolutional layer, a single inference takes a 3-dimensional matrix 219 // of input and produces a 3-dimensional matrix of output. We call the three 220 // dimensions height, width and feature_map_count, where for an image, the 221 // height and width correspond to the Y and X pixel indices, respectively, and 222 // the feature_map_count corresponds to the RGB dimension of the input data. 223 // Then the count indicates how many 3D matrices are being presented to be 224 // processed at once; this corresponds to the neural network concept of 225 // minibatch size. 226 // 227 // For a fully connected layer, it's better to put the nodes of the layer in 228 // the feature_map_count, and leave the height and weight as degenerate (== 1). 229 // Count indicates how many input vectors (degenerate 3D matrices) are to be 230 // processed. 231 // 232 // If unspecified, value_max and value_min default to 0.0. 233 // If value_max == value_min the Stream will attempt to derive valid values - 234 // for example the output of Relu6 activation will always be in the range 235 // [0.0, 6.0]. 236 // 237 // If unspecified, layout defaults to kYXDepthBatch. 238 class BatchDescriptor { 239 public: 240 // Creates a "blank" batch descriptor, which should be initialized via the 241 // named argument helpers. 242 BatchDescriptor(); 243 explicit BatchDescriptor(int ndims); 244 245 // Clones values from 'other' for initialization. 246 void CloneFrom(const BatchDescriptor& other); 247 248 string ToString() const; 249 string ToShortString() const; 250 251 // Pre-condition: 252 // value_max_ == 0 253 // value_min_ == 0 254 // quantized_activation_mode_ == QuantizedActivationMode::k8Bit 255 TensorDescriptorProto ToProto(DataType data_type) const; 256 257 // Accessors. 258 int64 count() const { return tensor_.dimensions(0); } 259 int64 feature_map_count() const { return tensor_.dimensions(1); } 260 int64 height() const { return GetDim(spatial_size(), DimIndex::Y); } 261 int64 width() const { return GetDim(spatial_size(), DimIndex::X); } 262 int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size(), dim); } 263 int ndims() const { return spatial_size().size(); } 264 float value_max() const { return value_max_; } 265 float value_min() const { return value_min_; } 266 DataLayout layout() const { return tensor_.data_layout(); } 267 QuantizedActivationMode quantized_activation_mode() const { 268 return quantized_activation_mode_; 269 } 270 // Full dimensions of the underlying data, ordered according to a specific 271 // layout. 272 std::vector<int64> full_dims(const DataLayout& layout) const; 273 274 // Full strides of the underlying data, ordered according to a specific 275 // layout. 276 std::vector<int64> full_strides(const DataLayout& layout) const; 277 278 // Named-argument helpers for avoiding user error during construction. 279 BatchDescriptor& set_count(int64 value) { 280 tensor_.set_dimensions(0, value); 281 return *this; 282 } 283 BatchDescriptor& set_feature_map_count(int64 value) { 284 tensor_.set_dimensions(1, value); 285 return *this; 286 } 287 BatchDescriptor& set_height(int64 value) { 288 SetDim(spatial_size(), DimIndex::Y, value); 289 return *this; 290 } 291 BatchDescriptor& set_width(int64 value) { 292 SetDim(spatial_size(), DimIndex::X, value); 293 return *this; 294 } 295 BatchDescriptor& set_spatial_dim(DimIndex dim, int64 value) { 296 SetDim(spatial_size(), dim, value); 297 return *this; 298 } 299 BatchDescriptor& set_value_max(float value) { 300 value_max_ = value; 301 return *this; 302 } 303 BatchDescriptor& set_value_min(float value) { 304 value_min_ = value; 305 return *this; 306 } 307 BatchDescriptor& set_layout(DataLayout layout) { 308 tensor_.set_data_layout(layout); 309 return *this; 310 } 311 BatchDescriptor& set_quantized_activation_mode( 312 QuantizedActivationMode quantized_activation_mode) { 313 quantized_activation_mode_ = quantized_activation_mode; 314 return *this; 315 } 316 317 // Return the number of nodes in a single feature map. 318 int64 NodesPerFeatureMap() const; 319 320 // Return the number of nodes across all feature maps. Note that this is not 321 // affected by the batch count. 322 int64 NodesAcrossFeatureMaps() const; 323 324 // Returns the number of elements (e.g. RGB pixel values) required to hold a 325 // given batch descriptor, given a no-padding assumption. Note that this is 326 // affected by the batch count. 327 int64 ElementCount() const; 328 329 // Return the number of weights required to fully connect a layer with 330 // dimensions given by the 'input' descriptor with a layer with dimensions 331 // given by the 'output' descriptor. 332 static int64 FullyConnectedWeightCount(const BatchDescriptor& input, 333 const BatchDescriptor& output); 334 335 // Return the number of biases required to fully connect to an output layer 336 // with dimensions given the 'output' descriptor. 337 static int64 FullyConnectedBiasCount(const BatchDescriptor& output); 338 339 // Return a BatchDescriptor for the output of a depth concatenation 340 // with the given input descriptors. The inputs should have the same 341 // dimensions, except possibly for feature_map_count(), though this 342 // function does not verify that. 343 static BatchDescriptor DepthConcatenateOutputDescriptor( 344 port::ArraySlice<dnn::BatchDescriptor> inputs); 345 346 private: 347 absl::Span<const int64> spatial_size() const { 348 return AsInt64Slice(tensor_.dimensions()).subspan(2); 349 } 350 351 absl::Span<int64> spatial_size() { 352 return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2); 353 } 354 355 TensorDescriptorProto tensor_; 356 float value_max_; 357 float value_min_; 358 QuantizedActivationMode quantized_activation_mode_; 359 }; 360 361 // Returns a string representation of the given filter layout. 362 string FilterLayoutString(FilterLayout layout); 363 364 // Describes a filter for the convolution. This is the "window" from 365 // height-by-width patches of each of the feature maps in the input layer to the 366 // cells within the output feature map. 367 // 368 // Uses the named argument construction form: 369 // 370 // FilterDescriptor filter_dimensions; 371 // filter_dimensions 372 // .set_output_feature_map_count(42) 373 // .set_input_feature_map_count(7) 374 // ... 375 // 376 // Arguments: 377 // - output_feature_map_count: number of feature maps in the output layer. 378 // - input_feature_map_count: number of feature maps in the input layer (from 379 // which the filter patch is taken). 380 // - input_filter_height: "height" number of neurons used in the sliding window 381 // over the input layer. 382 // - input_filter_width: "width" number of neurons used in the sliding window 383 // over the input layer. 384 // 385 // Sometimes names like "filter input height" are referred to by synonymous 386 // terminology, such as "kernel y size". 387 // 388 // If unspecified, layout defaults to kOutputInputYX. 389 class FilterDescriptor { 390 public: 391 // By default construction, all dimensions are set to zero, so they should all 392 // be populated by the user via the named-argument helpers below. (See class 393 // comment for details.) 394 FilterDescriptor(); 395 explicit FilterDescriptor(int ndims); 396 ~FilterDescriptor(); 397 398 // Named-argument helpers for avoiding user error during construction. 399 FilterDescriptor& set_output_feature_map_count(int64 value) { 400 tensor_.set_dimensions(0, value); 401 return *this; 402 } 403 FilterDescriptor& set_input_feature_map_count(int64 value) { 404 tensor_.set_dimensions(1, value); 405 return *this; 406 } 407 FilterDescriptor& set_input_filter_height(int64 value) { 408 SetDim(input_filter_dims(), DimIndex::Y, value); 409 return *this; 410 } 411 FilterDescriptor& set_input_filter_width(int64 value) { 412 SetDim(input_filter_dims(), DimIndex::X, value); 413 return *this; 414 } 415 FilterDescriptor& set_layout(FilterLayout layout) { 416 tensor_.set_filter_layout(layout); 417 return *this; 418 } 419 FilterDescriptor& set_spatial_dim(DimIndex dim, int64 value) { 420 SetDim(input_filter_dims(), dim, value); 421 return *this; 422 } 423 int ndims() const { return input_filter_dims().size(); } 424 425 void CloneFrom(const FilterDescriptor& other); 426 427 string ToString() const; 428 string ToShortString() const; 429 TensorDescriptorProto ToProto(DataType data_type) const; 430 431 // Returns the number of weights required as parameters for a convolution 432 // using this filter descriptor. 433 int64 ComputeWeightCount() const; 434 435 // Returns the number of biases required as parameters for a convolution 436 // using this filter descriptor. 437 int64 bias_count() const { return output_feature_map_count(); } 438 439 int64 output_feature_map_count() const { return tensor_.dimensions(0); } 440 int64 input_feature_map_count() const { return tensor_.dimensions(1); } 441 int64 input_filter_height() const { 442 return GetDim(input_filter_dims(), DimIndex::Y); 443 } 444 int64 input_filter_width() const { 445 return GetDim(input_filter_dims(), DimIndex::X); 446 } 447 int64 input_filter_dim(DimIndex dim) const { 448 return GetDim(input_filter_dims(), dim); 449 } 450 451 FilterLayout layout() const { return tensor_.filter_layout(); } 452 453 absl::Span<const int64> input_filter_dims() const { 454 return AsInt64Slice(tensor_.dimensions()).subspan(2); 455 } 456 457 private: 458 absl::Span<int64> input_filter_dims() { 459 return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2); 460 } 461 462 TensorDescriptorProto tensor_; 463 }; 464 465 // Describes how padding should be aligned when the total number of pad 466 // elements is odd. 467 enum class PadAlignment : int64 { 468 kDefault = 0, // default padding for the device. 469 kCudnnPadding, // cuDNN padding - prefer to pad at the start. 470 kTensorFlowPadding, // TensorFlow padding - prefer to pad at the end. 471 }; 472 473 // Returns a string representation of the given padding alignment. 474 string PadAlignmentString(PadAlignment alignment); 475 476 // Print alignment to str. Needed to use CHECK_EQ between two PadAlignments. 477 std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment); 478 479 // Describes a convolution. 480 // 481 // Uses the named argument construction form: 482 // 483 // ConvolutionDescriptor convolution_dimensions; 484 // convolution_dimensions 485 // .set_vertical_filter_stride(2) 486 // .set_horizontal_filter_stride(2) 487 // ... 488 // 489 // Arguments: 490 // - zero_padding_height: padding of the "y dimension" of the input data. Note 491 // that this is different from the height of the filter. 492 // - zero_padding_width: analogous to the height above, but in the "x 493 // dimension". 494 // - vertical_filter_stride: the convolution slides a 2-dimensional window of 495 // filter-height-by-filter-width over the input layer -- the center of that 496 // window is moved in the "y dimension" according to this stride value. 497 // - horizontal_filter_stride: analogous to the vertical stride above, but in 498 // the "x dimension". 499 // - vertical_dilation_rate: there will be (vertical_dilation_rate - 1) skipped 500 // cells between each filter element in the "y dimension". 501 // - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1) 502 // skipped cells between each filter element in the "x dimension". 503 // - convolution_not_crosscor: By default (convolution_not_crosscor == false), 504 // we perform cross correlation rather than convolution. With the flag set, 505 // we perform convolution. Convolution and cross correlation are related by 506 // rotating the filter by 180 degrees (or equivalently flipping all spatial 507 // dimensions). 508 class ConvolutionDescriptor { 509 public: 510 // By default construction, there is no zero-padding and the filter stride is 511 // 1x1 (centering the filter on every cell in the input layer's 512 // width-by-height area). 513 ConvolutionDescriptor(); 514 explicit ConvolutionDescriptor(int ndims); 515 ~ConvolutionDescriptor(); 516 517 string ToString() const; 518 string ToShortString() const; 519 ConvolutionDescriptorProto ToProto() const { return proto_; } 520 521 ConvolutionDescriptor& set_zero_padding_height(int64 value) { 522 SetDim(padding(), DimIndex::Y, value); 523 return *this; 524 } 525 ConvolutionDescriptor& set_zero_padding_width(int64 value) { 526 SetDim(padding(), DimIndex::X, value); 527 return *this; 528 } 529 ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64 value) { 530 SetDim(padding(), dim, value); 531 return *this; 532 } 533 ConvolutionDescriptor& set_vertical_filter_stride(int64 value) { 534 SetDim(strides(), DimIndex::Y, value); 535 return *this; 536 } 537 ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) { 538 SetDim(strides(), DimIndex::X, value); 539 return *this; 540 } 541 ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64 value) { 542 SetDim(strides(), dim, value); 543 return *this; 544 } 545 ConvolutionDescriptor& set_vertical_dilation_rate(int64 value) { 546 SetDim(dilations(), DimIndex::Y, value); 547 return *this; 548 } 549 ConvolutionDescriptor& set_horizontal_dilation_rate(int64 value) { 550 SetDim(dilations(), DimIndex::X, value); 551 return *this; 552 } 553 ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64 value) { 554 SetDim(dilations(), dim, value); 555 return *this; 556 } 557 ConvolutionDescriptor& set_group_count(int group_count) { 558 proto_.set_group_count(group_count); 559 return *this; 560 } 561 ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) { 562 proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION 563 : ConvolutionMode::CROSS_CORRELATION); 564 return *this; 565 } 566 int64 zero_padding_height() const { return GetDim(padding(), DimIndex::Y); } 567 int64 zero_padding_width() const { return GetDim(padding(), DimIndex::X); } 568 int64 vertical_filter_stride() const { 569 return GetDim(strides(), DimIndex::Y); 570 } 571 int64 horizontal_filter_stride() const { 572 return GetDim(strides(), DimIndex::X); 573 } 574 int64 vertical_dilation_rate() const { 575 return GetDim(dilations(), DimIndex::Y); 576 } 577 int64 horizontal_dilation_rate() const { 578 return GetDim(dilations(), DimIndex::X); 579 } 580 581 int zero_padding(DimIndex dim) const { return GetDim(padding(), dim); } 582 int filter_stride(DimIndex dim) const { return GetDim(strides(), dim); } 583 int dilation_rate(DimIndex dim) const { return GetDim(dilations(), dim); } 584 // TODO(timshen): remove this function. No users of this class is setting a 585 // non-default pad alignment. 586 PadAlignment pad_alignment() const { return PadAlignment::kDefault; } 587 int group_count() const { return proto_.group_count(); } 588 int ndims() const { return padding().size(); } 589 bool convolution_not_crosscorr() const { 590 return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION; 591 } 592 593 absl::Span<const int64> strides() const { 594 return AsInt64Slice(proto_.strides()); 595 } 596 597 absl::Span<const int64> dilations() const { 598 return AsInt64Slice(proto_.dilations()); 599 } 600 601 absl::Span<const int64> padding() const { 602 return AsInt64Slice(proto_.paddings()); 603 } 604 605 private: 606 absl::Span<int64> strides() { return AsInt64Slice(proto_.mutable_strides()); } 607 608 absl::Span<int64> dilations() { 609 return AsInt64Slice(proto_.mutable_dilations()); 610 } 611 612 absl::Span<int64> padding() { 613 return AsInt64Slice(proto_.mutable_paddings()); 614 } 615 616 ConvolutionDescriptorProto proto_; 617 618 // TODO(leary) cudnn provides these fields, but need to characterize what 619 // their effect is -- they may be boolean rather than integral. 620 // int64 upscale_input_x; 621 // int64 upscale_input_y; 622 }; 623 624 // A patch of values in the input can be pooled via either a max or an average 625 // operation. 626 // Specify int64 so there's no padding in PoolingDescriptor. 627 enum class PoolingMode : int64 { 628 kMaximum, 629 kAverage, 630 }; 631 632 // Specify the dimension in which to concatenate inputs in space. 633 // Specify int64 so there's no padding in SpaceConcatenateMode. 634 enum class SpaceConcatenateMode : int64 { 635 XDirection, 636 YDirection, 637 }; 638 639 // Returns a short name for the pooling mode, e.g. "Avg". 640 string ShortPoolingModeString(PoolingMode mode); 641 642 // Describes a pooling operation to be enqueued onto a stream via a platform's 643 // DnnSupport. 644 // 645 // TODO(broune): describe how padding works and what happens if the 646 // window height/width is not divisible by the vertical/horizontal 647 // stride. 648 // 649 // Arguments: 650 // pooling_mode: pooling operator to use on the input patch 651 // window_height: height of input window 652 // window_width: width of input window 653 // vertical_stride: vertical delta for center of the input patch 654 // horizontal_stride: horizontal delta for center of the input patch 655 class PoolingDescriptor { 656 public: 657 PoolingDescriptor(); 658 explicit PoolingDescriptor(int ndims); 659 660 PoolingDescriptor& set_pooling_mode(PoolingMode value) { 661 mode_ = value; 662 return *this; 663 } 664 PoolingDescriptor& set_window_height(int64 value) { 665 SetDim(&window_, DimIndex::Y, value); 666 return *this; 667 } 668 PoolingDescriptor& set_window_width(int64 value) { 669 SetDim(&window_, DimIndex::X, value); 670 return *this; 671 } 672 PoolingDescriptor& set_window(DimIndex dim, int64 value) { 673 SetDim(&window_, dim, value); 674 return *this; 675 } 676 PoolingDescriptor& set_vertical_padding(int64 value) { 677 SetDim(&padding_, DimIndex::Y, value); 678 return *this; 679 } 680 PoolingDescriptor& set_horizontal_padding(int64 value) { 681 SetDim(&padding_, DimIndex::X, value); 682 return *this; 683 } 684 PoolingDescriptor& set_padding(DimIndex dim, int64 value) { 685 SetDim(&padding_, dim, value); 686 return *this; 687 } 688 PoolingDescriptor& set_vertical_stride(int64 value) { 689 SetDim(&strides_, DimIndex::Y, value); 690 return *this; 691 } 692 PoolingDescriptor& set_horizontal_stride(int64 value) { 693 SetDim(&strides_, DimIndex::X, value); 694 return *this; 695 } 696 PoolingDescriptor& set_stride(DimIndex dim, int64 value) { 697 SetDim(&strides_, dim, value); 698 return *this; 699 } 700 PoolingDescriptor& set_propagate_nans(bool value) { 701 propagate_nans_ = value; 702 return *this; 703 } 704 705 int ndims() const { return ndims_; } 706 void CloneFrom(const PoolingDescriptor& other); 707 708 string ToString() const; 709 string ToShortString() const; 710 711 PoolingMode mode() const { return mode_; } 712 int64 window_height() const { return GetDim(window_, DimIndex::Y); } 713 int64 window_width() const { return GetDim(window_, DimIndex::X); } 714 int64 window(DimIndex dim) const { return GetDim(window_, dim); } 715 int64 vertical_padding() const { return GetDim(padding_, DimIndex::Y); } 716 int64 horizontal_padding() const { return GetDim(padding_, DimIndex::X); } 717 int64 padding(DimIndex dim) const { return GetDim(padding_, dim); } 718 int64 vertical_stride() const { return GetDim(strides_, DimIndex::Y); } 719 int64 horizontal_stride() const { return GetDim(strides_, DimIndex::X); } 720 int64 stride(DimIndex dim) const { return GetDim(strides_, dim); } 721 absl::Span<const int64> window() const { return window_; } 722 absl::Span<const int64> padding() const { return padding_; } 723 absl::Span<const int64> strides() const { return strides_; } 724 bool propagate_nans() const { return propagate_nans_; } 725 726 private: 727 PoolingMode mode_; 728 int ndims_; 729 bool propagate_nans_; 730 731 // Stored as: ..., y, x. 732 std::vector<int64> window_; 733 std::vector<int64> padding_; 734 std::vector<int64> strides_; 735 }; 736 737 // Collects parameters for DNN algorithms 738 class AlgorithmDesc { 739 public: 740 typedef int64 Index; 741 AlgorithmDesc() : AlgorithmDesc(0, false) {} 742 AlgorithmDesc(Index a, bool use_tensor_ops) { 743 proto_.set_algo_id(a); 744 proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH 745 : AlgorithmProto::DEFAULT_MATH); 746 } 747 bool tensor_ops_enabled() const { 748 return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH; 749 } 750 Index algo_id() const { return proto_.algo_id(); } 751 bool operator==(const AlgorithmDesc& other) const { 752 return algo_id() == other.algo_id() && 753 tensor_ops_enabled() == other.tensor_ops_enabled(); 754 } 755 uint64 hash() const; 756 757 AlgorithmProto ToProto() const { return proto_; } 758 759 private: 760 AlgorithmProto proto_; 761 }; 762 763 // Describes the result from a perf experiment. 764 // 765 // Arguments: 766 // algorithm: returns the exact algorithm that was used. 767 // elapsed_time_in_ms: returns the measured elapsed time in milliseconds. 768 class ProfileResult { 769 public: 770 bool is_valid() const { 771 return algorithm_.has_value() && 772 elapsed_time_in_ms() != std::numeric_limits<float>::max(); 773 } 774 775 AlgorithmDesc algorithm() const { return *algorithm_; } 776 void set_algorithm(AlgorithmDesc val) { algorithm_ = val; } 777 778 float elapsed_time_in_ms() const { return elapsed_time_in_ms_; } 779 void set_elapsed_time_in_ms(float val) { elapsed_time_in_ms_ = val; } 780 781 size_t scratch_size() const { return scratch_size_; } 782 void set_scratch_size(size_t val) { scratch_size_ = val; } 783 784 private: 785 absl::optional<AlgorithmDesc> algorithm_; 786 float elapsed_time_in_ms_ = std::numeric_limits<float>::max(); 787 // The scratch size algorithm_ requires. Currently it's only populated by 788 // convolutions. 789 size_t scratch_size_ = 0; 790 }; 791 792 // Describes the configuration for the algorithms that will used. 793 // 794 // Arguments: 795 // algorithm: the primary algorithm that should be used. 796 // algorithm_no_scratch: a secondary algorithm that should be used, if the 797 // the allocation for the scratch memory fails. 798 class AlgorithmConfig { 799 public: 800 AlgorithmConfig() {} 801 explicit AlgorithmConfig(AlgorithmDesc algorithm) : algorithm_(algorithm) {} 802 AlgorithmConfig(AlgorithmDesc algorithm, AlgorithmDesc algorithm_no_scratch) 803 : algorithm_(algorithm), algorithm_no_scratch_(algorithm_no_scratch) {} 804 absl::optional<AlgorithmDesc> algorithm() const { return algorithm_; } 805 void set_algorithm(AlgorithmDesc val) { algorithm_ = val; } 806 absl::optional<AlgorithmDesc> algorithm_no_scratch() const { 807 return algorithm_no_scratch_; 808 } 809 void set_algorithm_no_scratch(AlgorithmDesc val) { 810 algorithm_no_scratch_ = val; 811 } 812 bool operator==(const AlgorithmConfig& other) const { 813 return this->algorithm_ == other.algorithm_ && 814 this->algorithm_no_scratch_ == other.algorithm_no_scratch_; 815 } 816 bool operator!=(const AlgorithmConfig& other) const { 817 return !(*this == other); 818 } 819 string ToString() const; 820 821 private: 822 absl::optional<AlgorithmDesc> algorithm_; 823 absl::optional<AlgorithmDesc> algorithm_no_scratch_; 824 }; 825 826 // Describes a local response normalization (LRN). LRN is used e.g. in 827 // dist_belief. 828 // 829 // Let V be the vector of feature maps at some (batch, y, x) 830 // coordinate. LRN applies independently to each vector V in the 831 // input, across all coordinates (batch, y, x), by mapping each V to 832 // another vector U of the same size using the formula 833 // 834 // U_i = V_i / ((bias + alpha * (sum_j V_j^2)) ^ beta) 835 // 836 // where the sum is taken over j in the closed range [i - range, i + range]. 837 // 838 // When calculating U_i the j in the sum can extend beyond the bounds 839 // of V. If wrap_around is true, then V_j = V_{j mod F} where F is the 840 // size of V, which is the number of feature maps. If wrap_around is 841 // false, then V_j = 0 for j outside [0, F-1]. 842 // 843 // If segment_size <= F, where F is the number of feature_maps, then 844 // segment_size has no effect. Otherwise, each consecutive segment of 845 // segment_size entries in V are normalized separately. 846 // 847 // Not all StreamExecutors allow wrap_around == true or segment_size 848 // != 64. Some do not implement normalization at all. 849 class NormalizeDescriptor { 850 public: 851 NormalizeDescriptor(); 852 853 NormalizeDescriptor& set_bias(float bias) { 854 bias_ = bias; 855 return *this; 856 } 857 858 NormalizeDescriptor& set_range(int32 range) { 859 range_ = range; 860 return *this; 861 } 862 863 NormalizeDescriptor& set_alpha(float alpha) { 864 alpha_ = alpha; 865 return *this; 866 } 867 868 NormalizeDescriptor& set_beta(float beta) { 869 beta_ = beta; 870 return *this; 871 } 872 873 NormalizeDescriptor& set_wrap_around(bool wrap_around) { 874 wrap_around_ = wrap_around; 875 return *this; 876 } 877 878 NormalizeDescriptor& set_segment_size(int32 segment_size) { 879 segment_size_ = segment_size; 880 return *this; 881 } 882 883 void CloneFrom(const NormalizeDescriptor& other); 884 885 string ToString() const; 886 string ToShortString() const; 887 888 float bias() const { return bias_; } 889 int32 range() const { return range_; } 890 float alpha() const { return alpha_; } 891 float beta() const { return beta_; } 892 bool wrap_around() const { return wrap_around_; } 893 int32 segment_size() const { return segment_size_; } 894 895 private: 896 float bias_; 897 int32 range_; 898 float alpha_; 899 float beta_; 900 bool wrap_around_; 901 int32 segment_size_; 902 }; 903 904 // Returns a string representation of the given activation mode. 905 string ActivationModeString(ActivationMode mode); 906 907 // Describes the operation that DoElementwiseOperation should perform on its 908 // inputs. 909 enum class ElementwiseOperation { kAdd, kMultiply }; 910 911 string ElementwiseOperationString(ElementwiseOperation op); 912 913 // A simple class representing the version of the backing library, to 914 // workaround the "too perfect forwarding" issue in gcc6+ compilers. 915 // See PR#16309 and issue #18402 for links discussing the issue. 916 class VersionInfo { 917 public: 918 VersionInfo(int major = 0, int minor = 0, int patch = 0) 919 : major_(major), minor_(minor), patch_(patch) {} 920 int major_version() const { return major_; } 921 int minor_version() const { return minor_; } 922 int patch() const { return patch_; } 923 924 private: 925 int major_; 926 int minor_; 927 int patch_; 928 }; 929 930 // Suite of operations typically used for implementing Deep/Convolutional Neural 931 // Nets. Note: A false return value of an operation indicates the 932 // implementation is not available. 933 // 934 // TODO(b/118763918): this class (or rather dispatch table) has several 935 // problems: 936 // * Some overloads are missing. Ideally we want to have template virtual 937 // functions while the template arguments is a closed set. However, we don't 938 // get that from the language. 939 // * The API is a union of cuDNN and another private backend. Only 10% of the 940 // functions are actually implemented by both backends, the rest are 941 // actually backend-specific. The massive interface creates extra mental 942 // burden. 943 // * Poor error handling: the API should return Status objects. 944 // 945 // PrepareForConvolution is an example for how new APIs should be written. 946 class DnnSupport { 947 public: 948 DnnSupport() {} 949 virtual ~DnnSupport() {} 950 951 virtual port::Status Init() = 0; 952 953 // Gets the version of the backing library, as a VersionInfo object. 954 virtual port::StatusOr<VersionInfo> GetVersion() { 955 return port::UnimplementedError( 956 "DnnSupport::GetVersion not implemented on this platform."); 957 } 958 959 // Performs a single-precision forward batch normalization operation onto 960 // the stream. 961 // 962 // Arguments: 963 // stream: borrowed pointer to the stream that the batch normalization 964 // operation should be enqueued onto. 965 // x: input data. 966 // scale: scaling parameters. 967 // offset: offset parameters. 968 // estimated_mean: population mean estimated during training. 969 // Used for inference only; empty for training. 970 // estimated_variance: population variance estimated during training, 971 // used for inference only; empty for training. 972 // x_desc: dimensions of the input data, which is the same as the dimensions 973 // of the output. 974 // scale_offset_desc: dimensions of scale and offset. 975 // epsilon: a small floating point number added to the variance of x. 976 // y: output data. 977 // batch_mean: batch mean, to be used to compute the running mean. 978 // batch_variance: batch variance, to be used to compute 979 // the running variance. 980 // reserve_space_1: saved mean, to be reused in the backward gradient 981 // computation. 982 // reserve_space_2: saved inv_var (1/sqrt(epsilon + variance), to be reused 983 // in the backward gradient computation. 984 // is_training: Set to true for training, false for inference. 985 // var_to_inv_var: a function to convert the variance to inverted variance 986 // for cuDNN v4 forward inference. 987 // inv_var_to_var: a function to convert the inverted variance to 988 // variance for cuDNN v4 forward training, to be used for TensorFlow 989 // to calculate the running variance. 990 virtual bool DoBatchNormalizationForward( 991 Stream* stream, const DeviceMemory<float>& x, 992 const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, 993 const DeviceMemory<float>& estimated_mean, 994 const DeviceMemory<float>& estimated_variance, 995 const dnn::BatchDescriptor& x_desc, 996 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 997 DeviceMemory<float>* y, DeviceMemory<float>* batch_mean, 998 DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1, 999 DeviceMemory<float>* reserve_space_2, bool is_training, 1000 std::function<const DeviceMemory<float>&()> var_to_inv_var, 1001 std::function<void()> inv_var_to_var) { 1002 return false; 1003 } 1004 1005 // Performs a half-precision forwards batch normalization operation onto the 1006 // stream. See DoBatchNormalizationForward above for argument details. 1007 virtual bool DoBatchNormalizationForward( 1008 Stream* stream, const DeviceMemory<Eigen::half>& x, 1009 const DeviceMemory<float>& scale, const DeviceMemory<float>& offset, 1010 const DeviceMemory<float>& estimated_mean, 1011 const DeviceMemory<float>& estimated_variance, 1012 const dnn::BatchDescriptor& x_desc, 1013 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 1014 DeviceMemory<Eigen::half>* y, DeviceMemory<float>* batch_mean, 1015 DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1, 1016 DeviceMemory<float>* reserve_space_2, bool is_training, 1017 std::function<const DeviceMemory<float>&()> var_to_inv_var, 1018 std::function<void()> inv_var_to_var) { 1019 return false; 1020 } 1021 1022 // Performs a single-precision backward batch normalization gradient 1023 // computation operation onto the stream. 1024 // 1025 // Arguments: 1026 // stream: borrowed pointer to the stream that the batch normalization 1027 // gradient computation operation should be enqueued onto. 1028 // y_backprop: gradient with regard to output y. 1029 // x: input data. 1030 // scale: scaling parameters. 1031 // inv_var: 1/sqrt(epsilon + variance) of x. 1032 // x_desc: dimensions of the input data, which is the same as the dimensions 1033 // of the output. 1034 // scale_offset_desc: dimensions of scale and offset. 1035 // epsilon: a small floating point number added to the variance of x. 1036 // x_backprop: gradient with respect to input x. 1037 // scale_backprop: gradient with respect to scale. 1038 // offset_backprop: gradient with respect to offset. 1039 virtual bool DoBatchNormalizationBackward( 1040 Stream* stream, const DeviceMemory<float>& y_backprop, 1041 const DeviceMemory<float>& x, const DeviceMemory<float>& scale, 1042 const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var, 1043 const dnn::BatchDescriptor& x_desc, 1044 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 1045 DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop, 1046 DeviceMemory<float>* offset_backprop) { 1047 return false; 1048 } 1049 1050 // Performs a half-precision backward batch normalization gradient computation 1051 // operation onto the stream. See DoBatchNormalizationBackward above for 1052 // argument details. 1053 virtual bool DoBatchNormalizationBackward( 1054 Stream* stream, const DeviceMemory<Eigen::half>& y_backprop, 1055 const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale, 1056 const DeviceMemory<float>& mean, const DeviceMemory<float>& inv_var, 1057 const dnn::BatchDescriptor& x_desc, 1058 const dnn::BatchDescriptor& scale_offset_desc, const double epsilon, 1059 DeviceMemory<Eigen::half>* x_backprop, 1060 DeviceMemory<float>* scale_backprop, 1061 DeviceMemory<float>* offset_backprop) { 1062 return false; 1063 } 1064 1065 // Enqueues a fused convolution operation onto the stream. 1066 // We provide several variants with different types for inputs, biases and 1067 // scaling parameters. 1068 // 1069 // Arguments (all borrowed): 1070 // stream: borrowed pointer to the stream that the 'convolve' operation 1071 // should be enqueued onto. 1072 // conv_input_descriptor: dimensions of the convolution input layer. 1073 // conv_input_data: un-owned device memory region which contains the 1074 // convolution input. 1075 // conv_input_scale: a floating point scale to multiply with each element 1076 // of conv_input_data. 1077 // filter_descriptor: dimensions of the convolution filter. 1078 // filter_data: un-owned device memory region which contains the 1079 // convolution filter weights. 1080 // convolution_descriptor: stride of the convolution filter. 1081 // biases: un-owned device memory region containing biases to add to the 1082 // input. 1083 // activation_mode: Type of activation to perform. 1084 // side_input_data: un-owned device memory region which contains optional 1085 // side input data. If 'side_input_scale' is non-zero, then this must 1086 // point to data in the tensor shape specified by output_shape. 1087 // It will be scaled by 'side_input_scale' and added to the convolution 1088 // result and bias prior to applying the activation function. 1089 // side_input_scale: a floating point scale to multiply with each element 1090 // of side_input_data. 1091 // output_descriptor: dimensions of the output layer. 1092 // output_data: un-owned device memory region in which to place the 1093 // convolution result. 1094 // scratch_allocator: un-owned, may-be-null object that may allocate scratch 1095 // space in order to speed up the convolution operation. 1096 // algorithm_config: specifies which algorithm should be used for the 1097 // operation. 1098 // output_profile_result: the output profile result for this call. The 1099 // profiling is only enabled when this is not nullptr. 1100 // 1101 // conv_input_descriptor, filter_descriptor, convolution_descriptor and 1102 // output_descriptor together specify exactly how the convolution is aligned 1103 // with the input data: 1104 // 1105 // * (input dimensions - filter size + 1) / filter stride == output dimensions 1106 // corresponds to dist_belief padding = VALID, i.e. the input is not padded. 1107 // * input dimensions / filter stride == output dimensions 1108 // corresponds to dist_belief padding = SAME, i.e. input and output are the 1109 // same size - this requires padding the input. 1110 // * (input dimensions + filter size - 1) / filter stride == output dimensions 1111 // corresponds to dist_belief padding = FULL, i.e. the output is sized so 1112 // that if the inverse of the filter is applied to the output in VALID mode 1113 // the result is the same size as the input - this requires even more 1114 // padding of the input. 1115 virtual bool DoFusedConvolve( 1116 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, 1117 const DeviceMemory<double>& conv_input_data, double conv_input_scale, 1118 const dnn::FilterDescriptor& filter_descriptor, 1119 const DeviceMemory<double>& filter_data, 1120 const dnn::ConvolutionDescriptor& convolution_descriptor, 1121 const DeviceMemory<double>& side_input_data, double side_input_scale, 1122 const dnn::BatchDescriptor& bias_descriptor, 1123 const DeviceMemory<double>& biases, dnn::ActivationMode activation_mode, 1124 const dnn::BatchDescriptor& output_descriptor, 1125 DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator, 1126 const dnn::AlgorithmConfig& algorithm_config, 1127 dnn::ProfileResult* output_profile_result) { 1128 return false; 1129 } 1130 1131 // This is the float version of DoFusedConvolve. 1132 virtual bool DoFusedConvolve( 1133 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, 1134 const DeviceMemory<float>& conv_input_data, float conv_input_scale, 1135 const dnn::FilterDescriptor& filter_descriptor, 1136 const DeviceMemory<float>& filter_data, 1137 const dnn::ConvolutionDescriptor& convolution_descriptor, 1138 const DeviceMemory<float>& side_input_data, float side_input_scale, 1139 const dnn::BatchDescriptor& bias_descriptor, 1140 const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode, 1141 const dnn::BatchDescriptor& output_descriptor, 1142 DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator, 1143 const dnn::AlgorithmConfig& algorithm_config, 1144 dnn::ProfileResult* output_profile_result) { 1145 return false; 1146 } 1147 1148 // This is the Eigen::half version of DoFusedConvolve. 1149 // The scaling parameters are still floats. 1150 virtual bool DoFusedConvolve( 1151 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, 1152 const DeviceMemory<Eigen::half>& conv_input_data, float conv_input_scale, 1153 const dnn::FilterDescriptor& filter_descriptor, 1154 const DeviceMemory<Eigen::half>& filter_data, 1155 const dnn::ConvolutionDescriptor& convolution_descriptor, 1156 const DeviceMemory<Eigen::half>& side_input_data, float side_input_scale, 1157 const dnn::BatchDescriptor& bias_descriptor, 1158 const DeviceMemory<Eigen::half>& biases, 1159 dnn::ActivationMode activation_mode, 1160 const dnn::BatchDescriptor& output_descriptor, 1161 DeviceMemory<Eigen::half>* output_data, 1162 ScratchAllocator* scratch_allocator, 1163 const dnn::AlgorithmConfig& algorithm_config, 1164 dnn::ProfileResult* output_profile_result) { 1165 return false; 1166 } 1167 1168 // This is the int8 version of DoFusedConvolve. 1169 // The bias input and scaling parameters are floats. 1170 virtual bool DoFusedConvolve( 1171 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, 1172 const DeviceMemory<int8>& conv_input_data, float conv_input_scale, 1173 const dnn::FilterDescriptor& filter_descriptor, 1174 const DeviceMemory<int8>& filter_data, 1175 const dnn::ConvolutionDescriptor& convolution_descriptor, 1176 const DeviceMemory<int8>& side_input_data, float side_input_scale, 1177 const dnn::BatchDescriptor& bias_descriptor, 1178 const DeviceMemory<float>& biases, dnn::ActivationMode activation_mode, 1179 const dnn::BatchDescriptor& output_descriptor, 1180 DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator, 1181 const dnn::AlgorithmConfig& algorithm_config, 1182 dnn::ProfileResult* output_profile_result) { 1183 return false; 1184 } 1185 1186 template <typename ElementType> 1187 port::Status PrepareForConvolution( 1188 ConvolutionKind kind, Stream* stream, 1189 const BatchDescriptor& batch_descriptor, 1190 DeviceMemory<ElementType> input_data, 1191 const FilterDescriptor& filter_descriptor, 1192 DeviceMemory<ElementType> filter_data, 1193 const BatchDescriptor& output_descriptor, 1194 DeviceMemory<ElementType> output_data, 1195 const ConvolutionDescriptor& convolution_descriptor, 1196 const AlgorithmConfig& algorithm_config, 1197 ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc, 1198 DeviceMemory<uint8>* scratch_memory) { 1199 return DoPrepareForConvolution( 1200 kind, ToDataType<ElementType>::value, stream, batch_descriptor, 1201 input_data, filter_descriptor, filter_data, output_descriptor, 1202 output_data, convolution_descriptor, algorithm_config, 1203 scratch_allocator, algorithm_desc, scratch_memory); 1204 } 1205 1206 // Enqueues a single-precision convolution operation onto the stream. 1207 // 1208 // Arguments (all borrowed): 1209 // stream: borrowed pointer to the stream that the 'convolve' operation 1210 // should be enqueued onto. 1211 // input_descriptor: dimensions of the input layer. 1212 // input_data: un-owned device memory region which contains the 1213 // convolution input. 1214 // filter_descriptor: dimensions of the convolution filter. 1215 // convolution_descriptor: stride of the convolution filter. 1216 // output_descriptor: dimensions of the output layer. 1217 // output_data: un-owned device memory region in which to place the 1218 // convolution result. 1219 // algorithm_desc: specifies which algorithm should be used for the 1220 // operation. 1221 // scratch: un-owned device memory for scratch space in order to speed up 1222 // the convolution operation. 1223 // output_profile_result: the output profile result for this call. The 1224 // profiling is only enabled when this is not nullptr. 1225 // 1226 // input_descriptor, filter_descriptor, convolution_descriptor and 1227 // output_descriptor together specify exactly how the convolution is aligned 1228 // with the input data: 1229 // 1230 // * (input dimensions - filter size + 1) / filter stride == output dimensions 1231 // corresponds to dist_belief padding = VALID, i.e. the input is not padded. 1232 // * input dimensions / filter stride == output dimensions 1233 // corresponds to dist_belief padding = SAME, i.e. input and output are the 1234 // same size - this requires padding the input. 1235 // * (input dimensions + filter size - 1) / filter stride == output dimensions 1236 // corresponds to dist_belief padding = FULL, i.e. the output is sized so 1237 // that if the inverse of the filter is applied to the output in VALID mode 1238 // the result is the same size as the input - this requires even more 1239 // padding of the input. 1240 virtual port::Status DoConvolve( 1241 ConvolutionKind kind, DataType element_type, Stream* stream, 1242 const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data, 1243 const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data, 1244 const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data, 1245 const ConvolutionDescriptor& convolution_descriptor, 1246 AlgorithmDesc algorithm_desc, DeviceMemory<uint8> scratch_memory, 1247 ProfileResult* output_profile_result) = 0; 1248 1249 template <typename ElementType> 1250 bool DoConvolve(Stream* stream, const dnn::BatchDescriptor& input_descriptor, 1251 const DeviceMemory<ElementType>& input_data, 1252 const dnn::FilterDescriptor& filter_descriptor, 1253 const DeviceMemory<ElementType>& filter_data, 1254 const dnn::ConvolutionDescriptor& convolution_descriptor, 1255 const dnn::BatchDescriptor& output_descriptor, 1256 DeviceMemory<ElementType>* output_data, 1257 const dnn::AlgorithmDesc& algorithm_desc, 1258 DeviceMemory<uint8>* scratch_memory, 1259 ProfileResult* output_profile_result) { 1260 return IsStatusOk( 1261 DoConvolve(ConvolutionKind::FORWARD, ToDataType<ElementType>::value, 1262 stream, input_descriptor, input_data, filter_descriptor, 1263 filter_data, output_descriptor, *output_data, 1264 convolution_descriptor, algorithm_desc, *scratch_memory, 1265 output_profile_result), 1266 !output_profile_result); 1267 } 1268 1269 // Return a list of algorithms supported by the forward convolution pass. 1270 // cc_major and cc_minor are the compute capabilities of the device. 1271 virtual bool GetConvolveAlgorithms( 1272 bool with_winograd_nonfused, int cc_major, int cc_minor, 1273 std::vector<AlgorithmDesc>* out_algorithms); 1274 1275 // Returns a list of supported rnn algorithms. 1276 virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms); 1277 1278 // Version of DoConvolve that uses pre-quantized 8 bit coefficients. 1279 // coefficient_scales specifies the scaling of each column of coefficients: 1280 // original float coefficient[row * num_columns + column] = 1281 // quantized coefficient[row * num_columns + column] * 1282 // coefficient_scales[column]. 1283 virtual bool DoConvolveQuantized( 1284 Stream* stream, const dnn::BatchDescriptor& input_descriptor, 1285 const DeviceMemory<float>& input_data, 1286 const dnn::FilterDescriptor& filter_descriptor, 1287 const DeviceMemory<int8>& filter_coefficients, 1288 const DeviceMemory<float>& coefficient_scales, 1289 const dnn::ConvolutionDescriptor& convolution_descriptor, 1290 const dnn::BatchDescriptor& output_descriptor, 1291 DeviceMemory<float>* output_data) = 0; 1292 1293 // Same as DoConvolveQuantized above, but int8 filter coefficients. 1294 virtual bool DoConvolveQuantized( 1295 Stream* stream, const dnn::BatchDescriptor& input_descriptor, 1296 const DeviceMemory<float>& input_data, 1297 const dnn::FilterDescriptor& filter_descriptor, 1298 const DeviceMemory<int16>& filter_coefficients, 1299 const DeviceMemory<float>& coefficient_scales, 1300 const dnn::ConvolutionDescriptor& convolution_descriptor, 1301 const dnn::BatchDescriptor& output_descriptor, 1302 DeviceMemory<float>* output_data) = 0; 1303 1304 // Variation of the above with the weight matrix split into two matrices. 1305 // first_weights: Coefficients of the first matrix. 1306 // second_weights: Coefficients of the second matrix. 1307 // depth_multiplier: specifies the columns of the first matrix and rows 1308 // of the second one - first_weights columns = depth_multiplier, 1309 // second_weights rows = depth_multiplier * 1310 // filter_descriptor.input_feature_map_count(). 1311 // see go/separable for documentation on separable convolutions. 1312 virtual bool DoSeparableConvolve( 1313 Stream* stream, const BatchDescriptor& input_descriptor, 1314 const DeviceMemory<float>& input_data, 1315 const FilterDescriptor& filter_descriptor, int depth_multiplier, 1316 const DeviceMemory<float>& first_weights, 1317 const DeviceMemory<float>& second_weights, 1318 const ConvolutionDescriptor& convolution_descriptor, 1319 const BatchDescriptor& output_descriptor, 1320 DeviceMemory<float>* output_data) = 0; 1321 1322 // Enqueues a single-precision backward convolution (for data) operation onto 1323 // the stream. 1324 // 1325 // Arguments: 1326 // stream: borrowed pointer to the stream that the 'convolve' operation 1327 // should be enqueued onto. 1328 // filter_descriptor: dimensions of the convolution filter. 1329 // filter_data: coefficients for the convolution filter. 1330 // output_descriptor: dimensions of the output gradients, which is the same 1331 // as the dimensions of the output. 1332 // backward_output_data: un-owned device memory region which contains the 1333 // backprop of the output. 1334 // convolution_descriptor: stride of the convolution filter. 1335 // input_descriptor: dimensions of the input layer. 1336 // backward_input_data: un-owned device memory region in which to place the 1337 // backprop of the input. 1338 // scratch_allocator: un-owned, may-be-null object that may allocate scratch 1339 // space in order to speed up the convolution operation. 1340 template <typename ElementType> 1341 bool DoConvolveBackwardData( 1342 Stream* stream, const dnn::FilterDescriptor& filter_descriptor, 1343 const DeviceMemory<ElementType>& filter_data, 1344 const dnn::BatchDescriptor& output_descriptor, 1345 const DeviceMemory<ElementType>& backward_output_data, 1346 const dnn::ConvolutionDescriptor& convolution_descriptor, 1347 const dnn::BatchDescriptor& input_descriptor, 1348 DeviceMemory<ElementType>* backward_input_data, 1349 const dnn::AlgorithmDesc& algorithm_desc, 1350 DeviceMemory<uint8>* scratch_memory, 1351 ProfileResult* output_profile_result) { 1352 return IsStatusOk( 1353 DoConvolve(ConvolutionKind::BACKWARD_DATA, 1354 ToDataType<ElementType>::value, stream, input_descriptor, 1355 *backward_input_data, filter_descriptor, filter_data, 1356 output_descriptor, backward_output_data, 1357 convolution_descriptor, algorithm_desc, *scratch_memory, 1358 output_profile_result), 1359 !output_profile_result); 1360 } 1361 1362 // Return a list of algorithms supported by the backward convolution pass for 1363 // data. 1364 virtual bool GetConvolveBackwardDataAlgorithms( 1365 bool with_winograd_nonfused, int cc_major, int cc_minor, 1366 std::vector<AlgorithmDesc>* out_algorithms); 1367 1368 // Enqueues a single-precision backward convolution (for filter) operation 1369 // onto the stream. 1370 // 1371 // Arguments: 1372 // stream: borrowed pointer to the stream that the 'convolve' operation 1373 // should be enqueued onto. 1374 // input_descriptor: dimensions of the input layer. 1375 // input_data: un-owned device memory region which contains the 1376 // convolution input. 1377 // output_descriptor: dimensions of the output gradients, which is the same 1378 // as the dimensions of the output. 1379 // backward_output_data: un-owned device memory region which contains the 1380 // backprop of the output. 1381 // convolution_descriptor: stride of the convolution filter. 1382 // filter_descriptor: dimensions of the convolution filter. 1383 // backward_filter_data: un-owned device memory region in which to place the 1384 // backprop of the filter. 1385 // scratch_allocator: un-owned, may-be-null object that may allocate scratch 1386 // space in order to speed up the convolution operation. 1387 template <typename ElementType> 1388 bool DoConvolveBackwardFilter( 1389 Stream* stream, const BatchDescriptor& input_descriptor, 1390 const DeviceMemory<ElementType>& input_data, 1391 const BatchDescriptor& output_descriptor, 1392 const DeviceMemory<ElementType>& backward_output_data, 1393 const ConvolutionDescriptor& convolution_descriptor, 1394 const FilterDescriptor& filter_descriptor, 1395 DeviceMemory<ElementType>* backward_filter_data, 1396 const dnn::AlgorithmDesc& algorithm_desc, 1397 DeviceMemory<uint8>* scratch_memory, 1398 ProfileResult* output_profile_result) { 1399 return IsStatusOk( 1400 DoConvolve(ConvolutionKind::BACKWARD_FILTER, 1401 ToDataType<ElementType>::value, stream, input_descriptor, 1402 input_data, filter_descriptor, *backward_filter_data, 1403 output_descriptor, backward_output_data, 1404 convolution_descriptor, algorithm_desc, *scratch_memory, 1405 output_profile_result), 1406 !output_profile_result); 1407 } 1408 1409 // Return a list of algorithms supported by the backward convolution pass for 1410 // filters. 1411 virtual bool GetConvolveBackwardFilterAlgorithms( 1412 bool with_winograd_nonfused, int cc_major, int cc_minor, 1413 std::vector<AlgorithmDesc>* out_algorithms); 1414 1415 // Enqueues a single-precision backward convolution (for bias) operation onto 1416 // the stream. 1417 // 1418 // Arguments: 1419 // stream: borrowed pointer to the stream that the 'convolve' operation 1420 // should be enqueued onto. 1421 // input_descriptor: dimensions of the input layer. 1422 // input_data: un-owned device memory region which contains the 1423 // convolution input. 1424 // bias_descriptor: dimensions of the bias tensor. Should be the same as the 1425 // input dimensions, but with the spatial dimensions set to 1. 1426 // backward_filter_data: un-owned device memory region in which to place the 1427 // backprop of the bias. 1428 virtual bool DoConvolveBackwardBias(Stream* stream, 1429 const BatchDescriptor& input_descriptor, 1430 const DeviceMemory<float>& input_data, 1431 const BatchDescriptor& bias_descriptor, 1432 DeviceMemory<float>* backward_bias_data) { 1433 return false; 1434 } 1435 1436 virtual bool DoConvolveBackwardBias( 1437 Stream* stream, const BatchDescriptor& input_descriptor, 1438 const DeviceMemory<double>& input_data, 1439 const BatchDescriptor& bias_descriptor, 1440 DeviceMemory<double>* backward_bias_data) { 1441 return false; 1442 } 1443 1444 virtual bool DoConvolveBackwardBias( 1445 Stream* stream, const BatchDescriptor& input_descriptor, 1446 const DeviceMemory<Eigen::half>& input_data, 1447 const BatchDescriptor& bias_descriptor, 1448 DeviceMemory<Eigen::half>* backward_bias_data) { 1449 return false; 1450 } 1451 1452 // Fully connects the "nodes" (float values) in input_data with 1453 // shape input_dimensions to output_data with output_dimensions 1454 // using provided weights. This is equivalent to computing a matrix 1455 // product, hence the name MatMul. 1456 // 1457 // A BatchDescriptor has four dimensions: batch, y, x, depth. Matrix products 1458 // happen in two dimensions. To get down to two dimensions, we consider the 1459 // input y, x and depth dimension as one combined dimension T. For now, 1460 // assume that the output height and width are 1 and let OD be the output 1461 // depth. 1462 // 1463 // There are three device memory buffers passed in to this 1464 // function. We can now view all three as matrices: 1465 // 1466 // input_data: A batch x T matrix 1467 // weights: A T x OD matrix 1468 // output_data: A batch x OD matrix 1469 // 1470 // This function then computes the matrix product of input_data and 1471 // weights and writes the result into output_data. 1472 // 1473 // Here the weights buffer is in row major order, i.e. the first OD 1474 // entries in weights are the first row, the second OD entries in 1475 // weights are the second row and so on. 1476 // 1477 // The case for output width*height > 1 is more complicated. Let K = 1478 // OY * OX where OY is the output height and OX is the output 1479 // width. Then weights is divided into K sub-arrays W_i, for 1480 // i=0,...,k-1, that each represent a T x OD matrix. This function 1481 // then computes the K matrix multiplications of input_data with 1482 // each W_i. This creates K matrices with dimensions batch x 1483 // OD. These K matrices are concatenated horizontally to form one 1484 // larger matrix with dimensions batch x (K*OD); note that this is 1485 // not the same as concatenating the bytes of the matrices. The 1486 // combined matrix can then be interpreted as a tensor with 1487 // dimensions (batch, OY, OX, OD). If the output tensor format is 1488 // not kBatchYXDepth, this function would then need to arrange for 1489 // the output to be in the requested layout, if that is 1490 // supported. Note that the case K=1 is equivalent to the 1491 // description above. It is recommended to prefer the case K=1. 1492 // 1493 // Arguments (all borrowed): 1494 // stream: borrowed pointer to the stream that the 'fully connect' operation 1495 // should be enqueued onto. 1496 // output_data: un-owned device memory region in which to place the 1497 // fully connected result. 1498 virtual bool DoMatMul(Stream* stream, const DeviceMemory<float>& input_data, 1499 const DeviceMemory<float>& weights, 1500 const dnn::BatchDescriptor& input_dimensions, 1501 const dnn::BatchDescriptor& output_dimensions, 1502 DeviceMemory<float>* output_data) = 0; 1503 1504 // Version of DoMatMul that uses pre-quantized 8 bit weights. 1505 // weight_scales specifies the scaling of each column of weights: 1506 // original float weight[row * num_columns + column] = 1507 // quantized_weight[row * nnum_columns + column] * weight_scales[column]. 1508 virtual bool DoMatMulQuantized(Stream* stream, 1509 const DeviceMemory<float>& input_data, 1510 const DeviceMemory<int8>& quantized_weights, 1511 const DeviceMemory<float>& weight_scales, 1512 const dnn::BatchDescriptor& input_dimensions, 1513 const dnn::BatchDescriptor& output_dimensions, 1514 DeviceMemory<float>* output_data) = 0; 1515 1516 // Version of DoMatMul that uses pre-quantized 16 bit weights. 1517 // weight_scales specifies the scaling of each column of weights: 1518 // original float weight[row * num_columns + column] = 1519 // quantized_weight[row * nnum_columns + column] * weight_scales[column]. 1520 virtual bool DoMatMulQuantized(Stream* stream, 1521 const DeviceMemory<float>& input_data, 1522 const DeviceMemory<int16>& quantized_weights, 1523 const DeviceMemory<float>& weight_scales, 1524 const dnn::BatchDescriptor& input_dimensions, 1525 const dnn::BatchDescriptor& output_dimensions, 1526 DeviceMemory<float>* output_data) = 0; 1527 1528 // Adds biases to the feature maps in input_data producing 1529 // output_data. input_data can equal output_data, but must not 1530 // partially overlap it. 1531 // 1532 // Let K = count() * height() * width() and N = feature_map_count() 1533 // on dimensions. Then input_value contains K*N values and biases 1534 // contains N values. We can thus logically consider input_value to 1535 // contain K vectors of N elements each. This function adds biases 1536 // to each of those N vectors. 1537 // 1538 // TODO(broune): This works differently when width() * height() > 1 1539 // and the call to ThenBiasAdd() follows a call to ThenMatMul(). In 1540 // that case there should be width() * height() * 1541 // feature_map_count() biases, but this is not implemented on all 1542 // StreamExecutors. 1543 // 1544 // Arguments (all borrowed): 1545 // stream: borrowed pointer to the stream that the 'bias add' operation 1546 // should be enqueued onto. 1547 // input_data: un-owned device memory region containing the input. 1548 // biases: un-owned device memory region containing biases to add to the 1549 // input. 1550 // dimensions: dimensions of input_data and output_data. 1551 // output_data: un-owned device memory region in which to place the result. 1552 virtual bool DoBiasAdd(Stream* stream, const DeviceMemory<float>& input_data, 1553 const DeviceMemory<float>& biases, 1554 const dnn::BatchDescriptor& dimensions, 1555 DeviceMemory<float>* output_data) = 0; 1556 1557 // Performs a forward pooling operation on input_data, writing to 1558 // output_data. See PoolingDescriptor for how to configure the 1559 // pooling operation. 1560 // 1561 // Pooling happens as a window that moves across the Y and X 1562 // dimensions of input_data, where each position of the window 1563 // yields one output value. E.g. for max pooling, the computed value 1564 // is the maximum element in the window. The operation is applied 1565 // independently to each batch and at each feature map (depth), so 1566 // that the output depth and feature_map_count are the same as for 1567 // the input. The output width and height can be different. 1568 // 1569 // See PoolingDescriptor for how to configure the pooling operation. 1570 virtual bool DoPoolForward(Stream* stream, 1571 const dnn::PoolingDescriptor& pooling_dimensions, 1572 const dnn::BatchDescriptor& input_dimensions, 1573 const DeviceMemory<float>& input_data, 1574 const dnn::BatchDescriptor& output_dimensions, 1575 DeviceMemory<float>* output_data, 1576 ScratchAllocator* workspace_allocator) = 0; 1577 1578 virtual bool DoPoolForward(Stream* stream, 1579 const dnn::PoolingDescriptor& pooling_dimensions, 1580 const dnn::BatchDescriptor& input_dimensions, 1581 const DeviceMemory<double>& input_data, 1582 const dnn::BatchDescriptor& output_dimensions, 1583 DeviceMemory<double>* output_data, 1584 ScratchAllocator* workspace_allocator) { 1585 LOG(FATAL) << "DoPoolForward not implemented for double."; 1586 return false; 1587 } 1588 1589 virtual bool DoPoolForward(Stream* stream, 1590 const dnn::PoolingDescriptor& pooling_dimensions, 1591 const dnn::BatchDescriptor& input_dimensions, 1592 const DeviceMemory<Eigen::half>& input_data, 1593 const dnn::BatchDescriptor& output_dimensions, 1594 DeviceMemory<Eigen::half>* output_data, 1595 ScratchAllocator* workspace_allocator) { 1596 LOG(FATAL) << "DoPoolForward not implemented for float16."; 1597 return false; 1598 } 1599 1600 virtual bool DoPoolForward(Stream* stream, 1601 const dnn::PoolingDescriptor& pooling_dimensions, 1602 const dnn::BatchDescriptor& input_dimensions, 1603 const DeviceMemory<int8>& input_data, 1604 const dnn::BatchDescriptor& output_dimensions, 1605 DeviceMemory<int8>* output_data, 1606 ScratchAllocator* workspace_allocator) { 1607 LOG(FATAL) << "DoPoolForward not implemented for int8."; 1608 return false; 1609 } 1610 1611 // Performs differentiation of the pooling operation. 1612 virtual bool DoPoolBackward(Stream* stream, 1613 const dnn::PoolingDescriptor& pooling_dimensions, 1614 const dnn::BatchDescriptor& input_dimensions, 1615 const DeviceMemory<double>& input_data, 1616 const dnn::BatchDescriptor& output_dimensions, 1617 const DeviceMemory<double>& output_data, 1618 const DeviceMemory<double>& input_diff_data, 1619 DeviceMemory<double>* output_diff_data, 1620 ScratchAllocator* workspace_allocator) { 1621 LOG(FATAL) << "DoPoolBackward not implemented."; 1622 return false; 1623 } 1624 1625 virtual bool DoPoolBackward(Stream* stream, 1626 const dnn::PoolingDescriptor& pooling_dimensions, 1627 const dnn::BatchDescriptor& input_dimensions, 1628 const DeviceMemory<float>& input_data, 1629 const dnn::BatchDescriptor& output_dimensions, 1630 const DeviceMemory<float>& output_data, 1631 const DeviceMemory<float>& input_diff_data, 1632 DeviceMemory<float>* output_diff_data, 1633 ScratchAllocator* workspace_allocator) { 1634 LOG(FATAL) << "DoPoolBackward not implemented."; 1635 return false; 1636 } 1637 1638 virtual bool DoPoolBackward(Stream* stream, 1639 const dnn::PoolingDescriptor& pooling_dimensions, 1640 const dnn::BatchDescriptor& input_dimensions, 1641 const DeviceMemory<Eigen::half>& input_data, 1642 const dnn::BatchDescriptor& output_dimensions, 1643 const DeviceMemory<Eigen::half>& output_data, 1644 const DeviceMemory<Eigen::half>& input_diff_data, 1645 DeviceMemory<Eigen::half>* output_diff_data, 1646 ScratchAllocator* workspace_allocator) { 1647 LOG(FATAL) << "DoPoolBackward not implemented."; 1648 return false; 1649 } 1650 1651 // Applies local response normalization to the values from input_data and 1652 // writes the result to output_data. 1653 // 1654 // See comments on NormalizeDescriptor for a description of local response 1655 // normalization. 1656 virtual bool DoNormalizeWithDimensions( 1657 Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, 1658 const dnn::BatchDescriptor& dimensions, 1659 const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) { 1660 return false; 1661 } 1662 1663 // Performs backpropagation for the normalization operation 1664 // 1665 // Given raw data, its corresponding normalized output, and a gradient of some 1666 // unspecified function with respect to the normalized variables, computes the 1667 // gradient of that unspecified function with respect to the raw variables. 1668 // 1669 // The normalized data input array is expected to match the output that would 1670 // be obtained by running the raw data input array through the DoNormalize 1671 // method above. 1672 // 1673 // See comments on NormalizeDescriptor for a description of local response 1674 // normalization. 1675 virtual bool DoNormalizeBackwardWithDimensions( 1676 Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor, 1677 const dnn::BatchDescriptor& dimensions, 1678 const DeviceMemory<float>& raw_data, 1679 const DeviceMemory<float>& normalized_data, 1680 const DeviceMemory<float>& normalized_variable_gradient, 1681 DeviceMemory<float>* raw_variable_gradient, 1682 ScratchAllocator* workspace_allocator) { 1683 return false; 1684 } 1685 1686 // Applies an activation function (see ActivationMode) to all of the values 1687 // held on the device in 'input_data', whose dimensions are described by 1688 // 'dimensions'. 1689 // 1690 // Arguments (all borrowed): 1691 // stream: borrowed pointer to the stream that the 'activate' operation 1692 // should be enqueued onto. 1693 // activation_mode: Type of activation to perform. 1694 // input_data: un-owned device memory region which contains the 1695 // activate input. 1696 // output_data: un-owned device memory region in which to place the 1697 // activate result. 1698 virtual bool DoActivate(Stream* stream, ActivationMode activation_mode, 1699 const BatchDescriptor& dimensions, 1700 const DeviceMemory<float>& input_data, 1701 DeviceMemory<float>* output_data, uint64 options) { 1702 return false; 1703 } 1704 1705 // Concatenates several layers into one, by concatenating the depth of each 1706 // layer at matching x and y coordinates. 1707 // The inputs must all have the same width and height, the output will have 1708 // the same width and height as the inputs and its depth will be the sum of 1709 // the input depths. 1710 // 1711 // Arguments (all borrowed): 1712 // stream: borrowed pointer to the stream that the 'depth concatenate' 1713 // operation should be enqueued onto. 1714 // input_dimensions: The dimensions of each input. 1715 // input_data: un-owned device memory region which contains the 1716 // input data for each input layer. 1717 // output_data: un-owned device memory region in which to place the 1718 // depth concatenate result. 1719 virtual bool DoDepthConcatenate( 1720 Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, 1721 port::ArraySlice<const DeviceMemory<float>*> input_data, 1722 DeviceMemory<float>* output_data) = 0; 1723 1724 // Concatenates several layers into one, by concatenating each in the 1725 // x-dimension or y-dimension, based on a user-specified flag. 1726 // For x-concatenation, layers are aligned at matching y and depth 1727 // coordinates, and for y-concatenation, they are aligned at matching x and 1728 // depth coordinates. The inputs must all have the same depth and batch size. 1729 // For x-concatenation, the inputs must have the same height (y-size), and the 1730 // output will have the same depth and height as the inputs and its width (x- 1731 // size) will be the sum of the input widths. For y-concatenation, the inputs 1732 // must have the same width, and the output will have the same depth and width 1733 // as the inputs, and its height will be the sum of the input heights. 1734 // 1735 // Arguments: 1736 // stream: borrowed pointer to the stream that the 'space concatenate' 1737 // operation should be enqueued onto. 1738 // input_dimensions: the dimensions of each input. 1739 // input_data: un-owned device memory region which contains the input data 1740 // for each input layer. 1741 // output_data: un-owned device memory region in which to place the space 1742 // concatenate result. 1743 // concat_direction: either dnn:SpaceConcatenateMode::XDirection or 1744 // dnn::SpaceConcatenateMode::YDirection. 1745 virtual bool DoSpaceConcatenate( 1746 Stream* stream, port::ArraySlice<dnn::BatchDescriptor> input_dimensions, 1747 port::ArraySlice<const DeviceMemory<float>*> input_data, 1748 DeviceMemory<float>* output_data, 1749 dnn::SpaceConcatenateMode concat_direction) { 1750 return false; 1751 } 1752 1753 // Change the layout of the data by shrinking one dimension (or set of 1754 // dimensions) and growing another dimension (or set of dimensions), while 1755 // keeping the total number of data elements constant, and maintaining the 1756 // current data ordering. 1757 // 1758 // Currently, the only supported operation is depth into space by a power of 1759 // 2. E.g. (y, x, z) -> (y*2, x*2, z/4) 1760 // 1761 // Note that Reshape may not be a no-op, depending on the platform and which 1762 // dimensions are being changed. 1763 // 1764 // Example: forgetting about batch for the moment, let's take a tensor that's 1765 // 2x1x8 (y by x by z) and reshape to a tensor that's 4x2x2. The memory layout 1766 // is row-major order: y,x,z. I.e. z changes the fastest, then x, then y. The 1767 // elements of the tensor range from 0 to 15. The x,y,z indices are below each 1768 // element. 1769 // 1770 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 1771 // y0 y0 y0 y0 y0 y0 y0 y0 y1 y1 y1 y1 y1 y1 y1 y1 1772 // x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 x0 1773 // z0 z1 z2 z3 z4 z5 z6 z7 z0 z1 z2 z3 z4 z5 z6 z7 1774 // 1775 // reshape to 4x2x2 1776 // 1777 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 1778 // y0 y0 y0 y0 y1 y1 y1 y1 y2 y2 y2 y2 y3 y3 y3 y3 1779 // x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 x0 x0 x1 x1 1780 // z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 z0 z1 1781 virtual bool DoReshape(Stream* stream, 1782 const dnn::BatchDescriptor& input_dimensions, 1783 const DeviceMemory<float>& input_data, 1784 const dnn::BatchDescriptor& output_dimensions, 1785 DeviceMemory<float>* output_data) { 1786 return false; 1787 } 1788 1789 // Depth to space takes an X by Y image with depth D*M and changes it to an 1790 // MX x MY image with depth D. Each input location (x,y) with depth D*M in 1791 // the input image is changed to an MxM contiguous area in the output image, 1792 // with the values being laid out in the raster order by DepthToSpaceLayout, 1793 // and will have a new depth of D. 1794 // 1795 // Example. 1796 // M=2, Din =8, Xin=2, Yin=2. Xout=4, Yout=4, Dout=2 1797 // DepthHeightWidth layout 1798 // Values within a 'cell' are at different depths and same x & y. 1799 // Input: 1800 // abcdefgh ijklmnop 1801 // qrstuvwx yz012345 1802 // Output: 1803 // ae bf im jn 1804 // cg dh ko lp 1805 // qu rv y2 z3 1806 // sw tx 04 15 1807 // 1808 // sqrt_depth_reduction: 'M' in the comment above 1809 virtual bool DoDepthToSpace(Stream* stream, 1810 const dnn::BatchDescriptor& input_dimensions, 1811 const DeviceMemory<float>& input_data, 1812 const DepthToSpaceLayout& depth_to_space_layout, 1813 const int& sqrt_depth_reduction, 1814 DeviceMemory<float>* output_data) { 1815 return false; 1816 } 1817 1818 // Space to depth is the inverse of depth to space. Space to depth takes each 1819 // non-overlapping M by M patch (in the X and Y dimensions) with depth D of 1820 // the input, and transforms it to a 1 by 1 patch with depth D*M. If the 1821 // input has size (MX, MY, D), the output has size (X, Y, D*M). The number of 1822 // data elements is not changed. 1823 // 1824 // Example. 1825 // M=2, Din =2, Xin=4, Yin=4, Dout=8 1826 // DepthHeightWidth layout 1827 // Values within a 'cell' are at different depths and same x & y. 1828 // Input: 1829 // ae bf im jn 1830 // cg dh ko lp 1831 // qu rv y2 z3 1832 // sw tx 04 15 1833 // Output: 1834 // abcdefgh ijklmnop 1835 // qrstuvwx yz012345 1836 // 1837 // sqrt_depth_increase: 'M' in the comment above 1838 virtual bool DoSpaceToDepth(Stream* stream, 1839 const dnn::BatchDescriptor& input_dimensions, 1840 const DeviceMemory<float>& input_data, 1841 const DepthToSpaceLayout& space_to_depth_layout, 1842 const int& sqrt_depth_increase, 1843 DeviceMemory<float>* output_data) { 1844 return false; 1845 } 1846 1847 // Computes the specified operation (e.g. addition or multiplication) 1848 // between corresponding elements in the inputs and stores the result in the 1849 // output element. 1850 // The inputs and output must all have the same dimensions, but may have 1851 // different quantization parameters (min_value and max_value). 1852 // 1853 // Arguments (all borrowed): 1854 // stream: borrowed pointer to the stream that the 'elementwise operation' 1855 // should be enqueued onto. 1856 // operation: The operation to perform. 1857 // input_dimensions: The dimensions of each input. 1858 // input_data: un-owned device memory region which contains the 1859 // input data for each input layer. 1860 // output_dimensions: The dimensions of the output. 1861 // output_data: un-owned device memory region in which to place the 1862 // operation result. 1863 virtual bool DoElementwiseOperate( 1864 Stream* stream, ElementwiseOperation operation, 1865 port::ArraySlice<dnn::BatchDescriptor> input_dimensions, 1866 port::ArraySlice<const DeviceMemory<float>*> input_data, 1867 const dnn::BatchDescriptor& output_dimensions, 1868 DeviceMemory<float>* output_data) = 0; 1869 1870 // Computes the specified operation (e.g. addition or multiplication) 1871 // between corresponding elements in the inputs and stores the result in the 1872 // output element. Each input is multiplied by a scalar constant and the 1873 // result is divided by a scalar constant. 1874 // e.g. To perform Z = 0.9*X + 1.1*Y, set the input multiplicands to 9 and 11 1875 // and the output divisor to 10. 1876 // The inputs and output must all have the same dimensions, but may have 1877 // different quantization parameters (min_value and max_value). 1878 // 1879 // Arguments (all borrowed): 1880 // stream: borrowed pointer to the stream that the 'elementwise operation' 1881 // should be enqueued onto. 1882 // operation: The operation to perform. 1883 // input_multiplicands: Amount to scale each input. 1884 // output_divisor: Amount to divide the output. 1885 // input_dimensions: The dimensions of each input. 1886 // input_data: un-owned device memory region which contains the 1887 // input data for each input layer. 1888 // output_dimensions: The dimensions of the output. 1889 // output_data: un-owned device memory region in which to place the 1890 // operation result. 1891 virtual bool DoElementwiseOperateScaledQuantized( 1892 Stream* stream, ElementwiseOperation operation, 1893 port::ArraySlice<int> input_multiplicands, int output_divisor, 1894 port::ArraySlice<dnn::BatchDescriptor> input_dimensions, 1895 port::ArraySlice<const DeviceMemory<float>*> input_data, 1896 const dnn::BatchDescriptor& output_dimensions, 1897 DeviceMemory<float>* output_data) { 1898 return false; 1899 } 1900 1901 // Pads the input with zeros in the X and Y dimensions. The feature_map 1902 // dimension is unchanged. 1903 // 1904 // Arguments (all borrowed): 1905 // stream: borrowed pointer to the stream that the 'elementwise operation' 1906 // should be enqueued onto. 1907 // dimensions: The dimensions of the input. 1908 // input_data: un-owned device memory region which contains the 1909 // input data for the input layer. 1910 // left_pad: Amount to pad the input on the left. 1911 // right_pad: Amount to pad the input on the right. 1912 // top_pad: Amount to pad the input at the top (low Y). 1913 // bottom_pad: Amount to pad the input at the bottom (high Y). 1914 // output_data: un-owned device memory region in which to place the 1915 // padded result. 1916 virtual bool DoXYPad(Stream* stream, const dnn::BatchDescriptor &dimensions, 1917 const DeviceMemory<float> &input_data, 1918 int64 left_pad, int64 right_pad, int64 top_pad, 1919 int64 bottom_pad, DeviceMemory<float> *output_data) = 0; 1920 1921 // Extracts a slice of the input in the X and Y dimensions. The feature_map 1922 // dimension is unchanged. 1923 // 1924 // Arguments (all borrowed): 1925 // stream: borrowed pointer to the stream that the 'elementwise operation' 1926 // should be enqueued onto. 1927 // dimensions: The dimensions of the input. 1928 // input_data: un-owned device memory region which contains the 1929 // input data for the input layer. 1930 // left_trim: Amount to cut off the input on the left. 1931 // right_trim: Amount to cut off the input on the right. 1932 // top_trim: Amount to cut off the input at the top (low y). 1933 // bottom_trim: Amount to cut off the input at the bottom (high Y). 1934 // output_data: un-owned device memory region in which to place the 1935 // padded result. 1936 virtual bool DoXYSlice(Stream* stream, const dnn::BatchDescriptor &dimensions, 1937 const DeviceMemory<float> &input_data, 1938 int64 left_trim, int64 right_trim, int64 top_trim, 1939 int64 bottom_trim, DeviceMemory<float> *output_data) = 0; 1940 1941 // Grows the input tensor by replicating the X and Y dimensions. The batch and 1942 // depth/feature_map dimensions are unchanged. Currently, the input tensor is 1943 // limited to X=1 and Y=1. 1944 // 1945 // For example, the input has dimensions x=2, y=3, and replicate_x=3, 1946 // replicate_y=2. The diagonal elements of the output would be: [x0y0, x1y1, 1947 // x0y2, x1y0, x0y1, x1y2]. 1948 // Here is the example as a picture. input: 1949 // AB 1950 // CD 1951 // EF 1952 // broadcast result: 1953 // ABABAB 1954 // CDCDCD 1955 // EFEFEF 1956 // ABABAB 1957 // CDCDCD 1958 // EFEFEF 1959 // 1960 // Arguments (all borrowed): 1961 // stream: borrowed pointer to the stream that the 'elementwise operation' 1962 // should be enqueued onto. 1963 // dimensions: The dimensions of the input. 1964 // input_data: un-owned device memory region which contains the 1965 // input data for the input layer. 1966 // replicate_x: Amount to replicate the input's X dimension. 1967 // replicate_y: Amount to replicate the input's Y dimension. 1968 // output_data: un-owned device memory region in which to place the 1969 // padded result. 1970 virtual bool DoXYBroadcast(Stream* stream, 1971 const dnn::BatchDescriptor& dimensions, 1972 const DeviceMemory<float>& input_data, 1973 int64 replicate_x, int64 replicate_y, 1974 DeviceMemory<float>* output_data) { 1975 return false; 1976 } 1977 1978 // Enqueues an asynchronous memcpy of the *quantized* output of a layer (that 1979 // is, bytes instead of scaled floats) into 'host_dst' if they are available 1980 // for the underlying DNN implementation. If this quantized output is not 1981 // available, false is returned, which will place 'stream' into an error 1982 // state. 1983 // 1984 // Arguments (all borrowed): 1985 // stream: borrowed pointer to the stream that the 'quantized memcpy' 1986 // operation should be enqueued onto. 1987 // gpu_unquantized_src: the device memory that contains the unquantized data 1988 // -- this data should also have a corresponding quantized representation 1989 // on the device for this operation to succeed. 1990 // mode: Type of quantization of the data to write into host_dst. 1991 // host_dst: un-owned host memory region that is mutated in place, 1992 // it is clobbered by the values in 'gpu_unquantized_src' when the enqueued 1993 // (asynchronous) memcpy operation is performed. 1994 // size: size in bytes of the host_dst host memory region. 1995 virtual bool DoMemcpyD2HQuantized( 1996 Stream* stream, const DeviceMemory<float>& gpu_unquantized_src, 1997 QuantizedActivationMode mode, void* host_dst, int64 size) = 0; 1998 1999 // Enqueues an asynchronous memcpy of 'host_dst' into the *quantized* input 2000 // of a layer (that is, bytes instead of scaled floats) if they are supported 2001 // by the underlying DNN implementation. If this quantized input is not 2002 // supported, false is returned, which will place 'stream' into an error 2003 // state. 2004 // 2005 // Arguments (all borrowed): 2006 // stream: borrowed pointer to the stream that the 'quantized memcpy' 2007 // operation should be enqueued onto. 2008 // host_src: un-owned host memory region that contains the quantized data. 2009 // size: size in bytes of the host_src host memory region. 2010 // mode: Type of quantization of the data to read from host_src. 2011 // gpu_unquantized_dst: the device memory that is clobbered by the values in 2012 // 'host_src' when the enqueued (asynchronous) memcpy operation is 2013 // performed. -- this data should also have a corresponding quantized 2014 // representation on the device for this operation to 2015 // succeed. 2016 virtual bool DoMemcpyH2DQuantized( 2017 Stream* stream, const void* host_src, int64 size, 2018 QuantizedActivationMode mode, 2019 DeviceMemory<float>* gpu_unquantized_dst) = 0; 2020 2021 // Create an RNN descriptor based on model shapes and configurations. 2022 // The caller retains the ownership of the descriptor. 2023 // 2024 // Arguments: 2025 // num_layers: the number of layers for a RNN model. 2026 // hidden_size: the size of the hidden state. 2027 // input_size: the size of the input state. 2028 // input_mode: an enum to specify whether a linear transformation is added 2029 // after the input state. If input_size is different from hidden_size, this 2030 // is required. 2031 // direction_mode: an enum to specify whether this model is unidirectional or 2032 // bidirectional. 2033 // rnn_mode: an enum to specify the type of model to build. 2034 // data_type: an enum to specify the data types used in this model. 2035 // dropout: the dropout threshold between layers. When it is 0., no dropout 2036 // is added. 2037 // seed: a seed for initializing the dropout layers. 2038 // state_allocator: an memory allocator that will be used to store the state 2039 // for dropout layer. The user has to maintain the memory until the model 2040 // is no longer in use. 2041 virtual port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> 2042 createRnnDescriptor(int num_layers, int hidden_size, int input_size, 2043 int batch_size, dnn::RnnInputMode input_mode, 2044 dnn::RnnDirectionMode direction_mode, 2045 dnn::RnnMode rnn_mode, dnn::DataType data_type, 2046 const dnn::AlgorithmConfig& algorithm_config, 2047 float dropout, uint64 seed, 2048 ScratchAllocator* state_allocator) { 2049 return port::Status(port::error::UNIMPLEMENTED, 2050 "createRnnDescriptor is unimplemented"); 2051 } 2052 2053 // Create a RNN sequence descriptor that specifies either the input or output 2054 // sequence. The caller retains the ownership of the returned descriptor. 2055 // 2056 // Arguments: 2057 // max_seq_length: the max length of the sequences. 2058 // batch_size: the size of a minibatch. 2059 // data_size: the size of the state. 2060 // seq_lenghs: the lengths of sequences in a batch. 2061 // data_type: an enum to specify the type for the underlying data. 2062 virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>> 2063 createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size, 2064 int data_size, dnn::DataType data_type) { 2065 return port::Status(port::error::UNIMPLEMENTED, 2066 "createRnnSequenceTensorDescriptor is unimplemented"); 2067 } 2068 2069 virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>> 2070 createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size, 2071 int data_size, 2072 const absl::Span<const int>& seq_lengths, 2073 bool time_major, dnn::DataType data_type) { 2074 return port::Status(port::error::UNIMPLEMENTED, 2075 "createRnnSequenceTensorDescriptor is unimplemented"); 2076 } 2077 2078 // Create an RNN state descriptor that specifies the input or hidden state. 2079 // The caller retains the ownership of the returned descriptor. 2080 virtual port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>> 2081 createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size, 2082 dnn::DataType data_type) { 2083 return port::Status(port::error::UNIMPLEMENTED, 2084 "createRnnStateTensorDescriptor is unimplemented"); 2085 } 2086 2087 // Enqueue a forward operation of the RNN model onto the stream. 2088 // 2089 // Arguments: 2090 // stream: pointer to the stream where this operation should be enqueued to. 2091 // rnn_desc: a RNN descriptor created by createRnnDescriptor. 2092 // input_desc: descriptor for the input sequence. 2093 // input_data: the device memory region that contains the input data. 2094 // input_h_desc: descriptor for the input "h" state. 2095 // input_h_data: the device memory region that contains the input "h" data. 2096 // input_c_desc: descriptor for the input "c" state. 2097 // input_c_data: the device memory region that contains the input "c" data. 2098 // This must be specified for LSTM models. 2099 // params: the device memory region that contains the parameters used in this 2100 // model. 2101 // output_desc: descriptor for the output sequence. 2102 // output_data: the memory region that stores the output sequence data. 2103 // output_h_desc: descriptor for the output "h" state. 2104 // output_h_data: the memory region that stores the output "h" data. 2105 // output_c_desc: descriptor for the output "c" state. 2106 // output_c_data: the memory region that stores the output "c" data. This 2107 // must be specified for LSTM models. 2108 // is_training: whether this is used in training or inference. That decides 2109 // whether respace_space data need to be produced. 2110 // reserve_space_allocator: if "is_training" is true, an memory allocator 2111 // to create memory that holds the produced reserve_space. The caller is 2112 // retains the data and feed it to the backward pass. 2113 // workspace_allocator: an allocator to create temporary workspace used in 2114 // this kernel. The caller is responsible for retaining the memory long 2115 // enough for the lifespan of this operation, and recycles afterwards. 2116 virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, 2117 const dnn::RnnSequenceTensorDescriptor& input_desc, 2118 const DeviceMemory<Eigen::half>& input_data, 2119 const dnn::RnnStateTensorDescriptor& input_h_desc, 2120 const DeviceMemory<Eigen::half>& input_h_data, 2121 const dnn::RnnStateTensorDescriptor& input_c_desc, 2122 const DeviceMemory<Eigen::half>& input_c_data, 2123 const DeviceMemory<Eigen::half>& params, 2124 const dnn::RnnSequenceTensorDescriptor& output_desc, 2125 DeviceMemory<Eigen::half>* output_data, 2126 const dnn::RnnStateTensorDescriptor& output_h_desc, 2127 DeviceMemory<Eigen::half>* output_h_data, 2128 const dnn::RnnStateTensorDescriptor& output_c_desc, 2129 DeviceMemory<Eigen::half>* output_c_data, 2130 bool is_training, 2131 ScratchAllocator* reserve_space_allocator, 2132 ScratchAllocator* workspace_allocator, 2133 dnn::ProfileResult* output_profile_result) { 2134 return false; 2135 } 2136 2137 virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, 2138 const dnn::RnnSequenceTensorDescriptor& input_desc, 2139 const DeviceMemory<float>& input_data, 2140 const dnn::RnnStateTensorDescriptor& input_h_desc, 2141 const DeviceMemory<float>& input_h_data, 2142 const dnn::RnnStateTensorDescriptor& input_c_desc, 2143 const DeviceMemory<float>& input_c_data, 2144 const DeviceMemory<float>& params, 2145 const dnn::RnnSequenceTensorDescriptor& output_desc, 2146 DeviceMemory<float>* output_data, 2147 const dnn::RnnStateTensorDescriptor& output_h_desc, 2148 DeviceMemory<float>* output_h_data, 2149 const dnn::RnnStateTensorDescriptor& output_c_desc, 2150 DeviceMemory<float>* output_c_data, 2151 bool is_training, 2152 ScratchAllocator* reserve_space_allocator, 2153 ScratchAllocator* workspace_allocator, 2154 dnn::ProfileResult* output_profile_result) { 2155 return false; 2156 } 2157 2158 virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc, 2159 const dnn::RnnSequenceTensorDescriptor& input_desc, 2160 const DeviceMemory<double>& input_data, 2161 const dnn::RnnStateTensorDescriptor& input_h_desc, 2162 const DeviceMemory<double>& input_h_data, 2163 const dnn::RnnStateTensorDescriptor& input_c_desc, 2164 const DeviceMemory<double>& input_c_data, 2165 const DeviceMemory<double>& params, 2166 const dnn::RnnSequenceTensorDescriptor& output_desc, 2167 DeviceMemory<double>* output_data, 2168 const dnn::RnnStateTensorDescriptor& output_h_desc, 2169 DeviceMemory<double>* output_h_data, 2170 const dnn::RnnStateTensorDescriptor& output_c_desc, 2171 DeviceMemory<double>* output_c_data, 2172 bool is_training, 2173 ScratchAllocator* reserve_space_allocator, 2174 ScratchAllocator* workspace_allocator, 2175 dnn::ProfileResult* output_profile_result) { 2176 return false; 2177 } 2178 // Enqueue a backward operation of the RNN model onto the stream. 2179 // 2180 // Arguments: 2181 // stream: pointer to the stream where this operation should be enqueued to. 2182 // rnn_desc: a RNN descriptor created by createRnnDescriptor. 2183 // input_desc: descriptor for the input sequence. 2184 // input_data: the device memory region that contains the input data. 2185 // input_h_desc: descriptor for the input "h" state. 2186 // input_h_data: the device memory region that contains the input "h" data. 2187 // input_c_desc: descriptor for the input "c" state. 2188 // input_c_data: the device memory region that contains the input "c" data. 2189 // This must be specified for LSTM models. 2190 // params: the device memory region that contains the parameters used in this 2191 // model. 2192 // output_desc: descriptor for the output sequence. 2193 // output_data: the memory region that stores the output sequence data. 2194 // output_h_desc: descriptor for the output "h" state. 2195 // output_h_data: the memory region that stores the output "h" data. 2196 // output_c_desc: descriptor for the output "c" state. 2197 // output_c_data: the memory region that stores the output "c" data. This 2198 // must be specified for LSTM models. 2199 // output_backprop_data: the device memory region that contains the backprop 2200 // to the output sequence. 2201 // output_h_backprop_data: the device memory region that contains the 2202 // backprop to the output "h" state. 2203 // output_c_backprop_data: the device memory region that contains the 2204 // backprop to the output "c" state. 2205 // input_backprop_data: the device memory region that stores the backprop 2206 // to the input sequence. 2207 // input_h_backprop_data: the device memory region that stores the backprop 2208 // to the input "h" state. 2209 // input_c_backprop_data: the device memory region that stores the backprop 2210 // to the input "c" state. 2211 // params_backprop_data: the device memory region that stores the backprop 2212 // to the parameters. 2213 // reserve_space_data: the reserve_space data that is produced by the forward 2214 // operation. This memory region could be modified by this operation. 2215 // workspace_allocator: a memory allocator that creates the temporary 2216 // workspace memory used by this operation. The caller is responsible for 2217 // keeping the memory alive long enough for this operation, and recylces 2218 // afterwards. 2219 virtual bool DoRnnBackward( 2220 Stream* stream, const dnn::RnnDescriptor& rnn_desc, 2221 const dnn::RnnSequenceTensorDescriptor& input_desc, 2222 const DeviceMemory<Eigen::half>& input_data, 2223 const dnn::RnnStateTensorDescriptor& input_h_desc, 2224 const DeviceMemory<Eigen::half>& input_h_data, 2225 const dnn::RnnStateTensorDescriptor& input_c_desc, 2226 const DeviceMemory<Eigen::half>& input_c_data, 2227 const DeviceMemory<Eigen::half>& params, 2228 const dnn::RnnSequenceTensorDescriptor& output_desc, 2229 const DeviceMemory<Eigen::half>& output_data, 2230 const dnn::RnnStateTensorDescriptor& output_h_desc, 2231 const DeviceMemory<Eigen::half>& output_h_data, 2232 const dnn::RnnStateTensorDescriptor& output_c_desc, 2233 const DeviceMemory<Eigen::half>& output_c_data, 2234 const DeviceMemory<Eigen::half>& output_backprop_data, 2235 const DeviceMemory<Eigen::half>& output_h_backprop_data, 2236 const DeviceMemory<Eigen::half>& output_c_backprop_data, 2237 DeviceMemory<Eigen::half>* input_backprop_data, 2238 DeviceMemory<Eigen::half>* input_h_backprop_data, 2239 DeviceMemory<Eigen::half>* input_c_backprop_data, 2240 DeviceMemory<Eigen::half>* params_backprop_data, 2241 DeviceMemory<uint8>* reserve_space_data, 2242 ScratchAllocator* workspace_allocator, 2243 dnn::ProfileResult* output_profile_result) { 2244 return false; 2245 } 2246 2247 virtual bool DoRnnBackward( 2248 Stream* stream, const dnn::RnnDescriptor& rnn_desc, 2249 const dnn::RnnSequenceTensorDescriptor& input_desc, 2250 const DeviceMemory<float>& input_data, 2251 const dnn::RnnStateTensorDescriptor& input_h_desc, 2252 const DeviceMemory<float>& input_h_data, 2253 const dnn::RnnStateTensorDescriptor& input_c_desc, 2254 const DeviceMemory<float>& input_c_data, 2255 const DeviceMemory<float>& params, 2256 const dnn::RnnSequenceTensorDescriptor& output_desc, 2257 const DeviceMemory<float>& output_data, 2258 const dnn::RnnStateTensorDescriptor& output_h_desc, 2259 const DeviceMemory<float>& output_h_data, 2260 const dnn::RnnStateTensorDescriptor& output_c_desc, 2261 const DeviceMemory<float>& output_c_data, 2262 const DeviceMemory<float>& output_backprop_data, 2263 const DeviceMemory<float>& output_h_backprop_data, 2264 const DeviceMemory<float>& output_c_backprop_data, 2265 DeviceMemory<float>* input_backprop_data, 2266 DeviceMemory<float>* input_h_backprop_data, 2267 DeviceMemory<float>* input_c_backprop_data, 2268 DeviceMemory<float>* params_backprop_data, 2269 DeviceMemory<uint8>* reserve_space_data, 2270 ScratchAllocator* workspace_allocator, 2271 dnn::ProfileResult* output_profile_result) { 2272 return false; 2273 } 2274 2275 virtual bool DoRnnBackward( 2276 Stream* stream, const dnn::RnnDescriptor& rnn_desc, 2277 const dnn::RnnSequenceTensorDescriptor& input_desc, 2278 const DeviceMemory<double>& input_data, 2279 const dnn::RnnStateTensorDescriptor& input_h_desc, 2280 const DeviceMemory<double>& input_h_data, 2281 const dnn::RnnStateTensorDescriptor& input_c_desc, 2282 const DeviceMemory<double>& input_c_data, 2283 const DeviceMemory<double>& params, 2284 const dnn::RnnSequenceTensorDescriptor& output_desc, 2285 const DeviceMemory<double>& output_data, 2286 const dnn::RnnStateTensorDescriptor& output_h_desc, 2287 const DeviceMemory<double>& output_h_data, 2288 const dnn::RnnStateTensorDescriptor& output_c_desc, 2289 const DeviceMemory<double>& output_c_data, 2290 const DeviceMemory<double>& output_backprop_data, 2291 const DeviceMemory<double>& output_h_backprop_data, 2292 const DeviceMemory<double>& output_c_backprop_data, 2293 DeviceMemory<double>* input_backprop_data, 2294 DeviceMemory<double>* input_h_backprop_data, 2295 DeviceMemory<double>* input_c_backprop_data, 2296 DeviceMemory<double>* params_backprop_data, 2297 DeviceMemory<uint8>* reserve_space_data, 2298 ScratchAllocator* workspace_allocator, 2299 dnn::ProfileResult* output_profile_result) { 2300 return false; 2301 } 2302 2303 // Transforms a tensor into another tensor with a different layout and/or data 2304 // type. 2305 // 2306 // Arguments: 2307 // stream: pointer to the stream where this operation should be enqueued to. 2308 // input_desc: specifies the shape and the data layout of the input tensor. 2309 // input_type: the data type of the input tensor. 2310 // input_data: the device memory region that contains the input tensor. 2311 // output_desc: specifies the shape and the data layout of the output tensor. 2312 // output_type: the data type of the output tensor. 2313 // scale: an element-wise scaling factor to apply. 2314 // output_data: the device memory region that contains the output tensor. 2315 virtual bool DoTransformTensor(Stream* stream, 2316 const dnn::BatchDescriptor& input_desc, 2317 dnn::DataType input_type, 2318 const DeviceMemoryBase& input_data, 2319 const dnn::BatchDescriptor& output_desc, 2320 dnn::DataType output_type, float scale, 2321 DeviceMemoryBase* output_data) { 2322 return false; 2323 } 2324 2325 // Enqueues a fused convolution+bias+activation operation onto the stream. 2326 // 2327 // Arguments (all borrowed): 2328 // 2329 // stream: borrowed pointer to the stream that the 'fusion' operation should 2330 // be enqueued onto. 2331 // 2332 // conv_input_descriptor: dimensions of the convolution input layer. 2333 // conv_input_data: device memory which contains the convolution input. 2334 // 2335 // filter_descriptor: dimensions of the convolution filter. 2336 // filter_data: device memory which contains the convolution filter weights. 2337 // 2338 // convolution_descriptor: stride of the convolution filter. 2339 // 2340 // bias_descriptor: dimensions of the bias layer 2341 // biases: device memory region containing biases to add to the convolution 2342 // output 2343 // 2344 // activation_mode: Type of activation to perform. 2345 // 2346 // output_descriptor: dimensions of the output layer. 2347 // output_data: device memory region in which to place the fusion result. 2348 // 2349 // output_profile_result: the output profile result for this call. 2350 // The profiling is only enabled when this is not nullptr. 2351 // 2352 virtual bool DoFusedConvolutionBiasActivation( 2353 Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor, 2354 const DeviceMemory<float>& conv_input_data, 2355 const dnn::FilterDescriptor& filter_descriptor, 2356 const DeviceMemory<float>& filter_data, 2357 const dnn::ConvolutionDescriptor& convolution_descriptor, 2358 const dnn::BatchDescriptor& bias_descriptor, 2359 const DeviceMemory<float>& bias_data, dnn::ActivationMode activation_mode, 2360 const dnn::BatchDescriptor& output_descriptor, 2361 DeviceMemory<float>* output_data, 2362 dnn::ProfileResult* output_profile_result) { 2363 return false; 2364 } 2365 2366 // Enqueues a fused batchnorm+activation (inference) operation onto the 2367 // stream. 2368 // 2369 // Arguments (all borrowed): 2370 // 2371 // stream: borrowed pointer to the stream that the 'fusion' operation should 2372 // be enqueued onto. 2373 // 2374 // x_descriptor: dimensions of the batchnorm input layer. 2375 // x_data: device memory which contains the batchnorm input. 2376 // 2377 // scale_offset_mean_variance_descriptor: 2378 // dimensions of the scale/offset/mean/variance tensor. 2379 // scale_data: device memory which contains the scale input. 2380 // offset_data: device memory which contains the offset input. 2381 // mean_data: device memory which contains the mean input. 2382 // variance_data: device memory which contains the variance input. 2383 // epsilon : the epsilon value to use in batchnorm calculation 2384 // 2385 // activation_mode: Type of activation to perform. 2386 // 2387 // y_data: device memory region in which to place the fusion result. 2388 // 2389 // output_profile_result: the output profile result for this call. 2390 // The profiling is only enabled when this is not nullptr. 2391 // 2392 virtual bool DoFusedBatchNormActivationInference( 2393 Stream* stream, const dnn::BatchDescriptor& x_descriptor, 2394 const DeviceMemory<float>& x_data, 2395 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 2396 const DeviceMemory<float>& scale_data, 2397 const DeviceMemory<float>& offset_data, 2398 const DeviceMemory<float>& mean_data, 2399 const DeviceMemory<float>& variance_data, double epsilon, 2400 dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data, 2401 dnn::ProfileResult* output_profile_result) { 2402 return false; 2403 } 2404 2405 virtual bool DoFusedBatchNormActivationInference( 2406 Stream* stream, const dnn::BatchDescriptor& x_descriptor, 2407 const DeviceMemory<Eigen::half>& x_data, 2408 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 2409 const DeviceMemory<float>& scale_data, 2410 const DeviceMemory<float>& offset_data, 2411 const DeviceMemory<float>& mean_data, 2412 const DeviceMemory<float>& variance_data, double epsilon, 2413 dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data, 2414 dnn::ProfileResult* output_profile_result) { 2415 return false; 2416 } 2417 2418 // Enqueues a fused batchnorm+activation (training-fwd) operation onto the 2419 // stream. 2420 // 2421 // Arguments (all borrowed): 2422 // 2423 // stream: borrowed pointer to the stream that the 'fusion' operation should 2424 // be enqueued onto. 2425 // 2426 // x_descriptor: dimensions of the batchnorm input layer. 2427 // x_data: device memory which contains the batchnorm input. 2428 // 2429 // scale_offset_mean_variance_descriptor: 2430 // dimensions of the scale/offset/mean/variance tensor. 2431 // scale_data: device memory which contains the scale input. 2432 // offset_data: device memory which contains the offset input. 2433 // epsilon : the epsilon value to use in batchnorm calculation 2434 // 2435 // activation_mode: Type of activation to perform. 2436 // 2437 // y_data: device memory region in which to place the fusion result. 2438 // batch_mean_data: device memory in which to place the batch mean output. 2439 // batch_var_data: device memory in which to place the batch variance output. 2440 // saved_mean_data: device memory in which to save the mean for bwd pass. 2441 // saved_var_data: device memory in which to save the variance for bwd pass. 2442 // 2443 // output_profile_result: the output profile result for this call. 2444 // The profiling is only enabled when this is not nullptr. 2445 // 2446 virtual bool DoFusedBatchNormActivationForward( 2447 Stream* stream, const dnn::BatchDescriptor& x_descriptor, 2448 const DeviceMemory<float>& x_data, 2449 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 2450 const DeviceMemory<float>& scale_data, 2451 const DeviceMemory<float>& offset_data, double epsilon, 2452 dnn::ActivationMode activation_mode, DeviceMemory<float>* y_data, 2453 DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data, 2454 DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data, 2455 dnn::ProfileResult* output_profile_result) { 2456 return false; 2457 } 2458 2459 virtual bool DoFusedBatchNormActivationForward( 2460 Stream* stream, const dnn::BatchDescriptor& x_descriptor, 2461 const DeviceMemory<Eigen::half>& x_data, 2462 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 2463 const DeviceMemory<float>& scale_data, 2464 const DeviceMemory<float>& offset_data, double epsilon, 2465 dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y_data, 2466 DeviceMemory<float>* batch_mean_data, DeviceMemory<float>* batch_var_data, 2467 DeviceMemory<float>* saved_mean_data, DeviceMemory<float>* saved_var_data, 2468 dnn::ProfileResult* output_profile_result) { 2469 return false; 2470 } 2471 2472 // Enqueues a fused batchnorm+activation (training-bwd) operation onto the 2473 // stream. 2474 // 2475 // Arguments (all borrowed): 2476 // 2477 // stream: borrowed pointer to the stream that the 'fusion' operation should 2478 // be enqueued onto. 2479 // 2480 // y_act_backprop_descriptor: dimensions of the backprop input from the 2481 // previous layer. y_act_backprop_data: device memory which contains the 2482 // backprop input. 2483 // 2484 // y_act_data: device memory which contains the actv-fwd output data. 2485 // 2486 // activation_mode: actv-fwd type. 2487 // 2488 // scale_offset_mean_variance_descriptor: 2489 // dimensions of the scale/offset/mean/variance tensor. 2490 // scale_data: device memory which contains the scale input. 2491 // offset_data: device memory which contains the offset input. 2492 // saved_mean_data: device memory which contains the saved mean from fwd 2493 // pass. saved_var_data: device memory which contains the saved variance from 2494 // fwd pass. 2495 // 2496 // x_bn_backprop_data: device memory region in which to place the backprop 2497 // data from this layer scale_backprop_data: device memory in which to place 2498 // the scale backprop output. offset_backprop_data: device memory in which to 2499 // place the offset backprop output. 2500 // 2501 // output_profile_result: the output profile result for this call. 2502 // The profiling is only enabled when this is not nullptr. 2503 // 2504 virtual bool DoFusedBatchNormActivationBackward( 2505 Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor, 2506 const DeviceMemory<float>& y_act_backprop_data, 2507 const DeviceMemory<float>& y_act_data, 2508 dnn::ActivationMode activation_mode, const DeviceMemory<float>& x_bn_data, 2509 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 2510 const DeviceMemory<float>& scale_data, 2511 const DeviceMemory<float>& offset_data, 2512 const DeviceMemory<float>& saved_mean_data, 2513 const DeviceMemory<float>& saved_var_data, 2514 DeviceMemory<float>* x_bn_backprop_data, 2515 DeviceMemory<float>* scale_backprop_data, 2516 DeviceMemory<float>* offset_backprop_data, 2517 dnn::ProfileResult* output_profile_result) { 2518 return false; 2519 } 2520 2521 virtual bool DoFusedBatchNormActivationBackward( 2522 Stream* stream, const dnn::BatchDescriptor& y_act_backprop_descriptor, 2523 const DeviceMemory<Eigen::half>& y_act_backprop_data, 2524 const DeviceMemory<Eigen::half>& y_act_data, 2525 dnn::ActivationMode activation_mode, 2526 const DeviceMemory<Eigen::half>& x_bn_data, 2527 const dnn::BatchDescriptor& scale_offset_mean_variance_descriptor, 2528 const DeviceMemory<float>& scale_data, 2529 const DeviceMemory<float>& offset_data, 2530 const DeviceMemory<float>& saved_mean_data, 2531 const DeviceMemory<float>& saved_var_data, 2532 DeviceMemory<Eigen::half>* x_bn_backprop_data, 2533 DeviceMemory<float>* scale_backprop_data, 2534 DeviceMemory<float>* offset_backprop_data, 2535 dnn::ProfileResult* output_profile_result) { 2536 return false; 2537 } 2538 2539 protected: 2540 // Returns whether status is 'ok', and potentially logs the error. 2541 static bool IsStatusOk(const port::Status& status, bool report_error); 2542 2543 private: 2544 virtual port::Status DoPrepareForConvolution( 2545 ConvolutionKind kind, DataType element_type, Stream* stream, 2546 const BatchDescriptor& batch_descriptor, DeviceMemoryBase input_data, 2547 const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data, 2548 const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data, 2549 const ConvolutionDescriptor& convolution_descriptor, 2550 const AlgorithmConfig& algorithm_config, 2551 ScratchAllocator* scratch_allocator, AlgorithmDesc* algorithm_desc, 2552 DeviceMemory<uint8>* scratch_memory) { 2553 *algorithm_desc = {}; 2554 *scratch_memory = {}; 2555 return port::Status::OK(); 2556 } 2557 2558 SE_DISALLOW_COPY_AND_ASSIGN(DnnSupport); 2559 }; 2560 2561 } // namespace dnn 2562 } // namespace stream_executor 2563 2564 #endif // TENSORFLOW_STREAM_EXECUTOR_DNN_H_ 2565