Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
     17 #define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
     18 
     19 #if GOOGLE_CUDA
     20 
     21 #include <tuple>
     22 #include <unordered_map>
     23 #include "tensorflow/core/framework/op_kernel.h"
     24 #include "tensorflow/core/kernels/gpu_utils.h"
     25 #include "tensorflow/core/lib/gtl/inlined_vector.h"
     26 #include "tensorflow/core/lib/hash/hash.h"
     27 
     28 namespace tensorflow {
     29 
     30 // Get the Cudnn workspace limit from the environment variable, which is in MB.
     31 // Return the workspace memory limit in bytes. If no value is set, return the
     32 // default value.
     33 int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb,
     34                              int64 default_value_in_bytes);
     35 
     36 // A class to provide scratch-space allocator for Stream-Executor Cudnn
     37 // callback. TensorFlow is responsible for releasing the temporary buffers after
     38 // the kernel finishes.
     39 class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator {
     40  public:
     41   virtual ~CudnnScratchAllocator() {}
     42   CudnnScratchAllocator(int64 memory_limit, OpKernelContext* context)
     43       : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {}
     44   int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override {
     45     return memory_limit_;
     46   }
     47   perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>>
     48   AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override {
     49     Tensor temporary_memory;
     50     if (byte_size < 0) {
     51       return perftools::gputools::port::Status{
     52           perftools::gputools::port::error::INVALID_ARGUMENT,
     53           "Requested negative byte size!"};
     54     }
     55     if (byte_size > memory_limit_) {
     56       return perftools::gputools::port::StatusOr<
     57           perftools::gputools::DeviceMemory<uint8>>();
     58     }
     59     AllocationAttributes allocation_attr;
     60     allocation_attr.no_retry_on_failure = true;
     61     Status allocation_status(context_->allocate_temp(
     62         DT_UINT8, TensorShape({byte_size}), &temporary_memory,
     63         AllocatorAttributes(), allocation_attr));
     64     if (!allocation_status.ok()) {
     65       return perftools::gputools::port::StatusOr<
     66           perftools::gputools::DeviceMemory<uint8>>();
     67     }
     68     // Hold the reference of the allocated tensors until the end of the
     69     // allocator.
     70     allocated_tensors_.push_back(temporary_memory);
     71     total_byte_size_ += byte_size;
     72     return perftools::gputools::port::StatusOr<
     73         perftools::gputools::DeviceMemory<uint8>>(
     74         AsDeviceMemory(temporary_memory.flat<uint8>().data(),
     75                        temporary_memory.flat<uint8>().size()));
     76   }
     77   int64 TotalByteSize() { return total_byte_size_; }
     78 
     79  private:
     80   int64 memory_limit_;
     81   int64 total_byte_size_;
     82   OpKernelContext* context_;
     83   std::vector<Tensor> allocated_tensors_;
     84 };
     85 
     86 // Encapsulate all the shape information that is used in both forward and
     87 // backward conv operations.
     88 class ConvParameters {
     89  public:
     90   using SpatialArray = gtl::InlinedVector<int64, 3>;
     91   ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in,
     92                  int64 out_depths, const SpatialArray& filter,
     93                  const SpatialArray& dilation, const SpatialArray& stride,
     94                  const SpatialArray& padding, DataType dtype, int device_id)
     95       : batch_(batch),
     96         in_depths_(in_depths),
     97         out_depths_(out_depths),
     98         in_(in),
     99         filter_(filter),
    100         dilation_(dilation),
    101         stride_(stride),
    102         padding_(padding),
    103         dtype_(dtype),
    104         device_id_(device_id) {
    105     hash_code_ = batch;
    106     hash_code_ = Hash64Combine(hash_code_, in_depths);
    107     for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val);
    108     hash_code_ = Hash64Combine(hash_code_, out_depths);
    109     for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val);
    110     for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val);
    111     for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val);
    112     for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val);
    113     hash_code_ = Hash64Combine(hash_code_, dtype);
    114     hash_code_ = Hash64Combine(hash_code_, device_id);
    115   }
    116   bool operator==(const ConvParameters& other) const {
    117     return this->get_data_as_tuple() == other.get_data_as_tuple();
    118   }
    119 
    120   bool operator!=(const ConvParameters& other) const {
    121     return !(*this == other);
    122   }
    123   uint64 hash() const { return hash_code_; }
    124 
    125   string ToString() const {
    126     // clang-format off
    127     return strings::StrCat(
    128         batch_, ", ", in_depths_, ", ",
    129         "(", str_util::Join(in_, ", "), "), ",
    130         out_depths_, ", ",
    131         "(", str_util::Join(filter_, ", "), "), ",
    132         "(", str_util::Join(dilation_, ", "), "), ",
    133         "(", str_util::Join(stride_, ", "), "), ",
    134         "(", str_util::Join(padding_, ", "), "), ",
    135         dtype_, ", ",
    136         device_id_);
    137     // clang-format on
    138   }
    139 
    140   // TODO(yangzihao): The purpose of this function is to disable winograd
    141   // nonfused conv algorithm for certain input parameters so as to avoid a bug
    142   // in cuDNNv5 and cuDNNv6. Remove this once switch to cuDNNv7.
    143   template <typename T>
    144   bool ShouldIncludeWinogradNonfusedAlgo() const {
    145     int64 total_size = 16 * std::ceil(batch_ / 16.0) *
    146                        std::max(in_depths_, out_depths_) * in_[0] * in_[1] *
    147                        sizeof(T);
    148     int64 threshold = 1LL << 31;
    149     if (total_size >= threshold) {
    150       return false;
    151     } else {
    152       return true;
    153     }
    154   }
    155 
    156  protected:
    157   using ParameterDataType =
    158       std::tuple<int64, int64, SpatialArray, int64, SpatialArray, SpatialArray,
    159                  SpatialArray, SpatialArray, DataType, int>;
    160 
    161   ParameterDataType get_data_as_tuple() const {
    162     return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_,
    163                            dilation_, stride_, padding_, dtype_, device_id_);
    164   }
    165 
    166   uint64 hash_code_;
    167 
    168  private:
    169   int64 batch_;
    170   int64 in_depths_;
    171   int64 out_depths_;
    172   SpatialArray in_;
    173   SpatialArray filter_;
    174   SpatialArray dilation_;
    175   SpatialArray stride_;
    176   SpatialArray padding_;
    177   DataType dtype_;
    178   int device_id_;
    179 };
    180 
    181 typedef Eigen::GpuDevice GPUDevice;
    182 
    183 }  // namespace tensorflow
    184 
    185 #endif  // GOOGLE_CUDA
    186 
    187 #endif  // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_
    188