1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #ifndef TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_ 17 #define TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_ 18 19 #if GOOGLE_CUDA 20 21 #include <tuple> 22 #include <unordered_map> 23 #include "tensorflow/core/framework/op_kernel.h" 24 #include "tensorflow/core/kernels/gpu_utils.h" 25 #include "tensorflow/core/lib/gtl/inlined_vector.h" 26 #include "tensorflow/core/lib/hash/hash.h" 27 28 namespace tensorflow { 29 30 // Get the Cudnn workspace limit from the environment variable, which is in MB. 31 // Return the workspace memory limit in bytes. If no value is set, return the 32 // default value. 33 int64 GetCudnnWorkspaceLimit(const string& envvar_in_mb, 34 int64 default_value_in_bytes); 35 36 // A class to provide scratch-space allocator for Stream-Executor Cudnn 37 // callback. TensorFlow is responsible for releasing the temporary buffers after 38 // the kernel finishes. 39 class CudnnScratchAllocator : public perftools::gputools::ScratchAllocator { 40 public: 41 virtual ~CudnnScratchAllocator() {} 42 CudnnScratchAllocator(int64 memory_limit, OpKernelContext* context) 43 : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {} 44 int64 GetMemoryLimitInBytes(perftools::gputools::Stream* stream) override { 45 return memory_limit_; 46 } 47 perftools::gputools::port::StatusOr<perftools::gputools::DeviceMemory<uint8>> 48 AllocateBytes(perftools::gputools::Stream* stream, int64 byte_size) override { 49 Tensor temporary_memory; 50 if (byte_size < 0) { 51 return perftools::gputools::port::Status{ 52 perftools::gputools::port::error::INVALID_ARGUMENT, 53 "Requested negative byte size!"}; 54 } 55 if (byte_size > memory_limit_) { 56 return perftools::gputools::port::StatusOr< 57 perftools::gputools::DeviceMemory<uint8>>(); 58 } 59 AllocationAttributes allocation_attr; 60 allocation_attr.no_retry_on_failure = true; 61 Status allocation_status(context_->allocate_temp( 62 DT_UINT8, TensorShape({byte_size}), &temporary_memory, 63 AllocatorAttributes(), allocation_attr)); 64 if (!allocation_status.ok()) { 65 return perftools::gputools::port::StatusOr< 66 perftools::gputools::DeviceMemory<uint8>>(); 67 } 68 // Hold the reference of the allocated tensors until the end of the 69 // allocator. 70 allocated_tensors_.push_back(temporary_memory); 71 total_byte_size_ += byte_size; 72 return perftools::gputools::port::StatusOr< 73 perftools::gputools::DeviceMemory<uint8>>( 74 AsDeviceMemory(temporary_memory.flat<uint8>().data(), 75 temporary_memory.flat<uint8>().size())); 76 } 77 int64 TotalByteSize() { return total_byte_size_; } 78 79 private: 80 int64 memory_limit_; 81 int64 total_byte_size_; 82 OpKernelContext* context_; 83 std::vector<Tensor> allocated_tensors_; 84 }; 85 86 // Encapsulate all the shape information that is used in both forward and 87 // backward conv operations. 88 class ConvParameters { 89 public: 90 using SpatialArray = gtl::InlinedVector<int64, 3>; 91 ConvParameters(int64 batch, int64 in_depths, const SpatialArray& in, 92 int64 out_depths, const SpatialArray& filter, 93 const SpatialArray& dilation, const SpatialArray& stride, 94 const SpatialArray& padding, DataType dtype, int device_id) 95 : batch_(batch), 96 in_depths_(in_depths), 97 out_depths_(out_depths), 98 in_(in), 99 filter_(filter), 100 dilation_(dilation), 101 stride_(stride), 102 padding_(padding), 103 dtype_(dtype), 104 device_id_(device_id) { 105 hash_code_ = batch; 106 hash_code_ = Hash64Combine(hash_code_, in_depths); 107 for (int64 val : in) hash_code_ = Hash64Combine(hash_code_, val); 108 hash_code_ = Hash64Combine(hash_code_, out_depths); 109 for (int64 val : filter) hash_code_ = Hash64Combine(hash_code_, val); 110 for (int64 val : dilation) hash_code_ = Hash64Combine(hash_code_, val); 111 for (int64 val : stride) hash_code_ = Hash64Combine(hash_code_, val); 112 for (int64 val : padding) hash_code_ = Hash64Combine(hash_code_, val); 113 hash_code_ = Hash64Combine(hash_code_, dtype); 114 hash_code_ = Hash64Combine(hash_code_, device_id); 115 } 116 bool operator==(const ConvParameters& other) const { 117 return this->get_data_as_tuple() == other.get_data_as_tuple(); 118 } 119 120 bool operator!=(const ConvParameters& other) const { 121 return !(*this == other); 122 } 123 uint64 hash() const { return hash_code_; } 124 125 string ToString() const { 126 // clang-format off 127 return strings::StrCat( 128 batch_, ", ", in_depths_, ", ", 129 "(", str_util::Join(in_, ", "), "), ", 130 out_depths_, ", ", 131 "(", str_util::Join(filter_, ", "), "), ", 132 "(", str_util::Join(dilation_, ", "), "), ", 133 "(", str_util::Join(stride_, ", "), "), ", 134 "(", str_util::Join(padding_, ", "), "), ", 135 dtype_, ", ", 136 device_id_); 137 // clang-format on 138 } 139 140 // TODO(yangzihao): The purpose of this function is to disable winograd 141 // nonfused conv algorithm for certain input parameters so as to avoid a bug 142 // in cuDNNv5 and cuDNNv6. Remove this once switch to cuDNNv7. 143 template <typename T> 144 bool ShouldIncludeWinogradNonfusedAlgo() const { 145 int64 total_size = 16 * std::ceil(batch_ / 16.0) * 146 std::max(in_depths_, out_depths_) * in_[0] * in_[1] * 147 sizeof(T); 148 int64 threshold = 1LL << 31; 149 if (total_size >= threshold) { 150 return false; 151 } else { 152 return true; 153 } 154 } 155 156 protected: 157 using ParameterDataType = 158 std::tuple<int64, int64, SpatialArray, int64, SpatialArray, SpatialArray, 159 SpatialArray, SpatialArray, DataType, int>; 160 161 ParameterDataType get_data_as_tuple() const { 162 return std::make_tuple(batch_, in_depths_, in_, out_depths_, filter_, 163 dilation_, stride_, padding_, dtype_, device_id_); 164 } 165 166 uint64 hash_code_; 167 168 private: 169 int64 batch_; 170 int64 in_depths_; 171 int64 out_depths_; 172 SpatialArray in_; 173 SpatialArray filter_; 174 SpatialArray dilation_; 175 SpatialArray stride_; 176 SpatialArray padding_; 177 DataType dtype_; 178 int device_id_; 179 }; 180 181 typedef Eigen::GpuDevice GPUDevice; 182 183 } // namespace tensorflow 184 185 #endif // GOOGLE_CUDA 186 187 #endif // TENSORFLOW_CORE_KERNELS_CONV_OPS_GPU_H_ 188