1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" 17 #include "tensorflow/core/framework/tensor.h" 18 #include "tensorflow/core/graph/node_builder.h" 19 #include "tensorflow/core/kernels/ops_util.h" 20 #include "tensorflow/core/platform/test.h" 21 #include "tensorflow/core/platform/test_benchmark.h" 22 #include "tensorflow/core/util/tensor_format.h" 23 24 namespace tensorflow { 25 namespace { 26 27 // Creates a Graph which applies a unary "func" on a 3D tensor of 28 // type T with "num" elements. 29 template <typename T> 30 static Graph* Unary(const string& func, int num, DataType dtype) { 31 Graph* g = new Graph(OpRegistry::Global()); 32 Tensor data(dtype, TensorShape({64, 64, num / (64 * 64)})); 33 CHECK_GT(data.NumElements(), 0); 34 data.flat<T>().setRandom(); 35 test::graph::Unary(g, func, test::graph::Constant(g, data), 0); 36 return g; 37 } 38 39 const int kRows = 100000; 40 41 int RowsAndColsArg(int r, int c) { return r * kRows + c; } 42 int RowsFromArg(int arg) { return (arg / kRows); } 43 int ColsFromArg(int arg) { return (arg % kRows); } 44 45 #define BM_UNARY(DEVICE, FUNC, T, TYPE) \ 46 void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) { \ 47 const int64 tot = static_cast<int64>(iters) * num; \ 48 testing::ItemsProcessed(tot); \ 49 testing::BytesProcessed(tot * sizeof(T)); \ 50 test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \ 51 } \ 52 BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20); 53 54 BM_UNARY(cpu, Floor, float, DT_FLOAT); 55 #if GOOGLE_CUDA 56 BM_UNARY(gpu, Floor, float, DT_FLOAT); 57 #endif // GOOGLE_CUDA 58 #ifdef TENSORFLOW_USE_SYCL 59 BM_UNARY(sycl, Floor, float, DT_FLOAT); 60 #endif // TENSORFLOW_USE_SYCL 61 62 BM_UNARY(cpu, Floor, double, DT_DOUBLE); 63 #if GOOGLE_CUDA 64 BM_UNARY(gpu, Floor, double, DT_DOUBLE); 65 #endif // GOOGLE_CUDA 66 #ifdef TENSORFLOW_USE_SYCL 67 BM_UNARY(sycl, Floor, double, DT_DOUBLE); 68 #endif // TENSORFLOW_USE_SYCL 69 70 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64); 71 #if GOOGLE_CUDA 72 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64); 73 #endif // GOOGLE_CUDA 74 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128); 75 #if GOOGLE_CUDA 76 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128); 77 #endif // GOOGLE_CUDA 78 79 BM_UNARY(cpu, Rint, double, DT_DOUBLE); 80 #if GOOGLE_CUDA 81 BM_UNARY(gpu, Rint, double, DT_DOUBLE); 82 #endif // GOOGLE_CUDA 83 BM_UNARY(cpu, Rint, float, DT_FLOAT); 84 #if GOOGLE_CUDA 85 BM_UNARY(gpu, Rint, float, DT_FLOAT); 86 #endif // GOOGLE_CUDA 87 88 // data func scalar. 89 Graph* BinaryScalar(int num, const string& func) { 90 Graph* g = new Graph(OpRegistry::Global()); 91 Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)})); 92 lhs.flat<float>().setRandom(); 93 Tensor rhs(DT_FLOAT, TensorShape({})); 94 rhs.flat<float>().setRandom(); 95 test::graph::Binary(g, func, test::graph::Constant(g, lhs), 96 test::graph::Constant(g, rhs)); 97 return g; 98 } 99 100 #define BM_BINARY_SCALAR(DEVICE, FUNC) \ 101 void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) { \ 102 const int64 tot = static_cast<int64>(iters) * num; \ 103 testing::ItemsProcessed(tot); \ 104 testing::BytesProcessed(tot * sizeof(float)); \ 105 test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \ 106 } \ 107 BENCHMARK(BM_##DEVICE##_##FUNC##_scalar) \ 108 ->Arg(4096) /* must >= 4096 */ \ 109 ->Arg(32768) \ 110 ->Arg(131072) \ 111 ->Arg(1048576); 112 113 BM_BINARY_SCALAR(cpu, Less); 114 #if GOOGLE_CUDA 115 BM_BINARY_SCALAR(gpu, Less); 116 #endif // GOOGLE_CUDA 117 #ifdef TENSORFLOW_USE_SYCL 118 BM_BINARY_SCALAR(sycl, Less); 119 #endif // TENSORFLOW_USE_SYCL 120 121 BM_BINARY_SCALAR(cpu, Add); 122 #if GOOGLE_CUDA 123 BM_BINARY_SCALAR(gpu, Add); 124 #endif // GOOGLE_CUDA 125 #ifdef TENSORFLOW_USE_SYCL 126 BM_BINARY_SCALAR(sycl, Add); 127 #endif // TENSORFLOW_USE_SYCL 128 #undef BM_BINARY_SCALAR 129 130 template <class T> 131 Graph* BiasAdd(int rows, int cols, DataType type) { 132 Graph* g = new Graph(OpRegistry::Global()); 133 Tensor lhs(type, TensorShape({rows, cols})); 134 lhs.template flat<T>().setRandom(); 135 TensorShape rhs_shape; 136 rhs_shape = TensorShape({cols}); 137 Tensor rhs(type, rhs_shape); 138 rhs.template flat<T>().setRandom(); 139 test::graph::Binary(g, "BiasAdd", test::graph::Constant(g, lhs), 140 test::graph::Constant(g, rhs)); 141 return g; 142 } 143 144 #define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C) \ 145 void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(int iters, int arg) { \ 146 const int rows = RowsFromArg(arg); \ 147 const int cols = ColsFromArg(arg); \ 148 const int64 tot = static_cast<int64>(iters) * rows * cols; \ 149 testing::ItemsProcessed(tot); \ 150 testing::BytesProcessed(tot * sizeof(C_TYPE)); \ 151 test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE)).Run(iters); \ 152 } \ 153 BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C) \ 154 ->Arg(RowsAndColsArg(R, C)); 155 156 #define BM_BIAS_ADD_ALL(DEVICE, C_TYPE, TF_TYPE) \ 157 BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 512, 2048); \ 158 BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 512, 4096); \ 159 BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 2048, 512); \ 160 BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 4096, 512); 161 162 using Eigen::half; 163 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT); 164 #if GOOGLE_CUDA 165 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT); 166 #endif // GOOGLE_CUDA 167 BM_BIAS_ADD_ALL(cpu, half, DT_HALF); 168 #if GOOGLE_CUDA 169 BM_BIAS_ADD_ALL(gpu, half, DT_HALF); 170 #endif // GOOGLE_CUDA 171 #undef BM_BIAS_ADD_ALL 172 #undef BM_BIAS_ADD 173 174 template <class T> 175 Graph* BiasAddGrad(int rows, int cols, int channels, DataType type, 176 TensorFormat format) { 177 Graph* g = new Graph(OpRegistry::Global()); 178 TensorShape lhs_shape; 179 if (format == FORMAT_NCHW) { 180 lhs_shape = TensorShape({channels, rows, cols}); 181 } else { 182 lhs_shape = TensorShape({rows, cols, channels}); 183 } 184 Tensor lhs(type, lhs_shape); 185 lhs.template flat<T>().setRandom(); 186 Node* n; 187 TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BiasAddGrad") 188 .Attr("data_format", ToString(format)) 189 .Input(test::graph::Constant(g, lhs), /*src_index=*/0) 190 .Finalize(g, &n)); 191 return g; 192 } 193 194 #define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH) \ 195 void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH( \ 196 int iters, int arg, int channels) { \ 197 const int rows = RowsFromArg(arg); \ 198 const int cols = ColsFromArg(arg); \ 199 const int64 tot = static_cast<int64>(iters) * rows * cols * channels; \ 200 testing::ItemsProcessed(tot); \ 201 testing::BytesProcessed(tot * sizeof(C_TYPE)); \ 202 test::Benchmark(#DEVICE, BiasAddGrad<C_TYPE>(rows, cols, channels, \ 203 TF_TYPE, FORMAT_##FMT)) \ 204 .Run(iters); \ 205 } \ 206 BENCHMARK(BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH) \ 207 ->ArgPair(RowsAndColsArg(R, C), CH); 208 209 #define BM_BIAS_ADD_GRAD_ALL(DEVICE, FORMAT, C_TYPE, TF_TYPE) \ 210 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 64, 64, 64); \ 211 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 512, 512, 4); \ 212 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 512, 512, 1); \ 213 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 4); \ 214 BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1); 215 216 using Eigen::half; 217 #if GOOGLE_CUDA 218 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT); 219 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF); 220 #endif // GOOGLE_CUDA 221 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT); 222 #if GOOGLE_CUDA 223 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT); 224 #endif // GOOGLE_CUDA 225 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF); 226 #if GOOGLE_CUDA 227 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF); 228 #endif // GOOGLE_CUDA 229 #undef BM_BIAS_ADD_GRAD_ALL 230 #undef BM_BIAS_ADD_GRAD 231 232 Graph* BcastAdd(int rows, int cols, int dim) { 233 Graph* g = new Graph(OpRegistry::Global()); 234 Tensor lhs(DT_FLOAT, TensorShape({rows, cols})); 235 lhs.flat<float>().setRandom(); 236 TensorShape rhs_shape; 237 if (dim == 0) { 238 rhs_shape = TensorShape({rows, 1}); 239 } else { 240 rhs_shape = TensorShape({cols}); 241 } 242 Tensor rhs(DT_FLOAT, rhs_shape); 243 rhs.flat<float>().setRandom(); 244 test::graph::Binary(g, "Add", test::graph::Constant(g, lhs), 245 test::graph::Constant(g, rhs)); 246 return g; 247 } 248 249 #define BM_BCAST_ADD_ROW(DEVICE, R, C) \ 250 void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \ 251 const int rows = RowsFromArg(arg); \ 252 const int cols = ColsFromArg(arg); \ 253 const int64 tot = static_cast<int64>(iters) * rows * cols; \ 254 testing::ItemsProcessed(tot); \ 255 testing::BytesProcessed(tot * sizeof(float)); \ 256 test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters); \ 257 } \ 258 BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C)); 259 260 #define BM_BCAST_ADD_ROW_ALL(DEVICE) \ 261 BM_BCAST_ADD_ROW(DEVICE, 512, 2048); \ 262 BM_BCAST_ADD_ROW(DEVICE, 512, 4096); \ 263 BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \ 264 BM_BCAST_ADD_ROW(DEVICE, 4096, 512); 265 BM_BCAST_ADD_ROW_ALL(cpu); 266 #if GOOGLE_CUDA 267 BM_BCAST_ADD_ROW_ALL(gpu); 268 #endif // GOOGLE_CUDA 269 #ifdef TENSORFLOW_USE_SYCL 270 BM_BCAST_ADD_ROW_ALL(sycl); 271 #endif // TENSORFLOW_USE_SYCL 272 #undef BM_BCAST_ADD_ROW_ALL 273 #undef BM_BCAST_ADD_ROW 274 275 #define BM_BCAST_ADD_COL(DEVICE, R, C) \ 276 void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \ 277 const int rows = RowsFromArg(arg); \ 278 const int cols = ColsFromArg(arg); \ 279 const int64 tot = static_cast<int64>(iters) * rows * cols; \ 280 testing::ItemsProcessed(tot); \ 281 testing::BytesProcessed(tot * sizeof(float)); \ 282 test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters); \ 283 } \ 284 BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C)); 285 286 #define BM_BCAST_ADD_COL_ALL(DEVICE) \ 287 BM_BCAST_ADD_COL(DEVICE, 512, 2048); \ 288 BM_BCAST_ADD_COL(DEVICE, 512, 4096); \ 289 BM_BCAST_ADD_COL(DEVICE, 2048, 512); \ 290 BM_BCAST_ADD_COL(DEVICE, 4096, 512); 291 BM_BCAST_ADD_COL_ALL(cpu); 292 #if GOOGLE_CUDA 293 BM_BCAST_ADD_COL_ALL(gpu); 294 #endif // GOOGLE_CUDA 295 #ifdef TENSORFLOW_USE_SYCL 296 BM_BCAST_ADD_COL_ALL(sycl); 297 #endif // TENSORFLOW_USE_SYCL 298 #undef BM_BCAST_ADD_COL_ALL 299 #undef BM_BCAST_ADD_COL 300 301 } // namespace 302 } // namespace tensorflow 303