Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
     17 #include "tensorflow/core/framework/tensor.h"
     18 #include "tensorflow/core/graph/node_builder.h"
     19 #include "tensorflow/core/kernels/ops_util.h"
     20 #include "tensorflow/core/platform/test.h"
     21 #include "tensorflow/core/platform/test_benchmark.h"
     22 #include "tensorflow/core/util/tensor_format.h"
     23 
     24 namespace tensorflow {
     25 namespace {
     26 
     27 // Creates a Graph which applies a unary "func" on a 3D tensor of
     28 // type T with "num" elements.
     29 template <typename T>
     30 static Graph* Unary(const string& func, int num, DataType dtype) {
     31   Graph* g = new Graph(OpRegistry::Global());
     32   Tensor data(dtype, TensorShape({64, 64, num / (64 * 64)}));
     33   CHECK_GT(data.NumElements(), 0);
     34   data.flat<T>().setRandom();
     35   test::graph::Unary(g, func, test::graph::Constant(g, data), 0);
     36   return g;
     37 }
     38 
     39 const int kRows = 100000;
     40 
     41 int RowsAndColsArg(int r, int c) { return r * kRows + c; }
     42 int RowsFromArg(int arg) { return (arg / kRows); }
     43 int ColsFromArg(int arg) { return (arg % kRows); }
     44 
     45 #define BM_UNARY(DEVICE, FUNC, T, TYPE)                              \
     46   void BM_##DEVICE##_##FUNC##_##TYPE(int iters, int num) {           \
     47     const int64 tot = static_cast<int64>(iters) * num;               \
     48     testing::ItemsProcessed(tot);                                    \
     49     testing::BytesProcessed(tot * sizeof(T));                        \
     50     test::Benchmark(#DEVICE, Unary<T>(#FUNC, num, TYPE)).Run(iters); \
     51   }                                                                  \
     52   BENCHMARK(BM_##DEVICE##_##FUNC##_##TYPE)->Range(4 << 10, 1 << 20);
     53 
     54 BM_UNARY(cpu, Floor, float, DT_FLOAT);
     55 #if GOOGLE_CUDA
     56 BM_UNARY(gpu, Floor, float, DT_FLOAT);
     57 #endif  // GOOGLE_CUDA
     58 #ifdef TENSORFLOW_USE_SYCL
     59 BM_UNARY(sycl, Floor, float, DT_FLOAT);
     60 #endif  // TENSORFLOW_USE_SYCL
     61 
     62 BM_UNARY(cpu, Floor, double, DT_DOUBLE);
     63 #if GOOGLE_CUDA
     64 BM_UNARY(gpu, Floor, double, DT_DOUBLE);
     65 #endif  // GOOGLE_CUDA
     66 #ifdef TENSORFLOW_USE_SYCL
     67 BM_UNARY(sycl, Floor, double, DT_DOUBLE);
     68 #endif  // TENSORFLOW_USE_SYCL
     69 
     70 BM_UNARY(cpu, Conj, std::complex<float>, DT_COMPLEX64);
     71 #if GOOGLE_CUDA
     72 BM_UNARY(gpu, Conj, std::complex<float>, DT_COMPLEX64);
     73 #endif  // GOOGLE_CUDA
     74 BM_UNARY(cpu, Conj, std::complex<double>, DT_COMPLEX128);
     75 #if GOOGLE_CUDA
     76 BM_UNARY(gpu, Conj, std::complex<double>, DT_COMPLEX128);
     77 #endif  // GOOGLE_CUDA
     78 
     79 BM_UNARY(cpu, Rint, double, DT_DOUBLE);
     80 #if GOOGLE_CUDA
     81 BM_UNARY(gpu, Rint, double, DT_DOUBLE);
     82 #endif  // GOOGLE_CUDA
     83 BM_UNARY(cpu, Rint, float, DT_FLOAT);
     84 #if GOOGLE_CUDA
     85 BM_UNARY(gpu, Rint, float, DT_FLOAT);
     86 #endif  // GOOGLE_CUDA
     87 
     88 // data func scalar.
     89 Graph* BinaryScalar(int num, const string& func) {
     90   Graph* g = new Graph(OpRegistry::Global());
     91   Tensor lhs(DT_FLOAT, TensorShape({64, 64, num / (64 * 64)}));
     92   lhs.flat<float>().setRandom();
     93   Tensor rhs(DT_FLOAT, TensorShape({}));
     94   rhs.flat<float>().setRandom();
     95   test::graph::Binary(g, func, test::graph::Constant(g, lhs),
     96                       test::graph::Constant(g, rhs));
     97   return g;
     98 }
     99 
    100 #define BM_BINARY_SCALAR(DEVICE, FUNC)                             \
    101   void BM_##DEVICE##_##FUNC##_scalar(int iters, int num) {         \
    102     const int64 tot = static_cast<int64>(iters) * num;             \
    103     testing::ItemsProcessed(tot);                                  \
    104     testing::BytesProcessed(tot * sizeof(float));                  \
    105     test::Benchmark(#DEVICE, BinaryScalar(num, #FUNC)).Run(iters); \
    106   }                                                                \
    107   BENCHMARK(BM_##DEVICE##_##FUNC##_scalar)                         \
    108       ->Arg(4096) /* must >= 4096 */                               \
    109       ->Arg(32768)                                                 \
    110       ->Arg(131072)                                                \
    111       ->Arg(1048576);
    112 
    113 BM_BINARY_SCALAR(cpu, Less);
    114 #if GOOGLE_CUDA
    115 BM_BINARY_SCALAR(gpu, Less);
    116 #endif  // GOOGLE_CUDA
    117 #ifdef TENSORFLOW_USE_SYCL
    118 BM_BINARY_SCALAR(sycl, Less);
    119 #endif  // TENSORFLOW_USE_SYCL
    120 
    121 BM_BINARY_SCALAR(cpu, Add);
    122 #if GOOGLE_CUDA
    123 BM_BINARY_SCALAR(gpu, Add);
    124 #endif  // GOOGLE_CUDA
    125 #ifdef TENSORFLOW_USE_SYCL
    126 BM_BINARY_SCALAR(sycl, Add);
    127 #endif  // TENSORFLOW_USE_SYCL
    128 #undef BM_BINARY_SCALAR
    129 
    130 template <class T>
    131 Graph* BiasAdd(int rows, int cols, DataType type) {
    132   Graph* g = new Graph(OpRegistry::Global());
    133   Tensor lhs(type, TensorShape({rows, cols}));
    134   lhs.template flat<T>().setRandom();
    135   TensorShape rhs_shape;
    136   rhs_shape = TensorShape({cols});
    137   Tensor rhs(type, rhs_shape);
    138   rhs.template flat<T>().setRandom();
    139   test::graph::Binary(g, "BiasAdd", test::graph::Constant(g, lhs),
    140                       test::graph::Constant(g, rhs));
    141   return g;
    142 }
    143 
    144 #define BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, R, C)                             \
    145   void BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C(int iters, int arg) {      \
    146     const int rows = RowsFromArg(arg);                                         \
    147     const int cols = ColsFromArg(arg);                                         \
    148     const int64 tot = static_cast<int64>(iters) * rows * cols;                 \
    149     testing::ItemsProcessed(tot);                                              \
    150     testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
    151     test::Benchmark(#DEVICE, BiasAdd<C_TYPE>(rows, cols, TF_TYPE)).Run(iters); \
    152   }                                                                            \
    153   BENCHMARK(BM_##DEVICE##_##C_TYPE##_BiasAdd_R##R##_C##C)                      \
    154       ->Arg(RowsAndColsArg(R, C));
    155 
    156 #define BM_BIAS_ADD_ALL(DEVICE, C_TYPE, TF_TYPE)   \
    157   BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 512, 2048); \
    158   BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 512, 4096); \
    159   BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 2048, 512); \
    160   BM_BIAS_ADD(DEVICE, C_TYPE, TF_TYPE, 4096, 512);
    161 
    162 using Eigen::half;
    163 BM_BIAS_ADD_ALL(cpu, float, DT_FLOAT);
    164 #if GOOGLE_CUDA
    165 BM_BIAS_ADD_ALL(gpu, float, DT_FLOAT);
    166 #endif  // GOOGLE_CUDA
    167 BM_BIAS_ADD_ALL(cpu, half, DT_HALF);
    168 #if GOOGLE_CUDA
    169 BM_BIAS_ADD_ALL(gpu, half, DT_HALF);
    170 #endif  // GOOGLE_CUDA
    171 #undef BM_BIAS_ADD_ALL
    172 #undef BM_BIAS_ADD
    173 
    174 template <class T>
    175 Graph* BiasAddGrad(int rows, int cols, int channels, DataType type,
    176                    TensorFormat format) {
    177   Graph* g = new Graph(OpRegistry::Global());
    178   TensorShape lhs_shape;
    179   if (format == FORMAT_NCHW) {
    180     lhs_shape = TensorShape({channels, rows, cols});
    181   } else {
    182     lhs_shape = TensorShape({rows, cols, channels});
    183   }
    184   Tensor lhs(type, lhs_shape);
    185   lhs.template flat<T>().setRandom();
    186   Node* n;
    187   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "BiasAddGrad")
    188                   .Attr("data_format", ToString(format))
    189                   .Input(test::graph::Constant(g, lhs), /*src_index=*/0)
    190                   .Finalize(g, &n));
    191   return g;
    192 }
    193 
    194 #define BM_BIAS_ADD_GRAD(DEVICE, FMT, C_TYPE, TF_TYPE, R, C, CH)               \
    195   void BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH(      \
    196       int iters, int arg, int channels) {                                      \
    197     const int rows = RowsFromArg(arg);                                         \
    198     const int cols = ColsFromArg(arg);                                         \
    199     const int64 tot = static_cast<int64>(iters) * rows * cols * channels;      \
    200     testing::ItemsProcessed(tot);                                              \
    201     testing::BytesProcessed(tot * sizeof(C_TYPE));                             \
    202     test::Benchmark(#DEVICE, BiasAddGrad<C_TYPE>(rows, cols, channels,         \
    203                                                  TF_TYPE, FORMAT_##FMT))       \
    204         .Run(iters);                                                           \
    205   }                                                                            \
    206   BENCHMARK(BM_##DEVICE##_##FMT##_##C_TYPE##_BiasAddGrad_R##R##_C##C##_CH##CH) \
    207       ->ArgPair(RowsAndColsArg(R, C), CH);
    208 
    209 #define BM_BIAS_ADD_GRAD_ALL(DEVICE, FORMAT, C_TYPE, TF_TYPE)       \
    210   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 64, 64, 64);    \
    211   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 512, 512, 4);   \
    212   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 512, 512, 1);   \
    213   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 4); \
    214   BM_BIAS_ADD_GRAD(DEVICE, FORMAT, C_TYPE, TF_TYPE, 4096, 4096, 1);
    215 
    216 using Eigen::half;
    217 #if GOOGLE_CUDA
    218 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, float, DT_FLOAT);
    219 BM_BIAS_ADD_GRAD_ALL(gpu, NCHW, half, DT_HALF);
    220 #endif  // GOOGLE_CUDA
    221 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, float, DT_FLOAT);
    222 #if GOOGLE_CUDA
    223 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, float, DT_FLOAT);
    224 #endif  // GOOGLE_CUDA
    225 BM_BIAS_ADD_GRAD_ALL(cpu, NHWC, half, DT_HALF);
    226 #if GOOGLE_CUDA
    227 BM_BIAS_ADD_GRAD_ALL(gpu, NHWC, half, DT_HALF);
    228 #endif  // GOOGLE_CUDA
    229 #undef BM_BIAS_ADD_GRAD_ALL
    230 #undef BM_BIAS_ADD_GRAD
    231 
    232 Graph* BcastAdd(int rows, int cols, int dim) {
    233   Graph* g = new Graph(OpRegistry::Global());
    234   Tensor lhs(DT_FLOAT, TensorShape({rows, cols}));
    235   lhs.flat<float>().setRandom();
    236   TensorShape rhs_shape;
    237   if (dim == 0) {
    238     rhs_shape = TensorShape({rows, 1});
    239   } else {
    240     rhs_shape = TensorShape({cols});
    241   }
    242   Tensor rhs(DT_FLOAT, rhs_shape);
    243   rhs.flat<float>().setRandom();
    244   test::graph::Binary(g, "Add", test::graph::Constant(g, lhs),
    245                       test::graph::Constant(g, rhs));
    246   return g;
    247 }
    248 
    249 #define BM_BCAST_ADD_ROW(DEVICE, R, C)                             \
    250   void BM_##DEVICE##_BcastAddRow_R##R##_C##C(int iters, int arg) { \
    251     const int rows = RowsFromArg(arg);                             \
    252     const int cols = ColsFromArg(arg);                             \
    253     const int64 tot = static_cast<int64>(iters) * rows * cols;     \
    254     testing::ItemsProcessed(tot);                                  \
    255     testing::BytesProcessed(tot * sizeof(float));                  \
    256     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 0)).Run(iters);  \
    257   }                                                                \
    258   BENCHMARK(BM_##DEVICE##_BcastAddRow_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
    259 
    260 #define BM_BCAST_ADD_ROW_ALL(DEVICE)   \
    261   BM_BCAST_ADD_ROW(DEVICE, 512, 2048); \
    262   BM_BCAST_ADD_ROW(DEVICE, 512, 4096); \
    263   BM_BCAST_ADD_ROW(DEVICE, 2048, 512); \
    264   BM_BCAST_ADD_ROW(DEVICE, 4096, 512);
    265 BM_BCAST_ADD_ROW_ALL(cpu);
    266 #if GOOGLE_CUDA
    267 BM_BCAST_ADD_ROW_ALL(gpu);
    268 #endif  // GOOGLE_CUDA
    269 #ifdef TENSORFLOW_USE_SYCL
    270 BM_BCAST_ADD_ROW_ALL(sycl);
    271 #endif  // TENSORFLOW_USE_SYCL
    272 #undef BM_BCAST_ADD_ROW_ALL
    273 #undef BM_BCAST_ADD_ROW
    274 
    275 #define BM_BCAST_ADD_COL(DEVICE, R, C)                             \
    276   void BM_##DEVICE##_BcastAddCol_R##R##_C##C(int iters, int arg) { \
    277     const int rows = RowsFromArg(arg);                             \
    278     const int cols = ColsFromArg(arg);                             \
    279     const int64 tot = static_cast<int64>(iters) * rows * cols;     \
    280     testing::ItemsProcessed(tot);                                  \
    281     testing::BytesProcessed(tot * sizeof(float));                  \
    282     test::Benchmark(#DEVICE, BcastAdd(rows, cols, 1)).Run(iters);  \
    283   }                                                                \
    284   BENCHMARK(BM_##DEVICE##_BcastAddCol_R##R##_C##C)->Arg(RowsAndColsArg(R, C));
    285 
    286 #define BM_BCAST_ADD_COL_ALL(DEVICE)   \
    287   BM_BCAST_ADD_COL(DEVICE, 512, 2048); \
    288   BM_BCAST_ADD_COL(DEVICE, 512, 4096); \
    289   BM_BCAST_ADD_COL(DEVICE, 2048, 512); \
    290   BM_BCAST_ADD_COL(DEVICE, 4096, 512);
    291 BM_BCAST_ADD_COL_ALL(cpu);
    292 #if GOOGLE_CUDA
    293 BM_BCAST_ADD_COL_ALL(gpu);
    294 #endif  // GOOGLE_CUDA
    295 #ifdef TENSORFLOW_USE_SYCL
    296 BM_BCAST_ADD_COL_ALL(sycl);
    297 #endif  // TENSORFLOW_USE_SYCL
    298 #undef BM_BCAST_ADD_COL_ALL
    299 #undef BM_BCAST_ADD_COL
    300 
    301 }  // namespace
    302 }  // namespace tensorflow
    303