Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
     17 #include "tensorflow/core/framework/tensor.h"
     18 #include "tensorflow/core/framework/types.h"
     19 #include "tensorflow/core/platform/test.h"
     20 #include "tensorflow/core/platform/test_benchmark.h"
     21 
     22 namespace tensorflow {
     23 
     24 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements
     25 // into a scalar.
     26 template <typename T>
     27 static Graph* ToScalar(const string& reduce, int num_x, int num_y) {
     28   auto* g = new Graph(OpRegistry::Global());
     29   Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x, num_y}));
     30   data.flat<T>().setRandom();
     31   Tensor axes(DT_INT32, TensorShape({2}));
     32   axes.flat<int32>()(0) = 0;
     33   axes.flat<int32>()(1) = 1;
     34   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
     35                       test::graph::Constant(g, axes));
     36   return g;
     37 }
     38 
     39 static Graph* ColReduce(const string& reduce, int num_x, int num_y) {
     40   auto* g = new Graph(OpRegistry::Global());
     41   Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
     42   data.flat<float>().setRandom();
     43   Tensor axes(DT_INT32, TensorShape({1}));
     44   axes.flat<int32>()(0) = 0;
     45   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
     46                       test::graph::Constant(g, axes));
     47   return g;
     48 }
     49 
     50 static Graph* RowReduce(const string& reduce, int num_x, int num_y) {
     51   auto* g = new Graph(OpRegistry::Global());
     52   Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
     53   data.flat<float>().setRandom();
     54   Tensor axes(DT_INT32, TensorShape({1}));
     55   axes.flat<int32>()(0) = 1;
     56   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
     57                       test::graph::Constant(g, axes));
     58   return g;
     59 }
     60 
     61 static Graph* ThreeDYReduce(const string& reduce, int num_y, int num_z) {
     62   auto* g = new Graph(OpRegistry::Global());
     63   Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
     64   data.flat<float>().setRandom();
     65   Tensor axes(DT_INT32, TensorShape({1}));
     66   axes.flat<int32>()(0) = 1;
     67   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
     68                       test::graph::Constant(g, axes));
     69   return g;
     70 }
     71 
     72 static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) {
     73   auto* g = new Graph(OpRegistry::Global());
     74   Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z}));
     75   data.flat<float>().setRandom();
     76   Tensor axes(DT_INT32, TensorShape({2}));
     77   axes.flat<int32>()(0) = 0;
     78   axes.flat<int32>()(1) = 2;
     79   test::graph::Reduce(g, reduce, test::graph::Constant(g, data),
     80                       test::graph::Constant(g, axes));
     81   return g;
     82 }
     83 
     84 // Creates a bench which reduces a 3D tensor with total "num" floats
     85 // into a scalar on a "device". Runs the bench for "iters" times.
     86 template <typename T>
     87 static void ReduceToScalar(int iters, const string& device,
     88                            const string& reduce, int num_x, int num_y) {
     89   testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
     90   testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
     91                           sizeof(T));
     92   test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y)).Run(iters);
     93 }
     94 
     95 static void DoRowReduce(int iters, const string& device, const string& reduce,
     96                         int num_x, int num_y) {
     97   testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
     98   testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
     99                           sizeof(float));
    100   test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters);
    101 }
    102 
    103 static void DoColReduce(int iters, const string& device, const string& reduce,
    104                         int num_x, int num_y) {
    105   testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
    106   testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
    107                           sizeof(float));
    108   test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters);
    109 }
    110 
    111 static void Do3DYReduce(int iters, const string& device, const string& reduce,
    112                         int num_x, int num_y) {
    113   testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
    114   testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
    115                           sizeof(float));
    116   test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters);
    117 }
    118 
    119 static void Do3DXZReduce(int iters, const string& device, const string& reduce,
    120                          int num_x, int num_y) {
    121   testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
    122   testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
    123                           sizeof(float));
    124   test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters);
    125 }
    126 
    127 static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) {
    128   ReduceToScalar<float>(iters, "gpu", "Sum", num_x, num_y);
    129 }
    130 BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192);
    131 
    132 static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) {
    133   ReduceToScalar<std::complex<float>>(iters, "gpu", "Sum", num_x, num_y);
    134 }
    135 BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192);
    136 
    137 static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) {
    138   ReduceToScalar<Eigen::half>(iters, "gpu", "Sum", num_x, num_y);
    139 }
    140 BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192);
    141 
    142 static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) {
    143   DoRowReduce(iters, "gpu", "Sum", num_x, num_y);
    144 }
    145 BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192);
    146 
    147 static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) {
    148   DoColReduce(iters, "gpu", "Sum", num_x, num_y);
    149 }
    150 BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192);
    151 
    152 static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) {
    153   Do3DYReduce(iters, "gpu", "Sum", num_x, num_y);
    154 }
    155 BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096);
    156 
    157 static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) {
    158   Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y);
    159 }
    160 BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096);
    161 
    162 static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) {
    163   ReduceToScalar<float>(iters, "gpu", "Mean", num_x, num_y);
    164 }
    165 BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
    166 
    167 static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) {
    168   ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y);
    169 }
    170 BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
    171 
    172 static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) {
    173   ReduceToScalar<float>(iters, "gpu", "Min", num_x, num_y);
    174 }
    175 BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
    176 
    177 static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) {
    178   ReduceToScalar<Eigen::half>(iters, "gpu", "Min", num_x, num_y);
    179 }
    180 BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192);
    181 
    182 static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) {
    183   ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y);
    184 }
    185 BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192);
    186 
    187 }  // end namespace tensorflow
    188