1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" 17 #include "tensorflow/core/framework/tensor.h" 18 #include "tensorflow/core/framework/types.h" 19 #include "tensorflow/core/platform/test.h" 20 #include "tensorflow/core/platform/test_benchmark.h" 21 22 namespace tensorflow { 23 24 // Creates a Graph which "reduce"s a 3D float tensor of "num" elements 25 // into a scalar. 26 template <typename T> 27 static Graph* ToScalar(const string& reduce, int num_x, int num_y) { 28 auto* g = new Graph(OpRegistry::Global()); 29 Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x, num_y})); 30 data.flat<T>().setRandom(); 31 Tensor axes(DT_INT32, TensorShape({2})); 32 axes.flat<int32>()(0) = 0; 33 axes.flat<int32>()(1) = 1; 34 test::graph::Reduce(g, reduce, test::graph::Constant(g, data), 35 test::graph::Constant(g, axes)); 36 return g; 37 } 38 39 static Graph* ColReduce(const string& reduce, int num_x, int num_y) { 40 auto* g = new Graph(OpRegistry::Global()); 41 Tensor data(DT_FLOAT, TensorShape({num_x, num_y})); 42 data.flat<float>().setRandom(); 43 Tensor axes(DT_INT32, TensorShape({1})); 44 axes.flat<int32>()(0) = 0; 45 test::graph::Reduce(g, reduce, test::graph::Constant(g, data), 46 test::graph::Constant(g, axes)); 47 return g; 48 } 49 50 static Graph* RowReduce(const string& reduce, int num_x, int num_y) { 51 auto* g = new Graph(OpRegistry::Global()); 52 Tensor data(DT_FLOAT, TensorShape({num_x, num_y})); 53 data.flat<float>().setRandom(); 54 Tensor axes(DT_INT32, TensorShape({1})); 55 axes.flat<int32>()(0) = 1; 56 test::graph::Reduce(g, reduce, test::graph::Constant(g, data), 57 test::graph::Constant(g, axes)); 58 return g; 59 } 60 61 static Graph* ThreeDYReduce(const string& reduce, int num_y, int num_z) { 62 auto* g = new Graph(OpRegistry::Global()); 63 Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z})); 64 data.flat<float>().setRandom(); 65 Tensor axes(DT_INT32, TensorShape({1})); 66 axes.flat<int32>()(0) = 1; 67 test::graph::Reduce(g, reduce, test::graph::Constant(g, data), 68 test::graph::Constant(g, axes)); 69 return g; 70 } 71 72 static Graph* ThreeDXZReduce(const string& reduce, int num_y, int num_z) { 73 auto* g = new Graph(OpRegistry::Global()); 74 Tensor data(DT_FLOAT, TensorShape({4, num_y, num_z})); 75 data.flat<float>().setRandom(); 76 Tensor axes(DT_INT32, TensorShape({2})); 77 axes.flat<int32>()(0) = 0; 78 axes.flat<int32>()(1) = 2; 79 test::graph::Reduce(g, reduce, test::graph::Constant(g, data), 80 test::graph::Constant(g, axes)); 81 return g; 82 } 83 84 // Creates a bench which reduces a 3D tensor with total "num" floats 85 // into a scalar on a "device". Runs the bench for "iters" times. 86 template <typename T> 87 static void ReduceToScalar(int iters, const string& device, 88 const string& reduce, int num_x, int num_y) { 89 testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y); 90 testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y * 91 sizeof(T)); 92 test::Benchmark(device, ToScalar<T>(reduce, num_x, num_y)).Run(iters); 93 } 94 95 static void DoRowReduce(int iters, const string& device, const string& reduce, 96 int num_x, int num_y) { 97 testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y); 98 testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y * 99 sizeof(float)); 100 test::Benchmark(device, RowReduce(reduce, num_x, num_y)).Run(iters); 101 } 102 103 static void DoColReduce(int iters, const string& device, const string& reduce, 104 int num_x, int num_y) { 105 testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y); 106 testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y * 107 sizeof(float)); 108 test::Benchmark(device, ColReduce(reduce, num_x, num_y)).Run(iters); 109 } 110 111 static void Do3DYReduce(int iters, const string& device, const string& reduce, 112 int num_x, int num_y) { 113 testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y); 114 testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y * 115 sizeof(float)); 116 test::Benchmark(device, ThreeDYReduce(reduce, num_x, num_y)).Run(iters); 117 } 118 119 static void Do3DXZReduce(int iters, const string& device, const string& reduce, 120 int num_x, int num_y) { 121 testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y); 122 testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y * 123 sizeof(float)); 124 test::Benchmark(device, ThreeDXZReduce(reduce, num_x, num_y)).Run(iters); 125 } 126 127 static void BM_Sum2DToScalarGPU(int iters, int num_x, int num_y) { 128 ReduceToScalar<float>(iters, "gpu", "Sum", num_x, num_y); 129 } 130 BENCHMARK(BM_Sum2DToScalarGPU)->RangePair(1, 8192, 1, 8192); 131 132 static void BM_Sum2DToScalarGPUComplex(int iters, int num_x, int num_y) { 133 ReduceToScalar<std::complex<float>>(iters, "gpu", "Sum", num_x, num_y); 134 } 135 BENCHMARK(BM_Sum2DToScalarGPUComplex)->RangePair(1, 8192, 1, 8192); 136 137 static void BM_Sum2DToScalarGPUHalf(int iters, int num_x, int num_y) { 138 ReduceToScalar<Eigen::half>(iters, "gpu", "Sum", num_x, num_y); 139 } 140 BENCHMARK(BM_Sum2DToScalarGPUHalf)->RangePair(1, 8192, 1, 8192); 141 142 static void BM_Sum2DRowReduceGPU(int iters, int num_x, int num_y) { 143 DoRowReduce(iters, "gpu", "Sum", num_x, num_y); 144 } 145 BENCHMARK(BM_Sum2DRowReduceGPU)->RangePair(1, 8192, 1, 8192); 146 147 static void BM_Sum2DColumnReduceGPU(int iters, int num_x, int num_y) { 148 DoColReduce(iters, "gpu", "Sum", num_x, num_y); 149 } 150 BENCHMARK(BM_Sum2DColumnReduceGPU)->RangePair(1, 8192, 1, 8192); 151 152 static void BM_Sum3DYReduceGPU(int iters, int num_x, int num_y) { 153 Do3DYReduce(iters, "gpu", "Sum", num_x, num_y); 154 } 155 BENCHMARK(BM_Sum3DYReduceGPU)->RangePair(64, 4096, 64, 4096); 156 157 static void BM_Sum3DXZReduceGPU(int iters, int num_x, int num_y) { 158 Do3DXZReduce(iters, "gpu", "Sum", num_x, num_y); 159 } 160 BENCHMARK(BM_Sum3DXZReduceGPU)->RangePair(64, 4096, 64, 4096); 161 162 static void BM_Mean2DToScalarGPU(int iters, int num_x, int num_y) { 163 ReduceToScalar<float>(iters, "gpu", "Mean", num_x, num_y); 164 } 165 BENCHMARK(BM_Mean2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); 166 167 static void BM_Max2DToScalarGPU(int iters, int num_x, int num_y) { 168 ReduceToScalar<float>(iters, "gpu", "Max", num_x, num_y); 169 } 170 BENCHMARK(BM_Max2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); 171 172 static void BM_Min2DToScalarGPU(int iters, int num_x, int num_y) { 173 ReduceToScalar<float>(iters, "gpu", "Min", num_x, num_y); 174 } 175 BENCHMARK(BM_Min2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); 176 177 static void BM_Min2DToScalarGPUHalf(int iters, int num_x, int num_y) { 178 ReduceToScalar<Eigen::half>(iters, "gpu", "Min", num_x, num_y); 179 } 180 BENCHMARK(BM_Min2DToScalarGPUHalf)->RangePair(2048, 8192, 2048, 8192); 181 182 static void BM_Bool2DToScalarGPU(int iters, int num_x, int num_y) { 183 ReduceToScalar<bool>(iters, "gpu", "All", num_x, num_y); 184 } 185 BENCHMARK(BM_Bool2DToScalarGPU)->RangePair(2048, 8192, 2048, 8192); 186 187 } // end namespace tensorflow 188