1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include <functional> 17 #include <memory> 18 #include <vector> 19 20 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" 21 #include "tensorflow/core/framework/allocator.h" 22 #include "tensorflow/core/framework/op_kernel.h" 23 #include "tensorflow/core/framework/tensor.h" 24 #include "tensorflow/core/framework/types.h" 25 #include "tensorflow/core/framework/types.pb.h" 26 #include "tensorflow/core/graph/node_builder.h" 27 #include "tensorflow/core/graph/testlib.h" 28 #include "tensorflow/core/kernels/ops_testutil.h" 29 #include "tensorflow/core/kernels/ops_util.h" 30 #include "tensorflow/core/lib/core/status_test_util.h" 31 #include "tensorflow/core/platform/prefetch.h" 32 #include "tensorflow/core/platform/test.h" 33 #include "tensorflow/core/platform/test_benchmark.h" 34 35 namespace tensorflow { 36 namespace { 37 38 // For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim' 39 // in size, and concat them together along "concat_dimension" 40 template <typename T> 41 static void ConcatHelper(int iters, int concat_dimension, int dim2) { 42 testing::StopTiming(); 43 Graph* g = new Graph(OpRegistry::Global()); 44 45 DataType dt = DataTypeToEnum<T>::v(); 46 const int kDim1 = 100; 47 Tensor concat_dim(DT_INT32, TensorShape({})); 48 concat_dim.scalar<int32>()() = concat_dimension; 49 Tensor in0(dt, TensorShape({kDim1, dim2})); 50 in0.flat<T>().setRandom(); 51 Tensor in1(dt, TensorShape({kDim1, dim2})); 52 in1.flat<T>().setRandom(); 53 54 Node* node; 55 TF_CHECK_OK( 56 NodeBuilder(g->NewName("n"), "Concat") 57 .Input(test::graph::Constant(g, concat_dim)) 58 .Input({test::graph::Constant(g, in0), test::graph::Constant(g, in1)}) 59 .Attr("N", 2) 60 .Attr("T", dt) 61 .Finalize(g, &node)); 62 63 testing::BytesProcessed(static_cast<int64>(iters) * 64 ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T)); 65 testing::StartTiming(); 66 test::Benchmark("cpu", g).Run(iters); 67 testing::UseRealTime(); 68 } 69 70 static void BM_ConcatDim0Float(int iters, int dim2) { 71 ConcatHelper<float>(iters, 0, dim2); 72 } 73 74 static void BM_ConcatDim1Float(int iters, int dim2) { 75 ConcatHelper<float>(iters, 1, dim2); 76 } 77 78 BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000); 79 BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000); 80 81 static void BM_ConcatDim1int16(int iters, int dim2) { 82 ConcatHelper<int16>(iters, 1, dim2); 83 } 84 static void BM_ConcatDim1bfloat16(int iters, int dim2) { 85 ConcatHelper<bfloat16>(iters, 1, dim2); 86 } 87 88 BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000); 89 BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000); 90 91 template <typename T> 92 static void ConcatManyHelper(int iters, int concat_dimension, int dim2) { 93 testing::StopTiming(); 94 Graph* g = new Graph(OpRegistry::Global()); 95 96 DataType dt = DataTypeToEnum<T>::v(); 97 const int kDim1 = 40000; 98 const int kNumInputs = 64; 99 Tensor concat_dim(DT_INT32, TensorShape({})); 100 concat_dim.scalar<int32>()() = concat_dimension; 101 std::vector<NodeBuilder::NodeOut> inputs; 102 inputs.reserve(kNumInputs); 103 for (int i = 0; i < kNumInputs; ++i) { 104 Tensor in(dt, TensorShape({kDim1, dim2})); 105 in.flat<T>().setRandom(); 106 inputs.push_back(test::graph::Constant(g, in)); 107 } 108 109 Node* node; 110 TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Concat") 111 .Input(test::graph::Constant(g, concat_dim)) 112 .Input(inputs) 113 .Attr("N", 64) 114 .Attr("T", dt) 115 .Finalize(g, &node)); 116 testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 * 117 kNumInputs * sizeof(T)); 118 testing::StartTiming(); 119 test::Benchmark("cpu", g).Run(iters); 120 testing::UseRealTime(); 121 } 122 123 static void BM_ConcatManyDim1bfloat16(int iters, int dim2) { 124 ConcatManyHelper<bfloat16>(iters, 1, dim2); 125 } 126 127 BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60); 128 129 static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) { 130 testing::StopTiming(); 131 132 const int kDim1 = 100; 133 std::vector<float> data1(kDim1 * dim2, 1.0f); 134 std::vector<float> data2(kDim1 * dim2, 2.0f); 135 136 testing::BytesProcessed(static_cast<int64>(iters) * 137 ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float)); 138 testing::StartTiming(); 139 while (--iters > 0) { 140 const size_t n0 = data1.size(); 141 const size_t n1 = data2.size(); 142 float* result = new float[n0 + n1]; 143 memcpy(&result[0], &data1[0], n0 * sizeof(float)); 144 memcpy(&result[n0], &data2[0], n1 * sizeof(float)); 145 delete[] result; 146 } 147 } 148 149 static void BM_MemcpyAlternativeDim0(int iters, int dim2) { 150 MemcpyAlternativeHelper(iters, 0, dim2); 151 } 152 static void BM_MemcpyAlternativeDim1(int iters, int dim2) { 153 MemcpyAlternativeHelper(iters, 1, dim2); 154 } 155 156 BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000); 157 BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000); 158 159 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>, 160 Eigen::Unaligned> 161 EigenMap; 162 static void MemcpyManyAlternative1(int iters, int dim2) { 163 testing::StopTiming(); 164 165 const int kDim1 = 40000; 166 const int kNumCopies = 64; 167 const int size = kDim1 * dim2 * kNumCopies; 168 bfloat16* data = new bfloat16[size]; 169 EigenMap map(data, size); 170 map.setRandom(); 171 172 testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 * 173 kNumCopies * sizeof(bfloat16)); 174 testing::StartTiming(); 175 while (iters-- > 0) { 176 std::vector<bfloat16*> inputs(kNumCopies); 177 for (int i = 0; i < kNumCopies; ++i) { 178 inputs[i] = &data[i * kDim1 * dim2]; 179 } 180 bfloat16* result = new bfloat16[size]; 181 for (int j = 0; j < kNumCopies; ++j) { 182 bfloat16* output = &result[j * dim2]; 183 for (int i = 0; i < kDim1; ++i) { 184 if (i + 1 < kDim1) { 185 port::prefetch<port::PREFETCH_HINT_T0>(inputs[j] + dim2); 186 } 187 memcpy(output, inputs[j], dim2 * sizeof(bfloat16)); 188 inputs[j] += dim2; 189 output += dim2 * kNumCopies; 190 } 191 } 192 delete[] result; 193 } 194 delete[] data; 195 } 196 197 static void MemcpyManyAlternative2(int iters, int dim2) { 198 testing::StopTiming(); 199 200 const int kDim1 = 40000; 201 const int kNumCopies = 64; 202 const int size = kDim1 * dim2 * kNumCopies; 203 bfloat16* data = new bfloat16[size]; 204 EigenMap map(data, size); 205 map.setRandom(); 206 207 testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 * 208 kNumCopies * sizeof(bfloat16)); 209 testing::StartTiming(); 210 std::vector<bfloat16*> inputs(kNumCopies); 211 while (--iters > 0) { 212 bfloat16* result = new bfloat16[size]; 213 for (int i = 0; i < kNumCopies; ++i) { 214 inputs[i] = &data[i * kDim1 * dim2]; 215 } 216 bfloat16* output = result; 217 for (int i = 0; i < kDim1; ++i) { 218 for (int j = 0; j < kNumCopies; ++j) { 219 if (j + 1 < kNumCopies) { 220 port::prefetch<port::PREFETCH_HINT_T0>(inputs[j + 1]); 221 } 222 memcpy(output, inputs[j], dim2 * sizeof(bfloat16)); 223 inputs[j] += dim2; 224 output += dim2; 225 } 226 } 227 delete[] result; 228 } 229 delete[] data; 230 } 231 232 BENCHMARK(MemcpyManyAlternative1) 233 ->Arg(16) 234 ->Arg(17) 235 ->Arg(18) 236 ->Arg(32) 237 ->Arg(33) 238 ->Arg(34) 239 ->Arg(60) 240 ->Arg(64) 241 ->Arg(65); 242 243 BENCHMARK(MemcpyManyAlternative2) 244 ->Arg(16) 245 ->Arg(17) 246 ->Arg(18) 247 ->Arg(32) 248 ->Arg(33) 249 ->Arg(34) 250 ->Arg(60) 251 ->Arg(64) 252 ->Arg(65); 253 254 } // namespace 255 } // namespace tensorflow 256