1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #define EIGEN_USE_THREADS 17 18 #if GOOGLE_CUDA 19 #define EIGEN_USE_GPU 20 #endif // GOOGLE_CUDA 21 22 #include <functional> 23 #include <memory> 24 #include <unordered_map> 25 #include <vector> 26 27 #include "third_party/eigen3/Eigen/Core" 28 #include "tensorflow/cc/ops/const_op.h" 29 #include "tensorflow/cc/ops/nn_ops.h" 30 #include "tensorflow/cc/ops/nn_ops_internal.h" 31 #include "tensorflow/core/common_runtime/device_factory.h" 32 #include "tensorflow/core/common_runtime/eigen_thread_pool.h" 33 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" 34 #include "tensorflow/core/framework/allocator.h" 35 #include "tensorflow/core/framework/fake_input.h" 36 #include "tensorflow/core/framework/graph.pb.h" 37 #include "tensorflow/core/framework/node_def_builder.h" 38 #include "tensorflow/core/framework/op_kernel.h" 39 #include "tensorflow/core/framework/tensor.h" 40 #include "tensorflow/core/framework/tensor_testutil.h" 41 #include "tensorflow/core/framework/types.h" 42 #include "tensorflow/core/framework/types.pb.h" 43 #include "tensorflow/core/graph/graph_constructor.h" 44 #include "tensorflow/core/graph/graph_def_builder.h" 45 #include "tensorflow/core/kernels/ops_testutil.h" 46 #include "tensorflow/core/kernels/ops_util.h" 47 #include "tensorflow/core/lib/core/status_test_util.h" 48 #include "tensorflow/core/lib/core/threadpool.h" 49 #include "tensorflow/core/platform/logging.h" 50 #include "tensorflow/core/platform/test.h" 51 #include "tensorflow/core/platform/test_benchmark.h" 52 #include "tensorflow/core/public/session.h" 53 #include "tensorflow/core/public/version.h" 54 #include "tensorflow/core/util/padding.h" 55 #include "tensorflow/core/util/port.h" 56 57 namespace tensorflow { 58 59 static void SetConstOp(const string& name, std::initializer_list<int64> dims, 60 DataType data_type, NodeDef* node) { 61 Tensor tensor(data_type, TensorShape(dims)); 62 for (int64 i = 0; i < tensor.NumElements(); ++i) { 63 switch (data_type) { 64 case DT_FLOAT: 65 tensor.flat<float>()(i) = i / 10.0f; 66 break; 67 case DT_HALF: 68 tensor.flat<Eigen::half>()(i) = Eigen::half(i / 10.0f); 69 break; 70 default: 71 LOG(FATAL) << "Unknown data type " << data_type; 72 } 73 } 74 TF_CHECK_OK(NodeDefBuilder(name, "Const") 75 .Attr("dtype", data_type) 76 .Attr("value", tensor) 77 .Finalize(node)); 78 } 79 80 static void SetConstSizesOp(const string& name, const std::vector<int32>& sizes, 81 NodeDef* node) { 82 TensorShape shape; 83 shape.AddDim(sizes.size()); 84 Tensor tensor(DT_INT32, shape); 85 for (int64 i = 0; i < tensor.NumElements(); ++i) { 86 tensor.flat<int32>()(i) = sizes[i]; 87 } 88 TF_CHECK_OK(NodeDefBuilder(name, "Const") 89 .Attr("dtype", DT_INT32) 90 .Attr("value", tensor) 91 .Finalize(node)); 92 } 93 94 namespace { 95 96 enum CONV_OP { 97 CONV_OP_FORWARD = 0, 98 CONV_OP_BACKPROP_INPUT = 1, 99 CONV_OP_BACKPROP_FILTER = 2, 100 CONV_OP_FUSED = 3, 101 }; 102 103 } // namespace 104 105 static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth, 106 int out_depth, int filter_rows, int filter_cols, 107 CONV_OP op, int num_threads, int stride, 108 Padding padding, bool use_gpu, DataType data_type, 109 const string& label) { 110 if (!IsGoogleCudaEnabled() && use_gpu) { 111 testing::SetLabel( 112 strings::StrCat("Skipping GPU test (no --config=cuda): ", label)); 113 return; 114 } 115 testing::SetLabel(label); 116 117 // Set the number of threads 118 SessionOptions options; 119 options.config.set_intra_op_parallelism_threads(num_threads); 120 121 // We set up a graph for computing convolution. 122 GraphDef graph; 123 124 // For this, we need an input tensor and a filter tensor. 125 // Compute the output size. 126 int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; 127 TF_CHECK_OK(GetWindowedOutputSize(rows, filter_rows, stride, padding, 128 &out_rows, &pad_rows)); 129 TF_CHECK_OK(GetWindowedOutputSize(cols, filter_cols, stride, padding, 130 &out_cols, &pad_cols)); 131 // Counting the number of floating point operations (both MUL and ADD) 132 int64 num_ops = 0; 133 if (op == CONV_OP_FORWARD) { 134 // Forward computation: 135 // BATCH x OUT_ROW X OUT_COL X IN_DEPTH X PATCH_ROW X PATH_COL X OUT_DEPTH 136 // We multiply by two since there are multiplications and additions. 137 num_ops = static_cast<int64>(batch * in_depth * out_depth) * 138 static_cast<int64>(filter_rows * filter_cols) * 139 static_cast<int64>(out_rows * out_cols) * 2; 140 } else { 141 // Backward computation: 142 // BATCH x IN_ROW X IN_COL X IN_DEPTH X PATCH_ROW X PATCH_COL X OUT_DEPTH 143 // We multiply by two since there are multiplications and additions. 144 num_ops = static_cast<int64>(batch * in_depth * out_depth) * 145 static_cast<int64>(filter_rows * filter_cols) * 146 static_cast<int64>(rows * cols) * 2; 147 } 148 149 SetConstOp("input", {batch, rows, cols, in_depth}, data_type, 150 graph.add_node()); 151 SetConstOp("filter", {filter_rows, filter_cols, in_depth, out_depth}, 152 data_type, graph.add_node()); 153 SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth}, 154 data_type, graph.add_node()); 155 SetConstSizesOp("input_sizes", 156 std::vector<int32>({batch, rows, cols, in_depth}), 157 graph.add_node()); 158 SetConstSizesOp( 159 "filter_sizes", 160 std::vector<int32>({filter_rows, filter_cols, in_depth, out_depth}), 161 graph.add_node()); 162 SetConstSizesOp("resize_size", std::vector<int32>({rows, cols}), 163 graph.add_node()); 164 165 TensorShape paddings_shape({4, 2}); 166 Tensor paddings_tensor(DT_INT32, paddings_shape); 167 for (int64 i = 0; i < paddings_tensor.NumElements(); ++i) { 168 paddings_tensor.flat<int32>()(i) = 0; 169 } 170 TF_CHECK_OK(NodeDefBuilder("paddings", "Const") 171 .Attr("dtype", DT_INT32) 172 .Attr("value", paddings_tensor) 173 .Finalize(graph.add_node())); 174 175 // Now add the convolution op 176 NodeDef* conv = graph.add_node(); 177 switch (op) { 178 case CONV_OP_FORWARD: 179 TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2D") 180 .Input("input", 0, data_type) 181 .Input("filter", 0, data_type) 182 .Attr("strides", {1, stride, stride, 1}) 183 .Attr("padding", padding == VALID ? "VALID" : "SAME") 184 .Finalize(conv)); 185 break; 186 case CONV_OP_BACKPROP_INPUT: 187 TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropInput") 188 .Input("input_sizes", 0, DT_INT32) 189 .Input("filter", 0, data_type) 190 .Input("output_backprop", 0, data_type) 191 .Attr("strides", {1, stride, stride, 1}) 192 .Attr("padding", padding == VALID ? "VALID" : "SAME") 193 .Finalize(conv)); 194 break; 195 case CONV_OP_BACKPROP_FILTER: 196 TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropFilter") 197 .Input("input", 0, data_type) 198 .Input("filter_sizes", 0, DT_INT32) 199 .Input("output_backprop", 0, data_type) 200 .Attr("strides", {1, stride, stride, 1}) 201 .Attr("padding", padding == VALID ? "VALID" : "SAME") 202 .Finalize(conv)); 203 break; 204 case CONV_OP_FUSED: 205 TF_CHECK_OK(NodeDefBuilder("conv2d", "FusedResizeAndPadConv2D") 206 .Input("input", 0, data_type) 207 .Input("resize_size", 0, DT_INT32) 208 .Input("paddings", 0, DT_INT32) 209 .Input("filter", 0, data_type) 210 .Attr("mode", "REFLECT") 211 .Attr("strides", {1, stride, stride, 1}) 212 .Attr("padding", padding == VALID ? "VALID" : "SAME") 213 .Attr("resize_align_corners", false) 214 .Finalize(conv)); 215 break; 216 } 217 Graph* g = new Graph(OpRegistry::Global()); 218 GraphConstructorOptions opts; 219 TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g)); 220 221 string device = use_gpu ? "gpu" : "cpu"; 222 testing::UseRealTime(); 223 test::Benchmark(device, g, &options).Run(iters); 224 testing::ItemsProcessed(num_ops * iters); 225 } 226 227 // BS: batch_size 228 // R: tensor_in_rows 229 // C: tensor_in_cols 230 // ID: input_depth 231 // OD: output_depth 232 // KR: kernel_rows 233 // KC: kernel_cols 234 #define BM_ConvFloatFwd(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL) \ 235 static void BM_ConvFloatFwdCPU1_##LABEL(int iters) { \ 236 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR, \ 237 PAD, false, DT_FLOAT, \ 238 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 239 KR, "_", KC, "_", STR, "_", PAD, "_f_cpu1")); \ 240 } \ 241 static void BM_ConvFloatFwdCPU4_##LABEL(int iters) { \ 242 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 4, STR, \ 243 PAD, false, DT_FLOAT, \ 244 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 245 KR, "_", KC, "_", STR, "_", PAD, "_f_cpu4")); \ 246 } \ 247 static void BM_ConvFloatFusedCPU1_##LABEL(int iters) { \ 248 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FUSED, 1, STR, PAD, \ 249 false, DT_FLOAT, \ 250 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 251 KR, "_", KC, "_", STR, "_", PAD, "_f_cpu1")); \ 252 } \ 253 static void BM_ConvFloatFusedCPU4_##LABEL(int iters) { \ 254 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FUSED, 4, STR, PAD, \ 255 false, DT_FLOAT, \ 256 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 257 KR, "_", KC, "_", STR, "_", PAD, "_f_cpu4")); \ 258 } \ 259 static void BM_ConvFloatFwdGPU_##LABEL(int iters) { \ 260 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR, \ 261 PAD, true, DT_FLOAT, \ 262 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 263 KR, "_", KC, "_", STR, "_", PAD, "_f_gpu")); \ 264 } \ 265 static void BM_ConvHalfFwdGPU_##LABEL(int iters) { \ 266 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR, \ 267 PAD, true, DT_HALF, \ 268 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 269 KR, "_", KC, "_", STR, "_", PAD, "_h_gpu")); \ 270 } \ 271 BENCHMARK(BM_ConvFloatFwdCPU1_##LABEL); \ 272 BENCHMARK(BM_ConvFloatFwdCPU4_##LABEL); \ 273 BENCHMARK(BM_ConvFloatFusedCPU1_##LABEL); \ 274 BENCHMARK(BM_ConvFloatFusedCPU4_##LABEL); \ 275 BENCHMARK(BM_ConvFloatFwdGPU_##LABEL); \ 276 BENCHMARK(BM_ConvHalfFwdGPU_##LABEL) 277 278 BM_ConvFloatFwd(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0); 279 BM_ConvFloatFwd(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1); 280 BM_ConvFloatFwd(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2); 281 BM_ConvFloatFwd(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3); 282 BM_ConvFloatFwd(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4); 283 BM_ConvFloatFwd(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5); 284 BM_ConvFloatFwd(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6); 285 BM_ConvFloatFwd(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7); 286 BM_ConvFloatFwd(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8); 287 BM_ConvFloatFwd(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9); 288 BM_ConvFloatFwd(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10); 289 BM_ConvFloatFwd(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11); 290 BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12); 291 BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13); 292 BM_ConvFloatFwd(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14); 293 BM_ConvFloatFwd(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15); 294 BM_ConvFloatFwd(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16); 295 BM_ConvFloatFwd(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17); 296 BM_ConvFloatFwd(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18); 297 BM_ConvFloatFwd(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19); 298 BM_ConvFloatFwd(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20); 299 BM_ConvFloatFwd(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21); 300 BM_ConvFloatFwd(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22); 301 BM_ConvFloatFwd(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23); 302 BM_ConvFloatFwd(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24); 303 BM_ConvFloatFwd(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25); 304 BM_ConvFloatFwd(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26); 305 BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27); 306 BM_ConvFloatFwd(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28); 307 BM_ConvFloatFwd(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29); 308 BM_ConvFloatFwd(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30); 309 BM_ConvFloatFwd(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31); 310 BM_ConvFloatFwd(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32); 311 BM_ConvFloatFwd(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33); 312 BM_ConvFloatFwd(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34); 313 BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35); 314 BM_ConvFloatFwd(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36); 315 BM_ConvFloatFwd(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37); 316 BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38); 317 BM_ConvFloatFwd(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39); 318 BM_ConvFloatFwd(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40); 319 BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41); 320 BM_ConvFloatFwd(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42); 321 BM_ConvFloatFwd(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43); 322 BM_ConvFloatFwd(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44); 323 BM_ConvFloatFwd(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45); 324 BM_ConvFloatFwd(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46); 325 BM_ConvFloatFwd(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47); 326 BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48); 327 BM_ConvFloatFwd(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49); 328 BM_ConvFloatFwd(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50); 329 BM_ConvFloatFwd(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51); 330 BM_ConvFloatFwd(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52); 331 BM_ConvFloatFwd(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53); 332 BM_ConvFloatFwd(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54); 333 334 #define BM_ConvFloatBkInAndFilter(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL) \ 335 static void BM_ConvFloatBkInCPU1_##LABEL(int iters) { \ 336 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1, \ 337 STR, PAD, false, DT_FLOAT, \ 338 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 339 KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \ 340 } \ 341 static void BM_ConvFloatBkInCPU4_##LABEL(int iters) { \ 342 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 4, \ 343 STR, PAD, false, DT_FLOAT, \ 344 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 345 KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ 346 } \ 347 static void BM_ConvFloatBkInGPU_##LABEL(int iters) { \ 348 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1, \ 349 STR, PAD, true, DT_FLOAT, \ 350 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 351 KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ 352 } \ 353 static void BM_ConvFloatBkFilterCPU1_##LABEL(int iters) { \ 354 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \ 355 STR, PAD, false, DT_FLOAT, \ 356 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 357 KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \ 358 } \ 359 static void BM_ConvFloatBkFilterCPU4_##LABEL(int iters) { \ 360 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 4, \ 361 STR, PAD, false, DT_FLOAT, \ 362 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 363 KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ 364 } \ 365 static void BM_ConvFloatBkFilterGPU_##LABEL(int iters) { \ 366 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \ 367 STR, PAD, true, DT_FLOAT, \ 368 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 369 KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ 370 } \ 371 static void BM_ConvHalfBkInGPU_##LABEL(int iters) { \ 372 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1, \ 373 STR, PAD, true, DT_HALF, \ 374 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 375 KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ 376 } \ 377 static void BM_ConvHalfBkFilterGPU_##LABEL(int iters) { \ 378 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \ 379 STR, PAD, true, DT_HALF, \ 380 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_", \ 381 KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ 382 } \ 383 BENCHMARK(BM_ConvFloatBkInCPU1_##LABEL); \ 384 BENCHMARK(BM_ConvFloatBkInCPU4_##LABEL); \ 385 BENCHMARK(BM_ConvFloatBkInGPU_##LABEL); \ 386 BENCHMARK(BM_ConvFloatBkFilterCPU1_##LABEL); \ 387 BENCHMARK(BM_ConvFloatBkFilterCPU4_##LABEL); \ 388 BENCHMARK(BM_ConvFloatBkFilterGPU_##LABEL); \ 389 BENCHMARK(BM_ConvHalfBkInGPU_##LABEL); \ 390 BENCHMARK(BM_ConvHalfBkFilterGPU_##LABEL) 391 392 // Benchmarks from the inception model 393 394 BM_ConvFloatBkInAndFilter(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0); 395 BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1); 396 BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2); 397 BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3); 398 BM_ConvFloatBkInAndFilter(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4); 399 BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5); 400 BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6); 401 BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7); 402 BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8); 403 BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9); 404 BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10); 405 BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11); 406 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12); 407 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13); 408 BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14); 409 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15); 410 BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16); 411 BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17); 412 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18); 413 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19); 414 BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20); 415 BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21); 416 BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22); 417 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23); 418 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24); 419 BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25); 420 BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26); 421 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27); 422 BM_ConvFloatBkInAndFilter(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28); 423 BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29); 424 BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30); 425 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31); 426 BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32); 427 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33); 428 BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34); 429 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35); 430 BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36); 431 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37); 432 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38); 433 BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39); 434 BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40); 435 BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41); 436 BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42); 437 BM_ConvFloatBkInAndFilter(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43); 438 BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44); 439 BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45); 440 BM_ConvFloatBkInAndFilter(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46); 441 BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47); 442 BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48); 443 BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49); 444 BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50); 445 BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51); 446 BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52); 447 BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53); 448 BM_ConvFloatBkInAndFilter(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54); 449 450 #define BM_ConvFloatBkFCPU(BS, R, C, ID, OD, KR, KC, TH, LABEL) \ 451 static void \ 452 BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH( \ 453 int iters) { \ 454 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, TH, \ 455 1, VALID, false, DT_FLOAT, LABEL); \ 456 } \ 457 BENCHMARK( \ 458 BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH); 459 460 // Benchmarks from https://github.com/soumith/convnet-benchmarks 461 BM_ConvFloatBkFCPU(128, 128, 128, 3, 96, 11, 11, 4, "convnet-layer1"); 462 BM_ConvFloatBkFCPU(128, 64, 64, 64, 128, 9, 9, 4, "convnet-layer2"); 463 BM_ConvFloatBkFCPU(128, 32, 32, 128, 128, 9, 9, 4, "convnet-layer3"); 464 BM_ConvFloatBkFCPU(128, 16, 16, 128, 128, 7, 7, 4, "convnet-layer4"); 465 BM_ConvFloatBkFCPU(128, 13, 13, 384, 384, 3, 3, 4, "convnet-layer5"); 466 467 #define BM_ConvFloatBkFGPU(BS, R, C, ID, OD, KR, KC, LABEL) \ 468 static void BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC( \ 469 int iters) { \ 470 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \ 471 1, VALID, true, DT_FLOAT, LABEL); \ 472 } \ 473 static void BM_ConvHalfBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC( \ 474 int iters) { \ 475 BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \ 476 1, VALID, true, DT_HALF, LABEL); \ 477 } \ 478 BENCHMARK(BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC); \ 479 BENCHMARK(BM_ConvHalfBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC) 480 481 // Benchmarks from https://github.com/soumith/convnet-benchmarks 482 BM_ConvFloatBkFGPU(128, 128, 128, 3, 96, 11, 11, "convnet-layer1"); 483 BM_ConvFloatBkFGPU(128, 64, 64, 64, 128, 9, 9, "convnet-layer2"); 484 BM_ConvFloatBkFGPU(128, 32, 32, 128, 128, 9, 9, "convnet-layer3"); 485 BM_ConvFloatBkFGPU(128, 16, 16, 128, 128, 7, 7, "convnet-layer4"); 486 BM_ConvFloatBkFGPU(128, 13, 13, 384, 384, 3, 3, "convnet-layer5"); 487 488 namespace { 489 490 enum DEPTHWISE_CONV_OP { 491 DEPTHWISE_CONV_OP_FWD = 0, 492 DEPTHWISE_CONV_OP_BACKPROP_INPUT = 1, 493 DEPTHWISE_CONV_OP_BACKPROP_FILTER = 2 494 }; 495 496 } // namespace 497 498 static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols, 499 int in_depth, int depth_multiplier, 500 int out_depth, int filter_rows, 501 int filter_cols, DEPTHWISE_CONV_OP op, 502 int num_threads, int stride, Padding padding, 503 bool use_gpu, const string& label) { 504 if (!IsGoogleCudaEnabled() && use_gpu) { 505 testing::SetLabel( 506 strings::StrCat("Skipping GPU test (no --config=cuda): ", label)); 507 return; 508 } 509 testing::SetLabel(label); 510 511 // Set the number of threads 512 SessionOptions options; 513 options.config.set_intra_op_parallelism_threads(num_threads); 514 515 // We set up a graph for computing convolution. 516 GraphDef graph; 517 518 // For this, we need an input tensor and a filter tensor. 519 // Compute the output size. 520 int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0; 521 TF_CHECK_OK(GetWindowedOutputSize(rows, filter_rows, stride, padding, 522 &out_rows, &pad_rows)); 523 TF_CHECK_OK(GetWindowedOutputSize(cols, filter_cols, stride, padding, 524 &out_cols, &pad_cols)); 525 526 int64 num_ops = 0; 527 if (op == DEPTHWISE_CONV_OP_FWD) { 528 // Counting the number of floating point operations (both MUL and ADD) 529 // Forward computation: 530 // BATCH x OUT_ROW X OUT_COL X FLTR_ROW X FLTR_COL X DEPTH_MULT X IN_DEPTH 531 // We multiply by two since there are multiplications and additions. 532 num_ops = static_cast<int64>(batch * out_rows * out_cols) * 533 static_cast<int64>(filter_rows * filter_cols) * 534 static_cast<int64>(in_depth * depth_multiplier) * 2; 535 } else { 536 // Backward computation: both input and filter backprop take the same 537 // amount of computation: 538 // BATCH x IN_ROW X IN_COL X FLTR_ROW X FLTR_COL X DEPTH_MULT X IN_DEPTH 539 // We multiply by two since there are multiplications and additions. 540 // We divide by stride squared to approximate the affect of decreasing 541 // number of bprop output points per bprop input point with increasing 542 // stride. 543 num_ops = (static_cast<int64>(batch * rows * cols) * 544 static_cast<int64>(filter_rows * filter_cols) * 545 static_cast<int64>(in_depth * depth_multiplier) * 2) / 546 (stride * stride); 547 } 548 549 // FIXME 550 SetConstOp("input", {batch, rows, cols, in_depth}, DT_FLOAT, 551 graph.add_node()); 552 SetConstOp("depthwise_filter", 553 {filter_rows, filter_cols, in_depth, depth_multiplier}, DT_FLOAT, 554 graph.add_node()); 555 SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth}, 556 DT_FLOAT, graph.add_node()); 557 SetConstSizesOp("input_sizes", 558 std::vector<int32>({batch, rows, cols, in_depth}), 559 graph.add_node()); 560 SetConstSizesOp("filter_sizes", 561 std::vector<int32>( 562 {filter_rows, filter_cols, in_depth, depth_multiplier}), 563 graph.add_node()); 564 565 // Now add the convolution op 566 NodeDef* conv = graph.add_node(); 567 switch (op) { 568 case DEPTHWISE_CONV_OP_FWD: 569 TF_CHECK_OK(NodeDefBuilder("depthwise_conv2d", "DepthwiseConv2dNative") 570 .Input("input", 0, DT_FLOAT) 571 .Input("depthwise_filter", 0, DT_FLOAT) 572 .Attr("strides", {1, stride, stride, 1}) 573 .Attr("padding", padding == VALID ? "VALID" : "SAME") 574 .Finalize(conv)); 575 break; 576 case DEPTHWISE_CONV_OP_BACKPROP_INPUT: 577 TF_CHECK_OK(NodeDefBuilder("depthwise_conv2d_backprop_input", 578 "DepthwiseConv2dNativeBackpropInput") 579 .Input("input_sizes", 0, DT_INT32) 580 .Input("depthwise_filter", 0, DT_FLOAT) 581 .Input("output_backprop", 0, DT_FLOAT) 582 .Attr("strides", {1, stride, stride, 1}) 583 .Attr("padding", padding == VALID ? "VALID" : "SAME") 584 .Finalize(conv)); 585 break; 586 case DEPTHWISE_CONV_OP_BACKPROP_FILTER: 587 TF_CHECK_OK(NodeDefBuilder("depthwise_conv2d_backprop_filter", 588 "DepthwiseConv2dNativeBackpropFilter") 589 .Input("input", 0, DT_FLOAT) 590 .Input("filter_sizes", 0, DT_INT32) 591 .Input("output_backprop", 0, DT_FLOAT) 592 .Attr("strides", {1, stride, stride, 1}) 593 .Attr("padding", padding == VALID ? "VALID" : "SAME") 594 .Finalize(conv)); 595 break; 596 } 597 Graph* g = new Graph(OpRegistry::Global()); 598 GraphConstructorOptions opts; 599 TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g)); 600 601 string device = use_gpu ? "gpu" : "cpu"; 602 testing::UseRealTime(); 603 test::Benchmark(device, g, &options).Run(iters); 604 testing::ItemsProcessed(num_ops * iters); 605 } 606 607 // BS: batch_size 608 // R: tensor_in_rows 609 // C: tensor_in_cols 610 // ID: input_depth 611 // DM: depth_multiplier 612 // OD: output_depth 613 // KR: kernel_rows 614 // KC: kernel_cols 615 // STR: stride 616 // PAD: padding 617 618 #define BM_ConvFloatDepthwiseFwd(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, \ 619 LABEL) \ 620 static void BM_ConvFloatDepthwiseFwdCPU1_##LABEL(int iters) { \ 621 BM_ConvFloatDepthwise( \ 622 iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \ 623 PAD, false, \ 624 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ 625 KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \ 626 } \ 627 static void BM_ConvFloatDepthwiseFwdCPU4_##LABEL(int iters) { \ 628 BM_ConvFloatDepthwise( \ 629 iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 4, STR, \ 630 PAD, false, \ 631 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ 632 KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ 633 } \ 634 static void BM_ConvFloatDepthwiseFwdGPU_##LABEL(int iters) { \ 635 BM_ConvFloatDepthwise( \ 636 iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \ 637 PAD, true, \ 638 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ 639 KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ 640 } \ 641 BENCHMARK(BM_ConvFloatDepthwiseFwdCPU1_##LABEL); \ 642 BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL); \ 643 BENCHMARK(BM_ConvFloatDepthwiseFwdGPU_##LABEL); 644 645 // The configurations below are mostly from mobilenet models. 646 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0); 647 BM_ConvFloatDepthwiseFwd(32, 112, 112, 64, 1, 64, 3, 3, 1, SAME, conv1); 648 BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 1, SAME, conv2); 649 BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3); 650 BM_ConvFloatDepthwiseFwd(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4); 651 BM_ConvFloatDepthwiseFwd(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5); 652 BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6); 653 // Benchmarks with different stride and padding options. 654 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, SAME, conv7); 655 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8); 656 BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 3, 3, 1, SAME, conv9); 657 BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 5, 5, 1, SAME, conv10); 658 659 #define BM_ConvFloatDepthwiseBk(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, LABEL) \ 660 static void BM_ConvFloatDepthwiseBkInCPU1_##LABEL(int iters) { \ 661 BM_ConvFloatDepthwise( \ 662 iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \ 663 1, STR, PAD, false, \ 664 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ 665 KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \ 666 } \ 667 static void BM_ConvFloatDepthwiseBkInCPU4_##LABEL(int iters) { \ 668 BM_ConvFloatDepthwise( \ 669 iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \ 670 4, STR, PAD, false, \ 671 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ 672 KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ 673 } \ 674 static void BM_ConvFloatDepthwiseBkInGPU_##LABEL(int iters) { \ 675 BM_ConvFloatDepthwise( \ 676 iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \ 677 4, STR, PAD, true, \ 678 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ 679 KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ 680 } \ 681 static void BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL(int iters) { \ 682 BM_ConvFloatDepthwise( \ 683 iters, BS, R, C, ID, DM, OD, KR, KC, \ 684 DEPTHWISE_CONV_OP_BACKPROP_FILTER, 1, STR, PAD, false, \ 685 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ 686 KR, "_", KC, "_", STR, "_", PAD, "_cpu1")); \ 687 } \ 688 static void BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL(int iters) { \ 689 BM_ConvFloatDepthwise( \ 690 iters, BS, R, C, ID, DM, OD, KR, KC, \ 691 DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, false, \ 692 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ 693 KR, "_", KC, "_", STR, "_", PAD, "_cpu4")); \ 694 } \ 695 static void BM_ConvFloatDepthwiseBkFilterGPU_##LABEL(int iters) { \ 696 BM_ConvFloatDepthwise( \ 697 iters, BS, R, C, ID, DM, OD, KR, KC, \ 698 DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, true, \ 699 strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \ 700 KR, "_", KC, "_", STR, "_", PAD, "_gpu")); \ 701 } \ 702 BENCHMARK(BM_ConvFloatDepthwiseBkInCPU1_##LABEL); \ 703 BENCHMARK(BM_ConvFloatDepthwiseBkInCPU4_##LABEL); \ 704 BENCHMARK(BM_ConvFloatDepthwiseBkInGPU_##LABEL); \ 705 BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL); \ 706 BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL); \ 707 BENCHMARK(BM_ConvFloatDepthwiseBkFilterGPU_##LABEL) 708 709 // The configurations below are mostly from mobilenet models. 710 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0); 711 BM_ConvFloatDepthwiseBk(32, 112, 112, 64, 1, 64, 3, 3, 1, SAME, conv1); 712 BM_ConvFloatDepthwiseBk(32, 56, 56, 128, 1, 128, 3, 3, 1, SAME, conv2); 713 BM_ConvFloatDepthwiseBk(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3); 714 BM_ConvFloatDepthwiseBk(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4); 715 BM_ConvFloatDepthwiseBk(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5); 716 BM_ConvFloatDepthwiseBk(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6); 717 // Benchmarks with different stride and padding options, varying depth 718 // multiplier. 719 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 2, SAME, conv7); 720 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8); 721 722 // Vary depth multiplier. 723 BM_ConvFloatDepthwiseBk(32, 112, 112, 1, 24, 24, 3, 3, 1, SAME, conv9); 724 BM_ConvFloatDepthwiseBk(32, 112, 112, 2, 12, 24, 3, 3, 1, SAME, conv10); 725 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv11); 726 BM_ConvFloatDepthwiseBk(32, 112, 112, 8, 3, 24, 3, 3, 1, SAME, conv12); 727 BM_ConvFloatDepthwiseBk(32, 112, 112, 12, 2, 24, 3, 3, 1, SAME, conv13); 728 BM_ConvFloatDepthwiseBk(32, 112, 112, 24, 1, 24, 3, 3, 1, SAME, conv14); 729 730 static void BM_LRNFloat(int iters, int depth, int cols, int rows, 731 int batch_size, int range, int num_threads, 732 const string& label) { 733 tensorflow::testing::StopTiming(); 734 std::unique_ptr<Device> device( 735 DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); 736 737 thread::ThreadPool threadpool(Env::Default(), "test", num_threads); 738 EigenThreadPoolWrapper wrapper(&threadpool); 739 Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); 740 device->set_eigen_cpu_device(&eigen_cpu_device); 741 742 gtl::InlinedVector<TensorValue, 4> inputs; 743 TensorShape shape({batch_size, rows, cols, depth}); 744 745 Tensor input(DT_FLOAT, shape); 746 test::FillIota<float>(&input, 1.0); 747 inputs.push_back({nullptr, &input}); 748 749 // Convolution op. 750 NodeDef lrn_node_def; 751 TF_CHECK_OK(NodeDefBuilder("lrn_op", "LRN") 752 .Input("input", 0, DT_FLOAT) 753 .Attr("depth_radius", range) 754 .Attr("bias", 1.0) 755 .Attr("alpha", 0.1) 756 .Attr("beta", 0.5) 757 .Finalize(&lrn_node_def)); 758 759 Status status; 760 std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(), 761 cpu_allocator(), lrn_node_def, 762 TF_GRAPH_DEF_VERSION, &status)); 763 TF_CHECK_OK(status); 764 765 OpKernelContext::Params params; 766 params.device = device.get(); 767 params.frame_iter = FrameAndIter(0, 0); 768 params.inputs = &inputs; 769 params.op_kernel = op.get(); 770 std::vector<AllocatorAttributes> attrs; 771 test::SetOutputAttrs(¶ms, &attrs); 772 773 std::unique_ptr<OpKernelContext> context(new OpKernelContext(¶ms)); 774 775 op->Compute(context.get()); 776 tensorflow::testing::StartTiming(); 777 for (int i = 0; i < iters; ++i) { 778 delete context->release_output(0).tensor; 779 op->Compute(context.get()); 780 } 781 tensorflow::testing::StopTiming(); 782 testing::ItemsProcessed(context->mutable_output(0)->NumElements() * iters * 783 (2 * range + 1) * 2); 784 testing::SetLabel(label); 785 } 786 787 #define BM_LRNFloatFwdCPU(DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL) \ 788 static void \ 789 BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS( \ 790 int iters) { \ 791 BM_LRNFloat(iters, DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL); \ 792 } \ 793 BENCHMARK( \ 794 BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS) 795 796 // clang-format off 797 // DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL 798 BM_LRNFloatFwdCPU(64, 56, 56, 32, 5, 1, "lrn 1 thread"); 799 BM_LRNFloatFwdCPU(192, 28, 28, 64, 2, 1, "lrn 1 thread"); 800 BM_LRNFloatFwdCPU(192, 56, 56, 32, 5, 1, "lrn 1 thread"); 801 BM_LRNFloatFwdCPU(64, 56, 56, 32, 5, 4, "lrn 4 threads"); 802 BM_LRNFloatFwdCPU(192, 28, 28, 64, 2, 4, "lrn 4 threads"); 803 BM_LRNFloatFwdCPU(192, 56, 56, 32, 5, 4, "lrn 4 threads"); 804 BM_LRNFloatFwdCPU(64, 56, 56, 32, 5, 8, "lrn 8 threads"); 805 BM_LRNFloatFwdCPU(192, 28, 28, 64, 2, 8, "lrn 8 threads"); 806 BM_LRNFloatFwdCPU(192, 56, 56, 32, 5, 8, "lrn 8 threads"); 807 // clang-format on 808 809 /* 810 AvgPooling Op 811 */ 812 static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth, 813 int kernel_rows, int kernel_cols, int stride, 814 Padding padding, int num_threads, const string& label) { 815 tensorflow::testing::StopTiming(); 816 std::unique_ptr<Device> device( 817 DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); 818 819 thread::ThreadPool threadpool(Env::Default(), "test", num_threads); 820 EigenThreadPoolWrapper wrapper(&threadpool); 821 Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); 822 device->set_eigen_cpu_device(&eigen_cpu_device); 823 824 gtl::InlinedVector<TensorValue, 4> inputs; 825 TensorShape shape1({batch_size, rows, cols, depth}); 826 Tensor input1(DT_FLOAT, shape1); 827 test::FillIota<float>(&input1, 1.0); 828 inputs.push_back({nullptr, &input1}); 829 830 // AvgPooling op. 831 NodeDef avgpool_node_def; 832 CHECK_EQ(kernel_rows, kernel_cols); 833 Status status = NodeDefBuilder("avgpool_op", "AvgPool") 834 .Input(FakeInput(DT_FLOAT)) 835 .Attr("ksize", {1, kernel_rows, kernel_cols, 1}) 836 .Attr("strides", {1, stride, stride, 1}) 837 .Attr("padding", padding == VALID ? "VALID" : "SAME") 838 .Finalize(&avgpool_node_def); 839 TF_CHECK_OK(status); 840 841 std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(), 842 cpu_allocator(), avgpool_node_def, 843 TF_GRAPH_DEF_VERSION, &status)); 844 TF_CHECK_OK(status); 845 OpKernelContext::Params params; 846 params.device = device.get(); 847 params.frame_iter = FrameAndIter(0, 0); 848 params.inputs = &inputs; 849 params.op_kernel = op.get(); 850 std::vector<AllocatorAttributes> attrs; 851 test::SetOutputAttrs(¶ms, &attrs); 852 853 std::unique_ptr<OpKernelContext> avgpool_context( 854 new OpKernelContext(¶ms)); 855 856 op->Compute(avgpool_context.get()); 857 tensorflow::testing::StartTiming(); 858 for (int i = 0; i < iters; ++i) { 859 delete avgpool_context->release_output(0).tensor; 860 op->Compute(avgpool_context.get()); 861 } 862 tensorflow::testing::StopTiming(); 863 testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() * 864 iters); 865 testing::SetLabel(label); 866 } 867 868 // BS: batch_size 869 // IR: input_rows 870 // IC: input_cols 871 // ND: node_depth 872 // KR: kernel_rows 873 // KC: kernel_cols 874 // ST: stride. We use the same stride for both directions. 875 // PT: padding 876 #define BM_AvgPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ 877 static void \ 878 BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \ 879 int iters) { \ 880 BM_AvgPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL); \ 881 } \ 882 BENCHMARK( \ 883 BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) 884 885 // Labels are taken from the 2014-July-24 version of imagenet 886 BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "avgpool0_VALID"); 887 BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "avgpool1_VALID"); 888 BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "avgpool4_VALID"); 889 BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "avgpool10_VALID"); 890 BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "avgpool0_SAME"); 891 BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "avgpool1_SAME"); 892 BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "avgpool4_SAME"); 893 BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "avgpool10_SAME"); 894 BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "avgpool0_VALID"); 895 BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "avgpool1_VALID"); 896 BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "avgpool4_VALID"); 897 BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "avgpool10_VALID"); 898 BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "avgpool0_SAME"); 899 BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "avgpool1_SAME"); 900 BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "avgpool4_SAME"); 901 BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "avgpool10_SAME"); 902 903 static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols, 904 int depth, int kernel_rows, int kernel_cols, 905 int stride, Padding padding, int num_threads, 906 const string& label) { 907 tensorflow::testing::StopTiming(); 908 std::unique_ptr<Device> device( 909 DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); 910 911 thread::ThreadPool threadpool(Env::Default(), "test", num_threads); 912 EigenThreadPoolWrapper wrapper(&threadpool); 913 Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); 914 device->set_eigen_cpu_device(&eigen_cpu_device); 915 916 gtl::InlinedVector<TensorValue, 4> inputs; 917 918 int64 out_height, out_width, pad_rows, pad_cols; 919 TF_CHECK_OK(GetWindowedOutputSize(rows, kernel_rows, stride, padding, 920 &out_height, &pad_rows)); 921 TF_CHECK_OK(GetWindowedOutputSize(cols, kernel_cols, stride, padding, 922 &out_width, &pad_cols)); 923 TensorShape output_shape({batch_size, out_height, out_width, depth}); 924 TensorShape shape2({4}); 925 Tensor input_shape_tensor(DT_INT32, shape2); 926 int32 input_dims[] = {batch_size, rows, cols, depth}; 927 for (int i = 0; i < 4; i++) { 928 input_shape_tensor.flat<int32>()(i) = input_dims[i]; 929 } 930 inputs.push_back({nullptr, &input_shape_tensor}); 931 932 Tensor output_backprop(DT_FLOAT, output_shape); 933 test::FillIota<float>(&output_backprop, 11.0); 934 inputs.push_back({nullptr, &output_backprop}); 935 936 // AvgPoolGrad op. 937 NodeDef avgpool_grad_node_def; 938 Status status = NodeDefBuilder("avgpool_grad_op", "AvgPoolGrad") 939 .Input(FakeInput()) 940 .Input(FakeInput(DT_FLOAT)) 941 .Attr("ksize", {1, kernel_rows, kernel_cols, 1}) 942 .Attr("strides", {1, stride, stride, 1}) 943 .Attr("padding", padding == VALID ? "VALID" : "SAME") 944 .Finalize(&avgpool_grad_node_def); 945 TF_CHECK_OK(status); 946 std::unique_ptr<OpKernel> op( 947 CreateOpKernel(DEVICE_CPU, nullptr, cpu_allocator(), 948 avgpool_grad_node_def, TF_GRAPH_DEF_VERSION, &status)); 949 TF_CHECK_OK(status); 950 OpKernelContext::Params params; 951 params.device = device.get(); 952 params.frame_iter = FrameAndIter(0, 0); 953 params.inputs = &inputs; 954 params.op_kernel = op.get(); 955 std::vector<AllocatorAttributes> attrs; 956 test::SetOutputAttrs(¶ms, &attrs); 957 958 std::unique_ptr<OpKernelContext> avgpool_context( 959 new OpKernelContext(¶ms)); 960 961 op->Compute(avgpool_context.get()); 962 tensorflow::testing::StartTiming(); 963 for (int i = 0; i < iters; ++i) { 964 delete avgpool_context->release_output(0).tensor; 965 op->Compute(avgpool_context.get()); 966 } 967 tensorflow::testing::StopTiming(); 968 testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() * 969 iters); 970 testing::SetLabel(label); 971 } 972 973 // BS: batch_size 974 // IR: input_rows 975 // IC: input_cols 976 // ND: node_depth 977 // KR: kernel_rows 978 // KC: kernel_cols 979 // ST: stride. We use the same stride for both directions. 980 // PT: padding 981 // The resulted symbol is too long. Need to use two macros to fit in 80-chars 982 #define BM_AvgPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ 983 static void \ 984 BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \ 985 int iters) { \ 986 BM_AvgPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL); \ 987 } \ 988 BENCHMARK( \ 989 BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) 990 991 // Shapes taken from the 2015/05/16 inception model 992 BM_AvgPoolBkCPU(32, 35, 35, 192, 3, 3, 1, SAME, 1, "avgpool_grad0_SAME"); 993 BM_AvgPoolBkCPU(32, 35, 35, 256, 3, 3, 1, SAME, 1, "avgpool_grad1_SAME"); 994 BM_AvgPoolBkCPU(32, 17, 17, 768, 3, 3, 1, SAME, 1, "avgpool_grad2_SAME"); 995 BM_AvgPoolBkCPU(32, 17, 17, 1024, 3, 3, 1, SAME, 1, "avgpool_grad3_SAME"); 996 BM_AvgPoolBkCPU(32, 17, 17, 1152, 3, 3, 1, SAME, 1, "avgpool_grad4_SAME"); 997 BM_AvgPoolBkCPU(32, 17, 17, 1216, 3, 3, 1, SAME, 1, "avgpool_grad5_SAME"); 998 BM_AvgPoolBkCPU(32, 17, 17, 1248, 5, 5, 3, VALID, 1, "avgpool_grad6_VALID"); 999 BM_AvgPoolBkCPU(32, 8, 8, 1760, 3, 3, 1, SAME, 1, "avgpool_grad7_SAME"); 1000 BM_AvgPoolBkCPU(32, 8, 8, 2048, 8, 8, 1, VALID, 1, "avgpool_grad8_VALID"); 1001 1002 /* 1003 MaxPooling Op 1004 */ 1005 static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth, 1006 int kernel_rows, int kernel_cols, int stride, 1007 Padding padding, int num_threads, const string& label) { 1008 tensorflow::testing::StopTiming(); 1009 SessionOptions options; 1010 options.config.set_intra_op_parallelism_threads(num_threads); 1011 1012 std::unique_ptr<Device> device( 1013 DeviceFactory::NewDevice("CPU", options, "/job:a/replica:0/task:0")); 1014 1015 thread::ThreadPool threadpool(Env::Default(), "test", num_threads); 1016 EigenThreadPoolWrapper wrapper(&threadpool); 1017 Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); 1018 device->set_eigen_cpu_device(&eigen_cpu_device); 1019 1020 gtl::InlinedVector<TensorValue, 4> inputs; 1021 TensorShape shape1({batch_size, rows, cols, depth}); 1022 Tensor input1(DT_FLOAT, shape1); 1023 test::FillIota<float>(&input1, 1.0); 1024 inputs.push_back({nullptr, &input1}); 1025 1026 // MaxPooling op. 1027 NodeDef maxpool_node_def; 1028 CHECK_EQ(kernel_rows, kernel_cols); 1029 Status status = NodeDefBuilder("maxpool_op", "MaxPool") 1030 .Input(FakeInput()) 1031 .Attr("ksize", {1, kernel_rows, kernel_cols, 1}) 1032 .Attr("strides", {1, stride, stride, 1}) 1033 .Attr("padding", padding == VALID ? "VALID" : "SAME") 1034 .Finalize(&maxpool_node_def); 1035 TF_CHECK_OK(status); 1036 std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(), 1037 cpu_allocator(), maxpool_node_def, 1038 TF_GRAPH_DEF_VERSION, &status)); 1039 TF_CHECK_OK(status); 1040 OpKernelContext::Params params; 1041 params.device = device.get(); 1042 params.frame_iter = FrameAndIter(0, 0); 1043 params.inputs = &inputs; 1044 params.op_kernel = op.get(); 1045 std::vector<AllocatorAttributes> attrs; 1046 test::SetOutputAttrs(¶ms, &attrs); 1047 1048 std::unique_ptr<OpKernelContext> maxpool_context( 1049 new OpKernelContext(¶ms)); 1050 1051 op->Compute(maxpool_context.get()); 1052 tensorflow::testing::StartTiming(); 1053 for (int i = 0; i < iters; ++i) { 1054 delete maxpool_context->release_output(0).tensor; 1055 op->Compute(maxpool_context.get()); 1056 } 1057 tensorflow::testing::StopTiming(); 1058 testing::ItemsProcessed(maxpool_context->mutable_output(0)->NumElements() * 1059 iters); 1060 testing::SetLabel(label); 1061 } 1062 1063 // BS: batch_size 1064 // IR: input_rows 1065 // IC: input_cols 1066 // ND: node_depth 1067 // KR: kernel_rows 1068 // KC: kernel_cols 1069 // ST: stride. We use the same stride for both directions. 1070 // PT: padding 1071 #define BM_MaxPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ 1072 static void \ 1073 BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \ 1074 int iters) { \ 1075 BM_MaxPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL); \ 1076 } \ 1077 BENCHMARK( \ 1078 BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH) 1079 1080 // Labels are taken from the 2014-July-24 version of imagenet 1081 /* TODO XXX 1082 BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "maxpool0_VALID"); 1083 BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "maxpool1_VALID"); 1084 BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "maxpool4_VALID"); 1085 BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "maxpool10_VALID"); 1086 BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "maxpool0_SAME"); 1087 BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "maxpool1_SAME"); 1088 BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "maxpool4_SAME"); 1089 BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "maxpool10_SAME"); 1090 */ 1091 BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "maxpool0_VALID"); 1092 BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "maxpool1_VALID"); 1093 BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "maxpool4_VALID"); 1094 BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "maxpool10_VALID"); 1095 BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "maxpool0_SAME"); 1096 BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "maxpool1_SAME"); 1097 BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "maxpool4_SAME"); 1098 BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "maxpool10_SAME"); 1099 1100 static void BM_MaxPoolBk(int iters, int batch_size, int rows, int cols, 1101 int depth, int kernel_rows, int kernel_cols, 1102 int stride, Padding padding, int num_threads, 1103 bool use_gpu, const string& label) { 1104 auto root = Scope::NewRootScope().ExitOnError(); 1105 1106 int64 out_height, out_width, pad_rows, pad_cols; 1107 TF_CHECK_OK(GetWindowedOutputSize(rows, kernel_rows, stride, padding, 1108 &out_height, &pad_rows)); 1109 TF_CHECK_OK(GetWindowedOutputSize(cols, kernel_cols, stride, padding, 1110 &out_width, &pad_cols)); 1111 1112 Tensor input_data(DT_FLOAT, TensorShape({batch_size, rows, cols, depth})); 1113 input_data.flat<float>().setRandom(); 1114 1115 Tensor output_data(DT_FLOAT, 1116 TensorShape({batch_size, out_height, out_width, depth})); 1117 output_data.flat<float>().setRandom(); 1118 1119 Tensor output_diff(DT_FLOAT, 1120 TensorShape({batch_size, out_height, out_width, depth})); 1121 output_diff.flat<float>().setRandom(); 1122 1123 CHECK_EQ(kernel_rows, kernel_cols); 1124 ops::internal::MaxPoolGrad(root, input_data, output_data, output_diff, 1125 {1, kernel_rows, kernel_cols, 1} /* ksize */, 1126 {1, stride, stride, 1} /* stride */, 1127 padding == VALID ? "VALID" : "SAME"); 1128 TF_CHECK_OK(root.status()); 1129 Graph* g = new Graph(OpRegistry::Global()); 1130 TF_CHECK_OK(root.ToGraph(g)); 1131 string device = use_gpu ? "gpu" : "cpu"; 1132 testing::UseRealTime(); 1133 test::Benchmark(device, g).Run(iters); 1134 1135 testing::ItemsProcessed(batch_size * rows * cols * depth * iters); 1136 testing::SetLabel(label); 1137 } 1138 1139 // BS: batch_size 1140 // IR: input_rows 1141 // IC: input_cols 1142 // ND: node_depth 1143 // KR: kernel_rows 1144 // KC: kernel_cols 1145 // ST: stride. We use the same stride for both directions. 1146 // PT: padding 1147 // The resulted symbol is too long. Need to use two macros to fit in 80-chars 1148 // clang-format off 1149 #define BM_MaxPoolBkGPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ 1150 static void \ 1151 BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \ 1152 ##PT##_##TH( \ 1153 int iters) { \ 1154 BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, true, LABEL); \ 1155 } \ 1156 BENCHMARK( \ 1157 BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \ 1158 ##PT##_##TH) \ 1159 1160 #define BM_MaxPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL) \ 1161 static void \ 1162 BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \ 1163 ##PT##_##TH( \ 1164 int iters) { \ 1165 BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, false, LABEL); \ 1166 } \ 1167 BENCHMARK( \ 1168 BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_ \ 1169 ##PT##_##TH) 1170 // clang-format on 1171 1172 // Shapes taken from the 2015/05/16 inception model 1173 BM_MaxPoolBkGPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID"); 1174 BM_MaxPoolBkGPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID"); 1175 BM_MaxPoolBkGPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID"); 1176 BM_MaxPoolBkGPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID"); 1177 BM_MaxPoolBkGPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID"); 1178 1179 BM_MaxPoolBkCPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID"); 1180 BM_MaxPoolBkCPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID"); 1181 BM_MaxPoolBkCPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID"); 1182 BM_MaxPoolBkCPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID"); 1183 BM_MaxPoolBkCPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID"); 1184 1185 /* 1186 Relu Op 1187 Run benchmark with: 1188 */ 1189 static void BM_ReluFloat(int iters, int batch_size, int rows, int cols, 1190 int depth, int num_threads, const string& label) { 1191 tensorflow::testing::StopTiming(); 1192 std::unique_ptr<Device> device( 1193 DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); 1194 1195 thread::ThreadPool threadpool(Env::Default(), "test", num_threads); 1196 EigenThreadPoolWrapper wrapper(&threadpool); 1197 Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads); 1198 device->set_eigen_cpu_device(&eigen_cpu_device); 1199 1200 gtl::InlinedVector<TensorValue, 4> inputs; 1201 TensorShape shape1({batch_size, rows, cols, depth}); 1202 Tensor input1(DT_FLOAT, shape1); 1203 test::FillIota<float>(&input1, 1.0); 1204 inputs.push_back({nullptr, &input1}); 1205 1206 // Reluing op. 1207 NodeDef relu_node_def; 1208 Status status = NodeDefBuilder("relu_op", "Relu") 1209 .Input(FakeInput(DT_FLOAT)) 1210 .Finalize(&relu_node_def); 1211 TF_CHECK_OK(status); 1212 std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(), 1213 cpu_allocator(), relu_node_def, 1214 TF_GRAPH_DEF_VERSION, &status)); 1215 TF_CHECK_OK(status); 1216 OpKernelContext::Params params; 1217 params.device = device.get(); 1218 params.frame_iter = FrameAndIter(0, 0); 1219 params.inputs = &inputs; 1220 params.op_kernel = op.get(); 1221 std::vector<AllocatorAttributes> attrs; 1222 test::SetOutputAttrs(¶ms, &attrs); 1223 1224 std::unique_ptr<OpKernelContext> relu_context(new OpKernelContext(¶ms)); 1225 1226 op->Compute(relu_context.get()); 1227 tensorflow::testing::StartTiming(); 1228 for (int i = 0; i < iters; ++i) { 1229 delete relu_context->release_output(0).tensor; 1230 op->Compute(relu_context.get()); 1231 } 1232 tensorflow::testing::StopTiming(); 1233 testing::ItemsProcessed(relu_context->mutable_output(0)->NumElements() * 1234 iters); 1235 testing::SetLabel(label); 1236 } 1237 1238 // BS: batch_size 1239 // IR: input_rows 1240 // IC: input_cols 1241 // ND: node_depth 1242 #define BM_Relu(BS, IR, IC, ND, TH, LABEL) \ 1243 static void BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH(int iters) { \ 1244 BM_ReluFloat(iters, BS, IR, IC, ND, TH, LABEL); \ 1245 } \ 1246 BENCHMARK(BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH) 1247 1248 BM_Relu(32, 112, 112, 64, 1, "relu0"); 1249 BM_Relu(32, 56, 56, 192, 1, "relu1"); 1250 BM_Relu(32, 28, 28, 352, 1, "relu4"); 1251 BM_Relu(32, 14, 14, 576, 1, "relu10"); 1252 BM_Relu(32, 112, 112, 64, 4, "relu0"); 1253 BM_Relu(32, 56, 56, 192, 4, "relu1"); 1254 BM_Relu(32, 28, 28, 352, 4, "relu4"); 1255 BM_Relu(32, 14, 14, 576, 4, "relu10"); 1256 1257 static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth, 1258 int num_threads, bool use_gpu, 1259 const string& label) { 1260 auto root = Scope::NewRootScope().ExitOnError(); 1261 1262 Tensor input(DT_FLOAT, TensorShape({batch_size, node_depth})); 1263 input.flat<float>().setRandom(); 1264 1265 auto softmax = ops::Softmax(root, input); 1266 1267 TF_CHECK_OK(root.status()); 1268 Graph* g = new Graph(OpRegistry::Global()); 1269 TF_CHECK_OK(root.ToGraph(g)); 1270 string device = use_gpu ? "gpu" : "cpu"; 1271 SessionOptions opts; 1272 opts.config.set_inter_op_parallelism_threads(1); 1273 opts.config.set_intra_op_parallelism_threads(num_threads); 1274 opts.config.set_use_per_session_threads(true); 1275 opts.config.mutable_graph_options() 1276 ->mutable_optimizer_options() 1277 ->set_opt_level(OptimizerOptions_Level_L0); 1278 testing::UseRealTime(); 1279 test::Benchmark(device, g, &opts).Run(iters); 1280 testing::ItemsProcessed(batch_size * node_depth * iters); 1281 testing::SetLabel(label); 1282 } 1283 1284 #define BM_ImageNetSoftmaxFwd(BATCH_SIZE, NODE_DEPTH, TH, GPU, LABEL) \ 1285 static void \ 1286 BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH##_##GPU( \ 1287 int iters) { \ 1288 BM_ImageNetSoftmaxFwd(iters, BATCH_SIZE, NODE_DEPTH, TH, GPU, LABEL); \ 1289 } \ 1290 BENCHMARK(BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH##_##GPU) 1291 1292 // Labels are taken from the 2014-July-24 version of imagenet 1293 BM_ImageNetSoftmaxFwd(32, 1008, 1, false, "softmax32"); 1294 BM_ImageNetSoftmaxFwd(128, 1008, 1, false, "softmax128"); 1295 BM_ImageNetSoftmaxFwd(32, 1008, 4, false, "softmax32"); 1296 BM_ImageNetSoftmaxFwd(128, 1008, 4, false, "softmax128"); 1297 BM_ImageNetSoftmaxFwd(32, 1008, 1, true, "softmax32"); 1298 BM_ImageNetSoftmaxFwd(128, 1008, 1, true, "softmax128"); 1299 BM_ImageNetSoftmaxFwd(8192, 1024, 1, true, "softmax32"); 1300 BM_ImageNetSoftmaxFwd(8192, 32768, 1, true, "softmax128"); 1301 1302 static void BM_TopK(int iters, int rows, int cols, int k, int num_threads, 1303 bool use_gpu, const string& label) { 1304 testing::StopTiming(); 1305 auto root = Scope::NewRootScope().ExitOnError(); 1306 1307 Tensor input(DT_FLOAT, TensorShape({rows, cols})); 1308 input.flat<float>().setRandom(); 1309 1310 Tensor input_k(DT_INT32, TensorShape({})); 1311 input_k.scalar<int32>()() = k; 1312 1313 auto top_k = ops::TopK(root, input, input_k, ops::TopK::Sorted(true)); 1314 1315 TF_CHECK_OK(root.status()); 1316 Graph* g = new Graph(OpRegistry::Global()); 1317 TF_CHECK_OK(root.ToGraph(g)); 1318 string device = use_gpu ? "gpu" : "cpu"; 1319 SessionOptions opts; 1320 opts.config.set_inter_op_parallelism_threads(1); 1321 opts.config.set_intra_op_parallelism_threads(num_threads); 1322 opts.config.set_use_per_session_threads(true); 1323 opts.config.mutable_graph_options() 1324 ->mutable_optimizer_options() 1325 ->set_opt_level(OptimizerOptions_Level_L0); 1326 testing::UseRealTime(); 1327 testing::StartTiming(); 1328 test::Benchmark(device, g, &opts).Run(iters); 1329 testing::ItemsProcessed(rows * cols * iters); 1330 testing::SetLabel(label); 1331 } 1332 1333 // IR: input_rows 1334 // IC: input_cols 1335 // IK: k 1336 // TH: number of threads 1337 #define BM_TopKGPU(IR, IC, IK, TH, LABEL) \ 1338 static void BM_TopK_GPU_##IR##_##IC##_##IK##_##TH(int iters) { \ 1339 BM_TopK(iters, IR, IC, IK, TH, true, LABEL); \ 1340 } \ 1341 BENCHMARK(BM_TopK_GPU_##IR##_##IC##_##IK##_##TH) 1342 1343 #define BM_TopKCPU(IR, IC, IK, TH, LABEL) \ 1344 static void BM_TopK_CPU_##IR##_##IC##_##IK##_##TH(int iters) { \ 1345 BM_TopK(iters, IR, IC, IK, TH, false, LABEL); \ 1346 } \ 1347 BENCHMARK(BM_TopK_CPU_##IR##_##IC##_##IK##_##TH) 1348 1349 // clang-format on 1350 1351 BM_TopKCPU(1, 100, 1, 16, "topk_r_1_c_100_k_1_th_16"); 1352 BM_TopKCPU(1, 100, 2, 16, "topk_r_1_c_100_k_2_th_16"); 1353 BM_TopKCPU(1, 100, 10, 16, "topk_r_1_c_100_k_10_th_16"); 1354 BM_TopKCPU(1, 100, 50, 16, "topk_r_1_c_100_k_50_th_16"); 1355 BM_TopKCPU(1, 100, 100, 16, "topk_r_1_c_100_k_100_th_16"); 1356 BM_TopKCPU(32, 100, 1, 16, "topk_r_32_c_100_k_1_th_16"); 1357 BM_TopKCPU(32, 100, 2, 16, "topk_r_32_c_100_k_2_th_16"); 1358 BM_TopKCPU(32, 100, 10, 16, "topk_r_32_c_100_k_10_th_16"); 1359 BM_TopKCPU(32, 100, 50, 16, "topk_r_32_c_100_k_50_th_16"); 1360 BM_TopKCPU(32, 100, 100, 16, "topk_r_32_c_100_k_100_th_16"); 1361 BM_TopKCPU(128, 100, 1, 16, "topk_r_128_c_100_k_1_th_16"); 1362 BM_TopKCPU(128, 100, 2, 16, "topk_r_128_c_100_k_2_th_16"); 1363 BM_TopKCPU(128, 100, 10, 16, "topk_r_128_c_100_k_10_th_16"); 1364 BM_TopKCPU(128, 100, 50, 16, "topk_r_128_c_100_k_50_th_16"); 1365 BM_TopKCPU(128, 100, 100, 16, "topk_r_128_c_100_k_100_th_16"); 1366 BM_TopKCPU(128, 1000, 1, 16, "topk_r_128_c_1000_k_1_th_16"); 1367 BM_TopKCPU(128, 1000, 2, 16, "topk_r_128_c_1000_k_2_th_16"); 1368 BM_TopKCPU(128, 1000, 10, 16, "topk_r_128_c_1000_k_10_th_16"); 1369 BM_TopKCPU(128, 1000, 50, 16, "topk_r_128_c_1000_k_50_th_16"); 1370 BM_TopKCPU(128, 1000, 100, 16, "topk_r_128_c_1000_k_100_th_16"); 1371 BM_TopKCPU(128, 1000, 500, 16, "topk_r_128_c_1000_k_500_th_16"); 1372 BM_TopKCPU(128, 1000, 1000, 16, "topk_r_128_c_1000_k_1000_th_16"); 1373 1374 // From NMT Codebase: 1375 // batch_sizes: 16, 128 1376 // vocab_sizes: 10000 for small dataset, 35000 for large. 1377 // beam_widths: 1, 2, 5, 10 1378 BM_TopKCPU(16, 10000, 10000, 16, "topk_nmt_r_16_c_10000_k_10000_th_16"); 1379 BM_TopKCPU(16, 20000, 20000, 16, "topk_nmt_r_16_c_20000_k_20000_th_16"); 1380 BM_TopKCPU(16, 50000, 50000, 16, "topk_nmt_r_16_c_50000_k_50000_th_16"); 1381 BM_TopKCPU(16, 100000, 100000, 16, "topk_nmt_r_16_c_100000_k_100000_th_16"); 1382 BM_TopKCPU(16, 35000, 35000, 16, "topk_nmt_r_16_c_35000_k_35000_th_16"); 1383 BM_TopKCPU(16, 70000, 70000, 16, "topk_nmt_r_16_c_70000_k_70000_th_16"); 1384 BM_TopKCPU(16, 175000, 175000, 16, "topk_nmt_r_16_c_175000_k_175000_th_16"); 1385 BM_TopKCPU(16, 350000, 350000, 16, "topk_nmt_r_16_c_350000_k_350000_th_16"); 1386 BM_TopKCPU(128, 10000, 10000, 16, "topk_nmt_r_128_c_10000_k_10000_th_16"); 1387 BM_TopKCPU(128, 20000, 20000, 16, "topk_nmt_r_128_c_20000_k_20000_th_16"); 1388 BM_TopKCPU(128, 50000, 50000, 16, "topk_nmt_r_128_c_50000_k_50000_th_16"); 1389 BM_TopKCPU(128, 100000, 100000, 16, "topk_nmt_r_128_c_100000_k_100000_th_16"); 1390 BM_TopKCPU(128, 35000, 35000, 16, "topk_nmt_r_128_c_35000_k_35000_th_16"); 1391 BM_TopKCPU(128, 70000, 70000, 16, "topk_nmt_r_128_c_70000_k_70000_th_16"); 1392 BM_TopKCPU(128, 175000, 175000, 16, "topk_nmt_r_128_c_175000_k_175000_th_16"); 1393 BM_TopKCPU(128, 350000, 350000, 16, "topk_nmt_r_128_c_350000_k_350000_th_16"); 1394 1395 } // namespace tensorflow 1396