Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #define EIGEN_USE_THREADS
     17 
     18 #if GOOGLE_CUDA
     19 #define EIGEN_USE_GPU
     20 #endif  // GOOGLE_CUDA
     21 
     22 #include <functional>
     23 #include <memory>
     24 #include <unordered_map>
     25 #include <vector>
     26 
     27 #include "third_party/eigen3/Eigen/Core"
     28 #include "tensorflow/cc/ops/const_op.h"
     29 #include "tensorflow/cc/ops/nn_ops.h"
     30 #include "tensorflow/cc/ops/nn_ops_internal.h"
     31 #include "tensorflow/core/common_runtime/device_factory.h"
     32 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
     33 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
     34 #include "tensorflow/core/framework/allocator.h"
     35 #include "tensorflow/core/framework/fake_input.h"
     36 #include "tensorflow/core/framework/graph.pb.h"
     37 #include "tensorflow/core/framework/node_def_builder.h"
     38 #include "tensorflow/core/framework/op_kernel.h"
     39 #include "tensorflow/core/framework/tensor.h"
     40 #include "tensorflow/core/framework/tensor_testutil.h"
     41 #include "tensorflow/core/framework/types.h"
     42 #include "tensorflow/core/framework/types.pb.h"
     43 #include "tensorflow/core/graph/graph_constructor.h"
     44 #include "tensorflow/core/graph/graph_def_builder.h"
     45 #include "tensorflow/core/kernels/ops_testutil.h"
     46 #include "tensorflow/core/kernels/ops_util.h"
     47 #include "tensorflow/core/lib/core/status_test_util.h"
     48 #include "tensorflow/core/lib/core/threadpool.h"
     49 #include "tensorflow/core/platform/logging.h"
     50 #include "tensorflow/core/platform/test.h"
     51 #include "tensorflow/core/platform/test_benchmark.h"
     52 #include "tensorflow/core/public/session.h"
     53 #include "tensorflow/core/public/version.h"
     54 #include "tensorflow/core/util/padding.h"
     55 #include "tensorflow/core/util/port.h"
     56 
     57 namespace tensorflow {
     58 
     59 static void SetConstOp(const string& name, std::initializer_list<int64> dims,
     60                        DataType data_type, NodeDef* node) {
     61   Tensor tensor(data_type, TensorShape(dims));
     62   for (int64 i = 0; i < tensor.NumElements(); ++i) {
     63     switch (data_type) {
     64       case DT_FLOAT:
     65         tensor.flat<float>()(i) = i / 10.0f;
     66         break;
     67       case DT_HALF:
     68         tensor.flat<Eigen::half>()(i) = Eigen::half(i / 10.0f);
     69         break;
     70       default:
     71         LOG(FATAL) << "Unknown data type " << data_type;
     72     }
     73   }
     74   TF_CHECK_OK(NodeDefBuilder(name, "Const")
     75                   .Attr("dtype", data_type)
     76                   .Attr("value", tensor)
     77                   .Finalize(node));
     78 }
     79 
     80 static void SetConstSizesOp(const string& name, const std::vector<int32>& sizes,
     81                             NodeDef* node) {
     82   TensorShape shape;
     83   shape.AddDim(sizes.size());
     84   Tensor tensor(DT_INT32, shape);
     85   for (int64 i = 0; i < tensor.NumElements(); ++i) {
     86     tensor.flat<int32>()(i) = sizes[i];
     87   }
     88   TF_CHECK_OK(NodeDefBuilder(name, "Const")
     89                   .Attr("dtype", DT_INT32)
     90                   .Attr("value", tensor)
     91                   .Finalize(node));
     92 }
     93 
     94 namespace {
     95 
     96 enum CONV_OP {
     97   CONV_OP_FORWARD = 0,
     98   CONV_OP_BACKPROP_INPUT = 1,
     99   CONV_OP_BACKPROP_FILTER = 2,
    100   CONV_OP_FUSED = 3,
    101 };
    102 
    103 }  // namespace
    104 
    105 static void BM_ConvFloat(int iters, int batch, int rows, int cols, int in_depth,
    106                          int out_depth, int filter_rows, int filter_cols,
    107                          CONV_OP op, int num_threads, int stride,
    108                          Padding padding, bool use_gpu, DataType data_type,
    109                          const string& label) {
    110   if (!IsGoogleCudaEnabled() && use_gpu) {
    111     testing::SetLabel(
    112         strings::StrCat("Skipping GPU test (no --config=cuda): ", label));
    113     return;
    114   }
    115   testing::SetLabel(label);
    116 
    117   // Set the number of threads
    118   SessionOptions options;
    119   options.config.set_intra_op_parallelism_threads(num_threads);
    120 
    121   // We set up a graph for computing convolution.
    122   GraphDef graph;
    123 
    124   // For this, we need an input tensor and a filter tensor.
    125   // Compute the output size.
    126   int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
    127   TF_CHECK_OK(GetWindowedOutputSize(rows, filter_rows, stride, padding,
    128                                     &out_rows, &pad_rows));
    129   TF_CHECK_OK(GetWindowedOutputSize(cols, filter_cols, stride, padding,
    130                                     &out_cols, &pad_cols));
    131   // Counting the number of floating point operations (both MUL and ADD)
    132   int64 num_ops = 0;
    133   if (op == CONV_OP_FORWARD) {
    134     // Forward computation:
    135     // BATCH x OUT_ROW X OUT_COL X IN_DEPTH X PATCH_ROW X PATH_COL X OUT_DEPTH
    136     // We multiply by two since there are multiplications and additions.
    137     num_ops = static_cast<int64>(batch * in_depth * out_depth) *
    138               static_cast<int64>(filter_rows * filter_cols) *
    139               static_cast<int64>(out_rows * out_cols) * 2;
    140   } else {
    141     // Backward computation:
    142     // BATCH x IN_ROW X IN_COL X IN_DEPTH X PATCH_ROW X PATCH_COL X OUT_DEPTH
    143     // We multiply by two since there are multiplications and additions.
    144     num_ops = static_cast<int64>(batch * in_depth * out_depth) *
    145               static_cast<int64>(filter_rows * filter_cols) *
    146               static_cast<int64>(rows * cols) * 2;
    147   }
    148 
    149   SetConstOp("input", {batch, rows, cols, in_depth}, data_type,
    150              graph.add_node());
    151   SetConstOp("filter", {filter_rows, filter_cols, in_depth, out_depth},
    152              data_type, graph.add_node());
    153   SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth},
    154              data_type, graph.add_node());
    155   SetConstSizesOp("input_sizes",
    156                   std::vector<int32>({batch, rows, cols, in_depth}),
    157                   graph.add_node());
    158   SetConstSizesOp(
    159       "filter_sizes",
    160       std::vector<int32>({filter_rows, filter_cols, in_depth, out_depth}),
    161       graph.add_node());
    162   SetConstSizesOp("resize_size", std::vector<int32>({rows, cols}),
    163                   graph.add_node());
    164 
    165   TensorShape paddings_shape({4, 2});
    166   Tensor paddings_tensor(DT_INT32, paddings_shape);
    167   for (int64 i = 0; i < paddings_tensor.NumElements(); ++i) {
    168     paddings_tensor.flat<int32>()(i) = 0;
    169   }
    170   TF_CHECK_OK(NodeDefBuilder("paddings", "Const")
    171                   .Attr("dtype", DT_INT32)
    172                   .Attr("value", paddings_tensor)
    173                   .Finalize(graph.add_node()));
    174 
    175   // Now add the convolution op
    176   NodeDef* conv = graph.add_node();
    177   switch (op) {
    178     case CONV_OP_FORWARD:
    179       TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2D")
    180                       .Input("input", 0, data_type)
    181                       .Input("filter", 0, data_type)
    182                       .Attr("strides", {1, stride, stride, 1})
    183                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
    184                       .Finalize(conv));
    185       break;
    186     case CONV_OP_BACKPROP_INPUT:
    187       TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropInput")
    188                       .Input("input_sizes", 0, DT_INT32)
    189                       .Input("filter", 0, data_type)
    190                       .Input("output_backprop", 0, data_type)
    191                       .Attr("strides", {1, stride, stride, 1})
    192                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
    193                       .Finalize(conv));
    194       break;
    195     case CONV_OP_BACKPROP_FILTER:
    196       TF_CHECK_OK(NodeDefBuilder("conv2d", "Conv2DBackpropFilter")
    197                       .Input("input", 0, data_type)
    198                       .Input("filter_sizes", 0, DT_INT32)
    199                       .Input("output_backprop", 0, data_type)
    200                       .Attr("strides", {1, stride, stride, 1})
    201                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
    202                       .Finalize(conv));
    203       break;
    204     case CONV_OP_FUSED:
    205       TF_CHECK_OK(NodeDefBuilder("conv2d", "FusedResizeAndPadConv2D")
    206                       .Input("input", 0, data_type)
    207                       .Input("resize_size", 0, DT_INT32)
    208                       .Input("paddings", 0, DT_INT32)
    209                       .Input("filter", 0, data_type)
    210                       .Attr("mode", "REFLECT")
    211                       .Attr("strides", {1, stride, stride, 1})
    212                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
    213                       .Attr("resize_align_corners", false)
    214                       .Finalize(conv));
    215       break;
    216   }
    217   Graph* g = new Graph(OpRegistry::Global());
    218   GraphConstructorOptions opts;
    219   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
    220 
    221   string device = use_gpu ? "gpu" : "cpu";
    222   testing::UseRealTime();
    223   test::Benchmark(device, g, &options).Run(iters);
    224   testing::ItemsProcessed(num_ops * iters);
    225 }
    226 
    227 // BS: batch_size
    228 // R: tensor_in_rows
    229 // C: tensor_in_cols
    230 // ID: input_depth
    231 // OD: output_depth
    232 // KR: kernel_rows
    233 // KC: kernel_cols
    234 #define BM_ConvFloatFwd(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL)             \
    235   static void BM_ConvFloatFwdCPU1_##LABEL(int iters) {                         \
    236     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,     \
    237                  PAD, false, DT_FLOAT,                                         \
    238                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
    239                                  KR, "_", KC, "_", STR, "_", PAD, "_f_cpu1")); \
    240   }                                                                            \
    241   static void BM_ConvFloatFwdCPU4_##LABEL(int iters) {                         \
    242     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 4, STR,     \
    243                  PAD, false, DT_FLOAT,                                         \
    244                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
    245                                  KR, "_", KC, "_", STR, "_", PAD, "_f_cpu4")); \
    246   }                                                                            \
    247   static void BM_ConvFloatFusedCPU1_##LABEL(int iters) {                       \
    248     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FUSED, 1, STR, PAD,  \
    249                  false, DT_FLOAT,                                              \
    250                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
    251                                  KR, "_", KC, "_", STR, "_", PAD, "_f_cpu1")); \
    252   }                                                                            \
    253   static void BM_ConvFloatFusedCPU4_##LABEL(int iters) {                       \
    254     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FUSED, 4, STR, PAD,  \
    255                  false, DT_FLOAT,                                              \
    256                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
    257                                  KR, "_", KC, "_", STR, "_", PAD, "_f_cpu4")); \
    258   }                                                                            \
    259   static void BM_ConvFloatFwdGPU_##LABEL(int iters) {                          \
    260     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,     \
    261                  PAD, true, DT_FLOAT,                                          \
    262                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
    263                                  KR, "_", KC, "_", STR, "_", PAD, "_f_gpu"));  \
    264   }                                                                            \
    265   static void BM_ConvHalfFwdGPU_##LABEL(int iters) {                           \
    266     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_FORWARD, 1, STR,     \
    267                  PAD, true, DT_HALF,                                           \
    268                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",    \
    269                                  KR, "_", KC, "_", STR, "_", PAD, "_h_gpu"));  \
    270   }                                                                            \
    271   BENCHMARK(BM_ConvFloatFwdCPU1_##LABEL);                                      \
    272   BENCHMARK(BM_ConvFloatFwdCPU4_##LABEL);                                      \
    273   BENCHMARK(BM_ConvFloatFusedCPU1_##LABEL);                                    \
    274   BENCHMARK(BM_ConvFloatFusedCPU4_##LABEL);                                    \
    275   BENCHMARK(BM_ConvFloatFwdGPU_##LABEL);                                       \
    276   BENCHMARK(BM_ConvHalfFwdGPU_##LABEL)
    277 
    278 BM_ConvFloatFwd(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0);
    279 BM_ConvFloatFwd(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1);
    280 BM_ConvFloatFwd(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2);
    281 BM_ConvFloatFwd(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3);
    282 BM_ConvFloatFwd(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4);
    283 BM_ConvFloatFwd(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5);
    284 BM_ConvFloatFwd(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6);
    285 BM_ConvFloatFwd(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7);
    286 BM_ConvFloatFwd(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8);
    287 BM_ConvFloatFwd(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9);
    288 BM_ConvFloatFwd(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10);
    289 BM_ConvFloatFwd(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11);
    290 BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12);
    291 BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13);
    292 BM_ConvFloatFwd(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14);
    293 BM_ConvFloatFwd(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15);
    294 BM_ConvFloatFwd(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16);
    295 BM_ConvFloatFwd(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17);
    296 BM_ConvFloatFwd(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18);
    297 BM_ConvFloatFwd(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19);
    298 BM_ConvFloatFwd(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20);
    299 BM_ConvFloatFwd(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21);
    300 BM_ConvFloatFwd(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22);
    301 BM_ConvFloatFwd(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23);
    302 BM_ConvFloatFwd(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24);
    303 BM_ConvFloatFwd(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25);
    304 BM_ConvFloatFwd(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26);
    305 BM_ConvFloatFwd(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27);
    306 BM_ConvFloatFwd(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28);
    307 BM_ConvFloatFwd(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29);
    308 BM_ConvFloatFwd(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30);
    309 BM_ConvFloatFwd(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31);
    310 BM_ConvFloatFwd(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32);
    311 BM_ConvFloatFwd(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33);
    312 BM_ConvFloatFwd(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34);
    313 BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35);
    314 BM_ConvFloatFwd(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36);
    315 BM_ConvFloatFwd(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37);
    316 BM_ConvFloatFwd(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38);
    317 BM_ConvFloatFwd(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39);
    318 BM_ConvFloatFwd(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40);
    319 BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41);
    320 BM_ConvFloatFwd(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42);
    321 BM_ConvFloatFwd(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43);
    322 BM_ConvFloatFwd(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44);
    323 BM_ConvFloatFwd(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45);
    324 BM_ConvFloatFwd(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46);
    325 BM_ConvFloatFwd(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47);
    326 BM_ConvFloatFwd(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48);
    327 BM_ConvFloatFwd(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49);
    328 BM_ConvFloatFwd(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50);
    329 BM_ConvFloatFwd(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51);
    330 BM_ConvFloatFwd(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52);
    331 BM_ConvFloatFwd(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53);
    332 BM_ConvFloatFwd(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54);
    333 
    334 #define BM_ConvFloatBkInAndFilter(BS, R, C, ID, OD, KR, KC, STR, PAD, LABEL)  \
    335   static void BM_ConvFloatBkInCPU1_##LABEL(int iters) {                       \
    336     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,  \
    337                  STR, PAD, false, DT_FLOAT,                                   \
    338                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
    339                                  KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));  \
    340   }                                                                           \
    341   static void BM_ConvFloatBkInCPU4_##LABEL(int iters) {                       \
    342     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 4,  \
    343                  STR, PAD, false, DT_FLOAT,                                   \
    344                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
    345                                  KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));  \
    346   }                                                                           \
    347   static void BM_ConvFloatBkInGPU_##LABEL(int iters) {                        \
    348     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,  \
    349                  STR, PAD, true, DT_FLOAT,                                    \
    350                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
    351                                  KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
    352   }                                                                           \
    353   static void BM_ConvFloatBkFilterCPU1_##LABEL(int iters) {                   \
    354     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
    355                  STR, PAD, false, DT_FLOAT,                                   \
    356                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
    357                                  KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));  \
    358   }                                                                           \
    359   static void BM_ConvFloatBkFilterCPU4_##LABEL(int iters) {                   \
    360     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 4, \
    361                  STR, PAD, false, DT_FLOAT,                                   \
    362                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
    363                                  KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));  \
    364   }                                                                           \
    365   static void BM_ConvFloatBkFilterGPU_##LABEL(int iters) {                    \
    366     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
    367                  STR, PAD, true, DT_FLOAT,                                    \
    368                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
    369                                  KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
    370   }                                                                           \
    371   static void BM_ConvHalfBkInGPU_##LABEL(int iters) {                         \
    372     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_INPUT, 1,  \
    373                  STR, PAD, true, DT_HALF,                                     \
    374                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
    375                                  KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
    376   }                                                                           \
    377   static void BM_ConvHalfBkFilterGPU_##LABEL(int iters) {                     \
    378     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1, \
    379                  STR, PAD, true, DT_HALF,                                     \
    380                  strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", OD, "_",   \
    381                                  KR, "_", KC, "_", STR, "_", PAD, "_gpu"));   \
    382   }                                                                           \
    383   BENCHMARK(BM_ConvFloatBkInCPU1_##LABEL);                                    \
    384   BENCHMARK(BM_ConvFloatBkInCPU4_##LABEL);                                    \
    385   BENCHMARK(BM_ConvFloatBkInGPU_##LABEL);                                     \
    386   BENCHMARK(BM_ConvFloatBkFilterCPU1_##LABEL);                                \
    387   BENCHMARK(BM_ConvFloatBkFilterCPU4_##LABEL);                                \
    388   BENCHMARK(BM_ConvFloatBkFilterGPU_##LABEL);                                 \
    389   BENCHMARK(BM_ConvHalfBkInGPU_##LABEL);                                      \
    390   BENCHMARK(BM_ConvHalfBkFilterGPU_##LABEL)
    391 
    392 // Benchmarks from the inception model
    393 
    394 BM_ConvFloatBkInAndFilter(32, 5, 5, 1248, 128, 1, 1, 1, SAME, conv0);
    395 BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 1, 3, 1, SAME, conv1);
    396 BM_ConvFloatBkInAndFilter(32, 8, 8, 384, 384, 3, 1, 1, SAME, conv2);
    397 BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 192, 1, 1, 1, SAME, conv3);
    398 BM_ConvFloatBkInAndFilter(32, 8, 8, 448, 384, 3, 3, 1, SAME, conv4);
    399 BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 320, 1, 1, 1, SAME, conv5);
    400 BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 448, 1, 1, 1, SAME, conv6);
    401 BM_ConvFloatBkInAndFilter(32, 8, 8, 2048, 384, 1, 1, 1, SAME, conv7);
    402 BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 384, 1, 1, 1, SAME, conv8);
    403 BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 192, 1, 1, 1, SAME, conv9);
    404 BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 448, 1, 1, 1, SAME, conv10);
    405 BM_ConvFloatBkInAndFilter(32, 8, 8, 1760, 320, 1, 1, 1, SAME, conv11);
    406 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 2, VALID, conv12);
    407 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 3, 1, SAME, conv13);
    408 BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 192, 1, 1, 1, SAME, conv14);
    409 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 320, 3, 3, 2, VALID, conv15);
    410 BM_ConvFloatBkInAndFilter(32, 17, 17, 1248, 128, 1, 1, 1, SAME, conv16);
    411 BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 1, 3, 1, SAME, conv17);
    412 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 3, 1, 1, SAME, conv18);
    413 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 256, 1, 3, 1, SAME, conv19);
    414 BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 192, 1, 1, 1, SAME, conv20);
    415 BM_ConvFloatBkInAndFilter(32, 17, 17, 1216, 96, 1, 1, 1, SAME, conv21);
    416 BM_ConvFloatBkInAndFilter(32, 17, 17, 224, 224, 3, 1, 1, SAME, conv22);
    417 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 224, 3, 3, 1, SAME, conv23);
    418 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 1, 3, 1, SAME, conv24);
    419 BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 192, 1, 1, 1, SAME, conv25);
    420 BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 128, 1, 1, 1, SAME, conv26);
    421 BM_ConvFloatBkInAndFilter(32, 17, 17, 192, 192, 3, 1, 1, SAME, conv27);
    422 BM_ConvFloatBkInAndFilter(32, 17, 17, 160, 192, 3, 3, 1, SAME, conv28);
    423 BM_ConvFloatBkInAndFilter(32, 17, 17, 1152, 160, 1, 1, 1, SAME, conv29);
    424 BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 128, 1, 1, 1, SAME, conv30);
    425 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 1, 3, 1, SAME, conv31);
    426 BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 160, 1, 1, 1, SAME, conv32);
    427 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 192, 3, 1, 1, SAME, conv33);
    428 BM_ConvFloatBkInAndFilter(32, 17, 17, 1024, 256, 1, 1, 1, SAME, conv34);
    429 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 1, 1, SAME, conv35);
    430 BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 192, 1, 1, 1, SAME, conv36);
    431 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 1, 3, 1, SAME, conv37);
    432 BM_ConvFloatBkInAndFilter(32, 17, 17, 128, 128, 3, 3, 1, SAME, conv38);
    433 BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 128, 1, 1, 1, SAME, conv39);
    434 BM_ConvFloatBkInAndFilter(32, 17, 17, 768, 320, 1, 1, 1, SAME, conv40);
    435 BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 2, VALID, conv41);
    436 BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 384, 3, 3, 2, VALID, conv42);
    437 BM_ConvFloatBkInAndFilter(32, 35, 35, 64, 96, 3, 3, 1, SAME, conv43);
    438 BM_ConvFloatBkInAndFilter(32, 35, 35, 288, 64, 1, 1, 1, SAME, conv44);
    439 BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 64, 1, 1, 1, SAME, conv45);
    440 BM_ConvFloatBkInAndFilter(32, 35, 35, 48, 64, 5, 5, 1, SAME, conv46);
    441 BM_ConvFloatBkInAndFilter(32, 35, 35, 256, 48, 1, 1, 1, SAME, conv47);
    442 BM_ConvFloatBkInAndFilter(32, 35, 35, 96, 96, 3, 3, 1, SAME, conv48);
    443 BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 32, 1, 1, 1, SAME, conv49);
    444 BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 64, 1, 1, 1, SAME, conv50);
    445 BM_ConvFloatBkInAndFilter(32, 35, 35, 192, 48, 1, 1, 1, SAME, conv51);
    446 BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 192, 3, 3, 1, VALID, conv52);
    447 BM_ConvFloatBkInAndFilter(32, 73, 73, 64, 64, 1, 1, 1, VALID, conv53);
    448 BM_ConvFloatBkInAndFilter(32, 147, 147, 24, 64, 1, 1, 1, VALID, conv54);
    449 
    450 #define BM_ConvFloatBkFCPU(BS, R, C, ID, OD, KR, KC, TH, LABEL)                \
    451   static void                                                                  \
    452       BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH(  \
    453           int iters) {                                                         \
    454     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, TH, \
    455                  1, VALID, false, DT_FLOAT, LABEL);                            \
    456   }                                                                            \
    457   BENCHMARK(                                                                   \
    458       BM_ConvFloatBkFCPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC##_##TH);
    459 
    460 // Benchmarks from https://github.com/soumith/convnet-benchmarks
    461 BM_ConvFloatBkFCPU(128, 128, 128, 3, 96, 11, 11, 4, "convnet-layer1");
    462 BM_ConvFloatBkFCPU(128, 64, 64, 64, 128, 9, 9, 4, "convnet-layer2");
    463 BM_ConvFloatBkFCPU(128, 32, 32, 128, 128, 9, 9, 4, "convnet-layer3");
    464 BM_ConvFloatBkFCPU(128, 16, 16, 128, 128, 7, 7, 4, "convnet-layer4");
    465 BM_ConvFloatBkFCPU(128, 13, 13, 384, 384, 3, 3, 4, "convnet-layer5");
    466 
    467 #define BM_ConvFloatBkFGPU(BS, R, C, ID, OD, KR, KC, LABEL)                    \
    468   static void BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC( \
    469       int iters) {                                                             \
    470     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
    471                  1, VALID, true, DT_FLOAT, LABEL);                             \
    472   }                                                                            \
    473   static void BM_ConvHalfBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC(  \
    474       int iters) {                                                             \
    475     BM_ConvFloat(iters, BS, R, C, ID, OD, KR, KC, CONV_OP_BACKPROP_FILTER, 1,  \
    476                  1, VALID, true, DT_HALF, LABEL);                              \
    477   }                                                                            \
    478   BENCHMARK(BM_ConvFloatBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC);  \
    479   BENCHMARK(BM_ConvHalfBkFGPU_##BS##_##R##_##C##_##ID##_##OD##_##KR##_##KC)
    480 
    481 // Benchmarks from https://github.com/soumith/convnet-benchmarks
    482 BM_ConvFloatBkFGPU(128, 128, 128, 3, 96, 11, 11, "convnet-layer1");
    483 BM_ConvFloatBkFGPU(128, 64, 64, 64, 128, 9, 9, "convnet-layer2");
    484 BM_ConvFloatBkFGPU(128, 32, 32, 128, 128, 9, 9, "convnet-layer3");
    485 BM_ConvFloatBkFGPU(128, 16, 16, 128, 128, 7, 7, "convnet-layer4");
    486 BM_ConvFloatBkFGPU(128, 13, 13, 384, 384, 3, 3, "convnet-layer5");
    487 
    488 namespace {
    489 
    490 enum DEPTHWISE_CONV_OP {
    491   DEPTHWISE_CONV_OP_FWD = 0,
    492   DEPTHWISE_CONV_OP_BACKPROP_INPUT = 1,
    493   DEPTHWISE_CONV_OP_BACKPROP_FILTER = 2
    494 };
    495 
    496 }  // namespace
    497 
    498 static void BM_ConvFloatDepthwise(int iters, int batch, int rows, int cols,
    499                                   int in_depth, int depth_multiplier,
    500                                   int out_depth, int filter_rows,
    501                                   int filter_cols, DEPTHWISE_CONV_OP op,
    502                                   int num_threads, int stride, Padding padding,
    503                                   bool use_gpu, const string& label) {
    504   if (!IsGoogleCudaEnabled() && use_gpu) {
    505     testing::SetLabel(
    506         strings::StrCat("Skipping GPU test (no --config=cuda): ", label));
    507     return;
    508   }
    509   testing::SetLabel(label);
    510 
    511   // Set the number of threads
    512   SessionOptions options;
    513   options.config.set_intra_op_parallelism_threads(num_threads);
    514 
    515   // We set up a graph for computing convolution.
    516   GraphDef graph;
    517 
    518   // For this, we need an input tensor and a filter tensor.
    519   // Compute the output size.
    520   int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
    521   TF_CHECK_OK(GetWindowedOutputSize(rows, filter_rows, stride, padding,
    522                                     &out_rows, &pad_rows));
    523   TF_CHECK_OK(GetWindowedOutputSize(cols, filter_cols, stride, padding,
    524                                     &out_cols, &pad_cols));
    525 
    526   int64 num_ops = 0;
    527   if (op == DEPTHWISE_CONV_OP_FWD) {
    528     // Counting the number of floating point operations (both MUL and ADD)
    529     // Forward computation:
    530     // BATCH x OUT_ROW X OUT_COL X FLTR_ROW X FLTR_COL X DEPTH_MULT X IN_DEPTH
    531     // We multiply by two since there are multiplications and additions.
    532     num_ops = static_cast<int64>(batch * out_rows * out_cols) *
    533               static_cast<int64>(filter_rows * filter_cols) *
    534               static_cast<int64>(in_depth * depth_multiplier) * 2;
    535   } else {
    536     // Backward computation: both input and filter backprop take the same
    537     // amount of computation:
    538     // BATCH x IN_ROW X IN_COL X FLTR_ROW X FLTR_COL X DEPTH_MULT X IN_DEPTH
    539     // We multiply by two since there are multiplications and additions.
    540     // We divide by stride squared to approximate the affect of decreasing
    541     // number of bprop output points per bprop input point with increasing
    542     // stride.
    543     num_ops = (static_cast<int64>(batch * rows * cols) *
    544                static_cast<int64>(filter_rows * filter_cols) *
    545                static_cast<int64>(in_depth * depth_multiplier) * 2) /
    546               (stride * stride);
    547   }
    548 
    549   // FIXME
    550   SetConstOp("input", {batch, rows, cols, in_depth}, DT_FLOAT,
    551              graph.add_node());
    552   SetConstOp("depthwise_filter",
    553              {filter_rows, filter_cols, in_depth, depth_multiplier}, DT_FLOAT,
    554              graph.add_node());
    555   SetConstOp("output_backprop", {batch, out_rows, out_cols, out_depth},
    556              DT_FLOAT, graph.add_node());
    557   SetConstSizesOp("input_sizes",
    558                   std::vector<int32>({batch, rows, cols, in_depth}),
    559                   graph.add_node());
    560   SetConstSizesOp("filter_sizes",
    561                   std::vector<int32>(
    562                       {filter_rows, filter_cols, in_depth, depth_multiplier}),
    563                   graph.add_node());
    564 
    565   // Now add the convolution op
    566   NodeDef* conv = graph.add_node();
    567   switch (op) {
    568     case DEPTHWISE_CONV_OP_FWD:
    569       TF_CHECK_OK(NodeDefBuilder("depthwise_conv2d", "DepthwiseConv2dNative")
    570                       .Input("input", 0, DT_FLOAT)
    571                       .Input("depthwise_filter", 0, DT_FLOAT)
    572                       .Attr("strides", {1, stride, stride, 1})
    573                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
    574                       .Finalize(conv));
    575       break;
    576     case DEPTHWISE_CONV_OP_BACKPROP_INPUT:
    577       TF_CHECK_OK(NodeDefBuilder("depthwise_conv2d_backprop_input",
    578                                  "DepthwiseConv2dNativeBackpropInput")
    579                       .Input("input_sizes", 0, DT_INT32)
    580                       .Input("depthwise_filter", 0, DT_FLOAT)
    581                       .Input("output_backprop", 0, DT_FLOAT)
    582                       .Attr("strides", {1, stride, stride, 1})
    583                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
    584                       .Finalize(conv));
    585       break;
    586     case DEPTHWISE_CONV_OP_BACKPROP_FILTER:
    587       TF_CHECK_OK(NodeDefBuilder("depthwise_conv2d_backprop_filter",
    588                                  "DepthwiseConv2dNativeBackpropFilter")
    589                       .Input("input", 0, DT_FLOAT)
    590                       .Input("filter_sizes", 0, DT_INT32)
    591                       .Input("output_backprop", 0, DT_FLOAT)
    592                       .Attr("strides", {1, stride, stride, 1})
    593                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
    594                       .Finalize(conv));
    595       break;
    596   }
    597   Graph* g = new Graph(OpRegistry::Global());
    598   GraphConstructorOptions opts;
    599   TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph, g));
    600 
    601   string device = use_gpu ? "gpu" : "cpu";
    602   testing::UseRealTime();
    603   test::Benchmark(device, g, &options).Run(iters);
    604   testing::ItemsProcessed(num_ops * iters);
    605 }
    606 
    607 // BS: batch_size
    608 // R: tensor_in_rows
    609 // C: tensor_in_cols
    610 // ID: input_depth
    611 // DM: depth_multiplier
    612 // OD: output_depth
    613 // KR: kernel_rows
    614 // KC: kernel_cols
    615 // STR: stride
    616 // PAD: padding
    617 
    618 #define BM_ConvFloatDepthwiseFwd(BS, R, C, ID, DM, OD, KR, KC, STR, PAD,    \
    619                                  LABEL)                                     \
    620   static void BM_ConvFloatDepthwiseFwdCPU1_##LABEL(int iters) {             \
    621     BM_ConvFloatDepthwise(                                                  \
    622         iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \
    623         PAD, false,                                                         \
    624         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
    625                         KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));         \
    626   }                                                                         \
    627   static void BM_ConvFloatDepthwiseFwdCPU4_##LABEL(int iters) {             \
    628     BM_ConvFloatDepthwise(                                                  \
    629         iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 4, STR, \
    630         PAD, false,                                                         \
    631         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
    632                         KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));         \
    633   }                                                                         \
    634   static void BM_ConvFloatDepthwiseFwdGPU_##LABEL(int iters) {              \
    635     BM_ConvFloatDepthwise(                                                  \
    636         iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_FWD, 1, STR, \
    637         PAD, true,                                                          \
    638         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_", \
    639                         KR, "_", KC, "_", STR, "_", PAD, "_gpu"));          \
    640   }                                                                         \
    641   BENCHMARK(BM_ConvFloatDepthwiseFwdCPU1_##LABEL);                          \
    642   BENCHMARK(BM_ConvFloatDepthwiseFwdCPU4_##LABEL);                          \
    643   BENCHMARK(BM_ConvFloatDepthwiseFwdGPU_##LABEL);
    644 
    645 // The configurations below are mostly from mobilenet models.
    646 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0);
    647 BM_ConvFloatDepthwiseFwd(32, 112, 112, 64, 1, 64, 3, 3, 1, SAME, conv1);
    648 BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 1, SAME, conv2);
    649 BM_ConvFloatDepthwiseFwd(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3);
    650 BM_ConvFloatDepthwiseFwd(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4);
    651 BM_ConvFloatDepthwiseFwd(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5);
    652 BM_ConvFloatDepthwiseFwd(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6);
    653 // Benchmarks with different stride and padding options.
    654 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, SAME, conv7);
    655 BM_ConvFloatDepthwiseFwd(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8);
    656 BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 3, 3, 1, SAME, conv9);
    657 BM_ConvFloatDepthwiseFwd(1, 100, 100, 72, 1, 72, 5, 5, 1, SAME, conv10);
    658 
    659 #define BM_ConvFloatDepthwiseBk(BS, R, C, ID, DM, OD, KR, KC, STR, PAD, LABEL) \
    660   static void BM_ConvFloatDepthwiseBkInCPU1_##LABEL(int iters) {               \
    661     BM_ConvFloatDepthwise(                                                     \
    662         iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \
    663         1, STR, PAD, false,                                                    \
    664         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
    665                         KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));            \
    666   }                                                                            \
    667   static void BM_ConvFloatDepthwiseBkInCPU4_##LABEL(int iters) {               \
    668     BM_ConvFloatDepthwise(                                                     \
    669         iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \
    670         4, STR, PAD, false,                                                    \
    671         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
    672                         KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));            \
    673   }                                                                            \
    674   static void BM_ConvFloatDepthwiseBkInGPU_##LABEL(int iters) {                \
    675     BM_ConvFloatDepthwise(                                                     \
    676         iters, BS, R, C, ID, DM, OD, KR, KC, DEPTHWISE_CONV_OP_BACKPROP_INPUT, \
    677         4, STR, PAD, true,                                                     \
    678         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
    679                         KR, "_", KC, "_", STR, "_", PAD, "_gpu"));             \
    680   }                                                                            \
    681   static void BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL(int iters) {           \
    682     BM_ConvFloatDepthwise(                                                     \
    683         iters, BS, R, C, ID, DM, OD, KR, KC,                                   \
    684         DEPTHWISE_CONV_OP_BACKPROP_FILTER, 1, STR, PAD, false,                 \
    685         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
    686                         KR, "_", KC, "_", STR, "_", PAD, "_cpu1"));            \
    687   }                                                                            \
    688   static void BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL(int iters) {           \
    689     BM_ConvFloatDepthwise(                                                     \
    690         iters, BS, R, C, ID, DM, OD, KR, KC,                                   \
    691         DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, false,                 \
    692         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
    693                         KR, "_", KC, "_", STR, "_", PAD, "_cpu4"));            \
    694   }                                                                            \
    695   static void BM_ConvFloatDepthwiseBkFilterGPU_##LABEL(int iters) {            \
    696     BM_ConvFloatDepthwise(                                                     \
    697         iters, BS, R, C, ID, DM, OD, KR, KC,                                   \
    698         DEPTHWISE_CONV_OP_BACKPROP_FILTER, 4, STR, PAD, true,                  \
    699         strings::StrCat(BS, "_", R, "_", C, "_", ID, "_", DM, "_", OD, "_",    \
    700                         KR, "_", KC, "_", STR, "_", PAD, "_gpu"));             \
    701   }                                                                            \
    702   BENCHMARK(BM_ConvFloatDepthwiseBkInCPU1_##LABEL);                            \
    703   BENCHMARK(BM_ConvFloatDepthwiseBkInCPU4_##LABEL);                            \
    704   BENCHMARK(BM_ConvFloatDepthwiseBkInGPU_##LABEL);                             \
    705   BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU1_##LABEL);                        \
    706   BENCHMARK(BM_ConvFloatDepthwiseBkFilterCPU4_##LABEL);                        \
    707   BENCHMARK(BM_ConvFloatDepthwiseBkFilterGPU_##LABEL)
    708 
    709 // The configurations below are mostly from mobilenet models.
    710 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv0);
    711 BM_ConvFloatDepthwiseBk(32, 112, 112, 64, 1, 64, 3, 3, 1, SAME, conv1);
    712 BM_ConvFloatDepthwiseBk(32, 56, 56, 128, 1, 128, 3, 3, 1, SAME, conv2);
    713 BM_ConvFloatDepthwiseBk(32, 56, 56, 128, 1, 128, 3, 3, 2, SAME, conv3);
    714 BM_ConvFloatDepthwiseBk(32, 28, 28, 128, 1, 128, 3, 3, 1, SAME, conv4);
    715 BM_ConvFloatDepthwiseBk(32, 14, 14, 512, 1, 512, 3, 3, 1, SAME, conv5);
    716 BM_ConvFloatDepthwiseBk(32, 7, 7, 1024, 1, 1024, 3, 3, 1, SAME, conv6);
    717 // Benchmarks with different stride and padding options, varying depth
    718 // multiplier.
    719 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 2, SAME, conv7);
    720 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 2, VALID, conv8);
    721 
    722 // Vary depth multiplier.
    723 BM_ConvFloatDepthwiseBk(32, 112, 112, 1, 24, 24, 3, 3, 1, SAME, conv9);
    724 BM_ConvFloatDepthwiseBk(32, 112, 112, 2, 12, 24, 3, 3, 1, SAME, conv10);
    725 BM_ConvFloatDepthwiseBk(32, 112, 112, 3, 8, 24, 3, 3, 1, SAME, conv11);
    726 BM_ConvFloatDepthwiseBk(32, 112, 112, 8, 3, 24, 3, 3, 1, SAME, conv12);
    727 BM_ConvFloatDepthwiseBk(32, 112, 112, 12, 2, 24, 3, 3, 1, SAME, conv13);
    728 BM_ConvFloatDepthwiseBk(32, 112, 112, 24, 1, 24, 3, 3, 1, SAME, conv14);
    729 
    730 static void BM_LRNFloat(int iters, int depth, int cols, int rows,
    731                         int batch_size, int range, int num_threads,
    732                         const string& label) {
    733   tensorflow::testing::StopTiming();
    734   std::unique_ptr<Device> device(
    735       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
    736 
    737   thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
    738   EigenThreadPoolWrapper wrapper(&threadpool);
    739   Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
    740   device->set_eigen_cpu_device(&eigen_cpu_device);
    741 
    742   gtl::InlinedVector<TensorValue, 4> inputs;
    743   TensorShape shape({batch_size, rows, cols, depth});
    744 
    745   Tensor input(DT_FLOAT, shape);
    746   test::FillIota<float>(&input, 1.0);
    747   inputs.push_back({nullptr, &input});
    748 
    749   // Convolution op.
    750   NodeDef lrn_node_def;
    751   TF_CHECK_OK(NodeDefBuilder("lrn_op", "LRN")
    752                   .Input("input", 0, DT_FLOAT)
    753                   .Attr("depth_radius", range)
    754                   .Attr("bias", 1.0)
    755                   .Attr("alpha", 0.1)
    756                   .Attr("beta", 0.5)
    757                   .Finalize(&lrn_node_def));
    758 
    759   Status status;
    760   std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
    761                                               cpu_allocator(), lrn_node_def,
    762                                               TF_GRAPH_DEF_VERSION, &status));
    763   TF_CHECK_OK(status);
    764 
    765   OpKernelContext::Params params;
    766   params.device = device.get();
    767   params.frame_iter = FrameAndIter(0, 0);
    768   params.inputs = &inputs;
    769   params.op_kernel = op.get();
    770   std::vector<AllocatorAttributes> attrs;
    771   test::SetOutputAttrs(&params, &attrs);
    772 
    773   std::unique_ptr<OpKernelContext> context(new OpKernelContext(&params));
    774 
    775   op->Compute(context.get());
    776   tensorflow::testing::StartTiming();
    777   for (int i = 0; i < iters; ++i) {
    778     delete context->release_output(0).tensor;
    779     op->Compute(context.get());
    780   }
    781   tensorflow::testing::StopTiming();
    782   testing::ItemsProcessed(context->mutable_output(0)->NumElements() * iters *
    783                           (2 * range + 1) * 2);
    784   testing::SetLabel(label);
    785 }
    786 
    787 #define BM_LRNFloatFwdCPU(DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL)   \
    788   static void                                                                \
    789       BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS( \
    790           int iters) {                                                       \
    791     BM_LRNFloat(iters, DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL);     \
    792   }                                                                          \
    793   BENCHMARK(                                                                 \
    794       BM_LRNFloat_##DEPTH##_##COLS##_##ROWS##_##BATCH##_##RANGE##_##THREADS)
    795 
    796 // clang-format off
    797 //                DEPTH, COLS, ROWS, BATCH, RANGE, THREADS, LABEL
    798 BM_LRNFloatFwdCPU(64,    56,   56,   32,    5,     1,       "lrn 1 thread");
    799 BM_LRNFloatFwdCPU(192,   28,   28,   64,    2,     1,       "lrn 1 thread");
    800 BM_LRNFloatFwdCPU(192,   56,   56,   32,    5,     1,       "lrn 1 thread");
    801 BM_LRNFloatFwdCPU(64,    56,   56,   32,    5,     4,       "lrn 4 threads");
    802 BM_LRNFloatFwdCPU(192,   28,   28,   64,    2,     4,       "lrn 4 threads");
    803 BM_LRNFloatFwdCPU(192,   56,   56,   32,    5,     4,       "lrn 4 threads");
    804 BM_LRNFloatFwdCPU(64,    56,   56,   32,    5,     8,       "lrn 8 threads");
    805 BM_LRNFloatFwdCPU(192,   28,   28,   64,    2,     8,       "lrn 8 threads");
    806 BM_LRNFloatFwdCPU(192,   56,   56,   32,    5,     8,       "lrn 8 threads");
    807 // clang-format on
    808 
    809 /*
    810 AvgPooling Op
    811 */
    812 static void BM_AvgPool(int iters, int batch_size, int rows, int cols, int depth,
    813                        int kernel_rows, int kernel_cols, int stride,
    814                        Padding padding, int num_threads, const string& label) {
    815   tensorflow::testing::StopTiming();
    816   std::unique_ptr<Device> device(
    817       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
    818 
    819   thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
    820   EigenThreadPoolWrapper wrapper(&threadpool);
    821   Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
    822   device->set_eigen_cpu_device(&eigen_cpu_device);
    823 
    824   gtl::InlinedVector<TensorValue, 4> inputs;
    825   TensorShape shape1({batch_size, rows, cols, depth});
    826   Tensor input1(DT_FLOAT, shape1);
    827   test::FillIota<float>(&input1, 1.0);
    828   inputs.push_back({nullptr, &input1});
    829 
    830   // AvgPooling op.
    831   NodeDef avgpool_node_def;
    832   CHECK_EQ(kernel_rows, kernel_cols);
    833   Status status = NodeDefBuilder("avgpool_op", "AvgPool")
    834                       .Input(FakeInput(DT_FLOAT))
    835                       .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
    836                       .Attr("strides", {1, stride, stride, 1})
    837                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
    838                       .Finalize(&avgpool_node_def);
    839   TF_CHECK_OK(status);
    840 
    841   std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
    842                                               cpu_allocator(), avgpool_node_def,
    843                                               TF_GRAPH_DEF_VERSION, &status));
    844   TF_CHECK_OK(status);
    845   OpKernelContext::Params params;
    846   params.device = device.get();
    847   params.frame_iter = FrameAndIter(0, 0);
    848   params.inputs = &inputs;
    849   params.op_kernel = op.get();
    850   std::vector<AllocatorAttributes> attrs;
    851   test::SetOutputAttrs(&params, &attrs);
    852 
    853   std::unique_ptr<OpKernelContext> avgpool_context(
    854       new OpKernelContext(&params));
    855 
    856   op->Compute(avgpool_context.get());
    857   tensorflow::testing::StartTiming();
    858   for (int i = 0; i < iters; ++i) {
    859     delete avgpool_context->release_output(0).tensor;
    860     op->Compute(avgpool_context.get());
    861   }
    862   tensorflow::testing::StopTiming();
    863   testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
    864                           iters);
    865   testing::SetLabel(label);
    866 }
    867 
    868 // BS: batch_size
    869 // IR: input_rows
    870 // IC: input_cols
    871 // ND: node_depth
    872 // KR: kernel_rows
    873 // KC: kernel_cols
    874 // ST: stride. We use the same stride for both directions.
    875 // PT: padding
    876 #define BM_AvgPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)            \
    877   static void                                                                  \
    878       BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
    879           int iters) {                                                         \
    880     BM_AvgPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
    881   }                                                                            \
    882   BENCHMARK(                                                                   \
    883       BM_AvgPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
    884 
    885 // Labels are taken from the 2014-July-24 version of imagenet
    886 BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "avgpool0_VALID");
    887 BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "avgpool1_VALID");
    888 BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "avgpool4_VALID");
    889 BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "avgpool10_VALID");
    890 BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "avgpool0_SAME");
    891 BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "avgpool1_SAME");
    892 BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "avgpool4_SAME");
    893 BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "avgpool10_SAME");
    894 BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "avgpool0_VALID");
    895 BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "avgpool1_VALID");
    896 BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "avgpool4_VALID");
    897 BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "avgpool10_VALID");
    898 BM_AvgPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "avgpool0_SAME");
    899 BM_AvgPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "avgpool1_SAME");
    900 BM_AvgPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "avgpool4_SAME");
    901 BM_AvgPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "avgpool10_SAME");
    902 
    903 static void BM_AvgPoolBk(int iters, int batch_size, int rows, int cols,
    904                          int depth, int kernel_rows, int kernel_cols,
    905                          int stride, Padding padding, int num_threads,
    906                          const string& label) {
    907   tensorflow::testing::StopTiming();
    908   std::unique_ptr<Device> device(
    909       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
    910 
    911   thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
    912   EigenThreadPoolWrapper wrapper(&threadpool);
    913   Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
    914   device->set_eigen_cpu_device(&eigen_cpu_device);
    915 
    916   gtl::InlinedVector<TensorValue, 4> inputs;
    917 
    918   int64 out_height, out_width, pad_rows, pad_cols;
    919   TF_CHECK_OK(GetWindowedOutputSize(rows, kernel_rows, stride, padding,
    920                                     &out_height, &pad_rows));
    921   TF_CHECK_OK(GetWindowedOutputSize(cols, kernel_cols, stride, padding,
    922                                     &out_width, &pad_cols));
    923   TensorShape output_shape({batch_size, out_height, out_width, depth});
    924   TensorShape shape2({4});
    925   Tensor input_shape_tensor(DT_INT32, shape2);
    926   int32 input_dims[] = {batch_size, rows, cols, depth};
    927   for (int i = 0; i < 4; i++) {
    928     input_shape_tensor.flat<int32>()(i) = input_dims[i];
    929   }
    930   inputs.push_back({nullptr, &input_shape_tensor});
    931 
    932   Tensor output_backprop(DT_FLOAT, output_shape);
    933   test::FillIota<float>(&output_backprop, 11.0);
    934   inputs.push_back({nullptr, &output_backprop});
    935 
    936   // AvgPoolGrad op.
    937   NodeDef avgpool_grad_node_def;
    938   Status status = NodeDefBuilder("avgpool_grad_op", "AvgPoolGrad")
    939                       .Input(FakeInput())
    940                       .Input(FakeInput(DT_FLOAT))
    941                       .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
    942                       .Attr("strides", {1, stride, stride, 1})
    943                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
    944                       .Finalize(&avgpool_grad_node_def);
    945   TF_CHECK_OK(status);
    946   std::unique_ptr<OpKernel> op(
    947       CreateOpKernel(DEVICE_CPU, nullptr, cpu_allocator(),
    948                      avgpool_grad_node_def, TF_GRAPH_DEF_VERSION, &status));
    949   TF_CHECK_OK(status);
    950   OpKernelContext::Params params;
    951   params.device = device.get();
    952   params.frame_iter = FrameAndIter(0, 0);
    953   params.inputs = &inputs;
    954   params.op_kernel = op.get();
    955   std::vector<AllocatorAttributes> attrs;
    956   test::SetOutputAttrs(&params, &attrs);
    957 
    958   std::unique_ptr<OpKernelContext> avgpool_context(
    959       new OpKernelContext(&params));
    960 
    961   op->Compute(avgpool_context.get());
    962   tensorflow::testing::StartTiming();
    963   for (int i = 0; i < iters; ++i) {
    964     delete avgpool_context->release_output(0).tensor;
    965     op->Compute(avgpool_context.get());
    966   }
    967   tensorflow::testing::StopTiming();
    968   testing::ItemsProcessed(avgpool_context->mutable_output(0)->NumElements() *
    969                           iters);
    970   testing::SetLabel(label);
    971 }
    972 
    973 // BS: batch_size
    974 // IR: input_rows
    975 // IC: input_cols
    976 // ND: node_depth
    977 // KR: kernel_rows
    978 // KC: kernel_cols
    979 // ST: stride. We use the same stride for both directions.
    980 // PT: padding
    981 // The resulted symbol is too long. Need to use two macros to fit in 80-chars
    982 #define BM_AvgPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)               \
    983   static void                                                                    \
    984       BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
    985           int iters) {                                                           \
    986     BM_AvgPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
    987   }                                                                              \
    988   BENCHMARK(                                                                     \
    989       BM_AvgPoolBk_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
    990 
    991 // Shapes taken from the 2015/05/16 inception model
    992 BM_AvgPoolBkCPU(32, 35, 35, 192, 3, 3, 1, SAME, 1, "avgpool_grad0_SAME");
    993 BM_AvgPoolBkCPU(32, 35, 35, 256, 3, 3, 1, SAME, 1, "avgpool_grad1_SAME");
    994 BM_AvgPoolBkCPU(32, 17, 17, 768, 3, 3, 1, SAME, 1, "avgpool_grad2_SAME");
    995 BM_AvgPoolBkCPU(32, 17, 17, 1024, 3, 3, 1, SAME, 1, "avgpool_grad3_SAME");
    996 BM_AvgPoolBkCPU(32, 17, 17, 1152, 3, 3, 1, SAME, 1, "avgpool_grad4_SAME");
    997 BM_AvgPoolBkCPU(32, 17, 17, 1216, 3, 3, 1, SAME, 1, "avgpool_grad5_SAME");
    998 BM_AvgPoolBkCPU(32, 17, 17, 1248, 5, 5, 3, VALID, 1, "avgpool_grad6_VALID");
    999 BM_AvgPoolBkCPU(32, 8, 8, 1760, 3, 3, 1, SAME, 1, "avgpool_grad7_SAME");
   1000 BM_AvgPoolBkCPU(32, 8, 8, 2048, 8, 8, 1, VALID, 1, "avgpool_grad8_VALID");
   1001 
   1002 /*
   1003 MaxPooling Op
   1004 */
   1005 static void BM_MaxPool(int iters, int batch_size, int rows, int cols, int depth,
   1006                        int kernel_rows, int kernel_cols, int stride,
   1007                        Padding padding, int num_threads, const string& label) {
   1008   tensorflow::testing::StopTiming();
   1009   SessionOptions options;
   1010   options.config.set_intra_op_parallelism_threads(num_threads);
   1011 
   1012   std::unique_ptr<Device> device(
   1013       DeviceFactory::NewDevice("CPU", options, "/job:a/replica:0/task:0"));
   1014 
   1015   thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
   1016   EigenThreadPoolWrapper wrapper(&threadpool);
   1017   Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
   1018   device->set_eigen_cpu_device(&eigen_cpu_device);
   1019 
   1020   gtl::InlinedVector<TensorValue, 4> inputs;
   1021   TensorShape shape1({batch_size, rows, cols, depth});
   1022   Tensor input1(DT_FLOAT, shape1);
   1023   test::FillIota<float>(&input1, 1.0);
   1024   inputs.push_back({nullptr, &input1});
   1025 
   1026   // MaxPooling op.
   1027   NodeDef maxpool_node_def;
   1028   CHECK_EQ(kernel_rows, kernel_cols);
   1029   Status status = NodeDefBuilder("maxpool_op", "MaxPool")
   1030                       .Input(FakeInput())
   1031                       .Attr("ksize", {1, kernel_rows, kernel_cols, 1})
   1032                       .Attr("strides", {1, stride, stride, 1})
   1033                       .Attr("padding", padding == VALID ? "VALID" : "SAME")
   1034                       .Finalize(&maxpool_node_def);
   1035   TF_CHECK_OK(status);
   1036   std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
   1037                                               cpu_allocator(), maxpool_node_def,
   1038                                               TF_GRAPH_DEF_VERSION, &status));
   1039   TF_CHECK_OK(status);
   1040   OpKernelContext::Params params;
   1041   params.device = device.get();
   1042   params.frame_iter = FrameAndIter(0, 0);
   1043   params.inputs = &inputs;
   1044   params.op_kernel = op.get();
   1045   std::vector<AllocatorAttributes> attrs;
   1046   test::SetOutputAttrs(&params, &attrs);
   1047 
   1048   std::unique_ptr<OpKernelContext> maxpool_context(
   1049       new OpKernelContext(&params));
   1050 
   1051   op->Compute(maxpool_context.get());
   1052   tensorflow::testing::StartTiming();
   1053   for (int i = 0; i < iters; ++i) {
   1054     delete maxpool_context->release_output(0).tensor;
   1055     op->Compute(maxpool_context.get());
   1056   }
   1057   tensorflow::testing::StopTiming();
   1058   testing::ItemsProcessed(maxpool_context->mutable_output(0)->NumElements() *
   1059                           iters);
   1060   testing::SetLabel(label);
   1061 }
   1062 
   1063 // BS: batch_size
   1064 // IR: input_rows
   1065 // IC: input_cols
   1066 // ND: node_depth
   1067 // KR: kernel_rows
   1068 // KC: kernel_cols
   1069 // ST: stride. We use the same stride for both directions.
   1070 // PT: padding
   1071 #define BM_MaxPoolFwdCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)            \
   1072   static void                                                                  \
   1073       BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH( \
   1074           int iters) {                                                         \
   1075     BM_MaxPool(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL);              \
   1076   }                                                                            \
   1077   BENCHMARK(                                                                   \
   1078       BM_MaxPool_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_##PT##_##TH)
   1079 
   1080 // Labels are taken from the 2014-July-24 version of imagenet
   1081 /* TODO XXX
   1082 BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 1, "maxpool0_VALID");
   1083 BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 1, "maxpool1_VALID");
   1084 BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 1, "maxpool4_VALID");
   1085 BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 1, "maxpool10_VALID");
   1086 BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 1, "maxpool0_SAME");
   1087 BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 1, "maxpool1_SAME");
   1088 BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 1, "maxpool4_SAME");
   1089 BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 1, "maxpool10_SAME");
   1090 */
   1091 BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, VALID, 4, "maxpool0_VALID");
   1092 BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, VALID, 4, "maxpool1_VALID");
   1093 BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, VALID, 4, "maxpool4_VALID");
   1094 BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, VALID, 4, "maxpool10_VALID");
   1095 BM_MaxPoolFwdCPU(32, 112, 112, 64, 3, 3, 2, SAME, 4, "maxpool0_SAME");
   1096 BM_MaxPoolFwdCPU(32, 56, 56, 192, 3, 3, 2, SAME, 4, "maxpool1_SAME");
   1097 BM_MaxPoolFwdCPU(32, 28, 28, 352, 3, 3, 2, SAME, 4, "maxpool4_SAME");
   1098 BM_MaxPoolFwdCPU(32, 14, 14, 576, 3, 3, 2, SAME, 4, "maxpool10_SAME");
   1099 
   1100 static void BM_MaxPoolBk(int iters, int batch_size, int rows, int cols,
   1101                          int depth, int kernel_rows, int kernel_cols,
   1102                          int stride, Padding padding, int num_threads,
   1103                          bool use_gpu, const string& label) {
   1104   auto root = Scope::NewRootScope().ExitOnError();
   1105 
   1106   int64 out_height, out_width, pad_rows, pad_cols;
   1107   TF_CHECK_OK(GetWindowedOutputSize(rows, kernel_rows, stride, padding,
   1108                                     &out_height, &pad_rows));
   1109   TF_CHECK_OK(GetWindowedOutputSize(cols, kernel_cols, stride, padding,
   1110                                     &out_width, &pad_cols));
   1111 
   1112   Tensor input_data(DT_FLOAT, TensorShape({batch_size, rows, cols, depth}));
   1113   input_data.flat<float>().setRandom();
   1114 
   1115   Tensor output_data(DT_FLOAT,
   1116                      TensorShape({batch_size, out_height, out_width, depth}));
   1117   output_data.flat<float>().setRandom();
   1118 
   1119   Tensor output_diff(DT_FLOAT,
   1120                      TensorShape({batch_size, out_height, out_width, depth}));
   1121   output_diff.flat<float>().setRandom();
   1122 
   1123   CHECK_EQ(kernel_rows, kernel_cols);
   1124   ops::internal::MaxPoolGrad(root, input_data, output_data, output_diff,
   1125                              {1, kernel_rows, kernel_cols, 1} /* ksize */,
   1126                              {1, stride, stride, 1} /* stride */,
   1127                              padding == VALID ? "VALID" : "SAME");
   1128   TF_CHECK_OK(root.status());
   1129   Graph* g = new Graph(OpRegistry::Global());
   1130   TF_CHECK_OK(root.ToGraph(g));
   1131   string device = use_gpu ? "gpu" : "cpu";
   1132   testing::UseRealTime();
   1133   test::Benchmark(device, g).Run(iters);
   1134 
   1135   testing::ItemsProcessed(batch_size * rows * cols * depth * iters);
   1136   testing::SetLabel(label);
   1137 }
   1138 
   1139 // BS: batch_size
   1140 // IR: input_rows
   1141 // IC: input_cols
   1142 // ND: node_depth
   1143 // KR: kernel_rows
   1144 // KC: kernel_cols
   1145 // ST: stride. We use the same stride for both directions.
   1146 // PT: padding
   1147 // The resulted symbol is too long. Need to use two macros to fit in 80-chars
   1148 // clang-format off
   1149 #define BM_MaxPoolBkGPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)             \
   1150   static void                                                                  \
   1151       BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
   1152           ##PT##_##TH(                                                         \
   1153           int iters) {                                                         \
   1154     BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, true, LABEL);      \
   1155   }                                                                            \
   1156   BENCHMARK(                                                                   \
   1157       BM_MaxPoolBk_GPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
   1158           ##PT##_##TH)                                                         \
   1159 
   1160 #define BM_MaxPoolBkCPU(BS, IR, IC, ND, KR, KC, ST, PT, TH, LABEL)             \
   1161   static void                                                                  \
   1162       BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
   1163           ##PT##_##TH(                                                         \
   1164           int iters) {                                                         \
   1165     BM_MaxPoolBk(iters, BS, IR, IC, ND, KR, KC, ST, PT, TH, false, LABEL);     \
   1166   }                                                                            \
   1167   BENCHMARK(                                                                   \
   1168       BM_MaxPoolBk_CPU_##BS##_##IR##_##IC##_##ND##_##KR##_##KC##_##ST##_       \
   1169           ##PT##_##TH)
   1170 // clang-format on
   1171 
   1172 // Shapes taken from the 2015/05/16 inception model
   1173 BM_MaxPoolBkGPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID");
   1174 BM_MaxPoolBkGPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID");
   1175 BM_MaxPoolBkGPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID");
   1176 BM_MaxPoolBkGPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID");
   1177 BM_MaxPoolBkGPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID");
   1178 
   1179 BM_MaxPoolBkCPU(32, 147, 147, 64, 3, 3, 2, VALID, 1, "maxpool_grad0_VALID");
   1180 BM_MaxPoolBkCPU(32, 71, 71, 192, 3, 3, 2, VALID, 1, "maxpool_grad1_VALID");
   1181 BM_MaxPoolBkCPU(32, 35, 35, 288, 3, 3, 2, VALID, 1, "maxpool_grad2_VALID");
   1182 BM_MaxPoolBkCPU(32, 17, 17, 1248, 3, 3, 2, VALID, 1, "maxpool_grad3_VALID");
   1183 BM_MaxPoolBkCPU(32, 8, 8, 2048, 3, 3, 2, VALID, 1, "maxpool_grad4_VALID");
   1184 
   1185 /*
   1186 Relu Op
   1187 Run benchmark with:
   1188 */
   1189 static void BM_ReluFloat(int iters, int batch_size, int rows, int cols,
   1190                          int depth, int num_threads, const string& label) {
   1191   tensorflow::testing::StopTiming();
   1192   std::unique_ptr<Device> device(
   1193       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
   1194 
   1195   thread::ThreadPool threadpool(Env::Default(), "test", num_threads);
   1196   EigenThreadPoolWrapper wrapper(&threadpool);
   1197   Eigen::ThreadPoolDevice eigen_cpu_device(&wrapper, num_threads);
   1198   device->set_eigen_cpu_device(&eigen_cpu_device);
   1199 
   1200   gtl::InlinedVector<TensorValue, 4> inputs;
   1201   TensorShape shape1({batch_size, rows, cols, depth});
   1202   Tensor input1(DT_FLOAT, shape1);
   1203   test::FillIota<float>(&input1, 1.0);
   1204   inputs.push_back({nullptr, &input1});
   1205 
   1206   // Reluing op.
   1207   NodeDef relu_node_def;
   1208   Status status = NodeDefBuilder("relu_op", "Relu")
   1209                       .Input(FakeInput(DT_FLOAT))
   1210                       .Finalize(&relu_node_def);
   1211   TF_CHECK_OK(status);
   1212   std::unique_ptr<OpKernel> op(CreateOpKernel(DEVICE_CPU, device.get(),
   1213                                               cpu_allocator(), relu_node_def,
   1214                                               TF_GRAPH_DEF_VERSION, &status));
   1215   TF_CHECK_OK(status);
   1216   OpKernelContext::Params params;
   1217   params.device = device.get();
   1218   params.frame_iter = FrameAndIter(0, 0);
   1219   params.inputs = &inputs;
   1220   params.op_kernel = op.get();
   1221   std::vector<AllocatorAttributes> attrs;
   1222   test::SetOutputAttrs(&params, &attrs);
   1223 
   1224   std::unique_ptr<OpKernelContext> relu_context(new OpKernelContext(&params));
   1225 
   1226   op->Compute(relu_context.get());
   1227   tensorflow::testing::StartTiming();
   1228   for (int i = 0; i < iters; ++i) {
   1229     delete relu_context->release_output(0).tensor;
   1230     op->Compute(relu_context.get());
   1231   }
   1232   tensorflow::testing::StopTiming();
   1233   testing::ItemsProcessed(relu_context->mutable_output(0)->NumElements() *
   1234                           iters);
   1235   testing::SetLabel(label);
   1236 }
   1237 
   1238 // BS: batch_size
   1239 // IR: input_rows
   1240 // IC: input_cols
   1241 // ND: node_depth
   1242 #define BM_Relu(BS, IR, IC, ND, TH, LABEL)                               \
   1243   static void BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH(int iters) { \
   1244     BM_ReluFloat(iters, BS, IR, IC, ND, TH, LABEL);                      \
   1245   }                                                                      \
   1246   BENCHMARK(BM_ReluFloat_##BS##_##IR##_##IC##_##ND##_##TH)
   1247 
   1248 BM_Relu(32, 112, 112, 64, 1, "relu0");
   1249 BM_Relu(32, 56, 56, 192, 1, "relu1");
   1250 BM_Relu(32, 28, 28, 352, 1, "relu4");
   1251 BM_Relu(32, 14, 14, 576, 1, "relu10");
   1252 BM_Relu(32, 112, 112, 64, 4, "relu0");
   1253 BM_Relu(32, 56, 56, 192, 4, "relu1");
   1254 BM_Relu(32, 28, 28, 352, 4, "relu4");
   1255 BM_Relu(32, 14, 14, 576, 4, "relu10");
   1256 
   1257 static void BM_ImageNetSoftmaxFwd(int iters, int batch_size, int node_depth,
   1258                                   int num_threads, bool use_gpu,
   1259                                   const string& label) {
   1260   auto root = Scope::NewRootScope().ExitOnError();
   1261 
   1262   Tensor input(DT_FLOAT, TensorShape({batch_size, node_depth}));
   1263   input.flat<float>().setRandom();
   1264 
   1265   auto softmax = ops::Softmax(root, input);
   1266 
   1267   TF_CHECK_OK(root.status());
   1268   Graph* g = new Graph(OpRegistry::Global());
   1269   TF_CHECK_OK(root.ToGraph(g));
   1270   string device = use_gpu ? "gpu" : "cpu";
   1271   SessionOptions opts;
   1272   opts.config.set_inter_op_parallelism_threads(1);
   1273   opts.config.set_intra_op_parallelism_threads(num_threads);
   1274   opts.config.set_use_per_session_threads(true);
   1275   opts.config.mutable_graph_options()
   1276       ->mutable_optimizer_options()
   1277       ->set_opt_level(OptimizerOptions_Level_L0);
   1278   testing::UseRealTime();
   1279   test::Benchmark(device, g, &opts).Run(iters);
   1280   testing::ItemsProcessed(batch_size * node_depth * iters);
   1281   testing::SetLabel(label);
   1282 }
   1283 
   1284 #define BM_ImageNetSoftmaxFwd(BATCH_SIZE, NODE_DEPTH, TH, GPU, LABEL)     \
   1285   static void                                                             \
   1286       BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH##_##GPU(   \
   1287           int iters) {                                                    \
   1288     BM_ImageNetSoftmaxFwd(iters, BATCH_SIZE, NODE_DEPTH, TH, GPU, LABEL); \
   1289   }                                                                       \
   1290   BENCHMARK(BM_ImageNetSoftmaxFwd_##BATCH_SIZE##_##NODE_DEPTH##_##TH##_##GPU)
   1291 
   1292 // Labels are taken from the 2014-July-24 version of imagenet
   1293 BM_ImageNetSoftmaxFwd(32, 1008, 1, false, "softmax32");
   1294 BM_ImageNetSoftmaxFwd(128, 1008, 1, false, "softmax128");
   1295 BM_ImageNetSoftmaxFwd(32, 1008, 4, false, "softmax32");
   1296 BM_ImageNetSoftmaxFwd(128, 1008, 4, false, "softmax128");
   1297 BM_ImageNetSoftmaxFwd(32, 1008, 1, true, "softmax32");
   1298 BM_ImageNetSoftmaxFwd(128, 1008, 1, true, "softmax128");
   1299 BM_ImageNetSoftmaxFwd(8192, 1024, 1, true, "softmax32");
   1300 BM_ImageNetSoftmaxFwd(8192, 32768, 1, true, "softmax128");
   1301 
   1302 static void BM_TopK(int iters, int rows, int cols, int k, int num_threads,
   1303                     bool use_gpu, const string& label) {
   1304   testing::StopTiming();
   1305   auto root = Scope::NewRootScope().ExitOnError();
   1306 
   1307   Tensor input(DT_FLOAT, TensorShape({rows, cols}));
   1308   input.flat<float>().setRandom();
   1309 
   1310   Tensor input_k(DT_INT32, TensorShape({}));
   1311   input_k.scalar<int32>()() = k;
   1312 
   1313   auto top_k = ops::TopK(root, input, input_k, ops::TopK::Sorted(true));
   1314 
   1315   TF_CHECK_OK(root.status());
   1316   Graph* g = new Graph(OpRegistry::Global());
   1317   TF_CHECK_OK(root.ToGraph(g));
   1318   string device = use_gpu ? "gpu" : "cpu";
   1319   SessionOptions opts;
   1320   opts.config.set_inter_op_parallelism_threads(1);
   1321   opts.config.set_intra_op_parallelism_threads(num_threads);
   1322   opts.config.set_use_per_session_threads(true);
   1323   opts.config.mutable_graph_options()
   1324       ->mutable_optimizer_options()
   1325       ->set_opt_level(OptimizerOptions_Level_L0);
   1326   testing::UseRealTime();
   1327   testing::StartTiming();
   1328   test::Benchmark(device, g, &opts).Run(iters);
   1329   testing::ItemsProcessed(rows * cols * iters);
   1330   testing::SetLabel(label);
   1331 }
   1332 
   1333 // IR: input_rows
   1334 // IC: input_cols
   1335 // IK: k
   1336 // TH: number of threads
   1337 #define BM_TopKGPU(IR, IC, IK, TH, LABEL)                        \
   1338   static void BM_TopK_GPU_##IR##_##IC##_##IK##_##TH(int iters) { \
   1339     BM_TopK(iters, IR, IC, IK, TH, true, LABEL);                 \
   1340   }                                                              \
   1341   BENCHMARK(BM_TopK_GPU_##IR##_##IC##_##IK##_##TH)
   1342 
   1343 #define BM_TopKCPU(IR, IC, IK, TH, LABEL)                        \
   1344   static void BM_TopK_CPU_##IR##_##IC##_##IK##_##TH(int iters) { \
   1345     BM_TopK(iters, IR, IC, IK, TH, false, LABEL);                \
   1346   }                                                              \
   1347   BENCHMARK(BM_TopK_CPU_##IR##_##IC##_##IK##_##TH)
   1348 
   1349 // clang-format on
   1350 
   1351 BM_TopKCPU(1, 100, 1, 16, "topk_r_1_c_100_k_1_th_16");
   1352 BM_TopKCPU(1, 100, 2, 16, "topk_r_1_c_100_k_2_th_16");
   1353 BM_TopKCPU(1, 100, 10, 16, "topk_r_1_c_100_k_10_th_16");
   1354 BM_TopKCPU(1, 100, 50, 16, "topk_r_1_c_100_k_50_th_16");
   1355 BM_TopKCPU(1, 100, 100, 16, "topk_r_1_c_100_k_100_th_16");
   1356 BM_TopKCPU(32, 100, 1, 16, "topk_r_32_c_100_k_1_th_16");
   1357 BM_TopKCPU(32, 100, 2, 16, "topk_r_32_c_100_k_2_th_16");
   1358 BM_TopKCPU(32, 100, 10, 16, "topk_r_32_c_100_k_10_th_16");
   1359 BM_TopKCPU(32, 100, 50, 16, "topk_r_32_c_100_k_50_th_16");
   1360 BM_TopKCPU(32, 100, 100, 16, "topk_r_32_c_100_k_100_th_16");
   1361 BM_TopKCPU(128, 100, 1, 16, "topk_r_128_c_100_k_1_th_16");
   1362 BM_TopKCPU(128, 100, 2, 16, "topk_r_128_c_100_k_2_th_16");
   1363 BM_TopKCPU(128, 100, 10, 16, "topk_r_128_c_100_k_10_th_16");
   1364 BM_TopKCPU(128, 100, 50, 16, "topk_r_128_c_100_k_50_th_16");
   1365 BM_TopKCPU(128, 100, 100, 16, "topk_r_128_c_100_k_100_th_16");
   1366 BM_TopKCPU(128, 1000, 1, 16, "topk_r_128_c_1000_k_1_th_16");
   1367 BM_TopKCPU(128, 1000, 2, 16, "topk_r_128_c_1000_k_2_th_16");
   1368 BM_TopKCPU(128, 1000, 10, 16, "topk_r_128_c_1000_k_10_th_16");
   1369 BM_TopKCPU(128, 1000, 50, 16, "topk_r_128_c_1000_k_50_th_16");
   1370 BM_TopKCPU(128, 1000, 100, 16, "topk_r_128_c_1000_k_100_th_16");
   1371 BM_TopKCPU(128, 1000, 500, 16, "topk_r_128_c_1000_k_500_th_16");
   1372 BM_TopKCPU(128, 1000, 1000, 16, "topk_r_128_c_1000_k_1000_th_16");
   1373 
   1374 // From NMT Codebase:
   1375 //   batch_sizes: 16, 128
   1376 //   vocab_sizes: 10000 for small dataset, 35000 for large.
   1377 //   beam_widths: 1, 2, 5, 10
   1378 BM_TopKCPU(16, 10000, 10000, 16, "topk_nmt_r_16_c_10000_k_10000_th_16");
   1379 BM_TopKCPU(16, 20000, 20000, 16, "topk_nmt_r_16_c_20000_k_20000_th_16");
   1380 BM_TopKCPU(16, 50000, 50000, 16, "topk_nmt_r_16_c_50000_k_50000_th_16");
   1381 BM_TopKCPU(16, 100000, 100000, 16, "topk_nmt_r_16_c_100000_k_100000_th_16");
   1382 BM_TopKCPU(16, 35000, 35000, 16, "topk_nmt_r_16_c_35000_k_35000_th_16");
   1383 BM_TopKCPU(16, 70000, 70000, 16, "topk_nmt_r_16_c_70000_k_70000_th_16");
   1384 BM_TopKCPU(16, 175000, 175000, 16, "topk_nmt_r_16_c_175000_k_175000_th_16");
   1385 BM_TopKCPU(16, 350000, 350000, 16, "topk_nmt_r_16_c_350000_k_350000_th_16");
   1386 BM_TopKCPU(128, 10000, 10000, 16, "topk_nmt_r_128_c_10000_k_10000_th_16");
   1387 BM_TopKCPU(128, 20000, 20000, 16, "topk_nmt_r_128_c_20000_k_20000_th_16");
   1388 BM_TopKCPU(128, 50000, 50000, 16, "topk_nmt_r_128_c_50000_k_50000_th_16");
   1389 BM_TopKCPU(128, 100000, 100000, 16, "topk_nmt_r_128_c_100000_k_100000_th_16");
   1390 BM_TopKCPU(128, 35000, 35000, 16, "topk_nmt_r_128_c_35000_k_35000_th_16");
   1391 BM_TopKCPU(128, 70000, 70000, 16, "topk_nmt_r_128_c_70000_k_70000_th_16");
   1392 BM_TopKCPU(128, 175000, 175000, 16, "topk_nmt_r_128_c_175000_k_175000_th_16");
   1393 BM_TopKCPU(128, 350000, 350000, 16, "topk_nmt_r_128_c_350000_k_350000_th_16");
   1394 
   1395 }  // namespace tensorflow
   1396