Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include <functional>
     17 #include <memory>
     18 #include <vector>
     19 
     20 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
     21 #include "tensorflow/core/framework/allocator.h"
     22 #include "tensorflow/core/framework/op_kernel.h"
     23 #include "tensorflow/core/framework/tensor.h"
     24 #include "tensorflow/core/framework/types.h"
     25 #include "tensorflow/core/framework/types.pb.h"
     26 #include "tensorflow/core/graph/node_builder.h"
     27 #include "tensorflow/core/graph/testlib.h"
     28 #include "tensorflow/core/kernels/ops_testutil.h"
     29 #include "tensorflow/core/kernels/ops_util.h"
     30 #include "tensorflow/core/lib/core/status_test_util.h"
     31 #include "tensorflow/core/platform/prefetch.h"
     32 #include "tensorflow/core/platform/test.h"
     33 #include "tensorflow/core/platform/test_benchmark.h"
     34 
     35 namespace tensorflow {
     36 namespace {
     37 
     38 // For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
     39 // in size, and concat them together along "concat_dimension"
     40 template <typename T>
     41 static void ConcatHelper(int iters, int concat_dimension, int dim2) {
     42   testing::StopTiming();
     43   Graph* g = new Graph(OpRegistry::Global());
     44 
     45   DataType dt = DataTypeToEnum<T>::v();
     46   const int kDim1 = 100;
     47   Tensor concat_dim(DT_INT32, TensorShape({}));
     48   concat_dim.scalar<int32>()() = concat_dimension;
     49   Tensor in0(dt, TensorShape({kDim1, dim2}));
     50   in0.flat<T>().setRandom();
     51   Tensor in1(dt, TensorShape({kDim1, dim2}));
     52   in1.flat<T>().setRandom();
     53 
     54   Node* node;
     55   TF_CHECK_OK(
     56       NodeBuilder(g->NewName("n"), "Concat")
     57           .Input(test::graph::Constant(g, concat_dim))
     58           .Input({test::graph::Constant(g, in0), test::graph::Constant(g, in1)})
     59           .Attr("N", 2)
     60           .Attr("T", dt)
     61           .Finalize(g, &node));
     62 
     63   testing::BytesProcessed(static_cast<int64>(iters) *
     64                           ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T));
     65   testing::StartTiming();
     66   test::Benchmark("cpu", g).Run(iters);
     67   testing::UseRealTime();
     68 }
     69 
     70 static void BM_ConcatDim0Float(int iters, int dim2) {
     71   ConcatHelper<float>(iters, 0, dim2);
     72 }
     73 
     74 static void BM_ConcatDim1Float(int iters, int dim2) {
     75   ConcatHelper<float>(iters, 1, dim2);
     76 }
     77 
     78 BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
     79 BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
     80 
     81 static void BM_ConcatDim1int16(int iters, int dim2) {
     82   ConcatHelper<int16>(iters, 1, dim2);
     83 }
     84 static void BM_ConcatDim1bfloat16(int iters, int dim2) {
     85   ConcatHelper<bfloat16>(iters, 1, dim2);
     86 }
     87 
     88 BENCHMARK(BM_ConcatDim1int16)->Arg(1000)->Arg(100000)->Arg(1000000);
     89 BENCHMARK(BM_ConcatDim1bfloat16)->Arg(1000)->Arg(100000)->Arg(1000000);
     90 
     91 template <typename T>
     92 static void ConcatManyHelper(int iters, int concat_dimension, int dim2) {
     93   testing::StopTiming();
     94   Graph* g = new Graph(OpRegistry::Global());
     95 
     96   DataType dt = DataTypeToEnum<T>::v();
     97   const int kDim1 = 40000;
     98   const int kNumInputs = 64;
     99   Tensor concat_dim(DT_INT32, TensorShape({}));
    100   concat_dim.scalar<int32>()() = concat_dimension;
    101   std::vector<NodeBuilder::NodeOut> inputs;
    102   inputs.reserve(kNumInputs);
    103   for (int i = 0; i < kNumInputs; ++i) {
    104     Tensor in(dt, TensorShape({kDim1, dim2}));
    105     in.flat<T>().setRandom();
    106     inputs.push_back(test::graph::Constant(g, in));
    107   }
    108 
    109   Node* node;
    110   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Concat")
    111                   .Input(test::graph::Constant(g, concat_dim))
    112                   .Input(inputs)
    113                   .Attr("N", 64)
    114                   .Attr("T", dt)
    115                   .Finalize(g, &node));
    116   testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
    117                           kNumInputs * sizeof(T));
    118   testing::StartTiming();
    119   test::Benchmark("cpu", g).Run(iters);
    120   testing::UseRealTime();
    121 }
    122 
    123 static void BM_ConcatManyDim1bfloat16(int iters, int dim2) {
    124   ConcatManyHelper<bfloat16>(iters, 1, dim2);
    125 }
    126 
    127 BENCHMARK(BM_ConcatManyDim1bfloat16)->Arg(18)->Arg(34)->Arg(60);
    128 
    129 static void MemcpyAlternativeHelper(int iters, int concat_dimension, int dim2) {
    130   testing::StopTiming();
    131 
    132   const int kDim1 = 100;
    133   std::vector<float> data1(kDim1 * dim2, 1.0f);
    134   std::vector<float> data2(kDim1 * dim2, 2.0f);
    135 
    136   testing::BytesProcessed(static_cast<int64>(iters) *
    137                           ((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(float));
    138   testing::StartTiming();
    139   while (--iters > 0) {
    140     const size_t n0 = data1.size();
    141     const size_t n1 = data2.size();
    142     float* result = new float[n0 + n1];
    143     memcpy(&result[0], &data1[0], n0 * sizeof(float));
    144     memcpy(&result[n0], &data2[0], n1 * sizeof(float));
    145     delete[] result;
    146   }
    147 }
    148 
    149 static void BM_MemcpyAlternativeDim0(int iters, int dim2) {
    150   MemcpyAlternativeHelper(iters, 0, dim2);
    151 }
    152 static void BM_MemcpyAlternativeDim1(int iters, int dim2) {
    153   MemcpyAlternativeHelper(iters, 1, dim2);
    154 }
    155 
    156 BENCHMARK(BM_MemcpyAlternativeDim0)->Arg(1000)->Arg(100000)->Arg(1000000);
    157 BENCHMARK(BM_MemcpyAlternativeDim1)->Arg(1000)->Arg(100000)->Arg(1000000);
    158 
    159 typedef Eigen::TensorMap<Eigen::Tensor<bfloat16, 1, Eigen::RowMajor>,
    160                          Eigen::Unaligned>
    161     EigenMap;
    162 static void MemcpyManyAlternative1(int iters, int dim2) {
    163   testing::StopTiming();
    164 
    165   const int kDim1 = 40000;
    166   const int kNumCopies = 64;
    167   const int size = kDim1 * dim2 * kNumCopies;
    168   bfloat16* data = new bfloat16[size];
    169   EigenMap map(data, size);
    170   map.setRandom();
    171 
    172   testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
    173                           kNumCopies * sizeof(bfloat16));
    174   testing::StartTiming();
    175   while (iters-- > 0) {
    176     std::vector<bfloat16*> inputs(kNumCopies);
    177     for (int i = 0; i < kNumCopies; ++i) {
    178       inputs[i] = &data[i * kDim1 * dim2];
    179     }
    180     bfloat16* result = new bfloat16[size];
    181     for (int j = 0; j < kNumCopies; ++j) {
    182       bfloat16* output = &result[j * dim2];
    183       for (int i = 0; i < kDim1; ++i) {
    184         if (i + 1 < kDim1) {
    185           port::prefetch<port::PREFETCH_HINT_T0>(inputs[j] + dim2);
    186         }
    187         memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
    188         inputs[j] += dim2;
    189         output += dim2 * kNumCopies;
    190       }
    191     }
    192     delete[] result;
    193   }
    194   delete[] data;
    195 }
    196 
    197 static void MemcpyManyAlternative2(int iters, int dim2) {
    198   testing::StopTiming();
    199 
    200   const int kDim1 = 40000;
    201   const int kNumCopies = 64;
    202   const int size = kDim1 * dim2 * kNumCopies;
    203   bfloat16* data = new bfloat16[size];
    204   EigenMap map(data, size);
    205   map.setRandom();
    206 
    207   testing::BytesProcessed(static_cast<int64>(iters) * kDim1 * dim2 *
    208                           kNumCopies * sizeof(bfloat16));
    209   testing::StartTiming();
    210   std::vector<bfloat16*> inputs(kNumCopies);
    211   while (--iters > 0) {
    212     bfloat16* result = new bfloat16[size];
    213     for (int i = 0; i < kNumCopies; ++i) {
    214       inputs[i] = &data[i * kDim1 * dim2];
    215     }
    216     bfloat16* output = result;
    217     for (int i = 0; i < kDim1; ++i) {
    218       for (int j = 0; j < kNumCopies; ++j) {
    219         if (j + 1 < kNumCopies) {
    220           port::prefetch<port::PREFETCH_HINT_T0>(inputs[j + 1]);
    221         }
    222         memcpy(output, inputs[j], dim2 * sizeof(bfloat16));
    223         inputs[j] += dim2;
    224         output += dim2;
    225       }
    226     }
    227     delete[] result;
    228   }
    229   delete[] data;
    230 }
    231 
    232 BENCHMARK(MemcpyManyAlternative1)
    233     ->Arg(16)
    234     ->Arg(17)
    235     ->Arg(18)
    236     ->Arg(32)
    237     ->Arg(33)
    238     ->Arg(34)
    239     ->Arg(60)
    240     ->Arg(64)
    241     ->Arg(65);
    242 
    243 BENCHMARK(MemcpyManyAlternative2)
    244     ->Arg(16)
    245     ->Arg(17)
    246     ->Arg(18)
    247     ->Arg(32)
    248     ->Arg(33)
    249     ->Arg(34)
    250     ->Arg(60)
    251     ->Arg(64)
    252     ->Arg(65);
    253 
    254 }  // namespace
    255 }  // namespace tensorflow
    256