Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #include <functional>
     17 #include <vector>
     18 
     19 #include "tensorflow/core/common_runtime/device.h"
     20 #include "tensorflow/core/common_runtime/device_factory.h"
     21 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
     22 #include "tensorflow/core/framework/allocator.h"
     23 #include "tensorflow/core/framework/fake_input.h"
     24 #include "tensorflow/core/framework/node_def_builder.h"
     25 #include "tensorflow/core/framework/op_kernel.h"
     26 #include "tensorflow/core/framework/tensor.h"
     27 #include "tensorflow/core/framework/tensor_testutil.h"
     28 #include "tensorflow/core/framework/types.h"
     29 #include "tensorflow/core/framework/types.pb.h"
     30 #include "tensorflow/core/graph/node_builder.h"
     31 #include "tensorflow/core/graph/testlib.h"
     32 #include "tensorflow/core/kernels/ops_testutil.h"
     33 #include "tensorflow/core/kernels/ops_util.h"
     34 #include "tensorflow/core/platform/test.h"
     35 #include "tensorflow/core/platform/test_benchmark.h"
     36 #include "tensorflow/core/public/session_options.h"
     37 #include "tensorflow/core/public/version.h"
     38 
     39 namespace tensorflow {
     40 
     41 template <typename Index>
     42 static void BM_SegmentReduction(int iters, const string& reduction,
     43                                 Index num_rows, Index num_cols,
     44                                 Index segment_size) {
     45   testing::StopTiming();
     46   std::unique_ptr<Device> device(
     47       DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
     48 
     49   // Create inputs
     50   gtl::InlinedVector<TensorValue, 4> reduction_inputs;
     51   TensorShape shape1({num_rows, num_cols});
     52   Tensor input1(DT_FLOAT, shape1);
     53   reduction_inputs.push_back({nullptr, &input1});
     54 
     55   TensorShape shape2({num_rows});
     56   Tensor input2(DataTypeToEnum<Index>::v(), shape2);
     57   test::FillFn<Index>(&input2, [&num_rows, &segment_size](Index i) -> Index {
     58     return std::min(i / segment_size, num_rows - 1);
     59   });
     60   reduction_inputs.push_back({nullptr, &input2});
     61 
     62   NodeDef reduction_node_def;
     63   TF_CHECK_OK(NodeDefBuilder(reduction, reduction)
     64                   .Input(FakeInput(DT_FLOAT))
     65                   .Input(FakeInput(DataTypeToEnum<Index>::v()))
     66                   .Finalize(&reduction_node_def));
     67   Status status;
     68   std::unique_ptr<OpKernel> reduction_op(
     69       CreateOpKernel(DEVICE_CPU, device.get(), cpu_allocator(),
     70                      reduction_node_def, TF_GRAPH_DEF_VERSION, &status));
     71   OpKernelContext::Params params;
     72   params.device = device.get();
     73   params.frame_iter = FrameAndIter(0, 0);
     74   params.inputs = &reduction_inputs;
     75   params.op_kernel = reduction_op.get();
     76   std::vector<AllocatorAttributes> attrs;
     77   test::SetOutputAttrs(&params, &attrs);
     78 
     79   std::unique_ptr<OpKernelContext> reduction_context(
     80       new OpKernelContext(&params));
     81 
     82   reduction_op->Compute(reduction_context.get());
     83   TF_CHECK_OK(reduction_context->status());
     84   testing::StartTiming();
     85   for (int i = 0; i < iters; ++i) {
     86     delete reduction_context->release_output(0).tensor;
     87     reduction_op->Compute(reduction_context.get());
     88   }
     89   int64 bytes_per_iter =
     90       static_cast<int64>(num_rows * num_cols * sizeof(float));
     91   testing::BytesProcessed(bytes_per_iter * iters);
     92 }
     93 
     94 #define BM_Reduce(O, R, C, S)                                      \
     95   static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \
     96     BM_SegmentReduction<int32>(iters, #O, R, C, S);                \
     97   }                                                                \
     98   static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \
     99     BM_SegmentReduction<int64>(iters, #O, R, C, S);                \
    100   }                                                                \
    101   BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32);              \
    102   BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64);
    103 
    104 #define BM_Reduce_Arg(R, C, S)    \
    105   BM_Reduce(SegmentSum, R, C, S); \
    106   BM_Reduce(SegmentMean, R, C, S);
    107 
    108 BM_Reduce_Arg(64, 32, 1);
    109 BM_Reduce_Arg(4096, 128, 1);
    110 
    111 BM_Reduce_Arg(16, 8, 2);
    112 BM_Reduce_Arg(64, 32, 2);
    113 BM_Reduce_Arg(4096, 32, 2);
    114 BM_Reduce_Arg(4096, 128, 2);
    115 
    116 static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) {
    117   testing::StopTiming();
    118   Graph* g = new Graph(OpRegistry::Global());
    119   CHECK_LE(uniqueness, 1.0);
    120   CHECK_GT(uniqueness, 0.0);
    121 
    122   const int kNumIndices = size;
    123   Tensor indices(DT_INT32, TensorShape({kNumIndices}));
    124   auto indices_flat = indices.flat<int32>();
    125   Tensor segments(DT_INT32, TensorShape({kNumIndices}));
    126   auto segments_flat = segments.flat<int32>();
    127 
    128   int kUniqueIndices = uniqueness * kNumIndices;
    129   Tensor output_dim0(DT_INT32, TensorShape({}));
    130   output_dim0.scalar<int32>()() = kUniqueIndices;
    131 
    132   for (int i = 0; i < kNumIndices; ++i) {
    133     indices_flat(i) = (i * 31) % kUniqueIndices;
    134     segments_flat(i) = i * .8;
    135   }
    136 
    137   const int kDim1 = segments_flat(kNumIndices - 1) + 1;
    138   const int kDim2 = 128;
    139   Tensor input(DT_FLOAT, TensorShape({kDim1, kDim2}));
    140   input.flat<float>().setRandom();
    141 
    142   Node* node;
    143   TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseSegmentMeanGrad")
    144                   .Input(test::graph::Constant(g, input))
    145                   .Input(test::graph::Constant(g, indices))
    146                   .Input(test::graph::Constant(g, segments))
    147                   .Input(test::graph::Constant(g, output_dim0))
    148                   .Attr("T", DT_FLOAT)
    149                   .Finalize(g, &node));
    150 
    151   testing::UseRealTime();
    152   testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) *
    153                           sizeof(float));
    154   testing::StartTiming();
    155   test::Benchmark("cpu", g).Run(iters);
    156 }
    157 
    158 static void BM_SparseSegmentMeanGrad_Low(int iters, int size) {
    159   return SparseSegmentMeanGradHelper(iters, 1.0, size);
    160 }
    161 
    162 static void BM_SparseSegmentMeanGrad_High(int iters, int size) {
    163   return SparseSegmentMeanGradHelper(iters, 0.01, size);
    164 }
    165 
    166 BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000);
    167 BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000);
    168 
    169 }  // namespace tensorflow
    170