1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #include <functional> 17 #include <vector> 18 19 #include "tensorflow/core/common_runtime/device.h" 20 #include "tensorflow/core/common_runtime/device_factory.h" 21 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" 22 #include "tensorflow/core/framework/allocator.h" 23 #include "tensorflow/core/framework/fake_input.h" 24 #include "tensorflow/core/framework/node_def_builder.h" 25 #include "tensorflow/core/framework/op_kernel.h" 26 #include "tensorflow/core/framework/tensor.h" 27 #include "tensorflow/core/framework/tensor_testutil.h" 28 #include "tensorflow/core/framework/types.h" 29 #include "tensorflow/core/framework/types.pb.h" 30 #include "tensorflow/core/graph/node_builder.h" 31 #include "tensorflow/core/graph/testlib.h" 32 #include "tensorflow/core/kernels/ops_testutil.h" 33 #include "tensorflow/core/kernels/ops_util.h" 34 #include "tensorflow/core/platform/test.h" 35 #include "tensorflow/core/platform/test_benchmark.h" 36 #include "tensorflow/core/public/session_options.h" 37 #include "tensorflow/core/public/version.h" 38 39 namespace tensorflow { 40 41 template <typename Index> 42 static void BM_SegmentReduction(int iters, const string& reduction, 43 Index num_rows, Index num_cols, 44 Index segment_size) { 45 testing::StopTiming(); 46 std::unique_ptr<Device> device( 47 DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0")); 48 49 // Create inputs 50 gtl::InlinedVector<TensorValue, 4> reduction_inputs; 51 TensorShape shape1({num_rows, num_cols}); 52 Tensor input1(DT_FLOAT, shape1); 53 reduction_inputs.push_back({nullptr, &input1}); 54 55 TensorShape shape2({num_rows}); 56 Tensor input2(DataTypeToEnum<Index>::v(), shape2); 57 test::FillFn<Index>(&input2, [&num_rows, &segment_size](Index i) -> Index { 58 return std::min(i / segment_size, num_rows - 1); 59 }); 60 reduction_inputs.push_back({nullptr, &input2}); 61 62 NodeDef reduction_node_def; 63 TF_CHECK_OK(NodeDefBuilder(reduction, reduction) 64 .Input(FakeInput(DT_FLOAT)) 65 .Input(FakeInput(DataTypeToEnum<Index>::v())) 66 .Finalize(&reduction_node_def)); 67 Status status; 68 std::unique_ptr<OpKernel> reduction_op( 69 CreateOpKernel(DEVICE_CPU, device.get(), cpu_allocator(), 70 reduction_node_def, TF_GRAPH_DEF_VERSION, &status)); 71 OpKernelContext::Params params; 72 params.device = device.get(); 73 params.frame_iter = FrameAndIter(0, 0); 74 params.inputs = &reduction_inputs; 75 params.op_kernel = reduction_op.get(); 76 std::vector<AllocatorAttributes> attrs; 77 test::SetOutputAttrs(¶ms, &attrs); 78 79 std::unique_ptr<OpKernelContext> reduction_context( 80 new OpKernelContext(¶ms)); 81 82 reduction_op->Compute(reduction_context.get()); 83 TF_CHECK_OK(reduction_context->status()); 84 testing::StartTiming(); 85 for (int i = 0; i < iters; ++i) { 86 delete reduction_context->release_output(0).tensor; 87 reduction_op->Compute(reduction_context.get()); 88 } 89 int64 bytes_per_iter = 90 static_cast<int64>(num_rows * num_cols * sizeof(float)); 91 testing::BytesProcessed(bytes_per_iter * iters); 92 } 93 94 #define BM_Reduce(O, R, C, S) \ 95 static void BM_Reduce_##O##_##R##_##C##_##S##_int32(int iters) { \ 96 BM_SegmentReduction<int32>(iters, #O, R, C, S); \ 97 } \ 98 static void BM_Reduce_##O##_##R##_##C##_##S##_int64(int iters) { \ 99 BM_SegmentReduction<int64>(iters, #O, R, C, S); \ 100 } \ 101 BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int32); \ 102 BENCHMARK(BM_Reduce_##O##_##R##_##C##_##S##_int64); 103 104 #define BM_Reduce_Arg(R, C, S) \ 105 BM_Reduce(SegmentSum, R, C, S); \ 106 BM_Reduce(SegmentMean, R, C, S); 107 108 BM_Reduce_Arg(64, 32, 1); 109 BM_Reduce_Arg(4096, 128, 1); 110 111 BM_Reduce_Arg(16, 8, 2); 112 BM_Reduce_Arg(64, 32, 2); 113 BM_Reduce_Arg(4096, 32, 2); 114 BM_Reduce_Arg(4096, 128, 2); 115 116 static void SparseSegmentMeanGradHelper(int iters, float uniqueness, int size) { 117 testing::StopTiming(); 118 Graph* g = new Graph(OpRegistry::Global()); 119 CHECK_LE(uniqueness, 1.0); 120 CHECK_GT(uniqueness, 0.0); 121 122 const int kNumIndices = size; 123 Tensor indices(DT_INT32, TensorShape({kNumIndices})); 124 auto indices_flat = indices.flat<int32>(); 125 Tensor segments(DT_INT32, TensorShape({kNumIndices})); 126 auto segments_flat = segments.flat<int32>(); 127 128 int kUniqueIndices = uniqueness * kNumIndices; 129 Tensor output_dim0(DT_INT32, TensorShape({})); 130 output_dim0.scalar<int32>()() = kUniqueIndices; 131 132 for (int i = 0; i < kNumIndices; ++i) { 133 indices_flat(i) = (i * 31) % kUniqueIndices; 134 segments_flat(i) = i * .8; 135 } 136 137 const int kDim1 = segments_flat(kNumIndices - 1) + 1; 138 const int kDim2 = 128; 139 Tensor input(DT_FLOAT, TensorShape({kDim1, kDim2})); 140 input.flat<float>().setRandom(); 141 142 Node* node; 143 TF_CHECK_OK(NodeBuilder(g->NewName("n"), "SparseSegmentMeanGrad") 144 .Input(test::graph::Constant(g, input)) 145 .Input(test::graph::Constant(g, indices)) 146 .Input(test::graph::Constant(g, segments)) 147 .Input(test::graph::Constant(g, output_dim0)) 148 .Attr("T", DT_FLOAT) 149 .Finalize(g, &node)); 150 151 testing::UseRealTime(); 152 testing::BytesProcessed(static_cast<int64>(iters) * (kDim1 * kDim2) * 153 sizeof(float)); 154 testing::StartTiming(); 155 test::Benchmark("cpu", g).Run(iters); 156 } 157 158 static void BM_SparseSegmentMeanGrad_Low(int iters, int size) { 159 return SparseSegmentMeanGradHelper(iters, 1.0, size); 160 } 161 162 static void BM_SparseSegmentMeanGrad_High(int iters, int size) { 163 return SparseSegmentMeanGradHelper(iters, 0.01, size); 164 } 165 166 BENCHMARK(BM_SparseSegmentMeanGrad_Low)->Arg(1000)->Arg(100000); 167 BENCHMARK(BM_SparseSegmentMeanGrad_High)->Arg(1000)->Arg(100000); 168 169 } // namespace tensorflow 170