1 /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 // See docs in ../ops/math_ops.cc. 17 18 #define EIGEN_USE_THREADS 19 20 #include <numeric> 21 22 #include "tensorflow/core/kernels/aggregate_ops.h" 23 #include "tensorflow/core/kernels/aggregate_ops_cpu.h" 24 25 #include "tensorflow/core/framework/numeric_op.h" 26 #include "tensorflow/core/framework/register_types.h" 27 #include "tensorflow/core/framework/variant.h" 28 #include "tensorflow/core/framework/variant_encode_decode.h" 29 #include "tensorflow/core/framework/variant_op_registry.h" 30 #include "tensorflow/core/lib/gtl/inlined_vector.h" 31 #include "tensorflow/core/platform/logging.h" 32 33 namespace tensorflow { 34 35 typedef Eigen::ThreadPoolDevice CPUDevice; 36 typedef Eigen::GpuDevice GPUDevice; 37 #ifdef TENSORFLOW_USE_SYCL 38 typedef Eigen::SyclDevice SYCLDevice; 39 #endif // TENSORFLOW_USE_SYCL 40 41 template <typename Device, typename T> 42 class AddNOp : public OpKernel { 43 public: 44 explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {} 45 46 void Compute(OpKernelContext* ctx) override { 47 if (!ctx->ValidateInputsAreSameShape(this)) return; 48 49 const Tensor& input0 = ctx->input(0); 50 const int num = ctx->num_inputs(); 51 52 if (num == 1) { 53 ctx->set_output(0, input0); 54 return; 55 } 56 57 // Try to forward and accumulate the result in one of the input buffers. 58 int reused_input = -1; 59 gtl::InlinedVector<int, 8> input_indices(num); 60 std::iota(input_indices.begin(), input_indices.end(), 0); 61 Tensor* output = nullptr; 62 for (int input_idx = 0; input_idx < num; ++input_idx) { 63 if (ctx->forward_input_to_output_with_shape(input_idx, 0, input0.shape(), 64 &output)) { 65 reused_input = input_idx; 66 break; 67 } 68 } 69 if (reused_input == -1) { 70 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input0.shape(), &output)); 71 } else if (reused_input > 0) { 72 // Move the forwarded buffer to the front so we don't double count 73 // anything if there are more than 8 inputs. 74 input_indices[0] = reused_input; 75 input_indices[reused_input] = 0; 76 } 77 auto To = output->flat<T>(); 78 79 #define I(IDX) ctx->input(input_indices[IDX]).flat<T>() 80 81 #if defined(__ANDROID_TYPES_SLIM__) 82 // On Android by default,we only support additions of two arguments, so we 83 // can reduce the number of template instantiations. 84 OP_REQUIRES(ctx, num == 2, 85 errors::InvalidArgument("Only additions of two arguments " 86 "supported. Num inputs: ", 87 num)); 88 functor::Add2Functor<Device, T> functor2; 89 functor2(ctx->template eigen_device<Device>(), To, I(0), I(1)); 90 #else 91 static const int kWidth = 8; 92 int r = num % kWidth; 93 94 switch (r) { 95 case 2: { 96 functor::Add2Functor<Device, T> functor2; 97 functor2(ctx->template eigen_device<Device>(), To, I(0), I(1)); 98 break; 99 } 100 case 3: { 101 functor::Add3Functor<Device, T> functor3; 102 functor3(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2)); 103 break; 104 } 105 case 4: { 106 functor::Add4Functor<Device, T> functor4; 107 functor4(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 108 I(3)); 109 break; 110 } 111 case 5: { 112 functor::Add5Functor<Device, T> functor5; 113 functor5(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 114 I(3), I(4)); 115 break; 116 } 117 case 6: { 118 functor::Add6Functor<Device, T> functor6; 119 functor6(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 120 I(3), I(4), I(5)); 121 break; 122 } 123 case 7: { 124 functor::Add7Functor<Device, T> functor7; 125 functor7(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 126 I(3), I(4), I(5), I(6)); 127 break; 128 } 129 case 0: { 130 functor::Add8Functor<Device, T> functor8; 131 functor8(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 132 I(3), I(4), I(5), I(6), I(7)); 133 r = 8; 134 break; 135 } 136 case 1: { 137 functor::Add9Functor<Device, T> functor9; 138 functor9(ctx->template eigen_device<Device>(), To, I(0), I(1), I(2), 139 I(3), I(4), I(5), I(6), I(7), I(8)); 140 r = 9; 141 break; 142 } 143 } 144 145 for (; r < num; r += kWidth) { 146 functor::Add8pFunctor<Device, T> functor8p; 147 functor8p(ctx->template eigen_device<Device>(), To, I(r), I(r + 1), 148 I(r + 2), I(r + 3), I(r + 4), I(r + 5), I(r + 6), I(r + 7)); 149 } 150 #endif // defined(__ANDROID_TYPES_SLIM__) 151 152 #undef I 153 } 154 }; 155 156 template <typename Device> 157 class AddNOp<Device, Variant> : public OpKernel { 158 public: 159 explicit AddNOp(OpKernelConstruction* context) : OpKernel(context) {} 160 161 void Compute(OpKernelContext* ctx) override { 162 if (!ctx->ValidateInputsAreSameShape(this)) return; 163 164 const Tensor& input0 = ctx->input(0); 165 const int num = ctx->num_inputs(); 166 167 if (num == 1) { 168 ctx->set_output(0, input0); 169 return; 170 } 171 172 for (int i = 0; i < num; ++i) { 173 // Step 1: ensure unary variants. 174 OP_REQUIRES( 175 ctx, ctx->input(i).dims() == 0, 176 errors::InvalidArgument( 177 "AddN of non-scalar Tensor with dtype=DT_VARIANT is not " 178 "supported; inputs[", 179 i, " has shape: ", ctx->input(i).shape().DebugString(), ".")); 180 } 181 182 TensorShape common_shape; 183 OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(0), &common_shape)); 184 // Step 2: access all variants and ensure shapes match. 185 for (int i = 1; i < num; ++i) { 186 TensorShape check_shape; 187 OP_REQUIRES_OK(ctx, GetUnaryVariantShape(ctx->input(i), &check_shape)); 188 OP_REQUIRES(ctx, common_shape == check_shape, 189 errors::InvalidArgument( 190 "AddN of Variants of differing shapes; inputs[0] shape: ", 191 common_shape.DebugString(), ", inputs[", i, 192 "] shape: ", check_shape.DebugString())); 193 } 194 195 // Step 3: attempt to add using 196 // BinaryOpVariants(ADD_VARIANT_BINARY_OP, ...) 197 // For the output create a default-constructed variant object. 198 // TODO(ebrevdo): Perform summation in a tree-structure. 199 Tensor out(cpu_allocator(), DT_VARIANT, TensorShape({})); 200 Variant* v_out = &(out.scalar<Variant>()()); 201 OP_REQUIRES_OK( 202 ctx, BinaryOpVariants<Device>( 203 ctx, ADD_VARIANT_BINARY_OP, ctx->input(0).scalar<Variant>()(), 204 ctx->input(1).scalar<Variant>()(), v_out)); 205 for (int i = 2; i < num; ++i) { 206 const Variant tmp = std::move(*v_out); 207 const Variant& inp = ctx->input(i).scalar<Variant>()(); 208 OP_REQUIRES_OK(ctx, BinaryOpVariants<Device>(ctx, ADD_VARIANT_BINARY_OP, 209 inp, tmp, v_out)); 210 } 211 ctx->set_output(0, out); 212 } 213 }; 214 215 #define REGISTER_ADDN(type, dev) \ 216 REGISTER_KERNEL_BUILDER( \ 217 Name("AddN").Device(DEVICE_##dev).TypeConstraint<type>("T"), \ 218 AddNOp<dev##Device, type>) 219 220 #define REGISTER_ADDN_CPU(type) REGISTER_ADDN(type, CPU) 221 222 TF_CALL_NUMBER_TYPES(REGISTER_ADDN_CPU); 223 REGISTER_ADDN_CPU(Variant); 224 225 #undef REGISTER_ADDN_CPU 226 227 #if GOOGLE_CUDA 228 #define REGISTER_ADDN_GPU(type) REGISTER_ADDN(type, GPU) 229 TF_CALL_GPU_NUMBER_TYPES(REGISTER_ADDN_GPU); 230 TF_CALL_complex64(REGISTER_ADDN_GPU); 231 TF_CALL_complex128(REGISTER_ADDN_GPU); 232 TF_CALL_variant(REGISTER_ADDN_GPU); 233 #undef REGISTER_ADDN_GPU 234 235 // A special GPU kernel for int32. 236 // TODO(b/25387198): Also enable int32 in device memory. This kernel 237 // registration requires all int32 inputs and outputs to be in host memory. 238 REGISTER_KERNEL_BUILDER(Name("AddN") 239 .Device(DEVICE_GPU) 240 .TypeConstraint<int32>("T") 241 .HostMemory("inputs") 242 .HostMemory("sum"), 243 AddNOp<CPUDevice, int32>); 244 245 #endif // GOOGLE_CUDA 246 247 #ifdef TENSORFLOW_USE_SYCL 248 REGISTER_ADDN(float, SYCL); 249 REGISTER_ADDN(double, SYCL); 250 251 // A special GPU kernel for int32. 252 // TODO(b/25387198): Also enable int32 in device memory. This kernel 253 // registration requires all int32 inputs and outputs to be in host memory. 254 REGISTER_KERNEL_BUILDER(Name("AddN") 255 .Device(DEVICE_SYCL) 256 .TypeConstraint<int32>("T") 257 .HostMemory("inputs") 258 .HostMemory("sum"), 259 AddNOp<CPUDevice, int32>); 260 #endif // TENSORFLOW_USE_SYCL 261 262 #undef REGISTER_ADDN 263 264 } // namespace tensorflow 265