Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 // See docs in ../ops/math_ops.cc.
     17 
     18 #define EIGEN_USE_THREADS
     19 
     20 #include "tensorflow/core/kernels/bincount_op.h"
     21 #include "tensorflow/core/framework/op_kernel.h"
     22 #include "tensorflow/core/framework/register_types.h"
     23 #include "tensorflow/core/framework/types.h"
     24 #include "tensorflow/core/lib/core/threadpool.h"
     25 #include "tensorflow/core/platform/types.h"
     26 
     27 namespace tensorflow {
     28 
     29 using thread::ThreadPool;
     30 
     31 typedef Eigen::ThreadPoolDevice CPUDevice;
     32 typedef Eigen::GpuDevice GPUDevice;
     33 
     34 namespace functor {
     35 
     36 template <typename T>
     37 struct BincountFunctor<CPUDevice, T> {
     38   static Status Compute(OpKernelContext* context,
     39                         const typename TTypes<int32, 1>::ConstTensor& arr,
     40                         const typename TTypes<T, 1>::ConstTensor& weights,
     41                         typename TTypes<T, 1>::Tensor& output) {
     42     int size = output.size();
     43 
     44     Tensor all_nonneg_t;
     45     TF_RETURN_IF_ERROR(context->allocate_temp(
     46         DT_BOOL, TensorShape({}), &all_nonneg_t, AllocatorAttributes()));
     47     all_nonneg_t.scalar<bool>().device(context->eigen_cpu_device()) =
     48         (arr >= 0).all();
     49     if (!all_nonneg_t.scalar<bool>()()) {
     50       return errors::InvalidArgument("Input arr must be non-negative!");
     51     }
     52 
     53     // Allocate partial output bin sums for each worker thread. Worker ids in
     54     // ParallelForWithWorkerId range from 0 to NumThreads() inclusive.
     55     ThreadPool* thread_pool =
     56         context->device()->tensorflow_cpu_worker_threads()->workers;
     57     const int64 num_threads = thread_pool->NumThreads() + 1;
     58     Tensor partial_bins_t;
     59     TF_RETURN_IF_ERROR(context->allocate_temp(DataTypeToEnum<T>::value,
     60                                               TensorShape({num_threads, size}),
     61                                               &partial_bins_t));
     62     auto partial_bins = partial_bins_t.matrix<T>();
     63     partial_bins.setZero();
     64     thread_pool->ParallelForWithWorkerId(
     65         arr.size(), 8 /* cost */,
     66         [&](int64 start_ind, int64 limit_ind, int64 worker_id) {
     67           for (int64 i = start_ind; i < limit_ind; i++) {
     68             int32 value = arr(i);
     69             if (value < size) {
     70               if (weights.size()) {
     71                 partial_bins(worker_id, value) += weights(i);
     72               } else {
     73                 // Complex numbers don't support "++".
     74                 partial_bins(worker_id, value) += T(1);
     75               }
     76             }
     77           }
     78         });
     79 
     80     // Sum the partial bins along the 0th axis.
     81     Eigen::array<int, 1> reduce_dims({0});
     82     output.device(context->eigen_cpu_device()) = partial_bins.sum(reduce_dims);
     83     return Status::OK();
     84   }
     85 };
     86 
     87 }  // namespace functor
     88 
     89 template <typename Device, typename T>
     90 class BincountOp : public OpKernel {
     91  public:
     92   explicit BincountOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
     93 
     94   void Compute(OpKernelContext* ctx) override {
     95     const Tensor& arr_t = ctx->input(0);
     96     const Tensor& size_tensor = ctx->input(1);
     97     const Tensor& weights_t = ctx->input(2);
     98 
     99     int32 size = size_tensor.scalar<int32>()();
    100     OP_REQUIRES(
    101         ctx, size >= 0,
    102         errors::InvalidArgument("size (", size, ") must be non-negative"));
    103 
    104     const auto arr = arr_t.flat<int32>();
    105     const auto weights = weights_t.flat<T>();
    106     Tensor* output_t;
    107     OP_REQUIRES_OK(ctx,
    108                    ctx->allocate_output(0, TensorShape({size}), &output_t));
    109     auto output = output_t->flat<T>();
    110     OP_REQUIRES_OK(ctx, functor::BincountFunctor<Device, T>::Compute(
    111                             ctx, arr, weights, output));
    112   }
    113 };
    114 
    115 #define REGISTER_KERNELS(type)                                       \
    116   REGISTER_KERNEL_BUILDER(                                           \
    117       Name("Bincount").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
    118       BincountOp<CPUDevice, type>)
    119 
    120 TF_CALL_NUMBER_TYPES(REGISTER_KERNELS);
    121 #undef REGISTER_KERNELS
    122 
    123 #if GOOGLE_CUDA
    124 
    125 #define REGISTER_KERNELS(type)                            \
    126   REGISTER_KERNEL_BUILDER(Name("Bincount")                \
    127                               .Device(DEVICE_GPU)         \
    128                               .HostMemory("size")         \
    129                               .TypeConstraint<type>("T"), \
    130                           BincountOp<GPUDevice, type>)
    131 
    132 TF_CALL_int32(REGISTER_KERNELS);
    133 TF_CALL_float(REGISTER_KERNELS);
    134 #undef REGISTER_KERNELS
    135 
    136 #endif  // GOOGLE_CUDA
    137 
    138 }  // end namespace tensorflow
    139