Home | History | Annotate | Download | only in kernels
      1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
      2 
      3 Licensed under the Apache License, Version 2.0 (the "License");
      4 you may not use this file except in compliance with the License.
      5 You may obtain a copy of the License at
      6 
      7     http://www.apache.org/licenses/LICENSE-2.0
      8 
      9 Unless required by applicable law or agreed to in writing, software
     10 distributed under the License is distributed on an "AS IS" BASIS,
     11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     12 See the License for the specific language governing permissions and
     13 limitations under the License.
     14 ==============================================================================*/
     15 
     16 #if GOOGLE_CUDA
     17 
     18 #define EIGEN_USE_GPU
     19 
     20 #include "tensorflow/core/kernels/compare_and_bitpack_op.h"
     21 
     22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
     23 #include "tensorflow/core/framework/op_kernel.h"
     24 #include "tensorflow/core/framework/register_types.h"
     25 #include "tensorflow/core/framework/tensor_types.h"
     26 #include "tensorflow/core/platform/types.h"
     27 #include "tensorflow/core/util/cuda_kernel_helper.h"
     28 
     29 namespace tensorflow {
     30 
     31 typedef Eigen::GpuDevice GPUDevice;
     32 
     33 namespace functor {
     34 
     35 template <typename T>
     36 __global__ void CompareAndBitpackKernel(const int size, const T* threshold,
     37                                         const T* input, uint8* output) {
     38   // TODO(ebrevdo): Erich said: to get a better memory access pattern
     39   // you could have 8 threads load this data and do a comparison, then
     40   // use the ballot instruction to combine the values from each thread
     41   // in the warp in one instruction (so each thread will have the
     42   // result for 4 blocks) followed by an appropriate shift and mask to
     43   // get the 8-bits of interest.
     44   const T thresh = ldg(threshold);
     45   CUDA_1D_KERNEL_LOOP(i, size) {
     46     const T* block = input + 8 * i;
     47     output[i] =
     48         ((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) |
     49          (((ldg(block + 2) > thresh) << 5)) |
     50          (((ldg(block + 3) > thresh) << 4)) |
     51          (((ldg(block + 4) > thresh) << 3)) |
     52          (((ldg(block + 5) > thresh) << 2)) |
     53          (((ldg(block + 6) > thresh) << 1)) | (((ldg(block + 7) > thresh))));
     54   }
     55 }
     56 
     57 template <>
     58 __global__ void CompareAndBitpackKernel<bool>(const int size,
     59                                               const bool* threshold,
     60                                               const bool* input,
     61                                               uint8* output) {
     62   // TODO(ebrevdo): Erich said: I think you could again have multiple
     63   // threads work on one block and use the ballot instruction to the
     64   // bit packing in one instruction.
     65   CUDA_1D_KERNEL_LOOP(i, size) {
     66     const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i));
     67     // NOTE(ebrevdo): This assumes memory is little-endian.
     68     output[i] =
     69         ((((block & (1LL << (7 * 8))) >> (7 * 8 - 0))) |
     70          (((block & (1LL << (6 * 8))) >> (6 * 8 - 1))) |
     71          (((block & (1LL << (5 * 8))) >> (5 * 8 - 2))) |
     72          (((block & (1LL << (4 * 8))) >> (4 * 8 - 3))) |
     73          (((block & (1LL << (3 * 8))) >> (3 * 8 - 4))) |
     74          (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) |
     75          (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7)));
     76   }
     77 }
     78 
     79 template <>
     80 __global__ void CompareAndBitpackKernel<float>(const int size,
     81                                                const float* threshold,
     82                                                const float* input,
     83                                                uint8* output) {
     84   const float thresh = ldg(threshold);
     85   CUDA_1D_KERNEL_LOOP(i, size) {
     86     const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i));
     87     const float4 block1 =
     88         ldg(reinterpret_cast<const float4*>(input + 8 * i + 4));
     89     output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) |
     90                  (((block0.z > thresh) << 5)) | (((block0.w > thresh) << 4)) |
     91                  (((block1.x > thresh) << 3)) | (((block1.y > thresh) << 2)) |
     92                  (((block1.z > thresh) << 1)) | (((block1.w > thresh))));
     93   }
     94 }
     95 
     96 template <>
     97 __global__ void CompareAndBitpackKernel<double>(const int size,
     98                                                 const double* threshold,
     99                                                 const double* input,
    100                                                 uint8* output) {
    101   const double thresh = ldg(threshold);
    102   CUDA_1D_KERNEL_LOOP(i, size) {
    103     const double2 block0 = ldg(reinterpret_cast<const double2*>(input + 8 * i));
    104     const double2 block1 =
    105         ldg(reinterpret_cast<const double2*>(input + 8 * i + 2));
    106     const double2 block2 =
    107         ldg(reinterpret_cast<const double2*>(input + 8 * i + 4));
    108     const double2 block3 =
    109         ldg(reinterpret_cast<const double2*>(input + 8 * i + 6));
    110     output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) |
    111                  (((block1.x > thresh) << 5)) | (((block1.y > thresh) << 4)) |
    112                  (((block2.x > thresh) << 3)) | (((block2.y > thresh) << 2)) |
    113                  (((block3.x > thresh) << 1)) | (((block3.y > thresh))));
    114   }
    115 }
    116 
    117 #define DEFINE_GPU_SPECS(T)                                               \
    118   template <>                                                             \
    119   void CompareAndBitpack<GPUDevice, T>::operator()(                       \
    120       OpKernelContext* c, typename TTypes<T>::ConstMatrix input,          \
    121       typename TTypes<T>::ConstScalar threshold,                          \
    122       TTypes<uint8>::Matrix output) {                                     \
    123     const GPUDevice& d = c->eigen_device<GPUDevice>();                    \
    124     int64 total_count = output.size();                                    \
    125     CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d);        \
    126                                                                           \
    127     CompareAndBitpackKernel<T>                                            \
    128         <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \
    129             total_count, threshold.data(), input.data(), output.data());  \
    130   }
    131 
    132 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS)
    133 TF_CALL_bool(DEFINE_GPU_SPECS)
    134 
    135 #undef DECLARE_GPU_SPECS
    136 
    137 }  // namespace functor
    138 
    139 }  // namespace tensorflow
    140 
    141 #endif  // GOOGLE_CUDA
    142