1 /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 3 Licensed under the Apache License, Version 2.0 (the "License"); 4 you may not use this file except in compliance with the License. 5 You may obtain a copy of the License at 6 7 http://www.apache.org/licenses/LICENSE-2.0 8 9 Unless required by applicable law or agreed to in writing, software 10 distributed under the License is distributed on an "AS IS" BASIS, 11 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 See the License for the specific language governing permissions and 13 limitations under the License. 14 ==============================================================================*/ 15 16 #if GOOGLE_CUDA 17 18 #define EIGEN_USE_GPU 19 20 #include "tensorflow/core/kernels/compare_and_bitpack_op.h" 21 22 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" 23 #include "tensorflow/core/framework/op_kernel.h" 24 #include "tensorflow/core/framework/register_types.h" 25 #include "tensorflow/core/framework/tensor_types.h" 26 #include "tensorflow/core/platform/types.h" 27 #include "tensorflow/core/util/cuda_kernel_helper.h" 28 29 namespace tensorflow { 30 31 typedef Eigen::GpuDevice GPUDevice; 32 33 namespace functor { 34 35 template <typename T> 36 __global__ void CompareAndBitpackKernel(const int size, const T* threshold, 37 const T* input, uint8* output) { 38 // TODO(ebrevdo): Erich said: to get a better memory access pattern 39 // you could have 8 threads load this data and do a comparison, then 40 // use the ballot instruction to combine the values from each thread 41 // in the warp in one instruction (so each thread will have the 42 // result for 4 blocks) followed by an appropriate shift and mask to 43 // get the 8-bits of interest. 44 const T thresh = ldg(threshold); 45 CUDA_1D_KERNEL_LOOP(i, size) { 46 const T* block = input + 8 * i; 47 output[i] = 48 ((((ldg(block) > thresh) << 7)) | (((ldg(block + 1) > thresh) << 6)) | 49 (((ldg(block + 2) > thresh) << 5)) | 50 (((ldg(block + 3) > thresh) << 4)) | 51 (((ldg(block + 4) > thresh) << 3)) | 52 (((ldg(block + 5) > thresh) << 2)) | 53 (((ldg(block + 6) > thresh) << 1)) | (((ldg(block + 7) > thresh)))); 54 } 55 } 56 57 template <> 58 __global__ void CompareAndBitpackKernel<bool>(const int size, 59 const bool* threshold, 60 const bool* input, 61 uint8* output) { 62 // TODO(ebrevdo): Erich said: I think you could again have multiple 63 // threads work on one block and use the ballot instruction to the 64 // bit packing in one instruction. 65 CUDA_1D_KERNEL_LOOP(i, size) { 66 const int64 block = ldg(reinterpret_cast<const int64*>(input + 8 * i)); 67 // NOTE(ebrevdo): This assumes memory is little-endian. 68 output[i] = 69 ((((block & (1LL << (7 * 8))) >> (7 * 8 - 0))) | 70 (((block & (1LL << (6 * 8))) >> (6 * 8 - 1))) | 71 (((block & (1LL << (5 * 8))) >> (5 * 8 - 2))) | 72 (((block & (1LL << (4 * 8))) >> (4 * 8 - 3))) | 73 (((block & (1LL << (3 * 8))) >> (3 * 8 - 4))) | 74 (((block & (1LL << (2 * 8))) >> (2 * 8 - 5))) | 75 (((block & (1LL << 8)) >> (1 * 8 - 6))) | (((block & (1LL)) << 7))); 76 } 77 } 78 79 template <> 80 __global__ void CompareAndBitpackKernel<float>(const int size, 81 const float* threshold, 82 const float* input, 83 uint8* output) { 84 const float thresh = ldg(threshold); 85 CUDA_1D_KERNEL_LOOP(i, size) { 86 const float4 block0 = ldg(reinterpret_cast<const float4*>(input + 8 * i)); 87 const float4 block1 = 88 ldg(reinterpret_cast<const float4*>(input + 8 * i + 4)); 89 output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) | 90 (((block0.z > thresh) << 5)) | (((block0.w > thresh) << 4)) | 91 (((block1.x > thresh) << 3)) | (((block1.y > thresh) << 2)) | 92 (((block1.z > thresh) << 1)) | (((block1.w > thresh)))); 93 } 94 } 95 96 template <> 97 __global__ void CompareAndBitpackKernel<double>(const int size, 98 const double* threshold, 99 const double* input, 100 uint8* output) { 101 const double thresh = ldg(threshold); 102 CUDA_1D_KERNEL_LOOP(i, size) { 103 const double2 block0 = ldg(reinterpret_cast<const double2*>(input + 8 * i)); 104 const double2 block1 = 105 ldg(reinterpret_cast<const double2*>(input + 8 * i + 2)); 106 const double2 block2 = 107 ldg(reinterpret_cast<const double2*>(input + 8 * i + 4)); 108 const double2 block3 = 109 ldg(reinterpret_cast<const double2*>(input + 8 * i + 6)); 110 output[i] = ((((block0.x > thresh) << 7)) | (((block0.y > thresh) << 6)) | 111 (((block1.x > thresh) << 5)) | (((block1.y > thresh) << 4)) | 112 (((block2.x > thresh) << 3)) | (((block2.y > thresh) << 2)) | 113 (((block3.x > thresh) << 1)) | (((block3.y > thresh)))); 114 } 115 } 116 117 #define DEFINE_GPU_SPECS(T) \ 118 template <> \ 119 void CompareAndBitpack<GPUDevice, T>::operator()( \ 120 OpKernelContext* c, typename TTypes<T>::ConstMatrix input, \ 121 typename TTypes<T>::ConstScalar threshold, \ 122 TTypes<uint8>::Matrix output) { \ 123 const GPUDevice& d = c->eigen_device<GPUDevice>(); \ 124 int64 total_count = output.size(); \ 125 CudaLaunchConfig config = GetCudaLaunchConfig(total_count, d); \ 126 \ 127 CompareAndBitpackKernel<T> \ 128 <<<config.block_count, config.thread_per_block, 0, d.stream()>>>( \ 129 total_count, threshold.data(), input.data(), output.data()); \ 130 } 131 132 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS) 133 TF_CALL_bool(DEFINE_GPU_SPECS) 134 135 #undef DECLARE_GPU_SPECS 136 137 } // namespace functor 138 139 } // namespace tensorflow 140 141 #endif // GOOGLE_CUDA 142