Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include "opencv2/core/cuda/common.hpp"
     46 #include "opencv2/core/cuda/functional.hpp"
     47 #include "opencv2/core/cuda/emulation.hpp"
     48 #include "opencv2/core/cuda/transform.hpp"
     49 
     50 using namespace cv::cuda;
     51 using namespace cv::cuda::device;
     52 
     53 namespace hist
     54 {
     55     __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
     56     {
     57         __shared__ int shist[256];
     58 
     59         const int y = blockIdx.x * blockDim.y + threadIdx.y;
     60         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
     61 
     62         shist[tid] = 0;
     63         __syncthreads();
     64 
     65         if (y < rows)
     66         {
     67             const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
     68 
     69             const int cols_4 = cols / 4;
     70             for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
     71             {
     72                 unsigned int data = rowPtr[x];
     73 
     74                 Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
     75                 Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
     76                 Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
     77                 Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
     78             }
     79 
     80             if (cols % 4 != 0 && threadIdx.x == 0)
     81             {
     82                 for (int x = cols_4 * 4; x < cols; ++x)
     83                 {
     84                     unsigned int data = ((const uchar*)rowPtr)[x];
     85                     Emulation::smem::atomicAdd(&shist[data], 1);
     86                 }
     87             }
     88         }
     89 
     90         __syncthreads();
     91 
     92         const int histVal = shist[tid];
     93         if (histVal > 0)
     94             ::atomicAdd(hist + tid, histVal);
     95     }
     96 
     97     void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
     98     {
     99         const dim3 block(32, 8);
    100         const dim3 grid(divUp(src.rows, block.y));
    101 
    102         histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
    103         cudaSafeCall( cudaGetLastError() );
    104 
    105         if (stream == 0)
    106             cudaSafeCall( cudaDeviceSynchronize() );
    107     }
    108 }
    109 
    110 /////////////////////////////////////////////////////////////////////////
    111 
    112 namespace hist
    113 {
    114     __device__ __forceinline__ void histEvenInc(int* shist, uint data, int binSize, int lowerLevel, int upperLevel)
    115     {
    116         if (data >= lowerLevel && data <= upperLevel)
    117         {
    118             const uint ind = (data - lowerLevel) / binSize;
    119             Emulation::smem::atomicAdd(shist + ind, 1);
    120         }
    121     }
    122 
    123     __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
    124                                int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
    125     {
    126         extern __shared__ int shist[];
    127 
    128         const int y = blockIdx.x * blockDim.y + threadIdx.y;
    129         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
    130 
    131         if (tid < binCount)
    132             shist[tid] = 0;
    133 
    134         __syncthreads();
    135 
    136         if (y < rows)
    137         {
    138             const uchar* rowPtr = src + y * step;
    139             const uint* rowPtr4 = (uint*) rowPtr;
    140 
    141             const int cols_4 = cols / 4;
    142             for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
    143             {
    144                 const uint data = rowPtr4[x];
    145 
    146                 histEvenInc(shist, (data >>  0) & 0xFFU, binSize, lowerLevel, upperLevel);
    147                 histEvenInc(shist, (data >>  8) & 0xFFU, binSize, lowerLevel, upperLevel);
    148                 histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
    149                 histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
    150             }
    151 
    152             if (cols % 4 != 0 && threadIdx.x == 0)
    153             {
    154                 for (int x = cols_4 * 4; x < cols; ++x)
    155                 {
    156                     const uchar data = rowPtr[x];
    157                     histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
    158                 }
    159             }
    160         }
    161 
    162         __syncthreads();
    163 
    164         if (tid < binCount)
    165         {
    166             const int histVal = shist[tid];
    167 
    168             if (histVal > 0)
    169                 ::atomicAdd(hist + tid, histVal);
    170         }
    171     }
    172 
    173     void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
    174     {
    175         const dim3 block(32, 8);
    176         const dim3 grid(divUp(src.rows, block.y));
    177 
    178         const int binSize = divUp(upperLevel - lowerLevel, binCount);
    179 
    180         const size_t smem_size = binCount * sizeof(int);
    181 
    182         histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
    183         cudaSafeCall( cudaGetLastError() );
    184 
    185         if (stream == 0)
    186             cudaSafeCall( cudaDeviceSynchronize() );
    187     }
    188 }
    189 
    190 /////////////////////////////////////////////////////////////////////////
    191 
    192 namespace hist
    193 {
    194     __constant__ int c_lut[256];
    195 
    196     struct EqualizeHist : unary_function<uchar, uchar>
    197     {
    198         float scale;
    199 
    200         __host__ EqualizeHist(float _scale) : scale(_scale) {}
    201 
    202         __device__ __forceinline__ uchar operator ()(uchar val) const
    203         {
    204             const int lut = c_lut[val];
    205             return __float2int_rn(scale * lut);
    206         }
    207     };
    208 }
    209 
    210 namespace cv { namespace cuda { namespace device
    211 {
    212     template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
    213     {
    214         enum { smart_shift = 4 };
    215     };
    216 }}}
    217 
    218 namespace hist
    219 {
    220     void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
    221     {
    222         if (stream == 0)
    223             cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
    224         else
    225             cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
    226 
    227         const float scale = 255.0f / (src.cols * src.rows);
    228 
    229         device::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
    230     }
    231 }
    232 
    233 #endif /* CUDA_DISABLER */
    234