Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include "opencv2/core/cuda/common.hpp"
     46 #include "opencv2/core/cuda/border_interpolate.hpp"
     47 #include "opencv2/core/cuda/vec_traits.hpp"
     48 #include "opencv2/core/cuda/vec_math.hpp"
     49 #include "opencv2/core/cuda/saturate_cast.hpp"
     50 
     51 namespace cv { namespace cuda { namespace device
     52 {
     53     namespace imgproc
     54     {
     55         template <typename T, typename B> __global__ void pyrDown(const PtrStepSz<T> src, PtrStep<T> dst, const B b, int dst_cols)
     56         {
     57             typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type work_t;
     58 
     59             __shared__ work_t smem[256 + 4];
     60 
     61             const int x = blockIdx.x * blockDim.x + threadIdx.x;
     62             const int y = blockIdx.y;
     63 
     64             const int src_y = 2 * y;
     65 
     66             if (src_y >= 2 && src_y < src.rows - 2 && x >= 2 && x < src.cols - 2)
     67             {
     68                 {
     69                     work_t sum;
     70 
     71                     sum =       0.0625f * src(src_y - 2, x);
     72                     sum = sum + 0.25f   * src(src_y - 1, x);
     73                     sum = sum + 0.375f  * src(src_y    , x);
     74                     sum = sum + 0.25f   * src(src_y + 1, x);
     75                     sum = sum + 0.0625f * src(src_y + 2, x);
     76 
     77                     smem[2 + threadIdx.x] = sum;
     78                 }
     79 
     80                 if (threadIdx.x < 2)
     81                 {
     82                     const int left_x = x - 2;
     83 
     84                     work_t sum;
     85 
     86                     sum =       0.0625f * src(src_y - 2, left_x);
     87                     sum = sum + 0.25f   * src(src_y - 1, left_x);
     88                     sum = sum + 0.375f  * src(src_y    , left_x);
     89                     sum = sum + 0.25f   * src(src_y + 1, left_x);
     90                     sum = sum + 0.0625f * src(src_y + 2, left_x);
     91 
     92                     smem[threadIdx.x] = sum;
     93                 }
     94 
     95                 if (threadIdx.x > 253)
     96                 {
     97                     const int right_x = x + 2;
     98 
     99                     work_t sum;
    100 
    101                     sum =       0.0625f * src(src_y - 2, right_x);
    102                     sum = sum + 0.25f   * src(src_y - 1, right_x);
    103                     sum = sum + 0.375f  * src(src_y    , right_x);
    104                     sum = sum + 0.25f   * src(src_y + 1, right_x);
    105                     sum = sum + 0.0625f * src(src_y + 2, right_x);
    106 
    107                     smem[4 + threadIdx.x] = sum;
    108                 }
    109             }
    110             else
    111             {
    112                 {
    113                     work_t sum;
    114 
    115                     sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(x));
    116                     sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(x));
    117                     sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(x));
    118                     sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(x));
    119                     sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(x));
    120 
    121                     smem[2 + threadIdx.x] = sum;
    122                 }
    123 
    124                 if (threadIdx.x < 2)
    125                 {
    126                     const int left_x = x - 2;
    127 
    128                     work_t sum;
    129 
    130                     sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col(left_x));
    131                     sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col(left_x));
    132                     sum = sum + 0.375f  * src(src_y                    , b.idx_col(left_x));
    133                     sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col(left_x));
    134                     sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col(left_x));
    135 
    136                     smem[threadIdx.x] = sum;
    137                 }
    138 
    139                 if (threadIdx.x > 253)
    140                 {
    141                     const int right_x = x + 2;
    142 
    143                     work_t sum;
    144 
    145                     sum =       0.0625f * src(b.idx_row_low (src_y - 2), b.idx_col_high(right_x));
    146                     sum = sum + 0.25f   * src(b.idx_row_low (src_y - 1), b.idx_col_high(right_x));
    147                     sum = sum + 0.375f  * src(src_y                    , b.idx_col_high(right_x));
    148                     sum = sum + 0.25f   * src(b.idx_row_high(src_y + 1), b.idx_col_high(right_x));
    149                     sum = sum + 0.0625f * src(b.idx_row_high(src_y + 2), b.idx_col_high(right_x));
    150 
    151                     smem[4 + threadIdx.x] = sum;
    152                 }
    153             }
    154 
    155             __syncthreads();
    156 
    157             if (threadIdx.x < 128)
    158             {
    159                 const int tid2 = threadIdx.x * 2;
    160 
    161                 work_t sum;
    162 
    163                 sum =       0.0625f * smem[2 + tid2 - 2];
    164                 sum = sum + 0.25f   * smem[2 + tid2 - 1];
    165                 sum = sum + 0.375f  * smem[2 + tid2    ];
    166                 sum = sum + 0.25f   * smem[2 + tid2 + 1];
    167                 sum = sum + 0.0625f * smem[2 + tid2 + 2];
    168 
    169                 const int dst_x = (blockIdx.x * blockDim.x + tid2) / 2;
    170 
    171                 if (dst_x < dst_cols)
    172                     dst.ptr(y)[dst_x] = saturate_cast<T>(sum);
    173             }
    174         }
    175 
    176         template <typename T, template <typename> class B> void pyrDown_caller(PtrStepSz<T> src, PtrStepSz<T> dst, cudaStream_t stream)
    177         {
    178             const dim3 block(256);
    179             const dim3 grid(divUp(src.cols, block.x), dst.rows);
    180 
    181             B<T> b(src.rows, src.cols);
    182 
    183             pyrDown<T><<<grid, block, 0, stream>>>(src, dst, b, dst.cols);
    184             cudaSafeCall( cudaGetLastError() );
    185 
    186             if (stream == 0)
    187                 cudaSafeCall( cudaDeviceSynchronize() );
    188         }
    189 
    190         template <typename T> void pyrDown_gpu(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream)
    191         {
    192             pyrDown_caller<T, BrdReflect101>(static_cast< PtrStepSz<T> >(src), static_cast< PtrStepSz<T> >(dst), stream);
    193         }
    194 
    195         template void pyrDown_gpu<uchar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    196         //template void pyrDown_gpu<uchar2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    197         template void pyrDown_gpu<uchar3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    198         template void pyrDown_gpu<uchar4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    199 
    200         //template void pyrDown_gpu<schar>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    201         //template void pyrDown_gpu<char2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    202         //template void pyrDown_gpu<char3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    203         //template void pyrDown_gpu<char4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    204 
    205         template void pyrDown_gpu<ushort>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    206         //template void pyrDown_gpu<ushort2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    207         template void pyrDown_gpu<ushort3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    208         template void pyrDown_gpu<ushort4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    209 
    210         template void pyrDown_gpu<short>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    211         //template void pyrDown_gpu<short2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    212         template void pyrDown_gpu<short3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    213         template void pyrDown_gpu<short4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    214 
    215         //template void pyrDown_gpu<int>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    216         //template void pyrDown_gpu<int2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    217         //template void pyrDown_gpu<int3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    218         //template void pyrDown_gpu<int4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    219 
    220         template void pyrDown_gpu<float>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    221         //template void pyrDown_gpu<float2>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    222         template void pyrDown_gpu<float3>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    223         template void pyrDown_gpu<float4>(PtrStepSzb src, PtrStepSzb dst, cudaStream_t stream);
    224     } // namespace imgproc
    225 }}} // namespace cv { namespace cuda { namespace cudev
    226 
    227 
    228 #endif /* CUDA_DISABLER */
    229