Home | History | Annotate | Download | only in detail
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                          License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 #pragma once
     45 
     46 #ifndef __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__
     47 #define __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__
     48 
     49 #include "../../common.hpp"
     50 #include "../../util/tuple.hpp"
     51 #include "../../util/saturate_cast.hpp"
     52 #include "../../util/atomic.hpp"
     53 #include "../../util/vec_traits.hpp"
     54 #include "../../util/type_traits.hpp"
     55 #include "../../util/limits.hpp"
     56 #include "../../block/reduce.hpp"
     57 #include "../../functional/functional.hpp"
     58 #include "../../ptr2d/traits.hpp"
     59 
     60 namespace cv { namespace cudev {
     61 
     62 namespace grid_reduce_detail
     63 {
     64     // Unroll
     65 
     66     template <int cn> struct Unroll;
     67 
     68     template <> struct Unroll<1>
     69     {
     70         template <int BLOCK_SIZE, typename R>
     71         __device__ __forceinline__ static volatile R* smem(R* ptr)
     72         {
     73             return ptr;
     74         }
     75 
     76         template <typename R>
     77         __device__ __forceinline__ static R& res(R& val)
     78         {
     79             return val;
     80         }
     81 
     82         template <class Op>
     83         __device__ __forceinline__ static const Op& op(const Op& aop)
     84         {
     85             return aop;
     86         }
     87     };
     88 
     89     template <> struct Unroll<2>
     90     {
     91         template <int BLOCK_SIZE, typename R>
     92         __device__ __forceinline__ static tuple<volatile R*, volatile R*> smem(R* ptr)
     93         {
     94             return smem_tuple(ptr, ptr + BLOCK_SIZE);
     95         }
     96 
     97         template <typename R>
     98         __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> res(R& val)
     99         {
    100             return tie(val.x, val.y);
    101         }
    102 
    103         template <class Op>
    104         __device__ __forceinline__ static tuple<Op, Op> op(const Op& aop)
    105         {
    106             return make_tuple(aop, aop);
    107         }
    108     };
    109 
    110     template <> struct Unroll<3>
    111     {
    112         template <int BLOCK_SIZE, typename R>
    113         __device__ __forceinline__ static tuple<volatile R*, volatile R*, volatile R*> smem(R* ptr)
    114         {
    115             return smem_tuple(ptr, ptr + BLOCK_SIZE, ptr + 2 * BLOCK_SIZE);
    116         }
    117 
    118         template <typename R>
    119         __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&,
    120                                                 typename VecTraits<R>::elem_type&,
    121                                                 typename VecTraits<R>::elem_type&> res(R& val)
    122         {
    123             return tie(val.x, val.y, val.z);
    124         }
    125 
    126         template <class Op>
    127         __device__ __forceinline__ static tuple<Op, Op, Op> op(const Op& aop)
    128         {
    129             return make_tuple(aop, aop, aop);
    130         }
    131     };
    132 
    133     template <> struct Unroll<4>
    134     {
    135         template <int BLOCK_SIZE, typename R>
    136         __device__ __forceinline__ static tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem(R* ptr)
    137         {
    138             return smem_tuple(ptr, ptr + BLOCK_SIZE, ptr + 2 * BLOCK_SIZE, ptr + 3 * BLOCK_SIZE);
    139         }
    140 
    141         template <typename R>
    142         __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&,
    143                                                 typename VecTraits<R>::elem_type&,
    144                                                 typename VecTraits<R>::elem_type&,
    145                                                 typename VecTraits<R>::elem_type&> res(R& val)
    146         {
    147             return tie(val.x, val.y, val.z, val.w);
    148         }
    149 
    150         template <class Op>
    151         __device__ __forceinline__ static tuple<Op, Op, Op, Op> op(const Op& aop)
    152         {
    153             return make_tuple(aop, aop, aop, aop);
    154         }
    155     };
    156 
    157     // AtomicUnroll
    158 
    159     template <typename R, int cn> struct AtomicUnroll;
    160 
    161     template <typename R> struct AtomicUnroll<R, 1>
    162     {
    163         __device__ __forceinline__ static void add(R* ptr, R val)
    164         {
    165             atomicAdd(ptr, val);
    166         }
    167 
    168         __device__ __forceinline__ static void min(R* ptr, R val)
    169         {
    170             atomicMin(ptr, val);
    171         }
    172 
    173         __device__ __forceinline__ static void max(R* ptr, R val)
    174         {
    175             atomicMax(ptr, val);
    176         }
    177     };
    178 
    179     template <typename R> struct AtomicUnroll<R, 2>
    180     {
    181         typedef typename MakeVec<R, 2>::type val_type;
    182 
    183         __device__ __forceinline__ static void add(R* ptr, val_type val)
    184         {
    185             atomicAdd(ptr, val.x);
    186             atomicAdd(ptr + 1, val.y);
    187         }
    188 
    189         __device__ __forceinline__ static void min(R* ptr, val_type val)
    190         {
    191             atomicMin(ptr, val.x);
    192             atomicMin(ptr + 1, val.y);
    193         }
    194 
    195         __device__ __forceinline__ static void max(R* ptr, val_type val)
    196         {
    197             atomicMax(ptr, val.x);
    198             atomicMax(ptr + 1, val.y);
    199         }
    200     };
    201 
    202     template <typename R> struct AtomicUnroll<R, 3>
    203     {
    204         typedef typename MakeVec<R, 3>::type val_type;
    205 
    206         __device__ __forceinline__ static void add(R* ptr, val_type val)
    207         {
    208             atomicAdd(ptr, val.x);
    209             atomicAdd(ptr + 1, val.y);
    210             atomicAdd(ptr + 2, val.z);
    211         }
    212 
    213         __device__ __forceinline__ static void min(R* ptr, val_type val)
    214         {
    215             atomicMin(ptr, val.x);
    216             atomicMin(ptr + 1, val.y);
    217             atomicMin(ptr + 2, val.z);
    218         }
    219 
    220         __device__ __forceinline__ static void max(R* ptr, val_type val)
    221         {
    222             atomicMax(ptr, val.x);
    223             atomicMax(ptr + 1, val.y);
    224             atomicMax(ptr + 2, val.z);
    225         }
    226     };
    227 
    228     template <typename R> struct AtomicUnroll<R, 4>
    229     {
    230         typedef typename MakeVec<R, 4>::type val_type;
    231 
    232         __device__ __forceinline__ static void add(R* ptr, val_type val)
    233         {
    234             atomicAdd(ptr, val.x);
    235             atomicAdd(ptr + 1, val.y);
    236             atomicAdd(ptr + 2, val.z);
    237             atomicAdd(ptr + 3, val.w);
    238         }
    239 
    240         __device__ __forceinline__ static void min(R* ptr, val_type val)
    241         {
    242             atomicMin(ptr, val.x);
    243             atomicMin(ptr + 1, val.y);
    244             atomicMin(ptr + 2, val.z);
    245             atomicMin(ptr + 3, val.w);
    246         }
    247 
    248         __device__ __forceinline__ static void max(R* ptr, val_type val)
    249         {
    250             atomicMax(ptr, val.x);
    251             atomicMax(ptr + 1, val.y);
    252             atomicMax(ptr + 2, val.z);
    253             atomicMax(ptr + 3, val.w);
    254         }
    255     };
    256 
    257     // SumReductor
    258 
    259     template <typename src_type, typename work_type> struct SumReductor
    260     {
    261         typedef typename VecTraits<work_type>::elem_type work_elem_type;
    262         enum { cn = VecTraits<src_type>::cn };
    263 
    264         work_type sum;
    265 
    266         __device__ __forceinline__ SumReductor()
    267         {
    268             sum = VecTraits<work_type>::all(0);
    269         }
    270 
    271         __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal)
    272         {
    273             sum = sum + saturate_cast<work_type>(srcVal);
    274         }
    275 
    276         template <int BLOCK_SIZE>
    277         __device__ void reduceGrid(work_elem_type* result, int tid)
    278         {
    279             __shared__ work_elem_type smem[BLOCK_SIZE * cn];
    280 
    281             blockReduce<BLOCK_SIZE>(Unroll<cn>::template smem<BLOCK_SIZE>(smem), Unroll<cn>::res(sum), tid, Unroll<cn>::op(plus<work_elem_type>()));
    282 
    283             if (tid == 0)
    284                 AtomicUnroll<work_elem_type, cn>::add(result, sum);
    285         }
    286     };
    287 
    288     // MinMaxReductor
    289 
    290     template <typename T> struct minop : minimum<T>
    291     {
    292         __device__ __forceinline__ static T initial()
    293         {
    294             return numeric_limits<T>::max();
    295         }
    296 
    297         __device__ __forceinline__ static void atomic(T* result, T myval)
    298         {
    299             atomicMin(result, myval);
    300         }
    301     };
    302 
    303     template <typename T> struct maxop : maximum<T>
    304     {
    305         __device__ __forceinline__ static T initial()
    306         {
    307             return -numeric_limits<T>::max();
    308         }
    309 
    310         __device__ __forceinline__ static void atomic(T* result, T myval)
    311         {
    312             atomicMax(result, myval);
    313         }
    314     };
    315 
    316     struct both
    317     {
    318     };
    319 
    320     template <class Op, typename src_type, typename work_type> struct MinMaxReductor
    321     {
    322         work_type myval;
    323 
    324         __device__ __forceinline__ MinMaxReductor()
    325         {
    326             myval = Op::initial();
    327         }
    328 
    329         __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal)
    330         {
    331             Op op;
    332 
    333             myval = op(myval, srcVal);
    334         }
    335 
    336         template <int BLOCK_SIZE>
    337         __device__ void reduceGrid(work_type* result, int tid)
    338         {
    339             __shared__ work_type smem[BLOCK_SIZE];
    340 
    341             Op op;
    342 
    343             blockReduce<BLOCK_SIZE>(smem, myval, tid, op);
    344 
    345             if (tid == 0)
    346                 Op::atomic(result, myval);
    347         }
    348     };
    349 
    350     template <typename src_type, typename work_type> struct MinMaxReductor<both, src_type, work_type>
    351     {
    352         work_type mymin;
    353         work_type mymax;
    354 
    355         __device__ __forceinline__ MinMaxReductor()
    356         {
    357             mymin = numeric_limits<work_type>::max();
    358             mymax = -numeric_limits<work_type>::max();
    359         }
    360 
    361         __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal)
    362         {
    363             minimum<work_type> minOp;
    364             maximum<work_type> maxOp;
    365 
    366             mymin = minOp(mymin, srcVal);
    367             mymax = maxOp(mymax, srcVal);
    368         }
    369 
    370         template <int BLOCK_SIZE>
    371         __device__ void reduceGrid(work_type* result, int tid)
    372         {
    373             __shared__ work_type sminval[BLOCK_SIZE];
    374             __shared__ work_type smaxval[BLOCK_SIZE];
    375 
    376             minimum<work_type> minOp;
    377             maximum<work_type> maxOp;
    378 
    379             blockReduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), tie(mymin, mymax), tid, make_tuple(minOp, maxOp));
    380 
    381             if (tid == 0)
    382             {
    383                 atomicMin(result, mymin);
    384                 atomicMax(result + 1, mymax);
    385             }
    386         }
    387     };
    388 
    389     // glob_reduce
    390 
    391     template <class Reductor, int BLOCK_SIZE, int PATCH_X, int PATCH_Y, class SrcPtr, typename ResType, class MaskPtr>
    392     __global__ void reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols)
    393     {
    394         const int x0 = blockIdx.x * blockDim.x * PATCH_X + threadIdx.x;
    395         const int y0 = blockIdx.y * blockDim.y * PATCH_Y + threadIdx.y;
    396 
    397         Reductor reductor;
    398 
    399         for (int i = 0, y = y0; i < PATCH_Y && y < rows; ++i, y += blockDim.y)
    400         {
    401             for (int j = 0, x = x0; j < PATCH_X && x < cols; ++j, x += blockDim.x)
    402             {
    403                 if (mask(y, x))
    404                 {
    405                     reductor.reduceVal(src(y, x));
    406                 }
    407             }
    408         }
    409 
    410         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
    411 
    412         reductor.template reduceGrid<BLOCK_SIZE>(result, tid);
    413     }
    414 
    415     template <class Reductor, class Policy, class SrcPtr, typename ResType, class MaskPtr>
    416     __host__ void reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    417     {
    418         const dim3 block(Policy::block_size_x, Policy::block_size_y);
    419         const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y));
    420 
    421         reduce<Reductor, Policy::block_size_x * Policy::block_size_y, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols);
    422         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
    423 
    424         if (stream == 0)
    425             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
    426     }
    427 
    428     // callers
    429 
    430     template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
    431     __host__ void sum(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    432     {
    433         typedef typename PtrTraits<SrcPtr>::value_type src_type;
    434         typedef typename VecTraits<ResType>::elem_type res_elem_type;
    435 
    436         reduce<SumReductor<src_type, ResType>, Policy>(src, (res_elem_type*) result, mask, rows, cols, stream);
    437     }
    438 
    439     template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
    440     __host__ void minVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    441     {
    442         typedef typename PtrTraits<SrcPtr>::value_type src_type;
    443 
    444         reduce<MinMaxReductor<minop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
    445     }
    446 
    447     template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
    448     __host__ void maxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    449     {
    450         typedef typename PtrTraits<SrcPtr>::value_type src_type;
    451 
    452         reduce<MinMaxReductor<maxop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
    453     }
    454 
    455     template <class Policy, class SrcPtr, typename ResType, class MaskPtr>
    456     __host__ void minMaxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream)
    457     {
    458         typedef typename PtrTraits<SrcPtr>::value_type src_type;
    459 
    460         reduce<MinMaxReductor<both, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream);
    461     }
    462 }
    463 
    464 }}
    465 
    466 #endif
    467