Home | History | Annotate | Download | only in detail
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #ifndef __OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP__
     44 #define __OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP__
     45 
     46 #include <thrust/tuple.h>
     47 #include "../warp.hpp"
     48 #include "../warp_shuffle.hpp"
     49 
     50 //! @cond IGNORED
     51 
     52 namespace cv { namespace cuda { namespace device
     53 {
     54     namespace reduce_key_val_detail
     55     {
     56         template <typename T> struct GetType;
     57         template <typename T> struct GetType<T*>
     58         {
     59             typedef T type;
     60         };
     61         template <typename T> struct GetType<volatile T*>
     62         {
     63             typedef T type;
     64         };
     65         template <typename T> struct GetType<T&>
     66         {
     67             typedef T type;
     68         };
     69 
     70         template <unsigned int I, unsigned int N>
     71         struct For
     72         {
     73             template <class PointerTuple, class ReferenceTuple>
     74             static __device__ void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
     75             {
     76                 thrust::get<I>(smem)[tid] = thrust::get<I>(data);
     77 
     78                 For<I + 1, N>::loadToSmem(smem, data, tid);
     79             }
     80             template <class PointerTuple, class ReferenceTuple>
     81             static __device__ void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, unsigned int tid)
     82             {
     83                 thrust::get<I>(data) = thrust::get<I>(smem)[tid];
     84 
     85                 For<I + 1, N>::loadFromSmem(smem, data, tid);
     86             }
     87 
     88             template <class ReferenceTuple>
     89             static __device__ void copyShfl(const ReferenceTuple& val, unsigned int delta, int width)
     90             {
     91                 thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
     92 
     93                 For<I + 1, N>::copyShfl(val, delta, width);
     94             }
     95             template <class PointerTuple, class ReferenceTuple>
     96             static __device__ void copy(const PointerTuple& svals, const ReferenceTuple& val, unsigned int tid, unsigned int delta)
     97             {
     98                 thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
     99 
    100                 For<I + 1, N>::copy(svals, val, tid, delta);
    101             }
    102 
    103             template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
    104             static __device__ void mergeShfl(const KeyReferenceTuple& key, const ValReferenceTuple& val, const CmpTuple& cmp, unsigned int delta, int width)
    105             {
    106                 typename GetType<typename thrust::tuple_element<I, KeyReferenceTuple>::type>::type reg = shfl_down(thrust::get<I>(key), delta, width);
    107 
    108                 if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
    109                 {
    110                     thrust::get<I>(key) = reg;
    111                     thrust::get<I>(val) = shfl_down(thrust::get<I>(val), delta, width);
    112                 }
    113 
    114                 For<I + 1, N>::mergeShfl(key, val, cmp, delta, width);
    115             }
    116             template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
    117             static __device__ void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
    118                                          const ValPointerTuple& svals, const ValReferenceTuple& val,
    119                                          const CmpTuple& cmp,
    120                                          unsigned int tid, unsigned int delta)
    121             {
    122                 typename GetType<typename thrust::tuple_element<I, KeyPointerTuple>::type>::type reg = thrust::get<I>(skeys)[tid + delta];
    123 
    124                 if (thrust::get<I>(cmp)(reg, thrust::get<I>(key)))
    125                 {
    126                     thrust::get<I>(skeys)[tid] = thrust::get<I>(key) = reg;
    127                     thrust::get<I>(svals)[tid] = thrust::get<I>(val) = thrust::get<I>(svals)[tid + delta];
    128                 }
    129 
    130                 For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
    131             }
    132         };
    133         template <unsigned int N>
    134         struct For<N, N>
    135         {
    136             template <class PointerTuple, class ReferenceTuple>
    137             static __device__ void loadToSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
    138             {
    139             }
    140             template <class PointerTuple, class ReferenceTuple>
    141             static __device__ void loadFromSmem(const PointerTuple&, const ReferenceTuple&, unsigned int)
    142             {
    143             }
    144 
    145             template <class ReferenceTuple>
    146             static __device__ void copyShfl(const ReferenceTuple&, unsigned int, int)
    147             {
    148             }
    149             template <class PointerTuple, class ReferenceTuple>
    150             static __device__ void copy(const PointerTuple&, const ReferenceTuple&, unsigned int, unsigned int)
    151             {
    152             }
    153 
    154             template <class KeyReferenceTuple, class ValReferenceTuple, class CmpTuple>
    155             static __device__ void mergeShfl(const KeyReferenceTuple&, const ValReferenceTuple&, const CmpTuple&, unsigned int, int)
    156             {
    157             }
    158             template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
    159             static __device__ void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
    160                                          const ValPointerTuple&, const ValReferenceTuple&,
    161                                          const CmpTuple&,
    162                                          unsigned int, unsigned int)
    163             {
    164             }
    165         };
    166 
    167         //////////////////////////////////////////////////////
    168         // loadToSmem
    169 
    170         template <typename T>
    171         __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, unsigned int tid)
    172         {
    173             smem[tid] = data;
    174         }
    175         template <typename T>
    176         __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, unsigned int tid)
    177         {
    178             data = smem[tid];
    179         }
    180         template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    181                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
    182         __device__ __forceinline__ void loadToSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
    183                                                    const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
    184                                                    unsigned int tid)
    185         {
    186             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
    187         }
    188         template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    189                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
    190         __device__ __forceinline__ void loadFromSmem(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
    191                                                      const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
    192                                                      unsigned int tid)
    193         {
    194             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
    195         }
    196 
    197         //////////////////////////////////////////////////////
    198         // copyVals
    199 
    200         template <typename V>
    201         __device__ __forceinline__ void copyValsShfl(V& val, unsigned int delta, int width)
    202         {
    203             val = shfl_down(val, delta, width);
    204         }
    205         template <typename V>
    206         __device__ __forceinline__ void copyVals(volatile V* svals, V& val, unsigned int tid, unsigned int delta)
    207         {
    208             svals[tid] = val = svals[tid + delta];
    209         }
    210         template <typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
    211         __device__ __forceinline__ void copyValsShfl(const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
    212                                                      unsigned int delta,
    213                                                      int width)
    214         {
    215             For<0, thrust::tuple_size<thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9> >::value>::copyShfl(val, delta, width);
    216         }
    217         template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    218                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
    219         __device__ __forceinline__ void copyVals(const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
    220                                                  const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
    221                                                  unsigned int tid, unsigned int delta)
    222         {
    223             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
    224         }
    225 
    226         //////////////////////////////////////////////////////
    227         // merge
    228 
    229         template <typename K, typename V, class Cmp>
    230         __device__ __forceinline__ void mergeShfl(K& key, V& val, const Cmp& cmp, unsigned int delta, int width)
    231         {
    232             K reg = shfl_down(key, delta, width);
    233 
    234             if (cmp(reg, key))
    235             {
    236                 key = reg;
    237                 copyValsShfl(val, delta, width);
    238             }
    239         }
    240         template <typename K, typename V, class Cmp>
    241         __device__ __forceinline__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, unsigned int tid, unsigned int delta)
    242         {
    243             K reg = skeys[tid + delta];
    244 
    245             if (cmp(reg, key))
    246             {
    247                 skeys[tid] = key = reg;
    248                 copyVals(svals, val, tid, delta);
    249             }
    250         }
    251         template <typename K,
    252                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
    253                   class Cmp>
    254         __device__ __forceinline__ void mergeShfl(K& key,
    255                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
    256                                                   const Cmp& cmp,
    257                                                   unsigned int delta, int width)
    258         {
    259             K reg = shfl_down(key, delta, width);
    260 
    261             if (cmp(reg, key))
    262             {
    263                 key = reg;
    264                 copyValsShfl(val, delta, width);
    265             }
    266         }
    267         template <typename K,
    268                   typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    269                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
    270                   class Cmp>
    271         __device__ __forceinline__ void merge(volatile K* skeys, K& key,
    272                                               const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
    273                                               const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
    274                                               const Cmp& cmp, unsigned int tid, unsigned int delta)
    275         {
    276             K reg = skeys[tid + delta];
    277 
    278             if (cmp(reg, key))
    279             {
    280                 skeys[tid] = key = reg;
    281                 copyVals(svals, val, tid, delta);
    282             }
    283         }
    284         template <typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
    285                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
    286                   class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
    287         __device__ __forceinline__ void mergeShfl(const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
    288                                                   const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
    289                                                   const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
    290                                                   unsigned int delta, int width)
    291         {
    292             For<0, thrust::tuple_size<thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9> >::value>::mergeShfl(key, val, cmp, delta, width);
    293         }
    294         template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
    295                   typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
    296                   typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    297                   typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
    298                   class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
    299         __device__ __forceinline__ void merge(const thrust::tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
    300                                               const thrust::tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
    301                                               const thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
    302                                               const thrust::tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
    303                                               const thrust::tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
    304                                               unsigned int tid, unsigned int delta)
    305         {
    306             For<0, thrust::tuple_size<thrust::tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
    307         }
    308 
    309         //////////////////////////////////////////////////////
    310         // Generic
    311 
    312         template <unsigned int N> struct Generic
    313         {
    314             template <class KP, class KR, class VP, class VR, class Cmp>
    315             static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
    316             {
    317                 loadToSmem(skeys, key, tid);
    318                 loadValsToSmem(svals, val, tid);
    319                 if (N >= 32)
    320                     __syncthreads();
    321 
    322                 if (N >= 2048)
    323                 {
    324                     if (tid < 1024)
    325                         merge(skeys, key, svals, val, cmp, tid, 1024);
    326 
    327                     __syncthreads();
    328                 }
    329                 if (N >= 1024)
    330                 {
    331                     if (tid < 512)
    332                         merge(skeys, key, svals, val, cmp, tid, 512);
    333 
    334                     __syncthreads();
    335                 }
    336                 if (N >= 512)
    337                 {
    338                     if (tid < 256)
    339                         merge(skeys, key, svals, val, cmp, tid, 256);
    340 
    341                     __syncthreads();
    342                 }
    343                 if (N >= 256)
    344                 {
    345                     if (tid < 128)
    346                         merge(skeys, key, svals, val, cmp, tid, 128);
    347 
    348                     __syncthreads();
    349                 }
    350                 if (N >= 128)
    351                 {
    352                     if (tid < 64)
    353                         merge(skeys, key, svals, val, cmp, tid, 64);
    354 
    355                     __syncthreads();
    356                 }
    357                 if (N >= 64)
    358                 {
    359                     if (tid < 32)
    360                         merge(skeys, key, svals, val, cmp, tid, 32);
    361                 }
    362 
    363                 if (tid < 16)
    364                 {
    365                     merge(skeys, key, svals, val, cmp, tid, 16);
    366                     merge(skeys, key, svals, val, cmp, tid, 8);
    367                     merge(skeys, key, svals, val, cmp, tid, 4);
    368                     merge(skeys, key, svals, val, cmp, tid, 2);
    369                     merge(skeys, key, svals, val, cmp, tid, 1);
    370                 }
    371             }
    372         };
    373 
    374         template <unsigned int I, class KP, class KR, class VP, class VR, class Cmp>
    375         struct Unroll
    376         {
    377             static __device__ void loopShfl(KR key, VR val, Cmp cmp, unsigned int N)
    378             {
    379                 mergeShfl(key, val, cmp, I, N);
    380                 Unroll<I / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
    381             }
    382             static __device__ void loop(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
    383             {
    384                 merge(skeys, key, svals, val, cmp, tid, I);
    385                 Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
    386             }
    387         };
    388         template <class KP, class KR, class VP, class VR, class Cmp>
    389         struct Unroll<0, KP, KR, VP, VR, Cmp>
    390         {
    391             static __device__ void loopShfl(KR, VR, Cmp, unsigned int)
    392             {
    393             }
    394             static __device__ void loop(KP, KR, VP, VR, unsigned int, Cmp)
    395             {
    396             }
    397         };
    398 
    399         template <unsigned int N> struct WarpOptimized
    400         {
    401             template <class KP, class KR, class VP, class VR, class Cmp>
    402             static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
    403             {
    404             #if 0 // __CUDA_ARCH__ >= 300
    405                 (void) skeys;
    406                 (void) svals;
    407                 (void) tid;
    408 
    409                 Unroll<N / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, N);
    410             #else
    411                 loadToSmem(skeys, key, tid);
    412                 loadToSmem(svals, val, tid);
    413 
    414                 if (tid < N / 2)
    415                     Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
    416             #endif
    417             }
    418         };
    419 
    420         template <unsigned int N> struct GenericOptimized32
    421         {
    422             enum { M = N / 32 };
    423 
    424             template <class KP, class KR, class VP, class VR, class Cmp>
    425             static __device__ void reduce(KP skeys, KR key, VP svals, VR val, unsigned int tid, Cmp cmp)
    426             {
    427                 const unsigned int laneId = Warp::laneId();
    428 
    429             #if 0 // __CUDA_ARCH__ >= 300
    430                 Unroll<16, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, warpSize);
    431 
    432                 if (laneId == 0)
    433                 {
    434                     loadToSmem(skeys, key, tid / 32);
    435                     loadToSmem(svals, val, tid / 32);
    436                 }
    437             #else
    438                 loadToSmem(skeys, key, tid);
    439                 loadToSmem(svals, val, tid);
    440 
    441                 if (laneId < 16)
    442                     Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
    443 
    444                 __syncthreads();
    445 
    446                 if (laneId == 0)
    447                 {
    448                     loadToSmem(skeys, key, tid / 32);
    449                     loadToSmem(svals, val, tid / 32);
    450                 }
    451             #endif
    452 
    453                 __syncthreads();
    454 
    455                 loadFromSmem(skeys, key, tid);
    456 
    457                 if (tid < 32)
    458                 {
    459                 #if 0 // __CUDA_ARCH__ >= 300
    460                     loadFromSmem(svals, val, tid);
    461 
    462                     Unroll<M / 2, KP, KR, VP, VR, Cmp>::loopShfl(key, val, cmp, M);
    463                 #else
    464                     Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
    465                 #endif
    466                 }
    467             }
    468         };
    469 
    470         template <bool val, class T1, class T2> struct StaticIf;
    471         template <class T1, class T2> struct StaticIf<true, T1, T2>
    472         {
    473             typedef T1 type;
    474         };
    475         template <class T1, class T2> struct StaticIf<false, T1, T2>
    476         {
    477             typedef T2 type;
    478         };
    479 
    480         template <unsigned int N> struct IsPowerOf2
    481         {
    482             enum { value = ((N != 0) && !(N & (N - 1))) };
    483         };
    484 
    485         template <unsigned int N> struct Dispatcher
    486         {
    487             typedef typename StaticIf<
    488                 (N <= 32) && IsPowerOf2<N>::value,
    489                 WarpOptimized<N>,
    490                 typename StaticIf<
    491                     (N <= 1024) && IsPowerOf2<N>::value,
    492                     GenericOptimized32<N>,
    493                     Generic<N>
    494                 >::type
    495             >::type reductor;
    496         };
    497     }
    498 }}}
    499 
    500 //! @endcond
    501 
    502 #endif // __OPENCV_CUDA_PRED_VAL_REDUCE_DETAIL_HPP__
    503