Home | History | Annotate | Download | only in detail
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                          License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 #pragma once
     45 
     46 #ifndef __OPENCV_CUDEV_BLOCK_REDUCE_KEY_VAL_DETAIL_HPP__
     47 #define __OPENCV_CUDEV_BLOCK_REDUCE_KEY_VAL_DETAIL_HPP__
     48 
     49 #include "../../common.hpp"
     50 #include "../../util/tuple.hpp"
     51 #include "../../util/type_traits.hpp"
     52 #include "../../warp/warp.hpp"
     53 
     54 namespace cv { namespace cudev {
     55 
     56 namespace block_reduce_key_val_detail
     57 {
     58     // GetType
     59 
     60     template <typename T> struct GetType;
     61 
     62     template <typename T> struct GetType<T*>
     63     {
     64         typedef T type;
     65     };
     66 
     67     template <typename T> struct GetType<volatile T*>
     68     {
     69         typedef T type;
     70     };
     71 
     72     template <typename T> struct GetType<T&>
     73     {
     74         typedef T type;
     75     };
     76 
     77     // For
     78 
     79     template <int I, int N> struct For
     80     {
     81         template <class PointerTuple, class ReferenceTuple>
     82         __device__ static void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, uint tid)
     83         {
     84             get<I>(smem)[tid] = get<I>(data);
     85 
     86             For<I + 1, N>::loadToSmem(smem, data, tid);
     87         }
     88 
     89         template <class PointerTuple, class ReferenceTuple>
     90         __device__ static void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, uint tid)
     91         {
     92             get<I>(data) = get<I>(smem)[tid];
     93 
     94             For<I + 1, N>::loadFromSmem(smem, data, tid);
     95         }
     96 
     97         template <class PointerTuple, class ReferenceTuple>
     98         __device__ static void copy(const PointerTuple& svals, const ReferenceTuple& val, uint tid, uint delta)
     99         {
    100             get<I>(svals)[tid] = get<I>(val) = get<I>(svals)[tid + delta];
    101 
    102             For<I + 1, N>::copy(svals, val, tid, delta);
    103         }
    104 
    105         template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
    106         __device__ static void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key,
    107                                      const ValPointerTuple& svals, const ValReferenceTuple& val,
    108                                      const CmpTuple& cmp,
    109                                      uint tid, uint delta)
    110         {
    111             typename GetType<typename tuple_element<I, KeyPointerTuple>::type>::type reg = get<I>(skeys)[tid + delta];
    112 
    113             if (get<I>(cmp)(reg, get<I>(key)))
    114             {
    115                 get<I>(skeys)[tid] = get<I>(key) = reg;
    116                 get<I>(svals)[tid] = get<I>(val) = get<I>(svals)[tid + delta];
    117             }
    118 
    119             For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta);
    120         }
    121     };
    122 
    123     template <int N> struct For<N, N>
    124     {
    125         template <class PointerTuple, class ReferenceTuple>
    126         __device__ static void loadToSmem(const PointerTuple&, const ReferenceTuple&, uint)
    127         {
    128         }
    129 
    130         template <class PointerTuple, class ReferenceTuple>
    131         __device__ static void loadFromSmem(const PointerTuple&, const ReferenceTuple&, uint)
    132         {
    133         }
    134 
    135         template <class PointerTuple, class ReferenceTuple>
    136         __device__ static void copy(const PointerTuple&, const ReferenceTuple&, uint, uint)
    137         {
    138         }
    139 
    140         template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple>
    141         __device__ static void merge(const KeyPointerTuple&, const KeyReferenceTuple&,
    142                                      const ValPointerTuple&, const ValReferenceTuple&,
    143                                      const CmpTuple&,
    144                                      uint, uint)
    145         {
    146         }
    147     };
    148 
    149     // loadToSmem / loadFromSmem
    150 
    151     template <typename T>
    152     __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, uint tid)
    153     {
    154         smem[tid] = data;
    155     }
    156 
    157     template <typename T>
    158     __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, uint tid)
    159     {
    160         data = smem[tid];
    161     }
    162 
    163     template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    164               typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
    165     __device__ __forceinline__ void loadToSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
    166                                                const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
    167                                                uint tid)
    168     {
    169         For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid);
    170     }
    171 
    172     template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    173               typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
    174     __device__ __forceinline__ void loadFromSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem,
    175                                                  const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data,
    176                                                  uint tid)
    177     {
    178         For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid);
    179     }
    180 
    181     // copyVals
    182 
    183     template <typename V>
    184     __device__ __forceinline__ void copyVals(volatile V* svals, V& val, uint tid, uint delta)
    185     {
    186         svals[tid] = val = svals[tid + delta];
    187     }
    188 
    189     template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    190               typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9>
    191     __device__ __forceinline__ void copyVals(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
    192                                              const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
    193                                              uint tid, uint delta)
    194     {
    195         For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta);
    196     }
    197 
    198     // merge
    199 
    200     template <typename K, typename V, class Cmp>
    201     __device__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, uint tid, uint delta)
    202     {
    203         K reg = skeys[tid + delta];
    204 
    205         if (cmp(reg, key))
    206         {
    207             skeys[tid] = key = reg;
    208             copyVals(svals, val, tid, delta);
    209         }
    210     }
    211 
    212     template <typename K,
    213               typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    214               typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
    215               class Cmp>
    216     __device__ void merge(volatile K* skeys, K& key,
    217                           const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
    218                           const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
    219                           const Cmp& cmp, uint tid, uint delta)
    220     {
    221         K reg = skeys[tid + delta];
    222 
    223         if (cmp(reg, key))
    224         {
    225             skeys[tid] = key = reg;
    226             copyVals(svals, val, tid, delta);
    227         }
    228     }
    229 
    230     template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9,
    231               typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9,
    232               typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9,
    233               typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9,
    234               class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9>
    235     __device__ __forceinline__ void merge(const tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys,
    236                                           const tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key,
    237                                           const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals,
    238                                           const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val,
    239                                           const tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp,
    240                                           uint tid, uint delta)
    241     {
    242         For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta);
    243     }
    244 
    245     // Generic
    246 
    247     template <int N> struct Generic
    248     {
    249         template <class KP, class KR, class VP, class VR, class Cmp>
    250         __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp)
    251         {
    252             loadToSmem(skeys, key, tid);
    253             loadValsToSmem(svals, val, tid);
    254             if (N >= 32)
    255                 __syncthreads();
    256 
    257             if (N >= 2048)
    258             {
    259                 if (tid < 1024)
    260                     merge(skeys, key, svals, val, cmp, tid, 1024);
    261 
    262                 __syncthreads();
    263             }
    264             if (N >= 1024)
    265             {
    266                 if (tid < 512)
    267                     merge(skeys, key, svals, val, cmp, tid, 512);
    268 
    269                 __syncthreads();
    270             }
    271             if (N >= 512)
    272             {
    273                 if (tid < 256)
    274                     merge(skeys, key, svals, val, cmp, tid, 256);
    275 
    276                 __syncthreads();
    277             }
    278             if (N >= 256)
    279             {
    280                 if (tid < 128)
    281                     merge(skeys, key, svals, val, cmp, tid, 128);
    282 
    283                 __syncthreads();
    284             }
    285             if (N >= 128)
    286             {
    287                 if (tid < 64)
    288                     merge(skeys, key, svals, val, cmp, tid, 64);
    289 
    290                 __syncthreads();
    291             }
    292             if (N >= 64)
    293             {
    294                 if (tid < 32)
    295                     merge(skeys, key, svals, val, cmp, tid, 32);
    296             }
    297 
    298             if (tid < 16)
    299             {
    300                 merge(skeys, key, svals, val, cmp, tid, 16);
    301                 merge(skeys, key, svals, val, cmp, tid, 8);
    302                 merge(skeys, key, svals, val, cmp, tid, 4);
    303                 merge(skeys, key, svals, val, cmp, tid, 2);
    304                 merge(skeys, key, svals, val, cmp, tid, 1);
    305             }
    306         }
    307     };
    308 
    309     // Unroll
    310 
    311     template <int I, class KP, class KR, class VP, class VR, class Cmp> struct Unroll
    312     {
    313         __device__ static void loop(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp)
    314         {
    315             merge(skeys, key, svals, val, cmp, tid, I);
    316             Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
    317         }
    318     };
    319 
    320     template <class KP, class KR, class VP, class VR, class Cmp> struct Unroll<0, KP, KR, VP, VR, Cmp>
    321     {
    322         __device__ __forceinline__ static void loop(KP, KR, VP, VR, uint, Cmp)
    323         {
    324         }
    325     };
    326 
    327     // WarpOptimized
    328 
    329     template <int N> struct WarpOptimized
    330     {
    331         template <class KP, class KR, class VP, class VR, class Cmp>
    332         __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp)
    333         {
    334             loadToSmem(skeys, key, tid);
    335             loadToSmem(svals, val, tid);
    336 
    337             if (tid < N / 2)
    338                 Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
    339         }
    340     };
    341 
    342     // GenericOptimized32
    343 
    344     template <uint N> struct GenericOptimized32
    345     {
    346         enum { M = N / 32 };
    347 
    348         template <class KP, class KR, class VP, class VR, class Cmp>
    349         __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp)
    350         {
    351             const uint laneId = Warp::laneId();
    352 
    353             loadToSmem(skeys, key, tid);
    354             loadToSmem(svals, val, tid);
    355 
    356             if (laneId < 16)
    357                 Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
    358 
    359             __syncthreads();
    360 
    361             if (laneId == 0)
    362             {
    363                 loadToSmem(skeys, key, tid / 32);
    364                 loadToSmem(svals, val, tid / 32);
    365             }
    366 
    367             __syncthreads();
    368 
    369             loadFromSmem(skeys, key, tid);
    370 
    371             if (tid < 32)
    372             {
    373                 Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp);
    374             }
    375         }
    376     };
    377 
    378     template <int N> struct Dispatcher
    379     {
    380         typedef typename SelectIf<
    381             (N <= 32) && IsPowerOf2<N>::value,
    382             WarpOptimized<N>,
    383             typename SelectIf<
    384                 (N <= 1024) && IsPowerOf2<N>::value,
    385                 GenericOptimized32<N>,
    386                 Generic<N>
    387             >::type
    388         >::type reductor;
    389     };
    390 }
    391 
    392 }}
    393 
    394 #endif
    395