1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #pragma once 45 46 #ifndef __OPENCV_CUDEV_BLOCK_REDUCE_KEY_VAL_DETAIL_HPP__ 47 #define __OPENCV_CUDEV_BLOCK_REDUCE_KEY_VAL_DETAIL_HPP__ 48 49 #include "../../common.hpp" 50 #include "../../util/tuple.hpp" 51 #include "../../util/type_traits.hpp" 52 #include "../../warp/warp.hpp" 53 54 namespace cv { namespace cudev { 55 56 namespace block_reduce_key_val_detail 57 { 58 // GetType 59 60 template <typename T> struct GetType; 61 62 template <typename T> struct GetType<T*> 63 { 64 typedef T type; 65 }; 66 67 template <typename T> struct GetType<volatile T*> 68 { 69 typedef T type; 70 }; 71 72 template <typename T> struct GetType<T&> 73 { 74 typedef T type; 75 }; 76 77 // For 78 79 template <int I, int N> struct For 80 { 81 template <class PointerTuple, class ReferenceTuple> 82 __device__ static void loadToSmem(const PointerTuple& smem, const ReferenceTuple& data, uint tid) 83 { 84 get<I>(smem)[tid] = get<I>(data); 85 86 For<I + 1, N>::loadToSmem(smem, data, tid); 87 } 88 89 template <class PointerTuple, class ReferenceTuple> 90 __device__ static void loadFromSmem(const PointerTuple& smem, const ReferenceTuple& data, uint tid) 91 { 92 get<I>(data) = get<I>(smem)[tid]; 93 94 For<I + 1, N>::loadFromSmem(smem, data, tid); 95 } 96 97 template <class PointerTuple, class ReferenceTuple> 98 __device__ static void copy(const PointerTuple& svals, const ReferenceTuple& val, uint tid, uint delta) 99 { 100 get<I>(svals)[tid] = get<I>(val) = get<I>(svals)[tid + delta]; 101 102 For<I + 1, N>::copy(svals, val, tid, delta); 103 } 104 105 template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple> 106 __device__ static void merge(const KeyPointerTuple& skeys, const KeyReferenceTuple& key, 107 const ValPointerTuple& svals, const ValReferenceTuple& val, 108 const CmpTuple& cmp, 109 uint tid, uint delta) 110 { 111 typename GetType<typename tuple_element<I, KeyPointerTuple>::type>::type reg = get<I>(skeys)[tid + delta]; 112 113 if (get<I>(cmp)(reg, get<I>(key))) 114 { 115 get<I>(skeys)[tid] = get<I>(key) = reg; 116 get<I>(svals)[tid] = get<I>(val) = get<I>(svals)[tid + delta]; 117 } 118 119 For<I + 1, N>::merge(skeys, key, svals, val, cmp, tid, delta); 120 } 121 }; 122 123 template <int N> struct For<N, N> 124 { 125 template <class PointerTuple, class ReferenceTuple> 126 __device__ static void loadToSmem(const PointerTuple&, const ReferenceTuple&, uint) 127 { 128 } 129 130 template <class PointerTuple, class ReferenceTuple> 131 __device__ static void loadFromSmem(const PointerTuple&, const ReferenceTuple&, uint) 132 { 133 } 134 135 template <class PointerTuple, class ReferenceTuple> 136 __device__ static void copy(const PointerTuple&, const ReferenceTuple&, uint, uint) 137 { 138 } 139 140 template <class KeyPointerTuple, class KeyReferenceTuple, class ValPointerTuple, class ValReferenceTuple, class CmpTuple> 141 __device__ static void merge(const KeyPointerTuple&, const KeyReferenceTuple&, 142 const ValPointerTuple&, const ValReferenceTuple&, 143 const CmpTuple&, 144 uint, uint) 145 { 146 } 147 }; 148 149 // loadToSmem / loadFromSmem 150 151 template <typename T> 152 __device__ __forceinline__ void loadToSmem(volatile T* smem, T& data, uint tid) 153 { 154 smem[tid] = data; 155 } 156 157 template <typename T> 158 __device__ __forceinline__ void loadFromSmem(volatile T* smem, T& data, uint tid) 159 { 160 data = smem[tid]; 161 } 162 163 template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, 164 typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9> 165 __device__ __forceinline__ void loadToSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem, 166 const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data, 167 uint tid) 168 { 169 For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadToSmem(smem, data, tid); 170 } 171 172 template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, 173 typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9> 174 __device__ __forceinline__ void loadFromSmem(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& smem, 175 const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& data, 176 uint tid) 177 { 178 For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::loadFromSmem(smem, data, tid); 179 } 180 181 // copyVals 182 183 template <typename V> 184 __device__ __forceinline__ void copyVals(volatile V* svals, V& val, uint tid, uint delta) 185 { 186 svals[tid] = val = svals[tid + delta]; 187 } 188 189 template <typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, 190 typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9> 191 __device__ __forceinline__ void copyVals(const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals, 192 const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, 193 uint tid, uint delta) 194 { 195 For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::copy(svals, val, tid, delta); 196 } 197 198 // merge 199 200 template <typename K, typename V, class Cmp> 201 __device__ void merge(volatile K* skeys, K& key, volatile V* svals, V& val, const Cmp& cmp, uint tid, uint delta) 202 { 203 K reg = skeys[tid + delta]; 204 205 if (cmp(reg, key)) 206 { 207 skeys[tid] = key = reg; 208 copyVals(svals, val, tid, delta); 209 } 210 } 211 212 template <typename K, 213 typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, 214 typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9, 215 class Cmp> 216 __device__ void merge(volatile K* skeys, K& key, 217 const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals, 218 const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, 219 const Cmp& cmp, uint tid, uint delta) 220 { 221 K reg = skeys[tid + delta]; 222 223 if (cmp(reg, key)) 224 { 225 skeys[tid] = key = reg; 226 copyVals(svals, val, tid, delta); 227 } 228 } 229 230 template <typename KP0, typename KP1, typename KP2, typename KP3, typename KP4, typename KP5, typename KP6, typename KP7, typename KP8, typename KP9, 231 typename KR0, typename KR1, typename KR2, typename KR3, typename KR4, typename KR5, typename KR6, typename KR7, typename KR8, typename KR9, 232 typename VP0, typename VP1, typename VP2, typename VP3, typename VP4, typename VP5, typename VP6, typename VP7, typename VP8, typename VP9, 233 typename VR0, typename VR1, typename VR2, typename VR3, typename VR4, typename VR5, typename VR6, typename VR7, typename VR8, typename VR9, 234 class Cmp0, class Cmp1, class Cmp2, class Cmp3, class Cmp4, class Cmp5, class Cmp6, class Cmp7, class Cmp8, class Cmp9> 235 __device__ __forceinline__ void merge(const tuple<KP0, KP1, KP2, KP3, KP4, KP5, KP6, KP7, KP8, KP9>& skeys, 236 const tuple<KR0, KR1, KR2, KR3, KR4, KR5, KR6, KR7, KR8, KR9>& key, 237 const tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9>& svals, 238 const tuple<VR0, VR1, VR2, VR3, VR4, VR5, VR6, VR7, VR8, VR9>& val, 239 const tuple<Cmp0, Cmp1, Cmp2, Cmp3, Cmp4, Cmp5, Cmp6, Cmp7, Cmp8, Cmp9>& cmp, 240 uint tid, uint delta) 241 { 242 For<0, tuple_size<tuple<VP0, VP1, VP2, VP3, VP4, VP5, VP6, VP7, VP8, VP9> >::value>::merge(skeys, key, svals, val, cmp, tid, delta); 243 } 244 245 // Generic 246 247 template <int N> struct Generic 248 { 249 template <class KP, class KR, class VP, class VR, class Cmp> 250 __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp) 251 { 252 loadToSmem(skeys, key, tid); 253 loadValsToSmem(svals, val, tid); 254 if (N >= 32) 255 __syncthreads(); 256 257 if (N >= 2048) 258 { 259 if (tid < 1024) 260 merge(skeys, key, svals, val, cmp, tid, 1024); 261 262 __syncthreads(); 263 } 264 if (N >= 1024) 265 { 266 if (tid < 512) 267 merge(skeys, key, svals, val, cmp, tid, 512); 268 269 __syncthreads(); 270 } 271 if (N >= 512) 272 { 273 if (tid < 256) 274 merge(skeys, key, svals, val, cmp, tid, 256); 275 276 __syncthreads(); 277 } 278 if (N >= 256) 279 { 280 if (tid < 128) 281 merge(skeys, key, svals, val, cmp, tid, 128); 282 283 __syncthreads(); 284 } 285 if (N >= 128) 286 { 287 if (tid < 64) 288 merge(skeys, key, svals, val, cmp, tid, 64); 289 290 __syncthreads(); 291 } 292 if (N >= 64) 293 { 294 if (tid < 32) 295 merge(skeys, key, svals, val, cmp, tid, 32); 296 } 297 298 if (tid < 16) 299 { 300 merge(skeys, key, svals, val, cmp, tid, 16); 301 merge(skeys, key, svals, val, cmp, tid, 8); 302 merge(skeys, key, svals, val, cmp, tid, 4); 303 merge(skeys, key, svals, val, cmp, tid, 2); 304 merge(skeys, key, svals, val, cmp, tid, 1); 305 } 306 } 307 }; 308 309 // Unroll 310 311 template <int I, class KP, class KR, class VP, class VR, class Cmp> struct Unroll 312 { 313 __device__ static void loop(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp) 314 { 315 merge(skeys, key, svals, val, cmp, tid, I); 316 Unroll<I / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); 317 } 318 }; 319 320 template <class KP, class KR, class VP, class VR, class Cmp> struct Unroll<0, KP, KR, VP, VR, Cmp> 321 { 322 __device__ __forceinline__ static void loop(KP, KR, VP, VR, uint, Cmp) 323 { 324 } 325 }; 326 327 // WarpOptimized 328 329 template <int N> struct WarpOptimized 330 { 331 template <class KP, class KR, class VP, class VR, class Cmp> 332 __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp) 333 { 334 loadToSmem(skeys, key, tid); 335 loadToSmem(svals, val, tid); 336 337 if (tid < N / 2) 338 Unroll<N / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); 339 } 340 }; 341 342 // GenericOptimized32 343 344 template <uint N> struct GenericOptimized32 345 { 346 enum { M = N / 32 }; 347 348 template <class KP, class KR, class VP, class VR, class Cmp> 349 __device__ static void reduce(KP skeys, KR key, VP svals, VR val, uint tid, Cmp cmp) 350 { 351 const uint laneId = Warp::laneId(); 352 353 loadToSmem(skeys, key, tid); 354 loadToSmem(svals, val, tid); 355 356 if (laneId < 16) 357 Unroll<16, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); 358 359 __syncthreads(); 360 361 if (laneId == 0) 362 { 363 loadToSmem(skeys, key, tid / 32); 364 loadToSmem(svals, val, tid / 32); 365 } 366 367 __syncthreads(); 368 369 loadFromSmem(skeys, key, tid); 370 371 if (tid < 32) 372 { 373 Unroll<M / 2, KP, KR, VP, VR, Cmp>::loop(skeys, key, svals, val, tid, cmp); 374 } 375 } 376 }; 377 378 template <int N> struct Dispatcher 379 { 380 typedef typename SelectIf< 381 (N <= 32) && IsPowerOf2<N>::value, 382 WarpOptimized<N>, 383 typename SelectIf< 384 (N <= 1024) && IsPowerOf2<N>::value, 385 GenericOptimized32<N>, 386 Generic<N> 387 >::type 388 >::type reductor; 389 }; 390 } 391 392 }} 393 394 #endif 395