1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #pragma once 45 46 #ifndef __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__ 47 #define __OPENCV_CUDEV_GRID_REDUCE_DETAIL_HPP__ 48 49 #include "../../common.hpp" 50 #include "../../util/tuple.hpp" 51 #include "../../util/saturate_cast.hpp" 52 #include "../../util/atomic.hpp" 53 #include "../../util/vec_traits.hpp" 54 #include "../../util/type_traits.hpp" 55 #include "../../util/limits.hpp" 56 #include "../../block/reduce.hpp" 57 #include "../../functional/functional.hpp" 58 #include "../../ptr2d/traits.hpp" 59 60 namespace cv { namespace cudev { 61 62 namespace grid_reduce_detail 63 { 64 // Unroll 65 66 template <int cn> struct Unroll; 67 68 template <> struct Unroll<1> 69 { 70 template <int BLOCK_SIZE, typename R> 71 __device__ __forceinline__ static volatile R* smem(R* ptr) 72 { 73 return ptr; 74 } 75 76 template <typename R> 77 __device__ __forceinline__ static R& res(R& val) 78 { 79 return val; 80 } 81 82 template <class Op> 83 __device__ __forceinline__ static const Op& op(const Op& aop) 84 { 85 return aop; 86 } 87 }; 88 89 template <> struct Unroll<2> 90 { 91 template <int BLOCK_SIZE, typename R> 92 __device__ __forceinline__ static tuple<volatile R*, volatile R*> smem(R* ptr) 93 { 94 return smem_tuple(ptr, ptr + BLOCK_SIZE); 95 } 96 97 template <typename R> 98 __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&, typename VecTraits<R>::elem_type&> res(R& val) 99 { 100 return tie(val.x, val.y); 101 } 102 103 template <class Op> 104 __device__ __forceinline__ static tuple<Op, Op> op(const Op& aop) 105 { 106 return make_tuple(aop, aop); 107 } 108 }; 109 110 template <> struct Unroll<3> 111 { 112 template <int BLOCK_SIZE, typename R> 113 __device__ __forceinline__ static tuple<volatile R*, volatile R*, volatile R*> smem(R* ptr) 114 { 115 return smem_tuple(ptr, ptr + BLOCK_SIZE, ptr + 2 * BLOCK_SIZE); 116 } 117 118 template <typename R> 119 __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&, 120 typename VecTraits<R>::elem_type&, 121 typename VecTraits<R>::elem_type&> res(R& val) 122 { 123 return tie(val.x, val.y, val.z); 124 } 125 126 template <class Op> 127 __device__ __forceinline__ static tuple<Op, Op, Op> op(const Op& aop) 128 { 129 return make_tuple(aop, aop, aop); 130 } 131 }; 132 133 template <> struct Unroll<4> 134 { 135 template <int BLOCK_SIZE, typename R> 136 __device__ __forceinline__ static tuple<volatile R*, volatile R*, volatile R*, volatile R*> smem(R* ptr) 137 { 138 return smem_tuple(ptr, ptr + BLOCK_SIZE, ptr + 2 * BLOCK_SIZE, ptr + 3 * BLOCK_SIZE); 139 } 140 141 template <typename R> 142 __device__ __forceinline__ static tuple<typename VecTraits<R>::elem_type&, 143 typename VecTraits<R>::elem_type&, 144 typename VecTraits<R>::elem_type&, 145 typename VecTraits<R>::elem_type&> res(R& val) 146 { 147 return tie(val.x, val.y, val.z, val.w); 148 } 149 150 template <class Op> 151 __device__ __forceinline__ static tuple<Op, Op, Op, Op> op(const Op& aop) 152 { 153 return make_tuple(aop, aop, aop, aop); 154 } 155 }; 156 157 // AtomicUnroll 158 159 template <typename R, int cn> struct AtomicUnroll; 160 161 template <typename R> struct AtomicUnroll<R, 1> 162 { 163 __device__ __forceinline__ static void add(R* ptr, R val) 164 { 165 atomicAdd(ptr, val); 166 } 167 168 __device__ __forceinline__ static void min(R* ptr, R val) 169 { 170 atomicMin(ptr, val); 171 } 172 173 __device__ __forceinline__ static void max(R* ptr, R val) 174 { 175 atomicMax(ptr, val); 176 } 177 }; 178 179 template <typename R> struct AtomicUnroll<R, 2> 180 { 181 typedef typename MakeVec<R, 2>::type val_type; 182 183 __device__ __forceinline__ static void add(R* ptr, val_type val) 184 { 185 atomicAdd(ptr, val.x); 186 atomicAdd(ptr + 1, val.y); 187 } 188 189 __device__ __forceinline__ static void min(R* ptr, val_type val) 190 { 191 atomicMin(ptr, val.x); 192 atomicMin(ptr + 1, val.y); 193 } 194 195 __device__ __forceinline__ static void max(R* ptr, val_type val) 196 { 197 atomicMax(ptr, val.x); 198 atomicMax(ptr + 1, val.y); 199 } 200 }; 201 202 template <typename R> struct AtomicUnroll<R, 3> 203 { 204 typedef typename MakeVec<R, 3>::type val_type; 205 206 __device__ __forceinline__ static void add(R* ptr, val_type val) 207 { 208 atomicAdd(ptr, val.x); 209 atomicAdd(ptr + 1, val.y); 210 atomicAdd(ptr + 2, val.z); 211 } 212 213 __device__ __forceinline__ static void min(R* ptr, val_type val) 214 { 215 atomicMin(ptr, val.x); 216 atomicMin(ptr + 1, val.y); 217 atomicMin(ptr + 2, val.z); 218 } 219 220 __device__ __forceinline__ static void max(R* ptr, val_type val) 221 { 222 atomicMax(ptr, val.x); 223 atomicMax(ptr + 1, val.y); 224 atomicMax(ptr + 2, val.z); 225 } 226 }; 227 228 template <typename R> struct AtomicUnroll<R, 4> 229 { 230 typedef typename MakeVec<R, 4>::type val_type; 231 232 __device__ __forceinline__ static void add(R* ptr, val_type val) 233 { 234 atomicAdd(ptr, val.x); 235 atomicAdd(ptr + 1, val.y); 236 atomicAdd(ptr + 2, val.z); 237 atomicAdd(ptr + 3, val.w); 238 } 239 240 __device__ __forceinline__ static void min(R* ptr, val_type val) 241 { 242 atomicMin(ptr, val.x); 243 atomicMin(ptr + 1, val.y); 244 atomicMin(ptr + 2, val.z); 245 atomicMin(ptr + 3, val.w); 246 } 247 248 __device__ __forceinline__ static void max(R* ptr, val_type val) 249 { 250 atomicMax(ptr, val.x); 251 atomicMax(ptr + 1, val.y); 252 atomicMax(ptr + 2, val.z); 253 atomicMax(ptr + 3, val.w); 254 } 255 }; 256 257 // SumReductor 258 259 template <typename src_type, typename work_type> struct SumReductor 260 { 261 typedef typename VecTraits<work_type>::elem_type work_elem_type; 262 enum { cn = VecTraits<src_type>::cn }; 263 264 work_type sum; 265 266 __device__ __forceinline__ SumReductor() 267 { 268 sum = VecTraits<work_type>::all(0); 269 } 270 271 __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal) 272 { 273 sum = sum + saturate_cast<work_type>(srcVal); 274 } 275 276 template <int BLOCK_SIZE> 277 __device__ void reduceGrid(work_elem_type* result, int tid) 278 { 279 __shared__ work_elem_type smem[BLOCK_SIZE * cn]; 280 281 blockReduce<BLOCK_SIZE>(Unroll<cn>::template smem<BLOCK_SIZE>(smem), Unroll<cn>::res(sum), tid, Unroll<cn>::op(plus<work_elem_type>())); 282 283 if (tid == 0) 284 AtomicUnroll<work_elem_type, cn>::add(result, sum); 285 } 286 }; 287 288 // MinMaxReductor 289 290 template <typename T> struct minop : minimum<T> 291 { 292 __device__ __forceinline__ static T initial() 293 { 294 return numeric_limits<T>::max(); 295 } 296 297 __device__ __forceinline__ static void atomic(T* result, T myval) 298 { 299 atomicMin(result, myval); 300 } 301 }; 302 303 template <typename T> struct maxop : maximum<T> 304 { 305 __device__ __forceinline__ static T initial() 306 { 307 return -numeric_limits<T>::max(); 308 } 309 310 __device__ __forceinline__ static void atomic(T* result, T myval) 311 { 312 atomicMax(result, myval); 313 } 314 }; 315 316 struct both 317 { 318 }; 319 320 template <class Op, typename src_type, typename work_type> struct MinMaxReductor 321 { 322 work_type myval; 323 324 __device__ __forceinline__ MinMaxReductor() 325 { 326 myval = Op::initial(); 327 } 328 329 __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal) 330 { 331 Op op; 332 333 myval = op(myval, srcVal); 334 } 335 336 template <int BLOCK_SIZE> 337 __device__ void reduceGrid(work_type* result, int tid) 338 { 339 __shared__ work_type smem[BLOCK_SIZE]; 340 341 Op op; 342 343 blockReduce<BLOCK_SIZE>(smem, myval, tid, op); 344 345 if (tid == 0) 346 Op::atomic(result, myval); 347 } 348 }; 349 350 template <typename src_type, typename work_type> struct MinMaxReductor<both, src_type, work_type> 351 { 352 work_type mymin; 353 work_type mymax; 354 355 __device__ __forceinline__ MinMaxReductor() 356 { 357 mymin = numeric_limits<work_type>::max(); 358 mymax = -numeric_limits<work_type>::max(); 359 } 360 361 __device__ __forceinline__ void reduceVal(typename TypeTraits<src_type>::parameter_type srcVal) 362 { 363 minimum<work_type> minOp; 364 maximum<work_type> maxOp; 365 366 mymin = minOp(mymin, srcVal); 367 mymax = maxOp(mymax, srcVal); 368 } 369 370 template <int BLOCK_SIZE> 371 __device__ void reduceGrid(work_type* result, int tid) 372 { 373 __shared__ work_type sminval[BLOCK_SIZE]; 374 __shared__ work_type smaxval[BLOCK_SIZE]; 375 376 minimum<work_type> minOp; 377 maximum<work_type> maxOp; 378 379 blockReduce<BLOCK_SIZE>(smem_tuple(sminval, smaxval), tie(mymin, mymax), tid, make_tuple(minOp, maxOp)); 380 381 if (tid == 0) 382 { 383 atomicMin(result, mymin); 384 atomicMax(result + 1, mymax); 385 } 386 } 387 }; 388 389 // glob_reduce 390 391 template <class Reductor, int BLOCK_SIZE, int PATCH_X, int PATCH_Y, class SrcPtr, typename ResType, class MaskPtr> 392 __global__ void reduce(const SrcPtr src, ResType* result, const MaskPtr mask, const int rows, const int cols) 393 { 394 const int x0 = blockIdx.x * blockDim.x * PATCH_X + threadIdx.x; 395 const int y0 = blockIdx.y * blockDim.y * PATCH_Y + threadIdx.y; 396 397 Reductor reductor; 398 399 for (int i = 0, y = y0; i < PATCH_Y && y < rows; ++i, y += blockDim.y) 400 { 401 for (int j = 0, x = x0; j < PATCH_X && x < cols; ++j, x += blockDim.x) 402 { 403 if (mask(y, x)) 404 { 405 reductor.reduceVal(src(y, x)); 406 } 407 } 408 } 409 410 const int tid = threadIdx.y * blockDim.x + threadIdx.x; 411 412 reductor.template reduceGrid<BLOCK_SIZE>(result, tid); 413 } 414 415 template <class Reductor, class Policy, class SrcPtr, typename ResType, class MaskPtr> 416 __host__ void reduce(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 417 { 418 const dim3 block(Policy::block_size_x, Policy::block_size_y); 419 const dim3 grid(divUp(cols, block.x * Policy::patch_size_x), divUp(rows, block.y * Policy::patch_size_y)); 420 421 reduce<Reductor, Policy::block_size_x * Policy::block_size_y, Policy::patch_size_x, Policy::patch_size_y><<<grid, block, 0, stream>>>(src, result, mask, rows, cols); 422 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 423 424 if (stream == 0) 425 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 426 } 427 428 // callers 429 430 template <class Policy, class SrcPtr, typename ResType, class MaskPtr> 431 __host__ void sum(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 432 { 433 typedef typename PtrTraits<SrcPtr>::value_type src_type; 434 typedef typename VecTraits<ResType>::elem_type res_elem_type; 435 436 reduce<SumReductor<src_type, ResType>, Policy>(src, (res_elem_type*) result, mask, rows, cols, stream); 437 } 438 439 template <class Policy, class SrcPtr, typename ResType, class MaskPtr> 440 __host__ void minVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 441 { 442 typedef typename PtrTraits<SrcPtr>::value_type src_type; 443 444 reduce<MinMaxReductor<minop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream); 445 } 446 447 template <class Policy, class SrcPtr, typename ResType, class MaskPtr> 448 __host__ void maxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 449 { 450 typedef typename PtrTraits<SrcPtr>::value_type src_type; 451 452 reduce<MinMaxReductor<maxop<ResType>, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream); 453 } 454 455 template <class Policy, class SrcPtr, typename ResType, class MaskPtr> 456 __host__ void minMaxVal(const SrcPtr& src, ResType* result, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 457 { 458 typedef typename PtrTraits<SrcPtr>::value_type src_type; 459 460 reduce<MinMaxReductor<both, src_type, ResType>, Policy>(src, result, mask, rows, cols, stream); 461 } 462 } 463 464 }} 465 466 #endif 467