1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #pragma once 45 46 #ifndef __OPENCV_CUDEV_GRID_TRANSFORM_DETAIL_HPP__ 47 #define __OPENCV_CUDEV_GRID_TRANSFORM_DETAIL_HPP__ 48 49 #include "../../common.hpp" 50 #include "../../util/tuple.hpp" 51 #include "../../util/saturate_cast.hpp" 52 #include "../../util/vec_traits.hpp" 53 #include "../../ptr2d/glob.hpp" 54 #include "../../ptr2d/traits.hpp" 55 56 namespace cv { namespace cudev { 57 58 namespace grid_transform_detail 59 { 60 // OpUnroller 61 62 template <int cn> struct OpUnroller; 63 64 template <> struct OpUnroller<1> 65 { 66 template <typename T, typename D, class UnOp, class MaskPtr> 67 __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y) 68 { 69 if (mask(y, x_shifted)) 70 dst.x = op(src.x); 71 } 72 73 template <typename T1, typename T2, typename D, class BinOp, class MaskPtr> 74 __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y) 75 { 76 if (mask(y, x_shifted)) 77 dst.x = op(src1.x, src2.x); 78 } 79 }; 80 81 template <> struct OpUnroller<2> 82 { 83 template <typename T, typename D, class UnOp, class MaskPtr> 84 __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y) 85 { 86 if (mask(y, x_shifted)) 87 dst.x = op(src.x); 88 if (mask(y, x_shifted + 1)) 89 dst.y = op(src.y); 90 } 91 92 template <typename T1, typename T2, typename D, class BinOp, class MaskPtr> 93 __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y) 94 { 95 if (mask(y, x_shifted)) 96 dst.x = op(src1.x, src2.x); 97 if (mask(y, x_shifted + 1)) 98 dst.y = op(src1.y, src2.y); 99 } 100 }; 101 102 template <> struct OpUnroller<3> 103 { 104 template <typename T, typename D, class UnOp, class MaskPtr> 105 __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y) 106 { 107 if (mask(y, x_shifted)) 108 dst.x = op(src.x); 109 if (mask(y, x_shifted + 1)) 110 dst.y = op(src.y); 111 if (mask(y, x_shifted + 2)) 112 dst.z = op(src.z); 113 } 114 115 template <typename T1, typename T2, typename D, class BinOp, class MaskPtr> 116 __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y) 117 { 118 if (mask(y, x_shifted)) 119 dst.x = op(src1.x, src2.x); 120 if (mask(y, x_shifted + 1)) 121 dst.y = op(src1.y, src2.y); 122 if (mask(y, x_shifted + 2)) 123 dst.z = op(src1.z, src2.z); 124 } 125 }; 126 127 template <> struct OpUnroller<4> 128 { 129 template <typename T, typename D, class UnOp, class MaskPtr> 130 __device__ __forceinline__ static void unroll(const T& src, D& dst, const UnOp& op, const MaskPtr& mask, int x_shifted, int y) 131 { 132 if (mask(y, x_shifted)) 133 dst.x = op(src.x); 134 if (mask(y, x_shifted + 1)) 135 dst.y = op(src.y); 136 if (mask(y, x_shifted + 2)) 137 dst.z = op(src.z); 138 if (mask(y, x_shifted + 3)) 139 dst.w = op(src.w); 140 } 141 142 template <typename T1, typename T2, typename D, class BinOp, class MaskPtr> 143 __device__ __forceinline__ static void unroll(const T1& src1, const T2& src2, D& dst, const BinOp& op, const MaskPtr& mask, int x_shifted, int y) 144 { 145 if (mask(y, x_shifted)) 146 dst.x = op(src1.x, src2.x); 147 if (mask(y, x_shifted + 1)) 148 dst.y = op(src1.y, src2.y); 149 if (mask(y, x_shifted + 2)) 150 dst.z = op(src1.z, src2.z); 151 if (mask(y, x_shifted + 3)) 152 dst.w = op(src1.w, src2.w); 153 } 154 }; 155 156 // transformSimple 157 158 template <class SrcPtr, typename DstType, class UnOp, class MaskPtr> 159 __global__ void transformSimple(const SrcPtr src, GlobPtr<DstType> dst, const UnOp op, const MaskPtr mask, const int rows, const int cols) 160 { 161 const int x = blockIdx.x * blockDim.x + threadIdx.x; 162 const int y = blockIdx.y * blockDim.y + threadIdx.y; 163 164 if (x >= cols || y >= rows || !mask(y, x)) 165 return; 166 167 dst(y, x) = saturate_cast<DstType>(op(src(y, x))); 168 } 169 170 template <class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr> 171 __global__ void transformSimple(const SrcPtr1 src1, const SrcPtr2 src2, GlobPtr<DstType> dst, const BinOp op, const MaskPtr mask, const int rows, const int cols) 172 { 173 const int x = blockIdx.x * blockDim.x + threadIdx.x; 174 const int y = blockIdx.y * blockDim.y + threadIdx.y; 175 176 if (x >= cols || y >= rows || !mask(y, x)) 177 return; 178 179 dst(y, x) = saturate_cast<DstType>(op(src1(y, x), src2(y, x))); 180 } 181 182 // transformSmart 183 184 template <int SHIFT, typename SrcType, typename DstType, class UnOp, class MaskPtr> 185 __global__ void transformSmart(const GlobPtr<SrcType> src_, GlobPtr<DstType> dst_, const UnOp op, const MaskPtr mask, const int rows, const int cols) 186 { 187 typedef typename MakeVec<SrcType, SHIFT>::type read_type; 188 typedef typename MakeVec<DstType, SHIFT>::type write_type; 189 190 const int x = blockIdx.x * blockDim.x + threadIdx.x; 191 const int y = blockIdx.y * blockDim.y + threadIdx.y; 192 const int x_shifted = x * SHIFT; 193 194 if (y < rows) 195 { 196 const SrcType* src = src_.row(y); 197 DstType* dst = dst_.row(y); 198 199 if (x_shifted + SHIFT - 1 < cols) 200 { 201 const read_type src_n_el = ((const read_type*)src)[x]; 202 write_type dst_n_el = ((const write_type*)dst)[x]; 203 204 OpUnroller<SHIFT>::unroll(src_n_el, dst_n_el, op, mask, x_shifted, y); 205 206 ((write_type*)dst)[x] = dst_n_el; 207 } 208 else 209 { 210 for (int real_x = x_shifted; real_x < cols; ++real_x) 211 { 212 if (mask(y, real_x)) 213 dst[real_x] = op(src[real_x]); 214 } 215 } 216 } 217 } 218 219 template <int SHIFT, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr> 220 __global__ void transformSmart(const GlobPtr<SrcType1> src1_, const GlobPtr<SrcType2> src2_, GlobPtr<DstType> dst_, const BinOp op, const MaskPtr mask, const int rows, const int cols) 221 { 222 typedef typename MakeVec<SrcType1, SHIFT>::type read_type1; 223 typedef typename MakeVec<SrcType2, SHIFT>::type read_type2; 224 typedef typename MakeVec<DstType, SHIFT>::type write_type; 225 226 const int x = blockIdx.x * blockDim.x + threadIdx.x; 227 const int y = blockIdx.y * blockDim.y + threadIdx.y; 228 const int x_shifted = x * SHIFT; 229 230 if (y < rows) 231 { 232 const SrcType1* src1 = src1_.row(y); 233 const SrcType2* src2 = src2_.row(y); 234 DstType* dst = dst_.row(y); 235 236 if (x_shifted + SHIFT - 1 < cols) 237 { 238 const read_type1 src1_n_el = ((const read_type1*)src1)[x]; 239 const read_type2 src2_n_el = ((const read_type2*)src2)[x]; 240 write_type dst_n_el = ((const write_type*)dst)[x]; 241 242 OpUnroller<SHIFT>::unroll(src1_n_el, src2_n_el, dst_n_el, op, mask, x_shifted, y); 243 244 ((write_type*)dst)[x] = dst_n_el; 245 } 246 else 247 { 248 for (int real_x = x_shifted; real_x < cols; ++real_x) 249 { 250 if (mask(y, real_x)) 251 dst[real_x] = op(src1[real_x], src2[real_x]); 252 } 253 } 254 } 255 } 256 257 // TransformDispatcher 258 259 template <bool UseSmart, class Policy> struct TransformDispatcher; 260 261 template <class Policy> struct TransformDispatcher<false, Policy> 262 { 263 template <class SrcPtr, typename DstType, class UnOp, class MaskPtr> 264 __host__ static void call(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 265 { 266 const dim3 block(Policy::block_size_x, Policy::block_size_y); 267 const dim3 grid(divUp(cols, block.x), divUp(rows, block.y)); 268 269 transformSimple<<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols); 270 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 271 272 if (stream == 0) 273 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 274 } 275 276 template <class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr> 277 __host__ static void call(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 278 { 279 const dim3 block(Policy::block_size_x, Policy::block_size_y); 280 const dim3 grid(divUp(cols, block.x), divUp(rows, block.y)); 281 282 transformSimple<<<grid, block, 0, stream>>>(src1, src2, dst, op, mask, rows, cols); 283 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 284 285 if (stream == 0) 286 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 287 } 288 }; 289 290 template <class Policy> struct TransformDispatcher<true, Policy> 291 { 292 template <typename T> 293 __host__ static bool isAligned(const T* ptr, size_t size) 294 { 295 return reinterpret_cast<size_t>(ptr) % size == 0; 296 } 297 298 __host__ static bool isAligned(size_t step, size_t size) 299 { 300 return step % size == 0; 301 } 302 303 template <typename SrcType, typename DstType, class UnOp, class MaskPtr> 304 __host__ static void call(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 305 { 306 if (Policy::shift == 1 || 307 !isAligned(src.data, Policy::shift * sizeof(SrcType)) || !isAligned(src.step, Policy::shift * sizeof(SrcType)) || 308 !isAligned(dst.data, Policy::shift * sizeof(DstType)) || !isAligned(dst.step, Policy::shift * sizeof(DstType))) 309 { 310 TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream); 311 return; 312 } 313 314 const dim3 block(Policy::block_size_x, Policy::block_size_y); 315 const dim3 grid(divUp(cols, block.x * Policy::shift), divUp(rows, block.y)); 316 317 transformSmart<Policy::shift><<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols); 318 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 319 320 if (stream == 0) 321 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 322 } 323 324 template <typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr> 325 __host__ static void call(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 326 { 327 if (Policy::shift == 1 || 328 !isAligned(src1.data, Policy::shift * sizeof(SrcType1)) || !isAligned(src1.step, Policy::shift * sizeof(SrcType1)) || 329 !isAligned(src2.data, Policy::shift * sizeof(SrcType2)) || !isAligned(src2.step, Policy::shift * sizeof(SrcType2)) || 330 !isAligned(dst.data, Policy::shift * sizeof(DstType)) || !isAligned(dst.step, Policy::shift * sizeof(DstType))) 331 { 332 TransformDispatcher<false, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream); 333 return; 334 } 335 336 const dim3 block(Policy::block_size_x, Policy::block_size_y); 337 const dim3 grid(divUp(cols, block.x * Policy::shift), divUp(rows, block.y)); 338 339 transformSmart<Policy::shift><<<grid, block, 0, stream>>>(src1, src2, dst, op, mask, rows, cols); 340 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 341 342 if (stream == 0) 343 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 344 } 345 }; 346 347 template <class Policy, class SrcPtr, typename DstType, class UnOp, class MaskPtr> 348 __host__ void transform_unary(const SrcPtr& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 349 { 350 TransformDispatcher<false, Policy>::call(src, dst, op, mask, rows, cols, stream); 351 } 352 353 template <class Policy, class SrcPtr1, class SrcPtr2, typename DstType, class BinOp, class MaskPtr> 354 __host__ void transform_binary(const SrcPtr1& src1, const SrcPtr2& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 355 { 356 TransformDispatcher<false, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream); 357 } 358 359 template <class Policy, typename SrcType, typename DstType, class UnOp, class MaskPtr> 360 __host__ void transform_unary(const GlobPtr<SrcType>& src, const GlobPtr<DstType>& dst, const UnOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 361 { 362 TransformDispatcher<VecTraits<SrcType>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src, dst, op, mask, rows, cols, stream); 363 } 364 365 template <class Policy, typename SrcType1, typename SrcType2, typename DstType, class BinOp, class MaskPtr> 366 __host__ void transform_binary(const GlobPtr<SrcType1>& src1, const GlobPtr<SrcType2>& src2, const GlobPtr<DstType>& dst, const BinOp& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 367 { 368 TransformDispatcher<VecTraits<SrcType1>::cn == 1 && VecTraits<SrcType2>::cn == 1 && VecTraits<DstType>::cn == 1 && Policy::shift != 1, Policy>::call(src1, src2, dst, op, mask, rows, cols, stream); 369 } 370 371 // transform_tuple 372 373 template <int count> struct Unroll 374 { 375 template <class SrcVal, class DstPtrTuple, class OpTuple> 376 __device__ static void transform(const SrcVal& srcVal, DstPtrTuple& dst, const OpTuple& op, int y, int x) 377 { 378 typedef typename tuple_element<count - 1, DstPtrTuple>::type dst_ptr_type; 379 typedef typename PtrTraits<dst_ptr_type>::value_type dst_type; 380 381 get<count - 1>(dst)(y, x) = saturate_cast<dst_type>(get<count - 1>(op)(srcVal)); 382 Unroll<count - 1>::transform(srcVal, dst, op, y, x); 383 } 384 }; 385 template <> struct Unroll<0> 386 { 387 template <class SrcVal, class DstPtrTuple, class OpTuple> 388 __device__ __forceinline__ static void transform(const SrcVal&, DstPtrTuple&, const OpTuple&, int, int) 389 { 390 } 391 }; 392 393 template <class SrcPtr, class DstPtrTuple, class OpTuple, class MaskPtr> 394 __global__ void transform_tuple(const SrcPtr src, DstPtrTuple dst, const OpTuple op, const MaskPtr mask, const int rows, const int cols) 395 { 396 const int x = blockIdx.x * blockDim.x + threadIdx.x; 397 const int y = blockIdx.y * blockDim.y + threadIdx.y; 398 399 if (x >= cols || y >= rows || !mask(y, x)) 400 return; 401 402 typename PtrTraits<SrcPtr>::value_type srcVal = src(y, x); 403 404 Unroll<tuple_size<DstPtrTuple>::value>::transform(srcVal, dst, op, y, x); 405 } 406 407 template <class Policy, class SrcPtrTuple, class DstPtrTuple, class OpTuple, class MaskPtr> 408 __host__ void transform_tuple(const SrcPtrTuple& src, const DstPtrTuple& dst, const OpTuple& op, const MaskPtr& mask, int rows, int cols, cudaStream_t stream) 409 { 410 const dim3 block(Policy::block_size_x, Policy::block_size_y); 411 const dim3 grid(divUp(cols, block.x), divUp(rows, block.y)); 412 413 transform_tuple<<<grid, block, 0, stream>>>(src, dst, op, mask, rows, cols); 414 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 415 416 if (stream == 0) 417 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 418 } 419 } 420 421 }} 422 423 #endif 424