1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #include "opencv2/core/cuda/common.hpp" 44 #include "opencv2/core/cuda/saturate_cast.hpp" 45 #include "opencv2/core/cuda/vec_math.hpp" 46 #include "opencv2/core/cuda/border_interpolate.hpp" 47 48 using namespace cv::cuda; 49 using namespace cv::cuda::device; 50 51 namespace column_filter 52 { 53 #define MAX_KERNEL_SIZE 32 54 55 __constant__ float c_kernel[MAX_KERNEL_SIZE]; 56 57 template <int KSIZE, typename T, typename D, typename B> 58 __global__ void linearColumnFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd) 59 { 60 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) 61 const int BLOCK_DIM_X = 16; 62 const int BLOCK_DIM_Y = 16; 63 const int PATCH_PER_BLOCK = 4; 64 const int HALO_SIZE = KSIZE <= 16 ? 1 : 2; 65 #else 66 const int BLOCK_DIM_X = 16; 67 const int BLOCK_DIM_Y = 8; 68 const int PATCH_PER_BLOCK = 2; 69 const int HALO_SIZE = 2; 70 #endif 71 72 typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; 73 74 __shared__ sum_t smem[(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_Y][BLOCK_DIM_X]; 75 76 const int x = blockIdx.x * BLOCK_DIM_X + threadIdx.x; 77 78 if (x >= src.cols) 79 return; 80 81 const T* src_col = src.ptr() + x; 82 83 const int yStart = blockIdx.y * (BLOCK_DIM_Y * PATCH_PER_BLOCK) + threadIdx.y; 84 85 if (blockIdx.y > 0) 86 { 87 //Upper halo 88 #pragma unroll 89 for (int j = 0; j < HALO_SIZE; ++j) 90 smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, x)); 91 } 92 else 93 { 94 //Upper halo 95 #pragma unroll 96 for (int j = 0; j < HALO_SIZE; ++j) 97 smem[threadIdx.y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_low(yStart - (HALO_SIZE - j) * BLOCK_DIM_Y, src_col, src.step)); 98 } 99 100 if (blockIdx.y + 2 < gridDim.y) 101 { 102 //Main data 103 #pragma unroll 104 for (int j = 0; j < PATCH_PER_BLOCK; ++j) 105 smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + j * BLOCK_DIM_Y, x)); 106 107 //Lower halo 108 #pragma unroll 109 for (int j = 0; j < HALO_SIZE; ++j) 110 smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(src(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, x)); 111 } 112 else 113 { 114 //Main data 115 #pragma unroll 116 for (int j = 0; j < PATCH_PER_BLOCK; ++j) 117 smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + j * BLOCK_DIM_Y, src_col, src.step)); 118 119 //Lower halo 120 #pragma unroll 121 for (int j = 0; j < HALO_SIZE; ++j) 122 smem[threadIdx.y + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_Y + j * BLOCK_DIM_Y][threadIdx.x] = saturate_cast<sum_t>(brd.at_high(yStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_Y, src_col, src.step)); 123 } 124 125 __syncthreads(); 126 127 #pragma unroll 128 for (int j = 0; j < PATCH_PER_BLOCK; ++j) 129 { 130 const int y = yStart + j * BLOCK_DIM_Y; 131 132 if (y < src.rows) 133 { 134 sum_t sum = VecTraits<sum_t>::all(0); 135 136 #pragma unroll 137 for (int k = 0; k < KSIZE; ++k) 138 sum = sum + smem[threadIdx.y + HALO_SIZE * BLOCK_DIM_Y + j * BLOCK_DIM_Y - anchor + k][threadIdx.x] * c_kernel[k]; 139 140 dst(y, x) = saturate_cast<D>(sum); 141 } 142 } 143 } 144 145 template <int KSIZE, typename T, typename D, template<typename> class B> 146 void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream) 147 { 148 int BLOCK_DIM_X; 149 int BLOCK_DIM_Y; 150 int PATCH_PER_BLOCK; 151 152 if (cc >= 20) 153 { 154 BLOCK_DIM_X = 16; 155 BLOCK_DIM_Y = 16; 156 PATCH_PER_BLOCK = 4; 157 } 158 else 159 { 160 BLOCK_DIM_X = 16; 161 BLOCK_DIM_Y = 8; 162 PATCH_PER_BLOCK = 2; 163 } 164 165 const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); 166 const dim3 grid(divUp(src.cols, BLOCK_DIM_X), divUp(src.rows, BLOCK_DIM_Y * PATCH_PER_BLOCK)); 167 168 B<T> brd(src.rows); 169 170 linearColumnFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); 171 172 cudaSafeCall( cudaGetLastError() ); 173 174 if (stream == 0) 175 cudaSafeCall( cudaDeviceSynchronize() ); 176 } 177 } 178 179 namespace filter 180 { 181 template <typename T, typename D> 182 void linearColumn(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream) 183 { 184 typedef void (*caller_t)(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream); 185 186 static const caller_t callers[5][33] = 187 { 188 { 189 0, 190 column_filter::caller< 1, T, D, BrdColConstant>, 191 column_filter::caller< 2, T, D, BrdColConstant>, 192 column_filter::caller< 3, T, D, BrdColConstant>, 193 column_filter::caller< 4, T, D, BrdColConstant>, 194 column_filter::caller< 5, T, D, BrdColConstant>, 195 column_filter::caller< 6, T, D, BrdColConstant>, 196 column_filter::caller< 7, T, D, BrdColConstant>, 197 column_filter::caller< 8, T, D, BrdColConstant>, 198 column_filter::caller< 9, T, D, BrdColConstant>, 199 column_filter::caller<10, T, D, BrdColConstant>, 200 column_filter::caller<11, T, D, BrdColConstant>, 201 column_filter::caller<12, T, D, BrdColConstant>, 202 column_filter::caller<13, T, D, BrdColConstant>, 203 column_filter::caller<14, T, D, BrdColConstant>, 204 column_filter::caller<15, T, D, BrdColConstant>, 205 column_filter::caller<16, T, D, BrdColConstant>, 206 column_filter::caller<17, T, D, BrdColConstant>, 207 column_filter::caller<18, T, D, BrdColConstant>, 208 column_filter::caller<19, T, D, BrdColConstant>, 209 column_filter::caller<20, T, D, BrdColConstant>, 210 column_filter::caller<21, T, D, BrdColConstant>, 211 column_filter::caller<22, T, D, BrdColConstant>, 212 column_filter::caller<23, T, D, BrdColConstant>, 213 column_filter::caller<24, T, D, BrdColConstant>, 214 column_filter::caller<25, T, D, BrdColConstant>, 215 column_filter::caller<26, T, D, BrdColConstant>, 216 column_filter::caller<27, T, D, BrdColConstant>, 217 column_filter::caller<28, T, D, BrdColConstant>, 218 column_filter::caller<29, T, D, BrdColConstant>, 219 column_filter::caller<30, T, D, BrdColConstant>, 220 column_filter::caller<31, T, D, BrdColConstant>, 221 column_filter::caller<32, T, D, BrdColConstant> 222 }, 223 { 224 0, 225 column_filter::caller< 1, T, D, BrdColReplicate>, 226 column_filter::caller< 2, T, D, BrdColReplicate>, 227 column_filter::caller< 3, T, D, BrdColReplicate>, 228 column_filter::caller< 4, T, D, BrdColReplicate>, 229 column_filter::caller< 5, T, D, BrdColReplicate>, 230 column_filter::caller< 6, T, D, BrdColReplicate>, 231 column_filter::caller< 7, T, D, BrdColReplicate>, 232 column_filter::caller< 8, T, D, BrdColReplicate>, 233 column_filter::caller< 9, T, D, BrdColReplicate>, 234 column_filter::caller<10, T, D, BrdColReplicate>, 235 column_filter::caller<11, T, D, BrdColReplicate>, 236 column_filter::caller<12, T, D, BrdColReplicate>, 237 column_filter::caller<13, T, D, BrdColReplicate>, 238 column_filter::caller<14, T, D, BrdColReplicate>, 239 column_filter::caller<15, T, D, BrdColReplicate>, 240 column_filter::caller<16, T, D, BrdColReplicate>, 241 column_filter::caller<17, T, D, BrdColReplicate>, 242 column_filter::caller<18, T, D, BrdColReplicate>, 243 column_filter::caller<19, T, D, BrdColReplicate>, 244 column_filter::caller<20, T, D, BrdColReplicate>, 245 column_filter::caller<21, T, D, BrdColReplicate>, 246 column_filter::caller<22, T, D, BrdColReplicate>, 247 column_filter::caller<23, T, D, BrdColReplicate>, 248 column_filter::caller<24, T, D, BrdColReplicate>, 249 column_filter::caller<25, T, D, BrdColReplicate>, 250 column_filter::caller<26, T, D, BrdColReplicate>, 251 column_filter::caller<27, T, D, BrdColReplicate>, 252 column_filter::caller<28, T, D, BrdColReplicate>, 253 column_filter::caller<29, T, D, BrdColReplicate>, 254 column_filter::caller<30, T, D, BrdColReplicate>, 255 column_filter::caller<31, T, D, BrdColReplicate>, 256 column_filter::caller<32, T, D, BrdColReplicate> 257 }, 258 { 259 0, 260 column_filter::caller< 1, T, D, BrdColReflect>, 261 column_filter::caller< 2, T, D, BrdColReflect>, 262 column_filter::caller< 3, T, D, BrdColReflect>, 263 column_filter::caller< 4, T, D, BrdColReflect>, 264 column_filter::caller< 5, T, D, BrdColReflect>, 265 column_filter::caller< 6, T, D, BrdColReflect>, 266 column_filter::caller< 7, T, D, BrdColReflect>, 267 column_filter::caller< 8, T, D, BrdColReflect>, 268 column_filter::caller< 9, T, D, BrdColReflect>, 269 column_filter::caller<10, T, D, BrdColReflect>, 270 column_filter::caller<11, T, D, BrdColReflect>, 271 column_filter::caller<12, T, D, BrdColReflect>, 272 column_filter::caller<13, T, D, BrdColReflect>, 273 column_filter::caller<14, T, D, BrdColReflect>, 274 column_filter::caller<15, T, D, BrdColReflect>, 275 column_filter::caller<16, T, D, BrdColReflect>, 276 column_filter::caller<17, T, D, BrdColReflect>, 277 column_filter::caller<18, T, D, BrdColReflect>, 278 column_filter::caller<19, T, D, BrdColReflect>, 279 column_filter::caller<20, T, D, BrdColReflect>, 280 column_filter::caller<21, T, D, BrdColReflect>, 281 column_filter::caller<22, T, D, BrdColReflect>, 282 column_filter::caller<23, T, D, BrdColReflect>, 283 column_filter::caller<24, T, D, BrdColReflect>, 284 column_filter::caller<25, T, D, BrdColReflect>, 285 column_filter::caller<26, T, D, BrdColReflect>, 286 column_filter::caller<27, T, D, BrdColReflect>, 287 column_filter::caller<28, T, D, BrdColReflect>, 288 column_filter::caller<29, T, D, BrdColReflect>, 289 column_filter::caller<30, T, D, BrdColReflect>, 290 column_filter::caller<31, T, D, BrdColReflect>, 291 column_filter::caller<32, T, D, BrdColReflect> 292 }, 293 { 294 0, 295 column_filter::caller< 1, T, D, BrdColWrap>, 296 column_filter::caller< 2, T, D, BrdColWrap>, 297 column_filter::caller< 3, T, D, BrdColWrap>, 298 column_filter::caller< 4, T, D, BrdColWrap>, 299 column_filter::caller< 5, T, D, BrdColWrap>, 300 column_filter::caller< 6, T, D, BrdColWrap>, 301 column_filter::caller< 7, T, D, BrdColWrap>, 302 column_filter::caller< 8, T, D, BrdColWrap>, 303 column_filter::caller< 9, T, D, BrdColWrap>, 304 column_filter::caller<10, T, D, BrdColWrap>, 305 column_filter::caller<11, T, D, BrdColWrap>, 306 column_filter::caller<12, T, D, BrdColWrap>, 307 column_filter::caller<13, T, D, BrdColWrap>, 308 column_filter::caller<14, T, D, BrdColWrap>, 309 column_filter::caller<15, T, D, BrdColWrap>, 310 column_filter::caller<16, T, D, BrdColWrap>, 311 column_filter::caller<17, T, D, BrdColWrap>, 312 column_filter::caller<18, T, D, BrdColWrap>, 313 column_filter::caller<19, T, D, BrdColWrap>, 314 column_filter::caller<20, T, D, BrdColWrap>, 315 column_filter::caller<21, T, D, BrdColWrap>, 316 column_filter::caller<22, T, D, BrdColWrap>, 317 column_filter::caller<23, T, D, BrdColWrap>, 318 column_filter::caller<24, T, D, BrdColWrap>, 319 column_filter::caller<25, T, D, BrdColWrap>, 320 column_filter::caller<26, T, D, BrdColWrap>, 321 column_filter::caller<27, T, D, BrdColWrap>, 322 column_filter::caller<28, T, D, BrdColWrap>, 323 column_filter::caller<29, T, D, BrdColWrap>, 324 column_filter::caller<30, T, D, BrdColWrap>, 325 column_filter::caller<31, T, D, BrdColWrap>, 326 column_filter::caller<32, T, D, BrdColWrap> 327 }, 328 { 329 0, 330 column_filter::caller< 1, T, D, BrdColReflect101>, 331 column_filter::caller< 2, T, D, BrdColReflect101>, 332 column_filter::caller< 3, T, D, BrdColReflect101>, 333 column_filter::caller< 4, T, D, BrdColReflect101>, 334 column_filter::caller< 5, T, D, BrdColReflect101>, 335 column_filter::caller< 6, T, D, BrdColReflect101>, 336 column_filter::caller< 7, T, D, BrdColReflect101>, 337 column_filter::caller< 8, T, D, BrdColReflect101>, 338 column_filter::caller< 9, T, D, BrdColReflect101>, 339 column_filter::caller<10, T, D, BrdColReflect101>, 340 column_filter::caller<11, T, D, BrdColReflect101>, 341 column_filter::caller<12, T, D, BrdColReflect101>, 342 column_filter::caller<13, T, D, BrdColReflect101>, 343 column_filter::caller<14, T, D, BrdColReflect101>, 344 column_filter::caller<15, T, D, BrdColReflect101>, 345 column_filter::caller<16, T, D, BrdColReflect101>, 346 column_filter::caller<17, T, D, BrdColReflect101>, 347 column_filter::caller<18, T, D, BrdColReflect101>, 348 column_filter::caller<19, T, D, BrdColReflect101>, 349 column_filter::caller<20, T, D, BrdColReflect101>, 350 column_filter::caller<21, T, D, BrdColReflect101>, 351 column_filter::caller<22, T, D, BrdColReflect101>, 352 column_filter::caller<23, T, D, BrdColReflect101>, 353 column_filter::caller<24, T, D, BrdColReflect101>, 354 column_filter::caller<25, T, D, BrdColReflect101>, 355 column_filter::caller<26, T, D, BrdColReflect101>, 356 column_filter::caller<27, T, D, BrdColReflect101>, 357 column_filter::caller<28, T, D, BrdColReflect101>, 358 column_filter::caller<29, T, D, BrdColReflect101>, 359 column_filter::caller<30, T, D, BrdColReflect101>, 360 column_filter::caller<31, T, D, BrdColReflect101>, 361 column_filter::caller<32, T, D, BrdColReflect101> 362 } 363 }; 364 365 if (stream == 0) 366 cudaSafeCall( cudaMemcpyToSymbol(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice) ); 367 else 368 cudaSafeCall( cudaMemcpyToSymbolAsync(column_filter::c_kernel, kernel, ksize * sizeof(float), 0, cudaMemcpyDeviceToDevice, stream) ); 369 370 callers[brd_type][ksize]((PtrStepSz<T>)src, (PtrStepSz<D>)dst, anchor, cc, stream); 371 } 372 } 373