1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/emulation.hpp" 47 #include "opencv2/core/cuda/transform.hpp" 48 #include "opencv2/core/cuda/functional.hpp" 49 #include "opencv2/core/cuda/utility.hpp" 50 51 using namespace cv::cuda; 52 using namespace cv::cuda::device; 53 54 namespace canny 55 { 56 struct L1 : binary_function<int, int, float> 57 { 58 __device__ __forceinline__ float operator ()(int x, int y) const 59 { 60 return ::abs(x) + ::abs(y); 61 } 62 63 __host__ __device__ __forceinline__ L1() {} 64 __host__ __device__ __forceinline__ L1(const L1&) {} 65 }; 66 struct L2 : binary_function<int, int, float> 67 { 68 __device__ __forceinline__ float operator ()(int x, int y) const 69 { 70 return ::sqrtf(x * x + y * y); 71 } 72 73 __host__ __device__ __forceinline__ L2() {} 74 __host__ __device__ __forceinline__ L2(const L2&) {} 75 }; 76 } 77 78 namespace cv { namespace cuda { namespace device 79 { 80 template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1> 81 { 82 enum { smart_shift = 4 }; 83 }; 84 template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2> 85 { 86 enum { smart_shift = 4 }; 87 }; 88 }}} 89 90 namespace canny 91 { 92 texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); 93 struct SrcTex 94 { 95 int xoff; 96 int yoff; 97 __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} 98 99 __device__ __forceinline__ int operator ()(int y, int x) const 100 { 101 return tex2D(tex_src, x + xoff, y + yoff); 102 } 103 }; 104 105 template <class Norm> __global__ 106 void calcMagnitudeKernel(const SrcTex src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) 107 { 108 const int x = blockIdx.x * blockDim.x + threadIdx.x; 109 const int y = blockIdx.y * blockDim.y + threadIdx.y; 110 111 if (y >= mag.rows || x >= mag.cols) 112 return; 113 114 int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); 115 int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); 116 117 dx(y, x) = dxVal; 118 dy(y, x) = dyVal; 119 120 mag(y, x) = norm(dxVal, dyVal); 121 } 122 123 void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) 124 { 125 const dim3 block(16, 16); 126 const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); 127 128 bindTexture(&tex_src, srcWhole); 129 SrcTex src(xoff, yoff); 130 131 if (L2Grad) 132 { 133 L2 norm; 134 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm); 135 } 136 else 137 { 138 L1 norm; 139 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm); 140 } 141 142 cudaSafeCall( cudaGetLastError() ); 143 144 if (stream == NULL) 145 cudaSafeCall( cudaDeviceSynchronize() ); 146 } 147 148 void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) 149 { 150 if (L2Grad) 151 { 152 L2 norm; 153 transform(dx, dy, mag, norm, WithOutMask(), stream); 154 } 155 else 156 { 157 L1 norm; 158 transform(dx, dy, mag, norm, WithOutMask(), stream); 159 } 160 } 161 } 162 163 ////////////////////////////////////////////////////////////////////////////////////////// 164 165 namespace canny 166 { 167 texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); 168 169 __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) 170 { 171 const int CANNY_SHIFT = 15; 172 const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5); 173 174 const int x = blockIdx.x * blockDim.x + threadIdx.x; 175 const int y = blockIdx.y * blockDim.y + threadIdx.y; 176 177 if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1) 178 return; 179 180 int dxVal = dx(y, x); 181 int dyVal = dy(y, x); 182 183 const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; 184 const float m = tex2D(tex_mag, x, y); 185 186 dxVal = ::abs(dxVal); 187 dyVal = ::abs(dyVal); 188 189 // 0 - the pixel can not belong to an edge 190 // 1 - the pixel might belong to an edge 191 // 2 - the pixel does belong to an edge 192 int edge_type = 0; 193 194 if (m > low_thresh) 195 { 196 const int tg22x = dxVal * TG22; 197 const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); 198 199 dyVal <<= CANNY_SHIFT; 200 201 if (dyVal < tg22x) 202 { 203 if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) 204 edge_type = 1 + (int)(m > high_thresh); 205 } 206 else if(dyVal > tg67x) 207 { 208 if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) 209 edge_type = 1 + (int)(m > high_thresh); 210 } 211 else 212 { 213 if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) 214 edge_type = 1 + (int)(m > high_thresh); 215 } 216 } 217 218 map(y, x) = edge_type; 219 } 220 221 void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream) 222 { 223 const dim3 block(16, 16); 224 const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); 225 226 bindTexture(&tex_mag, mag); 227 228 calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh); 229 cudaSafeCall( cudaGetLastError() ); 230 231 if (stream == NULL) 232 cudaSafeCall( cudaDeviceSynchronize() ); 233 } 234 } 235 236 ////////////////////////////////////////////////////////////////////////////////////////// 237 238 namespace canny 239 { 240 __device__ int counter = 0; 241 242 __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols) 243 { 244 return (y >= 0) && (y < rows) && (x >= 0) && (x < cols); 245 } 246 247 __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st) 248 { 249 __shared__ volatile int smem[18][18]; 250 251 const int x = blockIdx.x * blockDim.x + threadIdx.x; 252 const int y = blockIdx.y * blockDim.y + threadIdx.y; 253 254 smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0; 255 if (threadIdx.y == 0) 256 smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0; 257 if (threadIdx.y == blockDim.y - 1) 258 smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0; 259 if (threadIdx.x == 0) 260 smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0; 261 if (threadIdx.x == blockDim.x - 1) 262 smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0; 263 if (threadIdx.x == 0 && threadIdx.y == 0) 264 smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0; 265 if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) 266 smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0; 267 if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) 268 smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0; 269 if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) 270 smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0; 271 272 __syncthreads(); 273 274 if (x >= map.cols || y >= map.rows) 275 return; 276 277 int n; 278 279 #pragma unroll 280 for (int k = 0; k < 16; ++k) 281 { 282 n = 0; 283 284 if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) 285 { 286 n += smem[threadIdx.y ][threadIdx.x ] == 2; 287 n += smem[threadIdx.y ][threadIdx.x + 1] == 2; 288 n += smem[threadIdx.y ][threadIdx.x + 2] == 2; 289 290 n += smem[threadIdx.y + 1][threadIdx.x ] == 2; 291 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; 292 293 n += smem[threadIdx.y + 2][threadIdx.x ] == 2; 294 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; 295 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; 296 } 297 298 __syncthreads(); 299 300 if (n > 0) 301 smem[threadIdx.y + 1][threadIdx.x + 1] = 2; 302 303 __syncthreads(); 304 } 305 306 const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; 307 308 map(y, x) = e; 309 310 n = 0; 311 312 if (e == 2) 313 { 314 n += smem[threadIdx.y ][threadIdx.x ] == 1; 315 n += smem[threadIdx.y ][threadIdx.x + 1] == 1; 316 n += smem[threadIdx.y ][threadIdx.x + 2] == 1; 317 318 n += smem[threadIdx.y + 1][threadIdx.x ] == 1; 319 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; 320 321 n += smem[threadIdx.y + 2][threadIdx.x ] == 1; 322 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; 323 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; 324 } 325 326 if (n > 0) 327 { 328 const int ind = ::atomicAdd(&counter, 1); 329 st[ind] = make_short2(x, y); 330 } 331 } 332 333 void edgesHysteresisLocal(PtrStepSzi map, short2* st1, cudaStream_t stream) 334 { 335 void* counter_ptr; 336 cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, counter) ); 337 338 cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); 339 340 const dim3 block(16, 16); 341 const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); 342 343 edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1); 344 cudaSafeCall( cudaGetLastError() ); 345 346 if (stream == NULL) 347 cudaSafeCall( cudaDeviceSynchronize() ); 348 } 349 } 350 351 ////////////////////////////////////////////////////////////////////////////////////////// 352 353 namespace canny 354 { 355 __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; 356 __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; 357 358 __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, const int count) 359 { 360 const int stack_size = 512; 361 362 __shared__ int s_counter; 363 __shared__ int s_ind; 364 __shared__ short2 s_st[stack_size]; 365 366 if (threadIdx.x == 0) 367 s_counter = 0; 368 369 __syncthreads(); 370 371 int ind = blockIdx.y * gridDim.x + blockIdx.x; 372 373 if (ind >= count) 374 return; 375 376 short2 pos = st1[ind]; 377 378 if (threadIdx.x < 8) 379 { 380 pos.x += c_dx[threadIdx.x]; 381 pos.y += c_dy[threadIdx.x]; 382 383 if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1) 384 { 385 map(pos.y, pos.x) = 2; 386 387 ind = Emulation::smem::atomicAdd(&s_counter, 1); 388 389 s_st[ind] = pos; 390 } 391 } 392 393 __syncthreads(); 394 395 while (s_counter > 0 && s_counter <= stack_size - blockDim.x) 396 { 397 const int subTaskIdx = threadIdx.x >> 3; 398 const int portion = ::min(s_counter, blockDim.x >> 3); 399 400 if (subTaskIdx < portion) 401 pos = s_st[s_counter - 1 - subTaskIdx]; 402 403 __syncthreads(); 404 405 if (threadIdx.x == 0) 406 s_counter -= portion; 407 408 __syncthreads(); 409 410 if (subTaskIdx < portion) 411 { 412 pos.x += c_dx[threadIdx.x & 7]; 413 pos.y += c_dy[threadIdx.x & 7]; 414 415 if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1) 416 { 417 map(pos.y, pos.x) = 2; 418 419 ind = Emulation::smem::atomicAdd(&s_counter, 1); 420 421 s_st[ind] = pos; 422 } 423 } 424 425 __syncthreads(); 426 } 427 428 if (s_counter > 0) 429 { 430 if (threadIdx.x == 0) 431 { 432 s_ind = ::atomicAdd(&counter, s_counter); 433 434 if (s_ind + s_counter > map.cols * map.rows) 435 s_counter = 0; 436 } 437 438 __syncthreads(); 439 440 ind = s_ind; 441 442 for (int i = threadIdx.x; i < s_counter; i += blockDim.x) 443 st2[ind + i] = s_st[i]; 444 } 445 } 446 447 void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, cudaStream_t stream) 448 { 449 void* counter_ptr; 450 cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, canny::counter) ); 451 452 int count; 453 cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); 454 cudaSafeCall( cudaStreamSynchronize(stream) ); 455 456 while (count > 0) 457 { 458 cudaSafeCall( cudaMemsetAsync(counter_ptr, 0, sizeof(int), stream) ); 459 460 const dim3 block(128); 461 const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); 462 463 edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, count); 464 cudaSafeCall( cudaGetLastError() ); 465 466 if (stream == NULL) 467 cudaSafeCall( cudaDeviceSynchronize() ); 468 469 cudaSafeCall( cudaMemcpyAsync(&count, counter_ptr, sizeof(int), cudaMemcpyDeviceToHost, stream) ); 470 cudaSafeCall( cudaStreamSynchronize(stream) ); 471 472 count = min(count, map.cols * map.rows); 473 474 //std::swap(st1, st2); 475 short2* tmp = st1; 476 st1 = st2; 477 st2 = tmp; 478 } 479 } 480 } 481 482 ////////////////////////////////////////////////////////////////////////////////////////// 483 484 namespace canny 485 { 486 struct GetEdges : unary_function<int, uchar> 487 { 488 __device__ __forceinline__ uchar operator ()(int e) const 489 { 490 return (uchar)(-(e >> 1)); 491 } 492 493 __host__ __device__ __forceinline__ GetEdges() {} 494 __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {} 495 }; 496 } 497 498 namespace cv { namespace cuda { namespace device 499 { 500 template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges> 501 { 502 enum { smart_shift = 4 }; 503 }; 504 }}} 505 506 namespace canny 507 { 508 void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream) 509 { 510 transform(map, dst, GetEdges(), WithOutMask(), stream); 511 } 512 } 513 514 #endif /* CUDA_DISABLER */ 515