1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/utility.hpp" 47 #include "opencv2/core/cuda/functional.hpp" 48 #include "opencv2/core/cuda/limits.hpp" 49 #include "opencv2/core/cuda/vec_math.hpp" 50 #include "opencv2/core/cuda/reduce.hpp" 51 52 using namespace cv::cuda; 53 using namespace cv::cuda::device; 54 55 namespace pyrlk 56 { 57 __constant__ int c_winSize_x; 58 __constant__ int c_winSize_y; 59 __constant__ int c_halfWin_x; 60 __constant__ int c_halfWin_y; 61 __constant__ int c_iters; 62 63 texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp); 64 texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp); 65 texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp); 66 67 texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp); 68 texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp); 69 70 template <int cn> struct Tex_I; 71 template <> struct Tex_I<1> 72 { 73 static __device__ __forceinline__ float read(float x, float y) 74 { 75 return tex2D(tex_If, x, y); 76 } 77 }; 78 template <> struct Tex_I<4> 79 { 80 static __device__ __forceinline__ float4 read(float x, float y) 81 { 82 return tex2D(tex_If4, x, y); 83 } 84 }; 85 86 template <int cn> struct Tex_J; 87 template <> struct Tex_J<1> 88 { 89 static __device__ __forceinline__ float read(float x, float y) 90 { 91 return tex2D(tex_Jf, x, y); 92 } 93 }; 94 template <> struct Tex_J<4> 95 { 96 static __device__ __forceinline__ float4 read(float x, float y) 97 { 98 return tex2D(tex_Jf4, x, y); 99 } 100 }; 101 102 __device__ __forceinline__ void accum(float& dst, float val) 103 { 104 dst += val; 105 } 106 __device__ __forceinline__ void accum(float& dst, const float4& val) 107 { 108 dst += val.x + val.y + val.z; 109 } 110 111 __device__ __forceinline__ float abs_(float a) 112 { 113 return ::fabsf(a); 114 } 115 __device__ __forceinline__ float4 abs_(const float4& a) 116 { 117 return abs(a); 118 } 119 120 template <int cn, int PATCH_X, int PATCH_Y, bool calcErr> 121 __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols) 122 { 123 #if __CUDA_ARCH__ <= 110 124 const int BLOCK_SIZE = 128; 125 #else 126 const int BLOCK_SIZE = 256; 127 #endif 128 129 __shared__ float smem1[BLOCK_SIZE]; 130 __shared__ float smem2[BLOCK_SIZE]; 131 __shared__ float smem3[BLOCK_SIZE]; 132 133 const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x; 134 135 float2 prevPt = prevPts[blockIdx.x]; 136 prevPt.x *= (1.0f / (1 << level)); 137 prevPt.y *= (1.0f / (1 << level)); 138 139 if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows) 140 { 141 if (tid == 0 && level == 0) 142 status[blockIdx.x] = 0; 143 144 return; 145 } 146 147 prevPt.x -= c_halfWin_x; 148 prevPt.y -= c_halfWin_y; 149 150 // extract the patch from the first image, compute covariation matrix of derivatives 151 152 float A11 = 0; 153 float A12 = 0; 154 float A22 = 0; 155 156 typedef typename TypeVec<float, cn>::vec_type work_type; 157 158 work_type I_patch [PATCH_Y][PATCH_X]; 159 work_type dIdx_patch[PATCH_Y][PATCH_X]; 160 work_type dIdy_patch[PATCH_Y][PATCH_X]; 161 162 for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i) 163 { 164 for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j) 165 { 166 float x = prevPt.x + xBase + 0.5f; 167 float y = prevPt.y + yBase + 0.5f; 168 169 I_patch[i][j] = Tex_I<cn>::read(x, y); 170 171 // Sharr Deriv 172 173 work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) - 174 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1)); 175 176 work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) - 177 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1)); 178 179 dIdx_patch[i][j] = dIdx; 180 dIdy_patch[i][j] = dIdy; 181 182 accum(A11, dIdx * dIdx); 183 accum(A12, dIdx * dIdy); 184 accum(A22, dIdy * dIdy); 185 } 186 } 187 188 reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>())); 189 190 #if __CUDA_ARCH__ >= 300 191 if (tid == 0) 192 { 193 smem1[0] = A11; 194 smem2[0] = A12; 195 smem3[0] = A22; 196 } 197 #endif 198 199 __syncthreads(); 200 201 A11 = smem1[0]; 202 A12 = smem2[0]; 203 A22 = smem3[0]; 204 205 float D = A11 * A22 - A12 * A12; 206 207 if (D < numeric_limits<float>::epsilon()) 208 { 209 if (tid == 0 && level == 0) 210 status[blockIdx.x] = 0; 211 212 return; 213 } 214 215 D = 1.f / D; 216 217 A11 *= D; 218 A12 *= D; 219 A22 *= D; 220 221 float2 nextPt = nextPts[blockIdx.x]; 222 nextPt.x *= 2.f; 223 nextPt.y *= 2.f; 224 225 nextPt.x -= c_halfWin_x; 226 nextPt.y -= c_halfWin_y; 227 228 for (int k = 0; k < c_iters; ++k) 229 { 230 if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows) 231 { 232 if (tid == 0 && level == 0) 233 status[blockIdx.x] = 0; 234 235 return; 236 } 237 238 float b1 = 0; 239 float b2 = 0; 240 241 for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i) 242 { 243 for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) 244 { 245 work_type I_val = I_patch[i][j]; 246 work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); 247 248 work_type diff = (J_val - I_val) * 32.0f; 249 250 accum(b1, diff * dIdx_patch[i][j]); 251 accum(b2, diff * dIdy_patch[i][j]); 252 } 253 } 254 255 reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>())); 256 257 #if __CUDA_ARCH__ >= 300 258 if (tid == 0) 259 { 260 smem1[0] = b1; 261 smem2[0] = b2; 262 } 263 #endif 264 265 __syncthreads(); 266 267 b1 = smem1[0]; 268 b2 = smem2[0]; 269 270 float2 delta; 271 delta.x = A12 * b2 - A22 * b1; 272 delta.y = A12 * b1 - A11 * b2; 273 274 nextPt.x += delta.x; 275 nextPt.y += delta.y; 276 277 if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f) 278 break; 279 } 280 281 float errval = 0; 282 if (calcErr) 283 { 284 for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i) 285 { 286 for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j) 287 { 288 work_type I_val = I_patch[i][j]; 289 work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f); 290 291 work_type diff = J_val - I_val; 292 293 accum(errval, abs_(diff)); 294 } 295 } 296 297 reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>()); 298 } 299 300 if (tid == 0) 301 { 302 nextPt.x += c_halfWin_x; 303 nextPt.y += c_halfWin_y; 304 305 nextPts[blockIdx.x] = nextPt; 306 307 if (calcErr) 308 err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y); 309 } 310 } 311 312 template <int cn, int PATCH_X, int PATCH_Y> 313 void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, 314 int level, dim3 block, cudaStream_t stream) 315 { 316 dim3 grid(ptcount); 317 318 if (level == 0 && err) 319 sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols); 320 else 321 sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols); 322 323 cudaSafeCall( cudaGetLastError() ); 324 325 if (stream == 0) 326 cudaSafeCall( cudaDeviceSynchronize() ); 327 } 328 329 template <bool calcErr> 330 __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols) 331 { 332 extern __shared__ int smem[]; 333 334 const int patchWidth = blockDim.x + 2 * c_halfWin_x; 335 const int patchHeight = blockDim.y + 2 * c_halfWin_y; 336 337 int* I_patch = smem; 338 int* dIdx_patch = I_patch + patchWidth * patchHeight; 339 int* dIdy_patch = dIdx_patch + patchWidth * patchHeight; 340 341 const int xBase = blockIdx.x * blockDim.x; 342 const int yBase = blockIdx.y * blockDim.y; 343 344 for (int i = threadIdx.y; i < patchHeight; i += blockDim.y) 345 { 346 for (int j = threadIdx.x; j < patchWidth; j += blockDim.x) 347 { 348 float x = xBase - c_halfWin_x + j + 0.5f; 349 float y = yBase - c_halfWin_y + i + 0.5f; 350 351 I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y); 352 353 // Sharr Deriv 354 355 dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) - 356 (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1)); 357 358 dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) - 359 (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1)); 360 } 361 } 362 363 __syncthreads(); 364 365 const int x = xBase + threadIdx.x; 366 const int y = yBase + threadIdx.y; 367 368 if (x >= cols || y >= rows) 369 return; 370 371 int A11i = 0; 372 int A12i = 0; 373 int A22i = 0; 374 375 for (int i = 0; i < c_winSize_y; ++i) 376 { 377 for (int j = 0; j < c_winSize_x; ++j) 378 { 379 int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; 380 int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; 381 382 A11i += dIdx * dIdx; 383 A12i += dIdx * dIdy; 384 A22i += dIdy * dIdy; 385 } 386 } 387 388 float A11 = A11i; 389 float A12 = A12i; 390 float A22 = A22i; 391 392 float D = A11 * A22 - A12 * A12; 393 394 if (D < numeric_limits<float>::epsilon()) 395 { 396 if (calcErr) 397 err(y, x) = numeric_limits<float>::max(); 398 399 return; 400 } 401 402 D = 1.f / D; 403 404 A11 *= D; 405 A12 *= D; 406 A22 *= D; 407 408 float2 nextPt; 409 nextPt.x = x + prevU(y/2, x/2) * 2.0f; 410 nextPt.y = y + prevV(y/2, x/2) * 2.0f; 411 412 for (int k = 0; k < c_iters; ++k) 413 { 414 if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows) 415 { 416 if (calcErr) 417 err(y, x) = numeric_limits<float>::max(); 418 419 return; 420 } 421 422 int b1 = 0; 423 int b2 = 0; 424 425 for (int i = 0; i < c_winSize_y; ++i) 426 { 427 for (int j = 0; j < c_winSize_x; ++j) 428 { 429 int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; 430 int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); 431 432 int diff = (J - I) * 32; 433 434 int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; 435 int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)]; 436 437 b1 += diff * dIdx; 438 b2 += diff * dIdy; 439 } 440 } 441 442 float2 delta; 443 delta.x = A12 * b2 - A22 * b1; 444 delta.y = A12 * b1 - A11 * b2; 445 446 nextPt.x += delta.x; 447 nextPt.y += delta.y; 448 449 if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f) 450 break; 451 } 452 453 u(y, x) = nextPt.x - x; 454 v(y, x) = nextPt.y - y; 455 456 if (calcErr) 457 { 458 int errval = 0; 459 460 for (int i = 0; i < c_winSize_y; ++i) 461 { 462 for (int j = 0; j < c_winSize_x; ++j) 463 { 464 int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j]; 465 int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f); 466 467 errval += ::abs(J - I); 468 } 469 } 470 471 err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y); 472 } 473 } 474 475 void loadConstants(int2 winSize, int iters, cudaStream_t stream) 476 { 477 cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); 478 cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); 479 480 int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); 481 cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); 482 cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); 483 484 cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) ); 485 } 486 487 void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, 488 int level, dim3 block, dim3 patch, cudaStream_t stream) 489 { 490 typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, 491 int level, dim3 block, cudaStream_t stream); 492 493 static const func_t funcs[5][5] = 494 { 495 {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>}, 496 {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>}, 497 {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>}, 498 {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>}, 499 {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>} 500 }; 501 502 bindTexture(&tex_If, I); 503 bindTexture(&tex_Jf, J); 504 505 funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount, 506 level, block, stream); 507 } 508 509 void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, 510 int level, dim3 block, dim3 patch, cudaStream_t stream) 511 { 512 typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount, 513 int level, dim3 block, cudaStream_t stream); 514 515 static const func_t funcs[5][5] = 516 { 517 {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>}, 518 {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>}, 519 {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>}, 520 {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>}, 521 {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>} 522 }; 523 524 bindTexture(&tex_If4, I); 525 bindTexture(&tex_Jf4, J); 526 527 funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount, 528 level, block, stream); 529 } 530 531 void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream) 532 { 533 dim3 block(16, 16); 534 dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y)); 535 536 bindTexture(&tex_Ib, I); 537 bindTexture(&tex_Jf, J); 538 539 int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2); 540 const int patchWidth = block.x + 2 * halfWin.x; 541 const int patchHeight = block.y + 2 * halfWin.y; 542 size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int); 543 544 if (err.data) 545 { 546 denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols); 547 cudaSafeCall( cudaGetLastError() ); 548 } 549 else 550 { 551 denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols); 552 cudaSafeCall( cudaGetLastError() ); 553 } 554 555 if (stream == 0) 556 cudaSafeCall( cudaDeviceSynchronize() ); 557 } 558 } 559 560 #endif /* CUDA_DISABLER */ 561