1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 //////////////////////////////////////////////////////////////////////////////// 44 // 45 // NVIDIA CUDA implementation of Brox et al Optical Flow algorithm 46 // 47 // Algorithm is explained in the original paper: 48 // T. Brox, A. Bruhn, N. Papenberg, J. Weickert: 49 // High accuracy optical flow estimation based on a theory for warping. 50 // ECCV 2004. 51 // 52 // Implementation by Mikhail Smirnov 53 // email: msmirnov (at) nvidia.com, devsupport (at) nvidia.com 54 // 55 // Credits for help with the code to: 56 // Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov. 57 // 58 //////////////////////////////////////////////////////////////////////////////// 59 60 #include <iostream> 61 #include <vector> 62 #include <memory> 63 64 #include "opencv2/core/cuda/utility.hpp" 65 66 #include "opencv2/cudalegacy/NPP_staging.hpp" 67 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp" 68 69 70 typedef NCVVectorAlloc<Ncv32f> FloatVector; 71 72 ///////////////////////////////////////////////////////////////////////////////////////// 73 // Implementation specific constants 74 ///////////////////////////////////////////////////////////////////////////////////////// 75 __device__ const float eps2 = 1e-6f; 76 77 ///////////////////////////////////////////////////////////////////////////////////////// 78 // Additional defines 79 ///////////////////////////////////////////////////////////////////////////////////////// 80 81 // rounded up division 82 inline int iDivUp(int a, int b) 83 { 84 return (a + b - 1)/b; 85 } 86 87 ///////////////////////////////////////////////////////////////////////////////////////// 88 // Texture references 89 ///////////////////////////////////////////////////////////////////////////////////////// 90 91 texture<float, 2, cudaReadModeElementType> tex_coarse; 92 texture<float, 2, cudaReadModeElementType> tex_fine; 93 94 texture<float, 2, cudaReadModeElementType> tex_I1; 95 texture<float, 2, cudaReadModeElementType> tex_I0; 96 97 texture<float, 2, cudaReadModeElementType> tex_Ix; 98 texture<float, 2, cudaReadModeElementType> tex_Ixx; 99 texture<float, 2, cudaReadModeElementType> tex_Ix0; 100 101 texture<float, 2, cudaReadModeElementType> tex_Iy; 102 texture<float, 2, cudaReadModeElementType> tex_Iyy; 103 texture<float, 2, cudaReadModeElementType> tex_Iy0; 104 105 texture<float, 2, cudaReadModeElementType> tex_Ixy; 106 107 texture<float, 1, cudaReadModeElementType> tex_u; 108 texture<float, 1, cudaReadModeElementType> tex_v; 109 texture<float, 1, cudaReadModeElementType> tex_du; 110 texture<float, 1, cudaReadModeElementType> tex_dv; 111 texture<float, 1, cudaReadModeElementType> tex_numerator_dudv; 112 texture<float, 1, cudaReadModeElementType> tex_numerator_u; 113 texture<float, 1, cudaReadModeElementType> tex_numerator_v; 114 texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u; 115 texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v; 116 texture<float, 1, cudaReadModeElementType> tex_diffusivity_x; 117 texture<float, 1, cudaReadModeElementType> tex_diffusivity_y; 118 119 120 ///////////////////////////////////////////////////////////////////////////////////////// 121 // SUPPLEMENTARY FUNCTIONS 122 ///////////////////////////////////////////////////////////////////////////////////////// 123 124 /////////////////////////////////////////////////////////////////////////////// 125 /// \brief performs pointwise summation of two vectors stored in device memory 126 /// \param d_res - pointer to resulting vector (device memory) 127 /// \param d_op1 - term #1 (device memory) 128 /// \param d_op2 - term #2 (device memory) 129 /// \param len - vector size 130 /////////////////////////////////////////////////////////////////////////////// 131 __global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len) 132 { 133 const int pos = blockIdx.x*blockDim.x + threadIdx.x; 134 135 if(pos >= len) return; 136 137 d_res[pos] = d_op1[pos] + d_op2[pos]; 138 } 139 140 /////////////////////////////////////////////////////////////////////////////// 141 /// \brief wrapper for summation kernel. 142 /// Computes \b op1 + \b op2 and stores result to \b res 143 /// \param res array, containing op1 + op2 (device memory) 144 /// \param op1 term #1 (device memory) 145 /// \param op2 term #2 (device memory) 146 /// \param count vector size 147 /////////////////////////////////////////////////////////////////////////////// 148 static void add(float *res, const float *op1, const float *op2, const int count, cudaStream_t stream) 149 { 150 dim3 threads(256); 151 dim3 blocks(iDivUp(count, threads.x)); 152 153 pointwise_add<<<blocks, threads, 0, stream>>>(res, op1, op2, count); 154 } 155 156 /////////////////////////////////////////////////////////////////////////////// 157 /// \brief wrapper for summation kernel. 158 /// Increments \b res by \b rhs 159 /// \param res initial vector, will be replaced with result (device memory) 160 /// \param rhs increment (device memory) 161 /// \param count vector size 162 /////////////////////////////////////////////////////////////////////////////// 163 static void add(float *res, const float *rhs, const int count, cudaStream_t stream) 164 { 165 add(res, res, rhs, count, stream); 166 } 167 168 /////////////////////////////////////////////////////////////////////////////// 169 /// \brief kernel for scaling vector by scalar 170 /// \param d_res scaled vector (device memory) 171 /// \param d_src source vector (device memory) 172 /// \param scale scalar to scale by 173 /// \param len vector size (number of elements) 174 /////////////////////////////////////////////////////////////////////////////// 175 __global__ void scaleVector(float *d_res, const float *d_src, float scale, const int len) 176 { 177 const int pos = blockIdx.x * blockDim.x + threadIdx.x; 178 179 if (pos >= len) return; 180 181 d_res[pos] = d_src[pos] * scale; 182 } 183 184 /////////////////////////////////////////////////////////////////////////////// 185 /// \brief scale vector by scalar 186 /// 187 /// kernel wrapper 188 /// \param d_res scaled vector (device memory) 189 /// \param d_src source vector (device memory) 190 /// \param scale scalar to scale by 191 /// \param len vector size (number of elements) 192 /// \param stream CUDA stream 193 /////////////////////////////////////////////////////////////////////////////// 194 static void ScaleVector(float *d_res, const float *d_src, float scale, const int len, cudaStream_t stream) 195 { 196 dim3 threads(256); 197 dim3 blocks(iDivUp(len, threads.x)); 198 199 scaleVector<<<blocks, threads, 0, stream>>>(d_res, d_src, scale, len); 200 } 201 202 const int SOR_TILE_WIDTH = 32; 203 const int SOR_TILE_HEIGHT = 6; 204 const int PSOR_TILE_WIDTH = 32; 205 const int PSOR_TILE_HEIGHT = 6; 206 const int PSOR_PITCH = PSOR_TILE_WIDTH + 4; 207 const int PSOR_HEIGHT = PSOR_TILE_HEIGHT + 4; 208 209 /////////////////////////////////////////////////////////////////////////////// 210 ///\brief Utility function. Compute smooth term diffusivity along x axis 211 ///\param s (out) pointer to memory location for result (diffusivity) 212 ///\param pos (in) position within shared memory array containing \b u 213 ///\param u (in) shared memory array containing \b u 214 ///\param v (in) shared memory array containing \b v 215 ///\param du (in) shared memory array containing \b du 216 ///\param dv (in) shared memory array containing \b dv 217 /////////////////////////////////////////////////////////////////////////////// 218 __forceinline__ __device__ void diffusivity_along_x(float *s, int pos, const float *u, const float *v, const float *du, const float *dv) 219 { 220 //x derivative between pixels (i,j) and (i-1,j) 221 const int left = pos-1; 222 float u_x = u[pos] + du[pos] - u[left] - du[left]; 223 float v_x = v[pos] + dv[pos] - v[left] - dv[left]; 224 const int up = pos + PSOR_PITCH; 225 const int down = pos - PSOR_PITCH; 226 const int up_left = up - 1; 227 const int down_left = down-1; 228 //y derivative between pixels (i,j) and (i-1,j) 229 float u_y = 0.25f*(u[up] + du[up] + u[up_left] + du[up_left] - u[down] - du[down] - u[down_left] - du[down_left]); 230 float v_y = 0.25f*(v[up] + dv[up] + v[up_left] + dv[up_left] - v[down] - dv[down] - v[down_left] - dv[down_left]); 231 *s = 0.5f / sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2); 232 } 233 234 /////////////////////////////////////////////////////////////////////////////// 235 ///\brief Utility function. Compute smooth term diffusivity along y axis 236 ///\param s (out) pointer to memory location for result (diffusivity) 237 ///\param pos (in) position within shared memory array containing \b u 238 ///\param u (in) shared memory array containing \b u 239 ///\param v (in) shared memory array containing \b v 240 ///\param du (in) shared memory array containing \b du 241 ///\param dv (in) shared memory array containing \b dv 242 /////////////////////////////////////////////////////////////////////////////// 243 __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const float *u, const float *v, const float *du, const float *dv) 244 { 245 //y derivative between pixels (i,j) and (i,j-1) 246 const int down = pos-PSOR_PITCH; 247 float u_y = u[pos] + du[pos] - u[down] - du[down]; 248 float v_y = v[pos] + dv[pos] - v[down] - dv[down]; 249 const int right = pos + 1; 250 const int left = pos - 1; 251 const int down_right = down + 1; 252 const int down_left = down - 1; 253 //x derivative between pixels (i,j) and (i,j-1); 254 float u_x = 0.25f*(u[right] + u[down_right] + du[right] + du[down_right] - u[left] - u[down_left] - du[left] - du[down_left]); 255 float v_x = 0.25f*(v[right] + v[down_right] + dv[right] + dv[down_right] - v[left] - v[down_left] - dv[left] - dv[down_left]); 256 *s = 0.5f/sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2); 257 } 258 259 /////////////////////////////////////////////////////////////////////////////// 260 ///\brief Utility function. Load element of 2D global memory to shared memory 261 ///\param smem pointer to shared memory array 262 ///\param is shared memory array column 263 ///\param js shared memory array row 264 ///\param w number of columns in global memory array 265 ///\param h number of rows in global memory array 266 ///\param p global memory array pitch in floats 267 /////////////////////////////////////////////////////////////////////////////// 268 template<int tex_id> 269 __forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p) 270 { 271 //position within shared memory array 272 const int ijs = js * PSOR_PITCH + is; 273 //mirror reflection across borders 274 i = max(i, -i-1); 275 i = min(i, w-i+w-1); 276 j = max(j, -j-1); 277 j = min(j, h-j+h-1); 278 const int pos = j * p + i; 279 switch(tex_id){ 280 case 0: 281 smem[ijs] = tex1Dfetch(tex_u, pos); 282 break; 283 case 1: 284 smem[ijs] = tex1Dfetch(tex_v, pos); 285 break; 286 case 2: 287 smem[ijs] = tex1Dfetch(tex_du, pos); 288 break; 289 case 3: 290 smem[ijs] = tex1Dfetch(tex_dv, pos); 291 break; 292 } 293 } 294 295 /////////////////////////////////////////////////////////////////////////////// 296 ///\brief Utility function. Load part (tile) of 2D global memory to shared memory 297 ///\param smem pointer to target shared memory array 298 ///\param ig column number within source 299 ///\param jg row number within source 300 ///\param w number of columns in global memory array 301 ///\param h number of rows in global memory array 302 ///\param p global memory array pitch in floats 303 /////////////////////////////////////////////////////////////////////////////// 304 template<int tex> 305 __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p) 306 { 307 const int i = threadIdx.x + 2; 308 const int j = threadIdx.y + 2; 309 load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel 310 __syncthreads(); 311 if(threadIdx.y < 2) 312 { 313 //load bottom shadow elements 314 load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p); 315 if(threadIdx.x < 2) 316 { 317 //load bottom right shadow elements 318 load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p); 319 //load middle right shadow elements 320 load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); 321 } 322 else if(threadIdx.x >= PSOR_TILE_WIDTH-2) 323 { 324 //load bottom left shadow elements 325 load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p); 326 //load middle left shadow elements 327 load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); 328 } 329 } 330 else if(threadIdx.y >= PSOR_TILE_HEIGHT-2) 331 { 332 //load upper shadow elements 333 load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p); 334 if(threadIdx.x < 2) 335 { 336 //load upper right shadow elements 337 load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p); 338 //load middle right shadow elements 339 load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); 340 } 341 else if(threadIdx.x >= PSOR_TILE_WIDTH-2) 342 { 343 //load upper left shadow elements 344 load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p); 345 //load middle left shadow elements 346 load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); 347 } 348 } 349 else 350 { 351 //load middle shadow elements 352 if(threadIdx.x < 2) 353 { 354 //load middle right shadow elements 355 load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p); 356 } 357 else if(threadIdx.x >= PSOR_TILE_WIDTH-2) 358 { 359 //load middle left shadow elements 360 load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p); 361 } 362 } 363 __syncthreads(); 364 } 365 366 /////////////////////////////////////////////////////////////////////////////// 367 /// \brief computes matrix of linearised system for \c du, \c dv 368 /// Computed values reside in GPU memory. \n 369 /// Matrix computation is divided into two steps. This kernel performs first step\n 370 /// - compute smoothness term diffusivity between pixels - psi dash smooth 371 /// - compute robustness factor in the data term - psi dash data 372 /// \param diffusivity_x (in/out) diffusivity between pixels along x axis in smoothness term 373 /// \param diffusivity_y (in/out) diffusivity between pixels along y axis in smoothness term 374 /// \param denominator_u (in/out) precomputed part of expression for new du value in SOR iteration 375 /// \param denominator_v (in/out) precomputed part of expression for new dv value in SOR iteration 376 /// \param numerator_dudv (in/out) precomputed part of expression for new du and dv value in SOR iteration 377 /// \param numerator_u (in/out) precomputed part of expression for new du value in SOR iteration 378 /// \param numerator_v (in/out) precomputed part of expression for new dv value in SOR iteration 379 /// \param w (in) frame width 380 /// \param h (in) frame height 381 /// \param pitch (in) pitch in floats 382 /// \param alpha (in) alpha in Brox model (flow smoothness) 383 /// \param gamma (in) gamma in Brox model (edge importance) 384 /////////////////////////////////////////////////////////////////////////////// 385 386 __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y, 387 float *denominator_u, float *denominator_v, 388 float *numerator_dudv, 389 float *numerator_u, float *numerator_v, 390 int w, int h, int s, 391 float alpha, float gamma) 392 { 393 __shared__ float u[PSOR_PITCH * PSOR_HEIGHT]; 394 __shared__ float v[PSOR_PITCH * PSOR_HEIGHT]; 395 __shared__ float du[PSOR_PITCH * PSOR_HEIGHT]; 396 __shared__ float dv[PSOR_PITCH * PSOR_HEIGHT]; 397 398 //position within tile 399 const int i = threadIdx.x; 400 const int j = threadIdx.y; 401 //position within smem arrays 402 const int ijs = (j+2) * PSOR_PITCH + i + 2; 403 //position within global memory 404 const int ig = blockIdx.x * blockDim.x + threadIdx.x; 405 const int jg = blockIdx.y * blockDim.y + threadIdx.y; 406 const int ijg = jg * s + ig; 407 //position within texture 408 float x = (float)ig + 0.5f; 409 float y = (float)jg + 0.5f; 410 //load u and v to smem 411 load_array<0>(u, ig, jg, w, h, s); 412 load_array<1>(v, ig, jg, w, h, s); 413 load_array<2>(du, ig, jg, w, h, s); 414 load_array<3>(dv, ig, jg, w, h, s); 415 //warped position 416 float wx = (x + u[ijs])/(float)w; 417 float wy = (y + v[ijs])/(float)h; 418 x /= (float)w; 419 y /= (float)h; 420 //compute image derivatives 421 const float Iz = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y); 422 const float Ix = tex2D(tex_Ix, wx, wy); 423 const float Ixz = Ix - tex2D(tex_Ix0, x, y); 424 const float Ixy = tex2D(tex_Ixy, wx, wy); 425 const float Ixx = tex2D(tex_Ixx, wx, wy); 426 const float Iy = tex2D(tex_Iy, wx, wy); 427 const float Iyz = Iy - tex2D(tex_Iy0, x, y); 428 const float Iyy = tex2D(tex_Iyy, wx, wy); 429 //compute data term 430 float q0, q1, q2; 431 q0 = Iz + Ix * du[ijs] + Iy * dv[ijs]; 432 q1 = Ixz + Ixx * du[ijs] + Ixy * dv[ijs]; 433 q2 = Iyz + Ixy * du[ijs] + Iyy * dv[ijs]; 434 float data_term = 0.5f * rsqrtf(q0*q0 + gamma*(q1*q1 + q2*q2) + eps2); 435 //scale data term by 1/alpha 436 data_term /= alpha; 437 //compute smoothness term (diffusivity) 438 float sx, sy; 439 440 if(ig >= w || jg >= h) return; 441 442 diffusivity_along_x(&sx, ijs, u, v, du, dv); 443 diffusivity_along_y(&sy, ijs, u, v, du, dv); 444 445 if(ig == 0) sx = 0.0f; 446 if(jg == 0) sy = 0.0f; 447 448 numerator_dudv[ijg] = data_term * (Ix*Iy + gamma * Ixy*(Ixx + Iyy)); 449 numerator_u[ijg] = data_term * (Ix*Iz + gamma * (Ixx*Ixz + Ixy*Iyz)); 450 numerator_v[ijg] = data_term * (Iy*Iz + gamma * (Iyy*Iyz + Ixy*Ixz)); 451 denominator_u[ijg] = data_term * (Ix*Ix + gamma * (Ixy*Ixy + Ixx*Ixx)); 452 denominator_v[ijg] = data_term * (Iy*Iy + gamma * (Ixy*Ixy + Iyy*Iyy)); 453 diffusivity_x[ijg] = sx; 454 diffusivity_y[ijg] = sy; 455 } 456 457 /////////////////////////////////////////////////////////////////////////////// 458 ///\brief computes matrix of linearised system for \c du, \c dv 459 ///\param inv_denominator_u 460 ///\param inv_denominator_v 461 ///\param w 462 ///\param h 463 ///\param s 464 /////////////////////////////////////////////////////////////////////////////// 465 __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v, 466 int w, int h, int s) 467 { 468 __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)]; 469 __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)]; 470 //position within tile 471 const int i = threadIdx.x; 472 const int j = threadIdx.y; 473 //position within smem arrays 474 const int ijs = j*(PSOR_TILE_WIDTH+1) + i; 475 //position within global memory 476 const int ig = blockIdx.x * blockDim.x + threadIdx.x; 477 const int jg = blockIdx.y * blockDim.y + threadIdx.y; 478 const int ijg = jg*s + ig; 479 int inside = ig < w && jg < h; 480 float denom_u; 481 float denom_v; 482 if(inside) 483 { 484 denom_u = inv_denominator_u[ijg]; 485 denom_v = inv_denominator_v[ijg]; 486 } 487 if(inside) 488 { 489 sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg); 490 sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg); 491 } 492 else 493 { 494 sx[ijs] = 0.0f; 495 sy[ijs] = 0.0f; 496 } 497 int up = ijs+PSOR_TILE_WIDTH+1; 498 if(j == PSOR_TILE_HEIGHT-1) 499 { 500 if(jg < h-1 && inside) 501 { 502 sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s); 503 } 504 else 505 { 506 sy[up] = 0.0f; 507 } 508 } 509 int right = ijs + 1; 510 if(threadIdx.x == PSOR_TILE_WIDTH-1) 511 { 512 if(ig < w-1 && inside) 513 { 514 sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1); 515 } 516 else 517 { 518 sx[right] = 0.0f; 519 } 520 } 521 __syncthreads(); 522 float diffusivity_sum; 523 diffusivity_sum = sx[ijs] + sx[ijs+1] + sy[ijs] + sy[ijs+PSOR_TILE_WIDTH+1]; 524 if(inside) 525 { 526 denom_u += diffusivity_sum; 527 denom_v += diffusivity_sum; 528 inv_denominator_u[ijg] = 1.0f/denom_u; 529 inv_denominator_v[ijg] = 1.0f/denom_v; 530 } 531 } 532 533 ///////////////////////////////////////////////////////////////////////////////////////// 534 // Red-Black SOR 535 ///////////////////////////////////////////////////////////////////////////////////////// 536 537 template<int isBlack> __global__ void sor_pass(float *new_du, 538 float *new_dv, 539 const float *g_inv_denominator_u, 540 const float *g_inv_denominator_v, 541 const float *g_numerator_u, 542 const float *g_numerator_v, 543 const float *g_numerator_dudv, 544 float omega, 545 int width, 546 int height, 547 int stride) 548 { 549 int i = blockIdx.x * blockDim.x + threadIdx.x; 550 int j = blockIdx.y * blockDim.y + threadIdx.y; 551 552 if(i >= width || j >= height) 553 return; 554 555 const int pos = j * stride + i; 556 const int pos_r = i < width - 1 ? pos + 1 : pos; 557 const int pos_u = j < height - 1 ? pos + stride : pos; 558 const int pos_d = j > 0 ? pos - stride : pos; 559 const int pos_l = i > 0 ? pos - 1 : pos; 560 561 //load smooth term 562 float s_up, s_left, s_right, s_down; 563 s_left = tex1Dfetch(tex_diffusivity_x, pos); 564 s_down = tex1Dfetch(tex_diffusivity_y, pos); 565 if(i < width-1) 566 s_right = tex1Dfetch(tex_diffusivity_x, pos_r); 567 else 568 s_right = 0.0f; //Neumann BC 569 if(j < height-1) 570 s_up = tex1Dfetch(tex_diffusivity_y, pos_u); 571 else 572 s_up = 0.0f; //Neumann BC 573 574 //load u, v and du, dv 575 float u_up, u_left, u_right, u_down, u; 576 float v_up, v_left, v_right, v_down, v; 577 float du_up, du_left, du_right, du_down, du; 578 float dv_up, dv_left, dv_right, dv_down, dv; 579 580 u_left = tex1Dfetch(tex_u, pos_l); 581 u_right = tex1Dfetch(tex_u, pos_r); 582 u_down = tex1Dfetch(tex_u, pos_d); 583 u_up = tex1Dfetch(tex_u, pos_u); 584 u = tex1Dfetch(tex_u, pos); 585 586 v_left = tex1Dfetch(tex_v, pos_l); 587 v_right = tex1Dfetch(tex_v, pos_r); 588 v_down = tex1Dfetch(tex_v, pos_d); 589 v = tex1Dfetch(tex_v, pos); 590 v_up = tex1Dfetch(tex_v, pos_u); 591 592 du = tex1Dfetch(tex_du, pos); 593 du_left = tex1Dfetch(tex_du, pos_l); 594 du_right = tex1Dfetch(tex_du, pos_r); 595 du_down = tex1Dfetch(tex_du, pos_d); 596 du_up = tex1Dfetch(tex_du, pos_u); 597 598 dv = tex1Dfetch(tex_dv, pos); 599 dv_left = tex1Dfetch(tex_dv, pos_l); 600 dv_right = tex1Dfetch(tex_dv, pos_r); 601 dv_down = tex1Dfetch(tex_dv, pos_d); 602 dv_up = tex1Dfetch(tex_dv, pos_u); 603 604 float numerator_dudv = g_numerator_dudv[pos]; 605 606 if((i+j)%2 == isBlack) 607 { 608 // update du 609 float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) - 610 u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv); 611 612 du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u; 613 614 // update dv 615 float numerator_v = (s_left*(v_left + dv_left) + s_up*(v_up + dv_up) + s_right*(v_right + dv_right) + s_down*(v_down + dv_down) - 616 v * (s_left + s_right + s_up + s_down) - g_numerator_v[pos] - numerator_dudv*du); 617 618 dv = (1.0f - omega) * dv + omega * g_inv_denominator_v[pos] * numerator_v; 619 } 620 new_du[pos] = du; 621 new_dv[pos] = dv; 622 } 623 624 /////////////////////////////////////////////////////////////////////////////// 625 // utility functions 626 /////////////////////////////////////////////////////////////////////////////// 627 628 void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex) 629 { 630 tex.addressMode[0] = cudaAddressModeClamp; 631 tex.filterMode = cudaFilterModePoint; 632 tex.normalized = false; 633 } 634 635 void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex) 636 { 637 tex.addressMode[0] = cudaAddressModeMirror; 638 tex.addressMode[1] = cudaAddressModeMirror; 639 tex.filterMode = cudaFilterModeLinear; 640 tex.normalized = true; 641 } 642 643 void InitTextures() 644 { 645 initTexture2D(tex_I0); 646 initTexture2D(tex_I1); 647 initTexture2D(tex_fine); // for downsampling 648 initTexture2D(tex_coarse); // for prolongation 649 650 initTexture2D(tex_Ix); 651 initTexture2D(tex_Ixx); 652 initTexture2D(tex_Ix0); 653 654 initTexture2D(tex_Iy); 655 initTexture2D(tex_Iyy); 656 initTexture2D(tex_Iy0); 657 658 initTexture2D(tex_Ixy); 659 660 initTexture1D(tex_u); 661 initTexture1D(tex_v); 662 initTexture1D(tex_du); 663 initTexture1D(tex_dv); 664 initTexture1D(tex_diffusivity_x); 665 initTexture1D(tex_diffusivity_y); 666 initTexture1D(tex_inv_denominator_u); 667 initTexture1D(tex_inv_denominator_v); 668 initTexture1D(tex_numerator_dudv); 669 initTexture1D(tex_numerator_u); 670 initTexture1D(tex_numerator_v); 671 } 672 673 namespace 674 { 675 struct ImagePyramid 676 { 677 std::vector<FloatVector*> img0; 678 std::vector<FloatVector*> img1; 679 680 std::vector<Ncv32u> w; 681 std::vector<Ncv32u> h; 682 683 explicit ImagePyramid(int outer_iterations) 684 { 685 img0.reserve(outer_iterations); 686 img1.reserve(outer_iterations); 687 688 w.reserve(outer_iterations); 689 h.reserve(outer_iterations); 690 } 691 692 ~ImagePyramid() 693 { 694 w.clear(); 695 h.clear(); 696 697 for (int i = static_cast<int>(img0.size()) - 1; i >= 0; --i) 698 { 699 delete img1[i]; 700 delete img0[i]; 701 } 702 703 img0.clear(); 704 img1.clear(); 705 } 706 }; 707 } 708 709 ///////////////////////////////////////////////////////////////////////////////////////// 710 // MAIN FUNCTION 711 ///////////////////////////////////////////////////////////////////////////////////////// 712 NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc, 713 INCVMemAllocator &gpu_mem_allocator, 714 const NCVMatrix<Ncv32f> &frame0, 715 const NCVMatrix<Ncv32f> &frame1, 716 NCVMatrix<Ncv32f> &uOut, 717 NCVMatrix<Ncv32f> &vOut, 718 cudaStream_t stream) 719 { 720 ncvAssertPrintReturn(desc.alpha > 0.0f , "Invalid alpha" , NCV_INCONSISTENT_INPUT); 721 ncvAssertPrintReturn(desc.gamma >= 0.0f , "Invalid gamma" , NCV_INCONSISTENT_INPUT); 722 ncvAssertPrintReturn(desc.number_of_inner_iterations > 0 , "Invalid number of inner iterations" , NCV_INCONSISTENT_INPUT); 723 ncvAssertPrintReturn(desc.number_of_outer_iterations > 0 , "Invalid number of outer iterations" , NCV_INCONSISTENT_INPUT); 724 ncvAssertPrintReturn(desc.number_of_solver_iterations > 0, "Invalid number of solver iterations", NCV_INCONSISTENT_INPUT); 725 726 const Ncv32u kSourceWidth = frame0.width(); 727 const Ncv32u kSourceHeight = frame0.height(); 728 729 ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT); 730 ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth && 731 uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT); 732 733 ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED); 734 735 bool kSkipProcessing = gpu_mem_allocator.isCounting(); 736 737 int cuda_device; 738 ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR); 739 740 cudaDeviceProp device_props; 741 ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR); 742 743 Ncv32u alignmentValue = gpu_mem_allocator.alignment (); 744 745 const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float); 746 const Ncv32u kSourcePitch = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float); 747 748 const Ncv32f scale_factor = desc.scale_factor; 749 const Ncv32f alpha = desc.alpha; 750 const Ncv32f gamma = desc.gamma; 751 752 const Ncv32u kSizeInPixelsAligned = alignUp(kSourceWidth, kStrideAlignmentFloat)*kSourceHeight; 753 754 #if defined SAFE_VECTOR_DECL 755 #undef SAFE_VECTOR_DECL 756 #endif 757 #define SAFE_VECTOR_DECL(name, allocator, size) \ 758 FloatVector name((allocator), (size)); \ 759 ncvAssertReturn(name.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 760 761 // matrix elements 762 SAFE_VECTOR_DECL(diffusivity_x, gpu_mem_allocator, kSizeInPixelsAligned); 763 SAFE_VECTOR_DECL(diffusivity_y, gpu_mem_allocator, kSizeInPixelsAligned); 764 SAFE_VECTOR_DECL(denom_u, gpu_mem_allocator, kSizeInPixelsAligned); 765 SAFE_VECTOR_DECL(denom_v, gpu_mem_allocator, kSizeInPixelsAligned); 766 SAFE_VECTOR_DECL(num_dudv, gpu_mem_allocator, kSizeInPixelsAligned); 767 SAFE_VECTOR_DECL(num_u, gpu_mem_allocator, kSizeInPixelsAligned); 768 SAFE_VECTOR_DECL(num_v, gpu_mem_allocator, kSizeInPixelsAligned); 769 770 // flow components 771 SAFE_VECTOR_DECL(u, gpu_mem_allocator, kSizeInPixelsAligned); 772 SAFE_VECTOR_DECL(v, gpu_mem_allocator, kSizeInPixelsAligned); 773 774 SAFE_VECTOR_DECL(u_new, gpu_mem_allocator, kSizeInPixelsAligned); 775 SAFE_VECTOR_DECL(v_new, gpu_mem_allocator, kSizeInPixelsAligned); 776 777 // flow increments 778 SAFE_VECTOR_DECL(du, gpu_mem_allocator, kSizeInPixelsAligned); 779 SAFE_VECTOR_DECL(dv, gpu_mem_allocator, kSizeInPixelsAligned); 780 781 SAFE_VECTOR_DECL(du_new, gpu_mem_allocator, kSizeInPixelsAligned); 782 SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned); 783 784 // temporary storage 785 SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator, 786 alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat)); 787 788 // image derivatives 789 SAFE_VECTOR_DECL(Ix, gpu_mem_allocator, kSizeInPixelsAligned); 790 SAFE_VECTOR_DECL(Ixx, gpu_mem_allocator, kSizeInPixelsAligned); 791 SAFE_VECTOR_DECL(Ix0, gpu_mem_allocator, kSizeInPixelsAligned); 792 SAFE_VECTOR_DECL(Iy, gpu_mem_allocator, kSizeInPixelsAligned); 793 SAFE_VECTOR_DECL(Iyy, gpu_mem_allocator, kSizeInPixelsAligned); 794 SAFE_VECTOR_DECL(Iy0, gpu_mem_allocator, kSizeInPixelsAligned); 795 SAFE_VECTOR_DECL(Ixy, gpu_mem_allocator, kSizeInPixelsAligned); 796 797 // spatial derivative filter size 798 const int kDFilterSize = 5; 799 SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize); 800 801 if (!kSkipProcessing) 802 { 803 const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f}; 804 805 ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize, 806 cudaMemcpyHostToDevice), NCV_CUDA_ERROR); 807 808 InitTextures(); 809 } 810 811 //prepare image pyramid 812 ImagePyramid pyr(desc.number_of_outer_iterations); 813 814 cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>(); 815 816 float scale = 1.0f; 817 818 //cuda arrays for frames 819 std::auto_ptr<FloatVector> pI0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned)); 820 ncvAssertReturn(pI0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 821 822 std::auto_ptr<FloatVector> pI1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned)); 823 ncvAssertReturn(pI1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 824 825 if (!kSkipProcessing) 826 { 827 //copy frame data to device 828 size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float); 829 size_t src_width_in_bytes = kSourceWidth * sizeof(float); 830 size_t src_pitch_in_bytes = frame0.pitch(); 831 832 ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(), 833 src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); 834 835 ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(), 836 src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR ); 837 } 838 839 FloatVector* I0 = pI0.release(); 840 FloatVector* I1 = pI1.release(); 841 842 //prepare pyramid 843 pyr.img0.push_back(I0); 844 pyr.img1.push_back(I1); 845 846 pyr.w.push_back(kSourceWidth); 847 pyr.h.push_back(kSourceHeight); 848 849 scale *= scale_factor; 850 851 Ncv32u prev_level_width = kSourceWidth; 852 Ncv32u prev_level_height = kSourceHeight; 853 while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(pyr.img0.size()) < desc.number_of_outer_iterations)) 854 { 855 //current resolution 856 Ncv32u level_width = static_cast<Ncv32u>(ceilf(kSourceWidth * scale)); 857 Ncv32u level_height = static_cast<Ncv32u>(ceilf(kSourceHeight * scale)); 858 859 Ncv32u level_width_aligned = alignUp(level_width, kStrideAlignmentFloat); 860 861 Ncv32u buffer_size = alignUp(level_width, kStrideAlignmentFloat) * level_height; // buffer size in floats 862 863 Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float); 864 865 std::auto_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size)); 866 ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 867 868 std::auto_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size)); 869 ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 870 871 if (!kSkipProcessing) 872 { 873 ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); 874 875 NcvSize32u srcSize (prev_level_width, prev_level_height); 876 NcvSize32u dstSize (level_width, level_height); 877 NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height); 878 NcvRect32u dstROI (0, 0, level_width, level_height); 879 880 // frame 0 881 ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI, 882 level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) ); 883 884 // frame 1 885 ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI, 886 level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) ); 887 } 888 889 I0 = level_frame0.release(); 890 I1 = level_frame1.release(); 891 892 //store pointers 893 pyr.img0.push_back(I0); 894 pyr.img1.push_back(I1); 895 896 pyr.w.push_back(level_width); 897 pyr.h.push_back(level_height); 898 899 scale *= scale_factor; 900 901 prev_level_width = level_width; 902 prev_level_height = level_height; 903 } 904 905 if (!kSkipProcessing) 906 { 907 //initial values for flow is 0 908 ncvAssertCUDAReturn(cudaMemsetAsync(u.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR); 909 ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR); 910 911 //select images with lowest resolution 912 size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float); 913 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR); 914 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR); 915 ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); 916 917 FloatVector* ptrU = &u; 918 FloatVector* ptrV = &v; 919 FloatVector* ptrUNew = &u_new; 920 FloatVector* ptrVNew = &v_new; 921 922 std::vector<FloatVector*>::const_reverse_iterator img0Iter = pyr.img0.rbegin(); 923 std::vector<FloatVector*>::const_reverse_iterator img1Iter = pyr.img1.rbegin(); 924 925 //outer loop 926 //warping fixed point iteration 927 while(!pyr.w.empty()) 928 { 929 //current grid dimensions 930 const Ncv32u kLevelWidth = pyr.w.back(); 931 const Ncv32u kLevelHeight = pyr.h.back(); 932 const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat); 933 934 //size of current image in bytes 935 const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float); 936 937 //number of points at current resolution 938 const int kLevelSizeInPixels = kLevelStride * kLevelHeight; 939 940 //initial guess for du and dv 941 ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR); 942 ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR); 943 944 //texture format descriptor 945 cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>(); 946 947 I0 = *img0Iter; 948 I1 = *img1Iter; 949 950 ++img0Iter; 951 ++img1Iter; 952 953 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR); 954 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR); 955 956 //compute derivatives 957 dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6)); 958 dim3 dThreads(32, 6); 959 960 const int kPitchTex = kLevelStride * sizeof(float); 961 962 NcvSize32u srcSize(kLevelWidth, kLevelHeight); 963 Ncv32u nSrcStep = kLevelStride * sizeof(float); 964 NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight); 965 966 // Ix0 967 ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI, 968 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 969 970 // Iy0 971 ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI, 972 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 973 974 // Ix 975 ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Ix.ptr(), srcSize, nSrcStep, oROI, 976 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 977 978 // Iy 979 ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Iy.ptr(), srcSize, nSrcStep, oROI, 980 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 981 982 // Ixx 983 ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Ix.ptr(), srcSize, nSrcStep, Ixx.ptr(), srcSize, nSrcStep, oROI, 984 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 985 986 // Iyy 987 ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Iyy.ptr(), srcSize, nSrcStep, oROI, 988 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 989 990 // Ixy 991 ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI, 992 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) ); 993 994 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix, Ix.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); 995 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); 996 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); 997 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy, Iy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); 998 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); 999 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); 1000 ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR); 1001 1002 // flow 1003 ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1004 ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1005 // flow increments 1006 ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1007 ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1008 1009 dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT)); 1010 dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT); 1011 1012 dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT)); 1013 dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT); 1014 1015 // inner loop 1016 // lagged nonlinearity fixed point iteration 1017 ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR); 1018 for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration) 1019 { 1020 //compute coefficients 1021 prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>> 1022 (diffusivity_x.ptr(), 1023 diffusivity_y.ptr(), 1024 denom_u.ptr(), 1025 denom_v.ptr(), 1026 num_dudv.ptr(), 1027 num_u.ptr(), 1028 num_v.ptr(), 1029 kLevelWidth, 1030 kLevelHeight, 1031 kLevelStride, 1032 alpha, 1033 gamma); 1034 1035 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); 1036 1037 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1038 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1039 1040 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1041 1042 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1043 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1044 1045 prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride); 1046 1047 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); 1048 1049 // linear system coefficients 1050 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1051 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1052 1053 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1054 1055 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1056 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1057 1058 ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1059 ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1060 1061 //solve linear system 1062 for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration) 1063 { 1064 float omega = 1.99f; 1065 1066 ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1067 ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1068 1069 sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>> 1070 (du_new.ptr(), 1071 dv_new.ptr(), 1072 denom_u.ptr(), 1073 denom_v.ptr(), 1074 num_u.ptr(), 1075 num_v.ptr(), 1076 num_dudv.ptr(), 1077 omega, 1078 kLevelWidth, 1079 kLevelHeight, 1080 kLevelStride); 1081 1082 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); 1083 1084 ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1085 ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1086 1087 sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>> 1088 (du.ptr(), 1089 dv.ptr(), 1090 denom_u.ptr(), 1091 denom_v.ptr(), 1092 num_u.ptr(), 1093 num_v.ptr(), 1094 num_dudv.ptr(), 1095 omega, 1096 kLevelWidth, 1097 kLevelHeight, 1098 kLevelStride); 1099 1100 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); 1101 1102 ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1103 ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR); 1104 }//end of solver loop 1105 }// end of inner loop 1106 1107 //update u and v 1108 add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream); 1109 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); 1110 add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream); 1111 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); 1112 1113 //prolongate using texture 1114 pyr.w.pop_back(); 1115 pyr.h.pop_back(); 1116 if (!pyr.w.empty()) 1117 { 1118 //compute new image size 1119 Ncv32u nw = pyr.w.back(); 1120 Ncv32u nh = pyr.h.back(); 1121 Ncv32u ns = alignUp(nw, kStrideAlignmentFloat); 1122 1123 dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8)); 1124 dim3 p_threads(32, 8); 1125 1126 NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight); 1127 NcvSize32u dstSize (nw, nh); 1128 NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight); 1129 NcvRect32u dstROI (0, 0, nw, nh); 1130 1131 ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI, 1132 ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) ); 1133 1134 ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream); 1135 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR); 1136 1137 ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI, 1138 ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) ); 1139 1140 ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream); 1141 ncvAssertCUDALastErrorReturn((int)NCV_CUDA_ERROR); 1142 1143 cv::cuda::device::swap<FloatVector*>(ptrU, ptrUNew); 1144 cv::cuda::device::swap<FloatVector*>(ptrV, ptrVNew); 1145 } 1146 scale /= scale_factor; 1147 } 1148 1149 // end of warping iterations 1150 ncvAssertCUDAReturn(cudaStreamSynchronize(stream), (int)NCV_CUDA_ERROR); 1151 1152 ncvAssertCUDAReturn( cudaMemcpy2DAsync 1153 (uOut.ptr(), uOut.pitch(), ptrU->ptr(), 1154 kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), (int)NCV_CUDA_ERROR ); 1155 1156 ncvAssertCUDAReturn( cudaMemcpy2DAsync 1157 (vOut.ptr(), vOut.pitch(), ptrV->ptr(), 1158 kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), (int)NCV_CUDA_ERROR ); 1159 1160 ncvAssertCUDAReturn(cudaStreamSynchronize(stream), (int)NCV_CUDA_ERROR); 1161 } 1162 1163 return NCV_SUCCESS; 1164 } 1165