Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 ////////////////////////////////////////////////////////////////////////////////
     44 //
     45 // NVIDIA CUDA implementation of Brox et al Optical Flow algorithm
     46 //
     47 // Algorithm is explained in the original paper:
     48 //      T. Brox, A. Bruhn, N. Papenberg, J. Weickert:
     49 //      High accuracy optical flow estimation based on a theory for warping.
     50 //      ECCV 2004.
     51 //
     52 // Implementation by Mikhail Smirnov
     53 // email: msmirnov (at) nvidia.com, devsupport (at) nvidia.com
     54 //
     55 // Credits for help with the code to:
     56 // Alexey Mendelenko, Anton Obukhov, and Alexander Kharlamov.
     57 //
     58 ////////////////////////////////////////////////////////////////////////////////
     59 
     60 #include <iostream>
     61 #include <vector>
     62 #include <memory>
     63 
     64 #include "opencv2/core/cuda/utility.hpp"
     65 
     66 #include "opencv2/cudalegacy/NPP_staging.hpp"
     67 #include "opencv2/cudalegacy/NCVBroxOpticalFlow.hpp"
     68 
     69 
     70 typedef NCVVectorAlloc<Ncv32f> FloatVector;
     71 
     72 /////////////////////////////////////////////////////////////////////////////////////////
     73 // Implementation specific constants
     74 /////////////////////////////////////////////////////////////////////////////////////////
     75 __device__ const float eps2 = 1e-6f;
     76 
     77 /////////////////////////////////////////////////////////////////////////////////////////
     78 // Additional defines
     79 /////////////////////////////////////////////////////////////////////////////////////////
     80 
     81 // rounded up division
     82 inline int iDivUp(int a, int b)
     83 {
     84     return (a + b - 1)/b;
     85 }
     86 
     87 /////////////////////////////////////////////////////////////////////////////////////////
     88 // Texture references
     89 /////////////////////////////////////////////////////////////////////////////////////////
     90 
     91 texture<float, 2, cudaReadModeElementType> tex_coarse;
     92 texture<float, 2, cudaReadModeElementType> tex_fine;
     93 
     94 texture<float, 2, cudaReadModeElementType> tex_I1;
     95 texture<float, 2, cudaReadModeElementType> tex_I0;
     96 
     97 texture<float, 2, cudaReadModeElementType> tex_Ix;
     98 texture<float, 2, cudaReadModeElementType> tex_Ixx;
     99 texture<float, 2, cudaReadModeElementType> tex_Ix0;
    100 
    101 texture<float, 2, cudaReadModeElementType> tex_Iy;
    102 texture<float, 2, cudaReadModeElementType> tex_Iyy;
    103 texture<float, 2, cudaReadModeElementType> tex_Iy0;
    104 
    105 texture<float, 2, cudaReadModeElementType> tex_Ixy;
    106 
    107 texture<float, 1, cudaReadModeElementType> tex_u;
    108 texture<float, 1, cudaReadModeElementType> tex_v;
    109 texture<float, 1, cudaReadModeElementType> tex_du;
    110 texture<float, 1, cudaReadModeElementType> tex_dv;
    111 texture<float, 1, cudaReadModeElementType> tex_numerator_dudv;
    112 texture<float, 1, cudaReadModeElementType> tex_numerator_u;
    113 texture<float, 1, cudaReadModeElementType> tex_numerator_v;
    114 texture<float, 1, cudaReadModeElementType> tex_inv_denominator_u;
    115 texture<float, 1, cudaReadModeElementType> tex_inv_denominator_v;
    116 texture<float, 1, cudaReadModeElementType> tex_diffusivity_x;
    117 texture<float, 1, cudaReadModeElementType> tex_diffusivity_y;
    118 
    119 
    120 /////////////////////////////////////////////////////////////////////////////////////////
    121 // SUPPLEMENTARY FUNCTIONS
    122 /////////////////////////////////////////////////////////////////////////////////////////
    123 
    124 ///////////////////////////////////////////////////////////////////////////////
    125 /// \brief performs pointwise summation of two vectors stored in device memory
    126 /// \param d_res    - pointer to resulting vector (device memory)
    127 /// \param d_op1    - term #1 (device memory)
    128 /// \param d_op2    - term #2 (device memory)
    129 /// \param len    - vector size
    130 ///////////////////////////////////////////////////////////////////////////////
    131 __global__ void pointwise_add(float *d_res, const float *d_op1, const float *d_op2, const int len)
    132 {
    133     const int pos = blockIdx.x*blockDim.x + threadIdx.x;
    134 
    135     if(pos >= len) return;
    136 
    137     d_res[pos] = d_op1[pos] + d_op2[pos];
    138 }
    139 
    140 ///////////////////////////////////////////////////////////////////////////////
    141 /// \brief wrapper for summation kernel.
    142 ///  Computes \b op1 + \b op2 and stores result to \b res
    143 /// \param res   array, containing op1 + op2 (device memory)
    144 /// \param op1   term #1 (device memory)
    145 /// \param op2   term #2 (device memory)
    146 /// \param count vector size
    147 ///////////////////////////////////////////////////////////////////////////////
    148 static void add(float *res, const float *op1, const float *op2, const int count, cudaStream_t stream)
    149 {
    150     dim3 threads(256);
    151     dim3 blocks(iDivUp(count, threads.x));
    152 
    153     pointwise_add<<<blocks, threads, 0, stream>>>(res, op1, op2, count);
    154 }
    155 
    156 ///////////////////////////////////////////////////////////////////////////////
    157 /// \brief wrapper for summation kernel.
    158 /// Increments \b res by \b rhs
    159 /// \param res   initial vector, will be replaced with result (device memory)
    160 /// \param rhs   increment (device memory)
    161 /// \param count vector size
    162 ///////////////////////////////////////////////////////////////////////////////
    163 static void add(float *res, const float *rhs, const int count, cudaStream_t stream)
    164 {
    165     add(res, res, rhs, count, stream);
    166 }
    167 
    168 ///////////////////////////////////////////////////////////////////////////////
    169 /// \brief kernel for scaling vector by scalar
    170 /// \param d_res  scaled vector (device memory)
    171 /// \param d_src  source vector (device memory)
    172 /// \param scale  scalar to scale by
    173 /// \param len    vector size (number of elements)
    174 ///////////////////////////////////////////////////////////////////////////////
    175 __global__ void scaleVector(float *d_res, const float *d_src, float scale, const int len)
    176 {
    177     const int pos = blockIdx.x * blockDim.x + threadIdx.x;
    178 
    179     if (pos >= len) return;
    180 
    181     d_res[pos] = d_src[pos] * scale;
    182 }
    183 
    184 ///////////////////////////////////////////////////////////////////////////////
    185 /// \brief scale vector by scalar
    186 ///
    187 /// kernel wrapper
    188 /// \param d_res  scaled vector (device memory)
    189 /// \param d_src  source vector (device memory)
    190 /// \param scale  scalar to scale by
    191 /// \param len    vector size (number of elements)
    192 /// \param stream CUDA stream
    193 ///////////////////////////////////////////////////////////////////////////////
    194 static void ScaleVector(float *d_res, const float *d_src, float scale, const int len, cudaStream_t stream)
    195 {
    196     dim3 threads(256);
    197     dim3 blocks(iDivUp(len, threads.x));
    198 
    199     scaleVector<<<blocks, threads, 0, stream>>>(d_res, d_src, scale, len);
    200 }
    201 
    202 const int SOR_TILE_WIDTH = 32;
    203 const int SOR_TILE_HEIGHT = 6;
    204 const int PSOR_TILE_WIDTH = 32;
    205 const int PSOR_TILE_HEIGHT = 6;
    206 const int PSOR_PITCH = PSOR_TILE_WIDTH + 4;
    207 const int PSOR_HEIGHT = PSOR_TILE_HEIGHT + 4;
    208 
    209 ///////////////////////////////////////////////////////////////////////////////
    210 ///\brief Utility function. Compute smooth term diffusivity along x axis
    211 ///\param s (out) pointer to memory location for result (diffusivity)
    212 ///\param pos (in) position within shared memory array containing \b u
    213 ///\param u (in) shared memory array containing \b u
    214 ///\param v (in) shared memory array containing \b v
    215 ///\param du (in) shared memory array containing \b du
    216 ///\param dv (in) shared memory array containing \b dv
    217 ///////////////////////////////////////////////////////////////////////////////
    218 __forceinline__ __device__ void diffusivity_along_x(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
    219 {
    220     //x derivative between pixels (i,j) and (i-1,j)
    221     const int left = pos-1;
    222     float u_x = u[pos] + du[pos] - u[left] - du[left];
    223     float v_x = v[pos] + dv[pos] - v[left] - dv[left];
    224     const int up        = pos + PSOR_PITCH;
    225     const int down      = pos - PSOR_PITCH;
    226     const int up_left   = up - 1;
    227     const int down_left = down-1;
    228     //y derivative between pixels (i,j) and (i-1,j)
    229     float u_y = 0.25f*(u[up] + du[up] + u[up_left] + du[up_left] - u[down] - du[down] - u[down_left] - du[down_left]);
    230     float v_y = 0.25f*(v[up] + dv[up] + v[up_left] + dv[up_left] - v[down] - dv[down] - v[down_left] - dv[down_left]);
    231     *s = 0.5f / sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
    232 }
    233 
    234 ///////////////////////////////////////////////////////////////////////////////
    235 ///\brief Utility function. Compute smooth term diffusivity along y axis
    236 ///\param s (out) pointer to memory location for result (diffusivity)
    237 ///\param pos (in) position within shared memory array containing \b u
    238 ///\param u (in) shared memory array containing \b u
    239 ///\param v (in) shared memory array containing \b v
    240 ///\param du (in) shared memory array containing \b du
    241 ///\param dv (in) shared memory array containing \b dv
    242 ///////////////////////////////////////////////////////////////////////////////
    243 __forceinline__ __device__ void diffusivity_along_y(float *s, int pos, const float *u, const float *v, const float *du, const float *dv)
    244 {
    245     //y derivative between pixels (i,j) and (i,j-1)
    246     const int down = pos-PSOR_PITCH;
    247     float u_y = u[pos] + du[pos] - u[down] - du[down];
    248     float v_y = v[pos] + dv[pos] - v[down] - dv[down];
    249     const int right      = pos + 1;
    250     const int left       = pos - 1;
    251     const int down_right = down + 1;
    252     const int down_left  = down - 1;
    253     //x derivative between pixels (i,j) and (i,j-1);
    254     float u_x = 0.25f*(u[right] + u[down_right] + du[right] + du[down_right] - u[left] - u[down_left] - du[left] - du[down_left]);
    255     float v_x = 0.25f*(v[right] + v[down_right] + dv[right] + dv[down_right] - v[left] - v[down_left] - dv[left] - dv[down_left]);
    256     *s = 0.5f/sqrtf(u_x*u_x + v_x*v_x + u_y*u_y + v_y*v_y + eps2);
    257 }
    258 
    259 ///////////////////////////////////////////////////////////////////////////////
    260 ///\brief Utility function. Load element of 2D global memory to shared memory
    261 ///\param smem pointer to shared memory array
    262 ///\param is shared memory array column
    263 ///\param js shared memory array row
    264 ///\param w number of columns in global memory array
    265 ///\param h number of rows in global memory array
    266 ///\param p global memory array pitch in floats
    267 ///////////////////////////////////////////////////////////////////////////////
    268 template<int tex_id>
    269 __forceinline__ __device__ void load_array_element(float *smem, int is, int js, int i, int j, int w, int h, int p)
    270 {
    271     //position within shared memory array
    272     const int ijs = js * PSOR_PITCH + is;
    273     //mirror reflection across borders
    274     i = max(i, -i-1);
    275     i = min(i, w-i+w-1);
    276     j = max(j, -j-1);
    277     j = min(j, h-j+h-1);
    278     const int pos = j * p + i;
    279     switch(tex_id){
    280         case 0:
    281             smem[ijs] = tex1Dfetch(tex_u, pos);
    282             break;
    283         case 1:
    284             smem[ijs] = tex1Dfetch(tex_v, pos);
    285             break;
    286         case 2:
    287             smem[ijs] = tex1Dfetch(tex_du, pos);
    288             break;
    289         case 3:
    290             smem[ijs] = tex1Dfetch(tex_dv, pos);
    291             break;
    292     }
    293 }
    294 
    295 ///////////////////////////////////////////////////////////////////////////////
    296 ///\brief Utility function. Load part (tile) of 2D global memory to shared memory
    297 ///\param smem pointer to target shared memory array
    298 ///\param ig column number within source
    299 ///\param jg row number within source
    300 ///\param w number of columns in global memory array
    301 ///\param h number of rows in global memory array
    302 ///\param p global memory array pitch in floats
    303 ///////////////////////////////////////////////////////////////////////////////
    304 template<int tex>
    305 __forceinline__ __device__ void load_array(float *smem, int ig, int jg, int w, int h, int p)
    306 {
    307     const int i = threadIdx.x + 2;
    308     const int j = threadIdx.y + 2;
    309     load_array_element<tex>(smem, i, j, ig, jg, w, h, p);//load current pixel
    310     __syncthreads();
    311     if(threadIdx.y < 2)
    312     {
    313         //load bottom shadow elements
    314         load_array_element<tex>(smem, i, j-2, ig, jg-2, w, h, p);
    315         if(threadIdx.x < 2)
    316         {
    317             //load bottom right shadow elements
    318             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j-2, ig+PSOR_TILE_WIDTH, jg-2, w, h, p);
    319             //load middle right shadow elements
    320             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
    321         }
    322         else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
    323         {
    324             //load bottom left shadow elements
    325             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j-2, ig-PSOR_TILE_WIDTH, jg-2, w, h, p);
    326             //load middle left shadow elements
    327             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
    328         }
    329     }
    330     else if(threadIdx.y >= PSOR_TILE_HEIGHT-2)
    331     {
    332         //load upper shadow elements
    333         load_array_element<tex>(smem, i, j+2, ig, jg+2, w, h, p);
    334         if(threadIdx.x < 2)
    335         {
    336             //load upper right shadow elements
    337             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j+2, ig+PSOR_TILE_WIDTH, jg+2, w, h, p);
    338             //load middle right shadow elements
    339             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
    340         }
    341         else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
    342         {
    343             //load upper left shadow elements
    344             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j+2, ig-PSOR_TILE_WIDTH, jg+2, w, h, p);
    345             //load middle left shadow elements
    346             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
    347         }
    348     }
    349     else
    350     {
    351         //load middle shadow elements
    352         if(threadIdx.x < 2)
    353         {
    354             //load middle right shadow elements
    355             load_array_element<tex>(smem, i+PSOR_TILE_WIDTH, j, ig+PSOR_TILE_WIDTH, jg, w, h, p);
    356         }
    357         else if(threadIdx.x >= PSOR_TILE_WIDTH-2)
    358         {
    359             //load middle left shadow elements
    360             load_array_element<tex>(smem, i-PSOR_TILE_WIDTH, j, ig-PSOR_TILE_WIDTH, jg, w, h, p);
    361         }
    362     }
    363     __syncthreads();
    364 }
    365 
    366 ///////////////////////////////////////////////////////////////////////////////
    367 /// \brief computes matrix of linearised system for \c du, \c dv
    368 /// Computed values reside in GPU memory. \n
    369 /// Matrix computation is divided into two steps. This kernel performs first step\n
    370 /// - compute smoothness term diffusivity between pixels - psi dash smooth
    371 /// - compute robustness factor in the data term - psi dash data
    372 /// \param diffusivity_x (in/out) diffusivity between pixels along x axis in smoothness term
    373 /// \param diffusivity_y (in/out) diffusivity between pixels along y axis in smoothness term
    374 /// \param denominator_u (in/out) precomputed part of expression for new du value in SOR iteration
    375 /// \param denominator_v (in/out) precomputed part of expression for new dv value in SOR iteration
    376 /// \param numerator_dudv (in/out) precomputed part of expression for new du and dv value in SOR iteration
    377 /// \param numerator_u (in/out) precomputed part of expression for new du value in SOR iteration
    378 /// \param numerator_v (in/out) precomputed part of expression for new dv value in SOR iteration
    379 /// \param w (in) frame width
    380 /// \param h (in) frame height
    381 /// \param pitch (in) pitch in floats
    382 /// \param alpha (in) alpha in Brox model (flow smoothness)
    383 /// \param gamma (in) gamma in Brox model (edge importance)
    384 ///////////////////////////////////////////////////////////////////////////////
    385 
    386 __global__ void prepare_sor_stage_1_tex(float *diffusivity_x, float *diffusivity_y,
    387                                                         float *denominator_u, float *denominator_v,
    388                                                         float *numerator_dudv,
    389                                                         float *numerator_u, float *numerator_v,
    390                                                         int w, int h, int s,
    391                                                         float alpha, float gamma)
    392 {
    393     __shared__ float u[PSOR_PITCH * PSOR_HEIGHT];
    394     __shared__ float v[PSOR_PITCH * PSOR_HEIGHT];
    395     __shared__ float du[PSOR_PITCH * PSOR_HEIGHT];
    396     __shared__ float dv[PSOR_PITCH * PSOR_HEIGHT];
    397 
    398     //position within tile
    399     const int i = threadIdx.x;
    400     const int j = threadIdx.y;
    401     //position within smem arrays
    402     const int ijs = (j+2) * PSOR_PITCH + i + 2;
    403     //position within global memory
    404     const int ig  = blockIdx.x * blockDim.x + threadIdx.x;
    405     const int jg  = blockIdx.y * blockDim.y + threadIdx.y;
    406     const int ijg = jg * s + ig;
    407     //position within texture
    408     float x = (float)ig + 0.5f;
    409     float y = (float)jg + 0.5f;
    410     //load u  and v to smem
    411     load_array<0>(u, ig, jg, w, h, s);
    412     load_array<1>(v, ig, jg, w, h, s);
    413     load_array<2>(du, ig, jg, w, h, s);
    414     load_array<3>(dv, ig, jg, w, h, s);
    415     //warped position
    416     float wx = (x + u[ijs])/(float)w;
    417     float wy = (y + v[ijs])/(float)h;
    418     x /= (float)w;
    419     y /= (float)h;
    420     //compute image derivatives
    421     const float Iz  = tex2D(tex_I1, wx, wy) - tex2D(tex_I0, x, y);
    422     const float Ix  = tex2D(tex_Ix, wx, wy);
    423     const float Ixz = Ix - tex2D(tex_Ix0, x, y);
    424     const float Ixy = tex2D(tex_Ixy, wx, wy);
    425     const float Ixx = tex2D(tex_Ixx, wx, wy);
    426     const float Iy  = tex2D(tex_Iy, wx, wy);
    427     const float Iyz = Iy - tex2D(tex_Iy0, x, y);
    428     const float Iyy = tex2D(tex_Iyy, wx, wy);
    429     //compute data term
    430     float q0, q1, q2;
    431     q0 = Iz  + Ix  * du[ijs] + Iy  * dv[ijs];
    432     q1 = Ixz + Ixx * du[ijs] + Ixy * dv[ijs];
    433     q2 = Iyz + Ixy * du[ijs] + Iyy * dv[ijs];
    434     float data_term = 0.5f * rsqrtf(q0*q0 + gamma*(q1*q1 + q2*q2) + eps2);
    435     //scale data term by 1/alpha
    436     data_term /= alpha;
    437     //compute smoothness term (diffusivity)
    438     float sx, sy;
    439 
    440     if(ig >= w || jg >= h) return;
    441 
    442     diffusivity_along_x(&sx, ijs, u, v, du, dv);
    443     diffusivity_along_y(&sy, ijs, u, v, du, dv);
    444 
    445     if(ig == 0) sx = 0.0f;
    446     if(jg == 0) sy = 0.0f;
    447 
    448     numerator_dudv[ijg] = data_term * (Ix*Iy + gamma * Ixy*(Ixx + Iyy));
    449     numerator_u[ijg]    = data_term * (Ix*Iz + gamma * (Ixx*Ixz + Ixy*Iyz));
    450     numerator_v[ijg]    = data_term * (Iy*Iz + gamma * (Iyy*Iyz + Ixy*Ixz));
    451     denominator_u[ijg]  = data_term * (Ix*Ix + gamma * (Ixy*Ixy + Ixx*Ixx));
    452     denominator_v[ijg]  = data_term * (Iy*Iy + gamma * (Ixy*Ixy + Iyy*Iyy));
    453     diffusivity_x[ijg]  = sx;
    454     diffusivity_y[ijg]  = sy;
    455 }
    456 
    457 ///////////////////////////////////////////////////////////////////////////////
    458 ///\brief computes matrix of linearised system for \c du, \c dv
    459 ///\param inv_denominator_u
    460 ///\param inv_denominator_v
    461 ///\param w
    462 ///\param h
    463 ///\param s
    464 ///////////////////////////////////////////////////////////////////////////////
    465 __global__ void prepare_sor_stage_2(float *inv_denominator_u, float *inv_denominator_v,
    466                                     int w, int h, int s)
    467 {
    468     __shared__ float sx[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
    469     __shared__ float sy[(PSOR_TILE_WIDTH+1) * (PSOR_TILE_HEIGHT+1)];
    470     //position within tile
    471     const int i = threadIdx.x;
    472     const int j = threadIdx.y;
    473     //position within smem arrays
    474     const int ijs = j*(PSOR_TILE_WIDTH+1) + i;
    475     //position within global memory
    476     const int ig  = blockIdx.x * blockDim.x + threadIdx.x;
    477     const int jg  = blockIdx.y * blockDim.y + threadIdx.y;
    478     const int ijg = jg*s + ig;
    479     int inside = ig < w && jg < h;
    480     float denom_u;
    481     float denom_v;
    482     if(inside)
    483     {
    484         denom_u = inv_denominator_u[ijg];
    485         denom_v = inv_denominator_v[ijg];
    486     }
    487     if(inside)
    488     {
    489         sx[ijs] = tex1Dfetch(tex_diffusivity_x, ijg);
    490         sy[ijs] = tex1Dfetch(tex_diffusivity_y, ijg);
    491     }
    492     else
    493     {
    494         sx[ijs] = 0.0f;
    495         sy[ijs] = 0.0f;
    496     }
    497     int up = ijs+PSOR_TILE_WIDTH+1;
    498     if(j == PSOR_TILE_HEIGHT-1)
    499     {
    500         if(jg < h-1 && inside)
    501         {
    502             sy[up] = tex1Dfetch(tex_diffusivity_y, ijg + s);
    503         }
    504         else
    505         {
    506             sy[up] = 0.0f;
    507         }
    508     }
    509     int right = ijs + 1;
    510     if(threadIdx.x == PSOR_TILE_WIDTH-1)
    511     {
    512         if(ig < w-1 && inside)
    513         {
    514             sx[right] = tex1Dfetch(tex_diffusivity_x, ijg + 1);
    515         }
    516         else
    517         {
    518             sx[right] = 0.0f;
    519         }
    520     }
    521     __syncthreads();
    522     float diffusivity_sum;
    523     diffusivity_sum = sx[ijs] + sx[ijs+1] + sy[ijs] + sy[ijs+PSOR_TILE_WIDTH+1];
    524     if(inside)
    525     {
    526         denom_u += diffusivity_sum;
    527         denom_v += diffusivity_sum;
    528         inv_denominator_u[ijg] = 1.0f/denom_u;
    529         inv_denominator_v[ijg] = 1.0f/denom_v;
    530     }
    531 }
    532 
    533 /////////////////////////////////////////////////////////////////////////////////////////
    534 // Red-Black SOR
    535 /////////////////////////////////////////////////////////////////////////////////////////
    536 
    537 template<int isBlack> __global__ void sor_pass(float *new_du,
    538                                                float *new_dv,
    539                                                const float *g_inv_denominator_u,
    540                                                const float *g_inv_denominator_v,
    541                                                const float *g_numerator_u,
    542                                                const float *g_numerator_v,
    543                                                const float *g_numerator_dudv,
    544                                                float omega,
    545                                                int width,
    546                                                int height,
    547                                                int stride)
    548 {
    549     int i = blockIdx.x * blockDim.x + threadIdx.x;
    550     int j = blockIdx.y * blockDim.y + threadIdx.y;
    551 
    552     if(i >= width || j >= height)
    553         return;
    554 
    555     const int pos = j * stride + i;
    556     const int pos_r = i < width - 1 ? pos + 1 : pos;
    557     const int pos_u = j < height - 1 ? pos + stride : pos;
    558     const int pos_d = j > 0 ? pos - stride : pos;
    559     const int pos_l = i > 0 ? pos - 1 : pos;
    560 
    561     //load smooth term
    562     float s_up, s_left, s_right, s_down;
    563     s_left = tex1Dfetch(tex_diffusivity_x, pos);
    564     s_down = tex1Dfetch(tex_diffusivity_y, pos);
    565     if(i < width-1)
    566         s_right = tex1Dfetch(tex_diffusivity_x, pos_r);
    567     else
    568         s_right = 0.0f; //Neumann BC
    569     if(j < height-1)
    570         s_up = tex1Dfetch(tex_diffusivity_y, pos_u);
    571     else
    572         s_up = 0.0f; //Neumann BC
    573 
    574     //load u, v and du, dv
    575     float u_up, u_left, u_right, u_down, u;
    576     float v_up, v_left, v_right, v_down, v;
    577     float du_up, du_left, du_right, du_down, du;
    578     float dv_up, dv_left, dv_right, dv_down, dv;
    579 
    580     u_left  = tex1Dfetch(tex_u, pos_l);
    581     u_right = tex1Dfetch(tex_u, pos_r);
    582     u_down  = tex1Dfetch(tex_u, pos_d);
    583     u_up    = tex1Dfetch(tex_u, pos_u);
    584     u       = tex1Dfetch(tex_u, pos);
    585 
    586     v_left  = tex1Dfetch(tex_v, pos_l);
    587     v_right = tex1Dfetch(tex_v, pos_r);
    588     v_down  = tex1Dfetch(tex_v, pos_d);
    589     v       = tex1Dfetch(tex_v, pos);
    590     v_up    = tex1Dfetch(tex_v, pos_u);
    591 
    592     du       = tex1Dfetch(tex_du, pos);
    593     du_left  = tex1Dfetch(tex_du, pos_l);
    594     du_right = tex1Dfetch(tex_du, pos_r);
    595     du_down  = tex1Dfetch(tex_du, pos_d);
    596     du_up    = tex1Dfetch(tex_du, pos_u);
    597 
    598     dv       = tex1Dfetch(tex_dv, pos);
    599     dv_left  = tex1Dfetch(tex_dv, pos_l);
    600     dv_right = tex1Dfetch(tex_dv, pos_r);
    601     dv_down  = tex1Dfetch(tex_dv, pos_d);
    602     dv_up    = tex1Dfetch(tex_dv, pos_u);
    603 
    604     float numerator_dudv    = g_numerator_dudv[pos];
    605 
    606     if((i+j)%2 == isBlack)
    607     {
    608         // update du
    609         float numerator_u = (s_left*(u_left + du_left) + s_up*(u_up + du_up) + s_right*(u_right + du_right) + s_down*(u_down + du_down) -
    610                              u * (s_left + s_right + s_up + s_down) - g_numerator_u[pos] - numerator_dudv*dv);
    611 
    612         du = (1.0f - omega) * du + omega * g_inv_denominator_u[pos] * numerator_u;
    613 
    614         // update dv
    615         float numerator_v = (s_left*(v_left + dv_left) + s_up*(v_up + dv_up) + s_right*(v_right + dv_right) + s_down*(v_down + dv_down) -
    616                              v * (s_left + s_right + s_up + s_down) - g_numerator_v[pos] - numerator_dudv*du);
    617 
    618         dv = (1.0f - omega) * dv + omega * g_inv_denominator_v[pos] * numerator_v;
    619     }
    620     new_du[pos] = du;
    621     new_dv[pos] = dv;
    622 }
    623 
    624 ///////////////////////////////////////////////////////////////////////////////
    625 // utility functions
    626 ///////////////////////////////////////////////////////////////////////////////
    627 
    628 void initTexture1D(texture<float, 1, cudaReadModeElementType> &tex)
    629 {
    630     tex.addressMode[0] = cudaAddressModeClamp;
    631     tex.filterMode = cudaFilterModePoint;
    632     tex.normalized = false;
    633 }
    634 
    635 void initTexture2D(texture<float, 2, cudaReadModeElementType> &tex)
    636 {
    637     tex.addressMode[0] = cudaAddressModeMirror;
    638     tex.addressMode[1] = cudaAddressModeMirror;
    639     tex.filterMode = cudaFilterModeLinear;
    640     tex.normalized = true;
    641 }
    642 
    643 void InitTextures()
    644 {
    645     initTexture2D(tex_I0);
    646     initTexture2D(tex_I1);
    647     initTexture2D(tex_fine);      // for downsampling
    648     initTexture2D(tex_coarse);    // for prolongation
    649 
    650     initTexture2D(tex_Ix);
    651     initTexture2D(tex_Ixx);
    652     initTexture2D(tex_Ix0);
    653 
    654     initTexture2D(tex_Iy);
    655     initTexture2D(tex_Iyy);
    656     initTexture2D(tex_Iy0);
    657 
    658     initTexture2D(tex_Ixy);
    659 
    660     initTexture1D(tex_u);
    661     initTexture1D(tex_v);
    662     initTexture1D(tex_du);
    663     initTexture1D(tex_dv);
    664     initTexture1D(tex_diffusivity_x);
    665     initTexture1D(tex_diffusivity_y);
    666     initTexture1D(tex_inv_denominator_u);
    667     initTexture1D(tex_inv_denominator_v);
    668     initTexture1D(tex_numerator_dudv);
    669     initTexture1D(tex_numerator_u);
    670     initTexture1D(tex_numerator_v);
    671 }
    672 
    673 namespace
    674 {
    675     struct ImagePyramid
    676     {
    677         std::vector<FloatVector*> img0;
    678         std::vector<FloatVector*> img1;
    679 
    680         std::vector<Ncv32u> w;
    681         std::vector<Ncv32u> h;
    682 
    683         explicit ImagePyramid(int outer_iterations)
    684         {
    685             img0.reserve(outer_iterations);
    686             img1.reserve(outer_iterations);
    687 
    688             w.reserve(outer_iterations);
    689             h.reserve(outer_iterations);
    690         }
    691 
    692         ~ImagePyramid()
    693         {
    694             w.clear();
    695             h.clear();
    696 
    697             for (int i = static_cast<int>(img0.size()) - 1; i >= 0; --i)
    698             {
    699                 delete img1[i];
    700                 delete img0[i];
    701             }
    702 
    703             img0.clear();
    704             img1.clear();
    705         }
    706     };
    707 }
    708 
    709 /////////////////////////////////////////////////////////////////////////////////////////
    710 // MAIN FUNCTION
    711 /////////////////////////////////////////////////////////////////////////////////////////
    712 NCVStatus NCVBroxOpticalFlow(const NCVBroxOpticalFlowDescriptor desc,
    713                              INCVMemAllocator &gpu_mem_allocator,
    714                              const NCVMatrix<Ncv32f> &frame0,
    715                              const NCVMatrix<Ncv32f> &frame1,
    716                              NCVMatrix<Ncv32f> &uOut,
    717                              NCVMatrix<Ncv32f> &vOut,
    718                              cudaStream_t stream)
    719 {
    720     ncvAssertPrintReturn(desc.alpha > 0.0f                   , "Invalid alpha"                      , NCV_INCONSISTENT_INPUT);
    721     ncvAssertPrintReturn(desc.gamma >= 0.0f                  , "Invalid gamma"                      , NCV_INCONSISTENT_INPUT);
    722     ncvAssertPrintReturn(desc.number_of_inner_iterations > 0 , "Invalid number of inner iterations" , NCV_INCONSISTENT_INPUT);
    723     ncvAssertPrintReturn(desc.number_of_outer_iterations > 0 , "Invalid number of outer iterations" , NCV_INCONSISTENT_INPUT);
    724     ncvAssertPrintReturn(desc.number_of_solver_iterations > 0, "Invalid number of solver iterations", NCV_INCONSISTENT_INPUT);
    725 
    726     const Ncv32u kSourceWidth  = frame0.width();
    727     const Ncv32u kSourceHeight = frame0.height();
    728 
    729     ncvAssertPrintReturn(frame1.width() == kSourceWidth && frame1.height() == kSourceHeight, "Frame dims do not match", NCV_INCONSISTENT_INPUT);
    730     ncvAssertReturn(uOut.width() == kSourceWidth && vOut.width() == kSourceWidth &&
    731         uOut.height() == kSourceHeight && vOut.height() == kSourceHeight, NCV_INCONSISTENT_INPUT);
    732 
    733     ncvAssertReturn(gpu_mem_allocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
    734 
    735     bool kSkipProcessing = gpu_mem_allocator.isCounting();
    736 
    737     int cuda_device;
    738     ncvAssertCUDAReturn(cudaGetDevice(&cuda_device), NCV_CUDA_ERROR);
    739 
    740     cudaDeviceProp device_props;
    741     ncvAssertCUDAReturn(cudaGetDeviceProperties(&device_props, cuda_device), NCV_CUDA_ERROR);
    742 
    743     Ncv32u alignmentValue = gpu_mem_allocator.alignment ();
    744 
    745     const Ncv32u kStrideAlignmentFloat = alignmentValue / sizeof(float);
    746     const Ncv32u kSourcePitch = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
    747 
    748     const Ncv32f scale_factor = desc.scale_factor;
    749     const Ncv32f alpha = desc.alpha;
    750     const Ncv32f gamma = desc.gamma;
    751 
    752     const Ncv32u kSizeInPixelsAligned = alignUp(kSourceWidth, kStrideAlignmentFloat)*kSourceHeight;
    753 
    754 #if defined SAFE_VECTOR_DECL
    755 #undef SAFE_VECTOR_DECL
    756 #endif
    757 #define SAFE_VECTOR_DECL(name, allocator, size) \
    758     FloatVector name((allocator), (size)); \
    759     ncvAssertReturn(name.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
    760 
    761     // matrix elements
    762     SAFE_VECTOR_DECL(diffusivity_x,  gpu_mem_allocator, kSizeInPixelsAligned);
    763     SAFE_VECTOR_DECL(diffusivity_y,  gpu_mem_allocator, kSizeInPixelsAligned);
    764     SAFE_VECTOR_DECL(denom_u,  gpu_mem_allocator, kSizeInPixelsAligned);
    765     SAFE_VECTOR_DECL(denom_v,  gpu_mem_allocator, kSizeInPixelsAligned);
    766     SAFE_VECTOR_DECL(num_dudv, gpu_mem_allocator, kSizeInPixelsAligned);
    767     SAFE_VECTOR_DECL(num_u,    gpu_mem_allocator, kSizeInPixelsAligned);
    768     SAFE_VECTOR_DECL(num_v,    gpu_mem_allocator, kSizeInPixelsAligned);
    769 
    770     // flow components
    771     SAFE_VECTOR_DECL(u, gpu_mem_allocator, kSizeInPixelsAligned);
    772     SAFE_VECTOR_DECL(v, gpu_mem_allocator, kSizeInPixelsAligned);
    773 
    774     SAFE_VECTOR_DECL(u_new, gpu_mem_allocator, kSizeInPixelsAligned);
    775     SAFE_VECTOR_DECL(v_new, gpu_mem_allocator, kSizeInPixelsAligned);
    776 
    777     // flow increments
    778     SAFE_VECTOR_DECL(du, gpu_mem_allocator, kSizeInPixelsAligned);
    779     SAFE_VECTOR_DECL(dv, gpu_mem_allocator, kSizeInPixelsAligned);
    780 
    781     SAFE_VECTOR_DECL(du_new, gpu_mem_allocator, kSizeInPixelsAligned);
    782     SAFE_VECTOR_DECL(dv_new, gpu_mem_allocator, kSizeInPixelsAligned);
    783 
    784     // temporary storage
    785     SAFE_VECTOR_DECL(device_buffer, gpu_mem_allocator,
    786         alignUp(kSourceWidth, kStrideAlignmentFloat) * alignUp(kSourceHeight, kStrideAlignmentFloat));
    787 
    788     // image derivatives
    789     SAFE_VECTOR_DECL(Ix,  gpu_mem_allocator, kSizeInPixelsAligned);
    790     SAFE_VECTOR_DECL(Ixx, gpu_mem_allocator, kSizeInPixelsAligned);
    791     SAFE_VECTOR_DECL(Ix0, gpu_mem_allocator, kSizeInPixelsAligned);
    792     SAFE_VECTOR_DECL(Iy,  gpu_mem_allocator, kSizeInPixelsAligned);
    793     SAFE_VECTOR_DECL(Iyy, gpu_mem_allocator, kSizeInPixelsAligned);
    794     SAFE_VECTOR_DECL(Iy0, gpu_mem_allocator, kSizeInPixelsAligned);
    795     SAFE_VECTOR_DECL(Ixy, gpu_mem_allocator, kSizeInPixelsAligned);
    796 
    797     // spatial derivative filter size
    798     const int kDFilterSize = 5;
    799     SAFE_VECTOR_DECL(derivativeFilter, gpu_mem_allocator, kDFilterSize);
    800 
    801     if (!kSkipProcessing)
    802     {
    803         const float derivativeFilterHost[kDFilterSize] = {1.0f, -8.0f, 0.0f, 8.0f, -1.0f};
    804 
    805         ncvAssertCUDAReturn(cudaMemcpy(derivativeFilter.ptr(), derivativeFilterHost, sizeof(float) * kDFilterSize,
    806             cudaMemcpyHostToDevice), NCV_CUDA_ERROR);
    807 
    808         InitTextures();
    809     }
    810 
    811     //prepare image pyramid
    812     ImagePyramid pyr(desc.number_of_outer_iterations);
    813 
    814     cudaChannelFormatDesc channel_desc = cudaCreateChannelDesc<float>();
    815 
    816     float scale = 1.0f;
    817 
    818     //cuda arrays for frames
    819     std::auto_ptr<FloatVector> pI0(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
    820     ncvAssertReturn(pI0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
    821 
    822     std::auto_ptr<FloatVector> pI1(new FloatVector(gpu_mem_allocator, kSizeInPixelsAligned));
    823     ncvAssertReturn(pI1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
    824 
    825     if (!kSkipProcessing)
    826     {
    827         //copy frame data to device
    828         size_t dst_width_in_bytes = alignUp(kSourceWidth, kStrideAlignmentFloat) * sizeof(float);
    829         size_t src_width_in_bytes = kSourceWidth * sizeof(float);
    830         size_t src_pitch_in_bytes = frame0.pitch();
    831 
    832         ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI0->ptr(), dst_width_in_bytes, frame0.ptr(),
    833             src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
    834 
    835         ncvAssertCUDAReturn( cudaMemcpy2DAsync(pI1->ptr(), dst_width_in_bytes, frame1.ptr(),
    836             src_pitch_in_bytes, src_width_in_bytes, kSourceHeight, cudaMemcpyDeviceToDevice, stream), NCV_CUDA_ERROR );
    837     }
    838 
    839     FloatVector* I0 = pI0.release();
    840     FloatVector* I1 = pI1.release();
    841 
    842         //prepare pyramid
    843     pyr.img0.push_back(I0);
    844     pyr.img1.push_back(I1);
    845 
    846     pyr.w.push_back(kSourceWidth);
    847     pyr.h.push_back(kSourceHeight);
    848 
    849     scale *= scale_factor;
    850 
    851     Ncv32u prev_level_width  = kSourceWidth;
    852     Ncv32u prev_level_height = kSourceHeight;
    853     while((prev_level_width > 15) && (prev_level_height > 15) && (static_cast<Ncv32u>(pyr.img0.size()) < desc.number_of_outer_iterations))
    854     {
    855         //current resolution
    856         Ncv32u level_width  = static_cast<Ncv32u>(ceilf(kSourceWidth  * scale));
    857         Ncv32u level_height = static_cast<Ncv32u>(ceilf(kSourceHeight * scale));
    858 
    859         Ncv32u level_width_aligned  = alignUp(level_width,  kStrideAlignmentFloat);
    860 
    861         Ncv32u buffer_size = alignUp(level_width, kStrideAlignmentFloat) * level_height; // buffer size in floats
    862 
    863         Ncv32u prev_level_pitch = alignUp(prev_level_width, kStrideAlignmentFloat) * sizeof(float);
    864 
    865         std::auto_ptr<FloatVector> level_frame0(new FloatVector(gpu_mem_allocator, buffer_size));
    866         ncvAssertReturn(level_frame0->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
    867 
    868         std::auto_ptr<FloatVector> level_frame1(new FloatVector(gpu_mem_allocator, buffer_size));
    869         ncvAssertReturn(level_frame1->isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
    870 
    871         if (!kSkipProcessing)
    872         {
    873             ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
    874 
    875             NcvSize32u srcSize (prev_level_width, prev_level_height);
    876             NcvSize32u dstSize (level_width, level_height);
    877             NcvRect32u srcROI (0, 0, prev_level_width, prev_level_height);
    878             NcvRect32u dstROI (0, 0, level_width, level_height);
    879 
    880             // frame 0
    881             ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I0->ptr(), srcSize, prev_level_pitch, srcROI,
    882                 level_frame0->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
    883 
    884             // frame 1
    885             ncvAssertReturnNcvStat( nppiStResize_32f_C1R (I1->ptr(), srcSize, prev_level_pitch, srcROI,
    886                 level_frame1->ptr(), dstSize, level_width_aligned * sizeof (float), dstROI, scale_factor, scale_factor, nppStSupersample) );
    887         }
    888 
    889         I0 = level_frame0.release();
    890         I1 = level_frame1.release();
    891 
    892         //store pointers
    893         pyr.img0.push_back(I0);
    894         pyr.img1.push_back(I1);
    895 
    896         pyr.w.push_back(level_width);
    897         pyr.h.push_back(level_height);
    898 
    899         scale *= scale_factor;
    900 
    901         prev_level_width  = level_width;
    902         prev_level_height = level_height;
    903     }
    904 
    905     if (!kSkipProcessing)
    906     {
    907         //initial values for flow is 0
    908         ncvAssertCUDAReturn(cudaMemsetAsync(u.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
    909         ncvAssertCUDAReturn(cudaMemsetAsync(v.ptr(), 0, kSizeInPixelsAligned * sizeof(float), stream), NCV_CUDA_ERROR);
    910 
    911         //select images with lowest resolution
    912         size_t pitch = alignUp(pyr.w.back(), kStrideAlignmentFloat) * sizeof(float);
    913         ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, pyr.img0.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
    914         ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, pyr.img1.back()->ptr(), channel_desc, pyr.w.back(), pyr.h.back(), pitch), NCV_CUDA_ERROR);
    915         ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
    916 
    917         FloatVector* ptrU = &u;
    918         FloatVector* ptrV = &v;
    919         FloatVector* ptrUNew = &u_new;
    920         FloatVector* ptrVNew = &v_new;
    921 
    922         std::vector<FloatVector*>::const_reverse_iterator img0Iter = pyr.img0.rbegin();
    923         std::vector<FloatVector*>::const_reverse_iterator img1Iter = pyr.img1.rbegin();
    924 
    925         //outer loop
    926         //warping fixed point iteration
    927         while(!pyr.w.empty())
    928         {
    929             //current grid dimensions
    930             const Ncv32u kLevelWidth  = pyr.w.back();
    931             const Ncv32u kLevelHeight = pyr.h.back();
    932             const Ncv32u kLevelStride = alignUp(kLevelWidth, kStrideAlignmentFloat);
    933 
    934             //size of current image in bytes
    935             const int kLevelSizeInBytes = kLevelStride * kLevelHeight * sizeof(float);
    936 
    937             //number of points at current resolution
    938             const int kLevelSizeInPixels = kLevelStride * kLevelHeight;
    939 
    940             //initial guess for du and dv
    941             ncvAssertCUDAReturn(cudaMemsetAsync(du.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
    942             ncvAssertCUDAReturn(cudaMemsetAsync(dv.ptr(), 0, kLevelSizeInBytes, stream), NCV_CUDA_ERROR);
    943 
    944             //texture format descriptor
    945             cudaChannelFormatDesc ch_desc = cudaCreateChannelDesc<float>();
    946 
    947             I0 = *img0Iter;
    948             I1 = *img1Iter;
    949 
    950             ++img0Iter;
    951             ++img1Iter;
    952 
    953             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I0, I0->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
    954             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_I1, I1->ptr(), ch_desc, kLevelWidth, kLevelHeight, kLevelStride*sizeof(float)), NCV_CUDA_ERROR);
    955 
    956             //compute derivatives
    957             dim3 dBlocks(iDivUp(kLevelWidth, 32), iDivUp(kLevelHeight, 6));
    958             dim3 dThreads(32, 6);
    959 
    960             const int kPitchTex = kLevelStride * sizeof(float);
    961 
    962             NcvSize32u srcSize(kLevelWidth, kLevelHeight);
    963             Ncv32u nSrcStep = kLevelStride * sizeof(float);
    964             NcvRect32u oROI(0, 0, kLevelWidth, kLevelHeight);
    965 
    966             // Ix0
    967             ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Ix0.ptr(), srcSize, nSrcStep, oROI,
    968                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
    969 
    970             // Iy0
    971             ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I0->ptr(), srcSize, nSrcStep, Iy0.ptr(), srcSize, nSrcStep, oROI,
    972                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
    973 
    974             // Ix
    975             ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Ix.ptr(), srcSize, nSrcStep, oROI,
    976                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
    977 
    978             // Iy
    979             ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (I1->ptr(), srcSize, nSrcStep, Iy.ptr(), srcSize, nSrcStep, oROI,
    980                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
    981 
    982             // Ixx
    983             ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Ix.ptr(), srcSize, nSrcStep, Ixx.ptr(), srcSize, nSrcStep, oROI,
    984                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
    985 
    986             // Iyy
    987             ncvAssertReturnNcvStat( nppiStFilterColumnBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Iyy.ptr(), srcSize, nSrcStep, oROI,
    988                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
    989 
    990             // Ixy
    991             ncvAssertReturnNcvStat( nppiStFilterRowBorder_32f_C1R (Iy.ptr(), srcSize, nSrcStep, Ixy.ptr(), srcSize, nSrcStep, oROI,
    992                 nppStBorderMirror, derivativeFilter.ptr(), kDFilterSize, kDFilterSize/2, 1.0f/12.0f) );
    993 
    994             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix,  Ix.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
    995             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixx, Ixx.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
    996             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ix0, Ix0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
    997             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy,  Iy.ptr(),  ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
    998             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iyy, Iyy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
    999             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Iy0, Iy0.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
   1000             ncvAssertCUDAReturn(cudaBindTexture2D(0, tex_Ixy, Ixy.ptr(), ch_desc, kLevelWidth, kLevelHeight, kPitchTex), NCV_CUDA_ERROR);
   1001 
   1002             //    flow
   1003             ncvAssertCUDAReturn(cudaBindTexture(0, tex_u, ptrU->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1004             ncvAssertCUDAReturn(cudaBindTexture(0, tex_v, ptrV->ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1005             //    flow increments
   1006             ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1007             ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1008 
   1009             dim3 psor_blocks(iDivUp(kLevelWidth, PSOR_TILE_WIDTH), iDivUp(kLevelHeight, PSOR_TILE_HEIGHT));
   1010             dim3 psor_threads(PSOR_TILE_WIDTH, PSOR_TILE_HEIGHT);
   1011 
   1012             dim3 sor_blocks(iDivUp(kLevelWidth, SOR_TILE_WIDTH), iDivUp(kLevelHeight, SOR_TILE_HEIGHT));
   1013             dim3 sor_threads(SOR_TILE_WIDTH, SOR_TILE_HEIGHT);
   1014 
   1015             // inner loop
   1016             // lagged nonlinearity fixed point iteration
   1017             ncvAssertCUDAReturn(cudaStreamSynchronize(stream), NCV_CUDA_ERROR);
   1018             for (Ncv32u current_inner_iteration = 0; current_inner_iteration < desc.number_of_inner_iterations; ++current_inner_iteration)
   1019             {
   1020                 //compute coefficients
   1021                 prepare_sor_stage_1_tex<<<psor_blocks, psor_threads, 0, stream>>>
   1022                     (diffusivity_x.ptr(),
   1023                      diffusivity_y.ptr(),
   1024                      denom_u.ptr(),
   1025                      denom_v.ptr(),
   1026                      num_dudv.ptr(),
   1027                      num_u.ptr(),
   1028                      num_v.ptr(),
   1029                      kLevelWidth,
   1030                      kLevelHeight,
   1031                      kLevelStride,
   1032                      alpha,
   1033                      gamma);
   1034 
   1035                 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
   1036 
   1037                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1038                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1039 
   1040                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1041 
   1042                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1043                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1044 
   1045                 prepare_sor_stage_2<<<psor_blocks, psor_threads, 0, stream>>>(denom_u.ptr(), denom_v.ptr(), kLevelWidth, kLevelHeight, kLevelStride);
   1046 
   1047                 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
   1048 
   1049                 //    linear system coefficients
   1050                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_x, diffusivity_x.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1051                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_diffusivity_y, diffusivity_y.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1052 
   1053                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_dudv, num_dudv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1054 
   1055                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_u, num_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1056                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_numerator_v, num_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1057 
   1058                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_u, denom_u.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1059                 ncvAssertCUDAReturn(cudaBindTexture(0, tex_inv_denominator_v, denom_v.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1060 
   1061                 //solve linear system
   1062                 for (Ncv32u solver_iteration = 0; solver_iteration < desc.number_of_solver_iterations; ++solver_iteration)
   1063                 {
   1064                     float omega = 1.99f;
   1065 
   1066                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1067                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1068 
   1069                     sor_pass<0><<<sor_blocks, sor_threads, 0, stream>>>
   1070                         (du_new.ptr(),
   1071                         dv_new.ptr(),
   1072                         denom_u.ptr(),
   1073                         denom_v.ptr(),
   1074                         num_u.ptr(),
   1075                         num_v.ptr(),
   1076                         num_dudv.ptr(),
   1077                         omega,
   1078                         kLevelWidth,
   1079                         kLevelHeight,
   1080                         kLevelStride);
   1081 
   1082                     ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
   1083 
   1084                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1085                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv_new.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1086 
   1087                     sor_pass<1><<<sor_blocks, sor_threads, 0, stream>>>
   1088                         (du.ptr(),
   1089                         dv.ptr(),
   1090                         denom_u.ptr(),
   1091                         denom_v.ptr(),
   1092                         num_u.ptr(),
   1093                         num_v.ptr(),
   1094                         num_dudv.ptr(),
   1095                         omega,
   1096                         kLevelWidth,
   1097                         kLevelHeight,
   1098                         kLevelStride);
   1099 
   1100                     ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
   1101 
   1102                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_du, du.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1103                     ncvAssertCUDAReturn(cudaBindTexture(0, tex_dv, dv.ptr(), ch_desc, kLevelSizeInBytes), NCV_CUDA_ERROR);
   1104                 }//end of solver loop
   1105             }// end of inner loop
   1106 
   1107             //update u and v
   1108             add(ptrU->ptr(), du.ptr(), kLevelSizeInPixels, stream);
   1109             ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
   1110             add(ptrV->ptr(), dv.ptr(), kLevelSizeInPixels, stream);
   1111             ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
   1112 
   1113             //prolongate using texture
   1114             pyr.w.pop_back();
   1115             pyr.h.pop_back();
   1116             if (!pyr.w.empty())
   1117             {
   1118                 //compute new image size
   1119                 Ncv32u nw = pyr.w.back();
   1120                 Ncv32u nh = pyr.h.back();
   1121                 Ncv32u ns = alignUp(nw, kStrideAlignmentFloat);
   1122 
   1123                 dim3 p_blocks(iDivUp(nw, 32), iDivUp(nh, 8));
   1124                 dim3 p_threads(32, 8);
   1125 
   1126                 NcvSize32u inner_srcSize (kLevelWidth, kLevelHeight);
   1127                 NcvSize32u dstSize (nw, nh);
   1128                 NcvRect32u srcROI (0, 0, kLevelWidth, kLevelHeight);
   1129                 NcvRect32u dstROI (0, 0, nw, nh);
   1130 
   1131                 ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrU->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
   1132                     ptrUNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
   1133 
   1134                 ScaleVector(ptrUNew->ptr(), ptrUNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
   1135                 ncvAssertCUDALastErrorReturn(NCV_CUDA_ERROR);
   1136 
   1137                 ncvAssertReturnNcvStat( nppiStResize_32f_C1R (ptrV->ptr(), inner_srcSize, kLevelStride * sizeof (float), srcROI,
   1138                     ptrVNew->ptr(), dstSize, ns * sizeof (float), dstROI, 1.0f/scale_factor, 1.0f/scale_factor, nppStBicubic) );
   1139 
   1140                 ScaleVector(ptrVNew->ptr(), ptrVNew->ptr(), 1.0f/scale_factor, ns * nh, stream);
   1141                 ncvAssertCUDALastErrorReturn((int)NCV_CUDA_ERROR);
   1142 
   1143                 cv::cuda::device::swap<FloatVector*>(ptrU, ptrUNew);
   1144                 cv::cuda::device::swap<FloatVector*>(ptrV, ptrVNew);
   1145             }
   1146             scale /= scale_factor;
   1147         }
   1148 
   1149         // end of warping iterations
   1150         ncvAssertCUDAReturn(cudaStreamSynchronize(stream), (int)NCV_CUDA_ERROR);
   1151 
   1152         ncvAssertCUDAReturn( cudaMemcpy2DAsync
   1153             (uOut.ptr(), uOut.pitch(), ptrU->ptr(),
   1154             kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), (int)NCV_CUDA_ERROR );
   1155 
   1156         ncvAssertCUDAReturn( cudaMemcpy2DAsync
   1157             (vOut.ptr(), vOut.pitch(), ptrV->ptr(),
   1158             kSourcePitch, kSourceWidth*sizeof(float), kSourceHeight, cudaMemcpyDeviceToDevice, stream), (int)NCV_CUDA_ERROR );
   1159 
   1160         ncvAssertCUDAReturn(cudaStreamSynchronize(stream), (int)NCV_CUDA_ERROR);
   1161     }
   1162 
   1163     return NCV_SUCCESS;
   1164 }
   1165