Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include "opencv2/core/cuda/common.hpp"
     46 #include "opencv2/core/cuda/utility.hpp"
     47 #include "opencv2/core/cuda/functional.hpp"
     48 #include "opencv2/core/cuda/limits.hpp"
     49 #include "opencv2/core/cuda/vec_math.hpp"
     50 #include "opencv2/core/cuda/reduce.hpp"
     51 
     52 using namespace cv::cuda;
     53 using namespace cv::cuda::device;
     54 
     55 namespace pyrlk
     56 {
     57     __constant__ int c_winSize_x;
     58     __constant__ int c_winSize_y;
     59     __constant__ int c_halfWin_x;
     60     __constant__ int c_halfWin_y;
     61     __constant__ int c_iters;
     62 
     63     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_If(false, cudaFilterModeLinear, cudaAddressModeClamp);
     64     texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_If4(false, cudaFilterModeLinear, cudaAddressModeClamp);
     65     texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_Ib(false, cudaFilterModePoint, cudaAddressModeClamp);
     66 
     67     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_Jf(false, cudaFilterModeLinear, cudaAddressModeClamp);
     68     texture<float4, cudaTextureType2D, cudaReadModeElementType> tex_Jf4(false, cudaFilterModeLinear, cudaAddressModeClamp);
     69 
     70     template <int cn> struct Tex_I;
     71     template <> struct Tex_I<1>
     72     {
     73         static __device__ __forceinline__ float read(float x, float y)
     74         {
     75             return tex2D(tex_If, x, y);
     76         }
     77     };
     78     template <> struct Tex_I<4>
     79     {
     80         static __device__ __forceinline__ float4 read(float x, float y)
     81         {
     82             return tex2D(tex_If4, x, y);
     83         }
     84     };
     85 
     86     template <int cn> struct Tex_J;
     87     template <> struct Tex_J<1>
     88     {
     89         static __device__ __forceinline__ float read(float x, float y)
     90         {
     91             return tex2D(tex_Jf, x, y);
     92         }
     93     };
     94     template <> struct Tex_J<4>
     95     {
     96         static __device__ __forceinline__ float4 read(float x, float y)
     97         {
     98             return tex2D(tex_Jf4, x, y);
     99         }
    100     };
    101 
    102     __device__ __forceinline__ void accum(float& dst, float val)
    103     {
    104         dst += val;
    105     }
    106     __device__ __forceinline__ void accum(float& dst, const float4& val)
    107     {
    108         dst += val.x + val.y + val.z;
    109     }
    110 
    111     __device__ __forceinline__ float abs_(float a)
    112     {
    113         return ::fabsf(a);
    114     }
    115     __device__ __forceinline__ float4 abs_(const float4& a)
    116     {
    117         return abs(a);
    118     }
    119 
    120     template <int cn, int PATCH_X, int PATCH_Y, bool calcErr>
    121     __global__ void sparseKernel(const float2* prevPts, float2* nextPts, uchar* status, float* err, const int level, const int rows, const int cols)
    122     {
    123     #if __CUDA_ARCH__ <= 110
    124         const int BLOCK_SIZE = 128;
    125     #else
    126         const int BLOCK_SIZE = 256;
    127     #endif
    128 
    129         __shared__ float smem1[BLOCK_SIZE];
    130         __shared__ float smem2[BLOCK_SIZE];
    131         __shared__ float smem3[BLOCK_SIZE];
    132 
    133         const unsigned int tid = threadIdx.y * blockDim.x + threadIdx.x;
    134 
    135         float2 prevPt = prevPts[blockIdx.x];
    136         prevPt.x *= (1.0f / (1 << level));
    137         prevPt.y *= (1.0f / (1 << level));
    138 
    139         if (prevPt.x < 0 || prevPt.x >= cols || prevPt.y < 0 || prevPt.y >= rows)
    140         {
    141             if (tid == 0 && level == 0)
    142                 status[blockIdx.x] = 0;
    143 
    144             return;
    145         }
    146 
    147         prevPt.x -= c_halfWin_x;
    148         prevPt.y -= c_halfWin_y;
    149 
    150         // extract the patch from the first image, compute covariation matrix of derivatives
    151 
    152         float A11 = 0;
    153         float A12 = 0;
    154         float A22 = 0;
    155 
    156         typedef typename TypeVec<float, cn>::vec_type work_type;
    157 
    158         work_type I_patch   [PATCH_Y][PATCH_X];
    159         work_type dIdx_patch[PATCH_Y][PATCH_X];
    160         work_type dIdy_patch[PATCH_Y][PATCH_X];
    161 
    162         for (int yBase = threadIdx.y, i = 0; yBase < c_winSize_y; yBase += blockDim.y, ++i)
    163         {
    164             for (int xBase = threadIdx.x, j = 0; xBase < c_winSize_x; xBase += blockDim.x, ++j)
    165             {
    166                 float x = prevPt.x + xBase + 0.5f;
    167                 float y = prevPt.y + yBase + 0.5f;
    168 
    169                 I_patch[i][j] = Tex_I<cn>::read(x, y);
    170 
    171                 // Sharr Deriv
    172 
    173                 work_type dIdx = 3.0f * Tex_I<cn>::read(x+1, y-1) + 10.0f * Tex_I<cn>::read(x+1, y) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
    174                                  (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x-1, y) + 3.0f * Tex_I<cn>::read(x-1, y+1));
    175 
    176                 work_type dIdy = 3.0f * Tex_I<cn>::read(x-1, y+1) + 10.0f * Tex_I<cn>::read(x, y+1) + 3.0f * Tex_I<cn>::read(x+1, y+1) -
    177                                 (3.0f * Tex_I<cn>::read(x-1, y-1) + 10.0f * Tex_I<cn>::read(x, y-1) + 3.0f * Tex_I<cn>::read(x+1, y-1));
    178 
    179                 dIdx_patch[i][j] = dIdx;
    180                 dIdy_patch[i][j] = dIdy;
    181 
    182                 accum(A11, dIdx * dIdx);
    183                 accum(A12, dIdx * dIdy);
    184                 accum(A22, dIdy * dIdy);
    185             }
    186         }
    187 
    188         reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2, smem3), thrust::tie(A11, A12, A22), tid, thrust::make_tuple(plus<float>(), plus<float>(), plus<float>()));
    189 
    190     #if __CUDA_ARCH__ >= 300
    191         if (tid == 0)
    192         {
    193             smem1[0] = A11;
    194             smem2[0] = A12;
    195             smem3[0] = A22;
    196         }
    197     #endif
    198 
    199         __syncthreads();
    200 
    201         A11 = smem1[0];
    202         A12 = smem2[0];
    203         A22 = smem3[0];
    204 
    205         float D = A11 * A22 - A12 * A12;
    206 
    207         if (D < numeric_limits<float>::epsilon())
    208         {
    209             if (tid == 0 && level == 0)
    210                 status[blockIdx.x] = 0;
    211 
    212             return;
    213         }
    214 
    215         D = 1.f / D;
    216 
    217         A11 *= D;
    218         A12 *= D;
    219         A22 *= D;
    220 
    221         float2 nextPt = nextPts[blockIdx.x];
    222         nextPt.x *= 2.f;
    223         nextPt.y *= 2.f;
    224 
    225         nextPt.x -= c_halfWin_x;
    226         nextPt.y -= c_halfWin_y;
    227 
    228         for (int k = 0; k < c_iters; ++k)
    229         {
    230             if (nextPt.x < -c_halfWin_x || nextPt.x >= cols || nextPt.y < -c_halfWin_y || nextPt.y >= rows)
    231             {
    232                 if (tid == 0 && level == 0)
    233                     status[blockIdx.x] = 0;
    234 
    235                 return;
    236             }
    237 
    238             float b1 = 0;
    239             float b2 = 0;
    240 
    241             for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
    242             {
    243                 for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
    244                 {
    245                     work_type I_val = I_patch[i][j];
    246                     work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
    247 
    248                     work_type diff = (J_val - I_val) * 32.0f;
    249 
    250                     accum(b1, diff * dIdx_patch[i][j]);
    251                     accum(b2, diff * dIdy_patch[i][j]);
    252                 }
    253             }
    254 
    255             reduce<BLOCK_SIZE>(smem_tuple(smem1, smem2), thrust::tie(b1, b2), tid, thrust::make_tuple(plus<float>(), plus<float>()));
    256 
    257         #if __CUDA_ARCH__ >= 300
    258             if (tid == 0)
    259             {
    260                 smem1[0] = b1;
    261                 smem2[0] = b2;
    262             }
    263         #endif
    264 
    265             __syncthreads();
    266 
    267             b1 = smem1[0];
    268             b2 = smem2[0];
    269 
    270             float2 delta;
    271             delta.x = A12 * b2 - A22 * b1;
    272             delta.y = A12 * b1 - A11 * b2;
    273 
    274             nextPt.x += delta.x;
    275             nextPt.y += delta.y;
    276 
    277             if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
    278                 break;
    279         }
    280 
    281         float errval = 0;
    282         if (calcErr)
    283         {
    284             for (int y = threadIdx.y, i = 0; y < c_winSize_y; y += blockDim.y, ++i)
    285             {
    286                 for (int x = threadIdx.x, j = 0; x < c_winSize_x; x += blockDim.x, ++j)
    287                 {
    288                     work_type I_val = I_patch[i][j];
    289                     work_type J_val = Tex_J<cn>::read(nextPt.x + x + 0.5f, nextPt.y + y + 0.5f);
    290 
    291                     work_type diff = J_val - I_val;
    292 
    293                     accum(errval, abs_(diff));
    294                 }
    295             }
    296 
    297             reduce<BLOCK_SIZE>(smem1, errval, tid, plus<float>());
    298         }
    299 
    300         if (tid == 0)
    301         {
    302             nextPt.x += c_halfWin_x;
    303             nextPt.y += c_halfWin_y;
    304 
    305             nextPts[blockIdx.x] = nextPt;
    306 
    307             if (calcErr)
    308                 err[blockIdx.x] = static_cast<float>(errval) / (cn * c_winSize_x * c_winSize_y);
    309         }
    310     }
    311 
    312     template <int cn, int PATCH_X, int PATCH_Y>
    313     void sparse_caller(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
    314                        int level, dim3 block, cudaStream_t stream)
    315     {
    316         dim3 grid(ptcount);
    317 
    318         if (level == 0 && err)
    319             sparseKernel<cn, PATCH_X, PATCH_Y, true><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
    320         else
    321             sparseKernel<cn, PATCH_X, PATCH_Y, false><<<grid, block>>>(prevPts, nextPts, status, err, level, rows, cols);
    322 
    323         cudaSafeCall( cudaGetLastError() );
    324 
    325         if (stream == 0)
    326             cudaSafeCall( cudaDeviceSynchronize() );
    327     }
    328 
    329     template <bool calcErr>
    330     __global__ void denseKernel(PtrStepf u, PtrStepf v, const PtrStepf prevU, const PtrStepf prevV, PtrStepf err, const int rows, const int cols)
    331     {
    332         extern __shared__ int smem[];
    333 
    334         const int patchWidth  = blockDim.x + 2 * c_halfWin_x;
    335         const int patchHeight = blockDim.y + 2 * c_halfWin_y;
    336 
    337         int* I_patch = smem;
    338         int* dIdx_patch = I_patch + patchWidth * patchHeight;
    339         int* dIdy_patch = dIdx_patch + patchWidth * patchHeight;
    340 
    341         const int xBase = blockIdx.x * blockDim.x;
    342         const int yBase = blockIdx.y * blockDim.y;
    343 
    344         for (int i = threadIdx.y; i < patchHeight; i += blockDim.y)
    345         {
    346             for (int j = threadIdx.x; j < patchWidth; j += blockDim.x)
    347             {
    348                 float x = xBase - c_halfWin_x + j + 0.5f;
    349                 float y = yBase - c_halfWin_y + i + 0.5f;
    350 
    351                 I_patch[i * patchWidth + j] = tex2D(tex_Ib, x, y);
    352 
    353                 // Sharr Deriv
    354 
    355                 dIdx_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x+1, y-1) + 10 * tex2D(tex_Ib, x+1, y) + 3 * tex2D(tex_Ib, x+1, y+1) -
    356                                                 (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x-1, y) + 3 * tex2D(tex_Ib, x-1, y+1));
    357 
    358                 dIdy_patch[i * patchWidth + j] = 3 * tex2D(tex_Ib, x-1, y+1) + 10 * tex2D(tex_Ib, x, y+1) + 3 * tex2D(tex_Ib, x+1, y+1) -
    359                                                 (3 * tex2D(tex_Ib, x-1, y-1) + 10 * tex2D(tex_Ib, x, y-1) + 3 * tex2D(tex_Ib, x+1, y-1));
    360             }
    361         }
    362 
    363         __syncthreads();
    364 
    365         const int x = xBase + threadIdx.x;
    366         const int y = yBase + threadIdx.y;
    367 
    368         if (x >= cols || y >= rows)
    369             return;
    370 
    371         int A11i = 0;
    372         int A12i = 0;
    373         int A22i = 0;
    374 
    375         for (int i = 0; i < c_winSize_y; ++i)
    376         {
    377             for (int j = 0; j < c_winSize_x; ++j)
    378             {
    379                 int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
    380                 int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
    381 
    382                 A11i += dIdx * dIdx;
    383                 A12i += dIdx * dIdy;
    384                 A22i += dIdy * dIdy;
    385             }
    386         }
    387 
    388         float A11 = A11i;
    389         float A12 = A12i;
    390         float A22 = A22i;
    391 
    392         float D = A11 * A22 - A12 * A12;
    393 
    394         if (D < numeric_limits<float>::epsilon())
    395         {
    396             if (calcErr)
    397                 err(y, x) = numeric_limits<float>::max();
    398 
    399             return;
    400         }
    401 
    402         D = 1.f / D;
    403 
    404         A11 *= D;
    405         A12 *= D;
    406         A22 *= D;
    407 
    408         float2 nextPt;
    409         nextPt.x = x + prevU(y/2, x/2) * 2.0f;
    410         nextPt.y = y + prevV(y/2, x/2) * 2.0f;
    411 
    412         for (int k = 0; k < c_iters; ++k)
    413         {
    414             if (nextPt.x < 0 || nextPt.x >= cols || nextPt.y < 0 || nextPt.y >= rows)
    415             {
    416                 if (calcErr)
    417                     err(y, x) = numeric_limits<float>::max();
    418 
    419                 return;
    420             }
    421 
    422             int b1 = 0;
    423             int b2 = 0;
    424 
    425             for (int i = 0; i < c_winSize_y; ++i)
    426             {
    427                 for (int j = 0; j < c_winSize_x; ++j)
    428                 {
    429                     int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
    430                     int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
    431 
    432                     int diff = (J - I) * 32;
    433 
    434                     int dIdx = dIdx_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
    435                     int dIdy = dIdy_patch[(threadIdx.y + i) * patchWidth + (threadIdx.x + j)];
    436 
    437                     b1 += diff * dIdx;
    438                     b2 += diff * dIdy;
    439                 }
    440             }
    441 
    442             float2 delta;
    443             delta.x = A12 * b2 - A22 * b1;
    444             delta.y = A12 * b1 - A11 * b2;
    445 
    446             nextPt.x += delta.x;
    447             nextPt.y += delta.y;
    448 
    449             if (::fabs(delta.x) < 0.01f && ::fabs(delta.y) < 0.01f)
    450                 break;
    451         }
    452 
    453         u(y, x) = nextPt.x - x;
    454         v(y, x) = nextPt.y - y;
    455 
    456         if (calcErr)
    457         {
    458             int errval = 0;
    459 
    460             for (int i = 0; i < c_winSize_y; ++i)
    461             {
    462                 for (int j = 0; j < c_winSize_x; ++j)
    463                 {
    464                     int I = I_patch[(threadIdx.y + i) * patchWidth + threadIdx.x + j];
    465                     int J = tex2D(tex_Jf, nextPt.x - c_halfWin_x + j + 0.5f, nextPt.y - c_halfWin_y + i + 0.5f);
    466 
    467                     errval += ::abs(J - I);
    468                 }
    469             }
    470 
    471             err(y, x) = static_cast<float>(errval) / (c_winSize_x * c_winSize_y);
    472         }
    473     }
    474 
    475     void loadConstants(int2 winSize, int iters, cudaStream_t stream)
    476     {
    477         cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_x, &winSize.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    478         cudaSafeCall( cudaMemcpyToSymbolAsync(c_winSize_y, &winSize.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    479 
    480         int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
    481         cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_x, &halfWin.x, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    482         cudaSafeCall( cudaMemcpyToSymbolAsync(c_halfWin_y, &halfWin.y, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    483 
    484         cudaSafeCall( cudaMemcpyToSymbolAsync(c_iters, &iters, sizeof(int), 0, cudaMemcpyHostToDevice, stream) );
    485     }
    486 
    487     void sparse1(PtrStepSzf I, PtrStepSzf J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
    488                  int level, dim3 block, dim3 patch, cudaStream_t stream)
    489     {
    490         typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
    491                                int level, dim3 block, cudaStream_t stream);
    492 
    493         static const func_t funcs[5][5] =
    494         {
    495             {sparse_caller<1, 1, 1>, sparse_caller<1, 2, 1>, sparse_caller<1, 3, 1>, sparse_caller<1, 4, 1>, sparse_caller<1, 5, 1>},
    496             {sparse_caller<1, 1, 2>, sparse_caller<1, 2, 2>, sparse_caller<1, 3, 2>, sparse_caller<1, 4, 2>, sparse_caller<1, 5, 2>},
    497             {sparse_caller<1, 1, 3>, sparse_caller<1, 2, 3>, sparse_caller<1, 3, 3>, sparse_caller<1, 4, 3>, sparse_caller<1, 5, 3>},
    498             {sparse_caller<1, 1, 4>, sparse_caller<1, 2, 4>, sparse_caller<1, 3, 4>, sparse_caller<1, 4, 4>, sparse_caller<1, 5, 4>},
    499             {sparse_caller<1, 1, 5>, sparse_caller<1, 2, 5>, sparse_caller<1, 3, 5>, sparse_caller<1, 4, 5>, sparse_caller<1, 5, 5>}
    500         };
    501 
    502         bindTexture(&tex_If, I);
    503         bindTexture(&tex_Jf, J);
    504 
    505         funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
    506             level, block, stream);
    507     }
    508 
    509     void sparse4(PtrStepSz<float4> I, PtrStepSz<float4> J, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
    510                  int level, dim3 block, dim3 patch, cudaStream_t stream)
    511     {
    512         typedef void (*func_t)(int rows, int cols, const float2* prevPts, float2* nextPts, uchar* status, float* err, int ptcount,
    513                                int level, dim3 block, cudaStream_t stream);
    514 
    515         static const func_t funcs[5][5] =
    516         {
    517             {sparse_caller<4, 1, 1>, sparse_caller<4, 2, 1>, sparse_caller<4, 3, 1>, sparse_caller<4, 4, 1>, sparse_caller<4, 5, 1>},
    518             {sparse_caller<4, 1, 2>, sparse_caller<4, 2, 2>, sparse_caller<4, 3, 2>, sparse_caller<4, 4, 2>, sparse_caller<4, 5, 2>},
    519             {sparse_caller<4, 1, 3>, sparse_caller<4, 2, 3>, sparse_caller<4, 3, 3>, sparse_caller<4, 4, 3>, sparse_caller<4, 5, 3>},
    520             {sparse_caller<4, 1, 4>, sparse_caller<4, 2, 4>, sparse_caller<4, 3, 4>, sparse_caller<4, 4, 4>, sparse_caller<4, 5, 4>},
    521             {sparse_caller<4, 1, 5>, sparse_caller<4, 2, 5>, sparse_caller<4, 3, 5>, sparse_caller<4, 4, 5>, sparse_caller<4, 5, 5>}
    522         };
    523 
    524         bindTexture(&tex_If4, I);
    525         bindTexture(&tex_Jf4, J);
    526 
    527         funcs[patch.y - 1][patch.x - 1](I.rows, I.cols, prevPts, nextPts, status, err, ptcount,
    528             level, block, stream);
    529     }
    530 
    531     void dense(PtrStepSzb I, PtrStepSzf J, PtrStepSzf u, PtrStepSzf v, PtrStepSzf prevU, PtrStepSzf prevV, PtrStepSzf err, int2 winSize, cudaStream_t stream)
    532     {
    533         dim3 block(16, 16);
    534         dim3 grid(divUp(I.cols, block.x), divUp(I.rows, block.y));
    535 
    536         bindTexture(&tex_Ib, I);
    537         bindTexture(&tex_Jf, J);
    538 
    539         int2 halfWin = make_int2((winSize.x - 1) / 2, (winSize.y - 1) / 2);
    540         const int patchWidth  = block.x + 2 * halfWin.x;
    541         const int patchHeight = block.y + 2 * halfWin.y;
    542         size_t smem_size = 3 * patchWidth * patchHeight * sizeof(int);
    543 
    544         if (err.data)
    545         {
    546             denseKernel<true><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, err, I.rows, I.cols);
    547             cudaSafeCall( cudaGetLastError() );
    548         }
    549         else
    550         {
    551             denseKernel<false><<<grid, block, smem_size, stream>>>(u, v, prevU, prevV, PtrStepf(), I.rows, I.cols);
    552             cudaSafeCall( cudaGetLastError() );
    553         }
    554 
    555         if (stream == 0)
    556             cudaSafeCall( cudaDeviceSynchronize() );
    557     }
    558 }
    559 
    560 #endif /* CUDA_DISABLER */
    561