Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include "opencv2/core/cuda/common.hpp"
     46 #include "opencv2/core/cuda/border_interpolate.hpp"
     47 
     48 #define tx threadIdx.x
     49 #define ty threadIdx.y
     50 #define bx blockIdx.x
     51 #define by blockIdx.y
     52 #define bdx blockDim.x
     53 #define bdy blockDim.y
     54 
     55 #define BORDER_SIZE 5
     56 #define MAX_KSIZE_HALF 100
     57 
     58 namespace cv { namespace cuda { namespace device { namespace optflow_farneback
     59 {
     60     __constant__ float c_g[8];
     61     __constant__ float c_xg[8];
     62     __constant__ float c_xxg[8];
     63     __constant__ float c_ig11, c_ig03, c_ig33, c_ig55;
     64 
     65 
     66     template <int polyN>
     67     __global__ void polynomialExpansion(
     68             const int height, const int width, const PtrStepf src, PtrStepf dst)
     69     {
     70         const int y = by * bdy + ty;
     71         const int x = bx * (bdx - 2*polyN) + tx - polyN;
     72 
     73         if (y < height)
     74         {
     75             extern __shared__ float smem[];
     76             volatile float *row = smem + tx;
     77             int xWarped = ::min(::max(x, 0), width - 1);
     78 
     79             row[0] = src(y, xWarped) * c_g[0];
     80             row[bdx] = 0.f;
     81             row[2*bdx] = 0.f;
     82 
     83             for (int k = 1; k <= polyN; ++k)
     84             {
     85                 float t0 = src(::max(y - k, 0), xWarped);
     86                 float t1 = src(::min(y + k, height - 1), xWarped);
     87 
     88                 row[0] += c_g[k] * (t0 + t1);
     89                 row[bdx] += c_xg[k] * (t1 - t0);
     90                 row[2*bdx] += c_xxg[k] * (t0 + t1);
     91             }
     92 
     93             __syncthreads();
     94 
     95             if (tx >= polyN && tx + polyN < bdx && x < width)
     96             {
     97                 float b1 = c_g[0] * row[0];
     98                 float b3 = c_g[0] * row[bdx];
     99                 float b5 = c_g[0] * row[2*bdx];
    100                 float b2 = 0, b4 = 0, b6 = 0;
    101 
    102                 for (int k = 1; k <= polyN; ++k)
    103                 {
    104                     b1 += (row[k] + row[-k]) * c_g[k];
    105                     b4 += (row[k] + row[-k]) * c_xxg[k];
    106                     b2 += (row[k] - row[-k]) * c_xg[k];
    107                     b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
    108                     b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
    109                     b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
    110                 }
    111 
    112                 dst(y, xWarped) = b3*c_ig11;
    113                 dst(height + y, xWarped) = b2*c_ig11;
    114                 dst(2*height + y, xWarped) = b1*c_ig03 + b5*c_ig33;
    115                 dst(3*height + y, xWarped) = b1*c_ig03 + b4*c_ig33;
    116                 dst(4*height + y, xWarped) = b6*c_ig55;
    117             }
    118         }
    119     }
    120 
    121 
    122     void setPolynomialExpansionConsts(
    123             int polyN, const float *g, const float *xg, const float *xxg,
    124             float ig11, float ig03, float ig33, float ig55)
    125     {
    126         cudaSafeCall(cudaMemcpyToSymbol(c_g, g, (polyN + 1) * sizeof(*g)));
    127         cudaSafeCall(cudaMemcpyToSymbol(c_xg, xg, (polyN + 1) * sizeof(*xg)));
    128         cudaSafeCall(cudaMemcpyToSymbol(c_xxg, xxg, (polyN + 1) * sizeof(*xxg)));
    129         cudaSafeCall(cudaMemcpyToSymbol(c_ig11, &ig11, sizeof(ig11)));
    130         cudaSafeCall(cudaMemcpyToSymbol(c_ig03, &ig03, sizeof(ig03)));
    131         cudaSafeCall(cudaMemcpyToSymbol(c_ig33, &ig33, sizeof(ig33)));
    132         cudaSafeCall(cudaMemcpyToSymbol(c_ig55, &ig55, sizeof(ig55)));
    133     }
    134 
    135 
    136     void polynomialExpansionGpu(const PtrStepSzf &src, int polyN, PtrStepSzf dst, cudaStream_t stream)
    137     {
    138         dim3 block(256);
    139         dim3 grid(divUp(src.cols, block.x - 2*polyN), src.rows);
    140         int smem = 3 * block.x * sizeof(float);
    141 
    142         if (polyN == 5)
    143             polynomialExpansion<5><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
    144         else if (polyN == 7)
    145             polynomialExpansion<7><<<grid, block, smem, stream>>>(src.rows, src.cols, src, dst);
    146 
    147         cudaSafeCall(cudaGetLastError());
    148 
    149         if (stream == 0)
    150             cudaSafeCall(cudaDeviceSynchronize());
    151     }
    152 
    153 
    154     __constant__ float c_border[BORDER_SIZE + 1];
    155 
    156     __global__ void updateMatrices(
    157             const int height, const int width, const PtrStepf flowx, const PtrStepf flowy,
    158             const PtrStepf R0, const PtrStepf R1, PtrStepf M)
    159     {
    160         const int y = by * bdy + ty;
    161         const int x = bx * bdx + tx;
    162 
    163         if (y < height && x < width)
    164         {
    165             float dx = flowx(y, x);
    166             float dy = flowy(y, x);
    167             float fx = x + dx;
    168             float fy = y + dy;
    169 
    170             int x1 = floorf(fx);
    171             int y1 = floorf(fy);
    172             fx -= x1; fy -= y1;
    173 
    174             float r2, r3, r4, r5, r6;
    175 
    176             if (x1 >= 0 && y1 >= 0 && x1 < width - 1 && y1 < height - 1)
    177             {
    178                 float a00 = (1.f - fx) * (1.f - fy);
    179                 float a01 = fx * (1.f - fy);
    180                 float a10 = (1.f - fx) * fy;
    181                 float a11 = fx * fy;
    182 
    183                 r2 = a00 * R1(y1, x1) +
    184                      a01 * R1(y1, x1 + 1) +
    185                      a10 * R1(y1 + 1, x1) +
    186                      a11 * R1(y1 + 1, x1 + 1);
    187 
    188                 r3 = a00 * R1(height + y1, x1) +
    189                      a01 * R1(height + y1, x1 + 1) +
    190                      a10 * R1(height + y1 + 1, x1) +
    191                      a11 * R1(height + y1 + 1, x1 + 1);
    192 
    193                 r4 = a00 * R1(2*height + y1, x1) +
    194                      a01 * R1(2*height + y1, x1 + 1) +
    195                      a10 * R1(2*height + y1 + 1, x1) +
    196                      a11 * R1(2*height + y1 + 1, x1 + 1);
    197 
    198                 r5 = a00 * R1(3*height + y1, x1) +
    199                      a01 * R1(3*height + y1, x1 + 1) +
    200                      a10 * R1(3*height + y1 + 1, x1) +
    201                      a11 * R1(3*height + y1 + 1, x1 + 1);
    202 
    203                 r6 = a00 * R1(4*height + y1, x1) +
    204                      a01 * R1(4*height + y1, x1 + 1) +
    205                      a10 * R1(4*height + y1 + 1, x1) +
    206                      a11 * R1(4*height + y1 + 1, x1 + 1);
    207 
    208                 r4 = (R0(2*height + y, x) + r4) * 0.5f;
    209                 r5 = (R0(3*height + y, x) + r5) * 0.5f;
    210                 r6 = (R0(4*height + y, x) + r6) * 0.25f;
    211             }
    212             else
    213             {
    214                 r2 = r3 = 0.f;
    215                 r4 = R0(2*height + y, x);
    216                 r5 = R0(3*height + y, x);
    217                 r6 = R0(4*height + y, x) * 0.5f;
    218             }
    219 
    220             r2 = (R0(y, x) - r2) * 0.5f;
    221             r3 = (R0(height + y, x) - r3) * 0.5f;
    222 
    223             r2 += r4*dy + r6*dx;
    224             r3 += r6*dy + r5*dx;
    225 
    226             float scale =
    227                     c_border[::min(x, BORDER_SIZE)] *
    228                     c_border[::min(y, BORDER_SIZE)] *
    229                     c_border[::min(width - x - 1, BORDER_SIZE)] *
    230                     c_border[::min(height - y - 1, BORDER_SIZE)];
    231 
    232             r2 *= scale; r3 *= scale; r4 *= scale;
    233             r5 *= scale; r6 *= scale;
    234 
    235             M(y, x) = r4*r4 + r6*r6;
    236             M(height + y, x) = (r4 + r5)*r6;
    237             M(2*height + y, x) = r5*r5 + r6*r6;
    238             M(3*height + y, x) = r4*r2 + r6*r3;
    239             M(4*height + y, x) = r6*r2 + r5*r3;
    240         }
    241     }
    242 
    243 
    244     void setUpdateMatricesConsts()
    245     {
    246         static const float border[BORDER_SIZE + 1] = {0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f};
    247         cudaSafeCall(cudaMemcpyToSymbol(c_border, border, (BORDER_SIZE + 1) * sizeof(*border)));
    248     }
    249 
    250 
    251     void updateMatricesGpu(
    252             const PtrStepSzf flowx, const PtrStepSzf flowy, const PtrStepSzf R0, const PtrStepSzf R1,
    253             PtrStepSzf M, cudaStream_t stream)
    254     {
    255         dim3 block(32, 8);
    256         dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
    257 
    258         updateMatrices<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, flowx, flowy, R0, R1, M);
    259 
    260         cudaSafeCall(cudaGetLastError());
    261 
    262         if (stream == 0)
    263             cudaSafeCall(cudaDeviceSynchronize());
    264     }
    265 
    266 
    267     __global__ void updateFlow(
    268             const int height, const int width, const PtrStepf M, PtrStepf flowx, PtrStepf flowy)
    269     {
    270         const int y = by * bdy + ty;
    271         const int x = bx * bdx + tx;
    272 
    273         if (y < height && x < width)
    274         {
    275             float g11 = M(y, x);
    276             float g12 = M(height + y, x);
    277             float g22 = M(2*height + y, x);
    278             float h1 = M(3*height + y, x);
    279             float h2 = M(4*height + y, x);
    280 
    281             float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
    282 
    283             flowx(y, x) = (g11*h2 - g12*h1) * detInv;
    284             flowy(y, x) = (g22*h1 - g12*h2) * detInv;
    285         }
    286     }
    287 
    288 
    289     void updateFlowGpu(const PtrStepSzf M, PtrStepSzf flowx, PtrStepSzf flowy, cudaStream_t stream)
    290     {
    291         dim3 block(32, 8);
    292         dim3 grid(divUp(flowx.cols, block.x), divUp(flowx.rows, block.y));
    293 
    294         updateFlow<<<grid, block, 0, stream>>>(flowx.rows, flowx.cols, M, flowx, flowy);
    295 
    296         cudaSafeCall(cudaGetLastError());
    297 
    298         if (stream == 0)
    299             cudaSafeCall(cudaDeviceSynchronize());
    300     }
    301 
    302 
    303     /*__global__ void boxFilter(
    304             const int height, const int width, const PtrStepf src,
    305             const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
    306     {
    307         const int y = by * bdy + ty;
    308         const int x = bx * bdx + tx;
    309 
    310         extern __shared__ float smem[];
    311         volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
    312 
    313         if (y < height)
    314         {
    315             // Vertical pass
    316             for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
    317             {
    318                 int xExt = int(bx * bdx) + i - ksizeHalf;
    319                 xExt = ::min(::max(xExt, 0), width - 1);
    320 
    321                 row[i] = src(y, xExt);
    322                 for (int j = 1; j <= ksizeHalf; ++j)
    323                     row[i] += src(::max(y - j, 0), xExt) + src(::min(y + j, height - 1), xExt);
    324             }
    325 
    326             if (x < width)
    327             {
    328                 __syncthreads();
    329 
    330                 // Horizontal passs
    331                 row += tx + ksizeHalf;
    332                 float res = row[0];
    333                 for (int i = 1; i <= ksizeHalf; ++i)
    334                     res += row[-i] + row[i];
    335                 dst(y, x) = res * boxAreaInv;
    336             }
    337         }
    338     }
    339 
    340 
    341     void boxFilterGpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    342     {
    343         dim3 block(256);
    344         dim3 grid(divUp(src.cols, block.x), divUp(src.rows, block.y));
    345         int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
    346 
    347         float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
    348         boxFilter<<<grid, block, smem, stream>>>(src.rows, src.cols, src, ksizeHalf, boxAreaInv, dst);
    349 
    350         cudaSafeCall(cudaGetLastError());
    351 
    352         if (stream == 0)
    353             cudaSafeCall(cudaDeviceSynchronize());
    354     }*/
    355 
    356 
    357     __global__ void boxFilter5(
    358             const int height, const int width, const PtrStepf src,
    359             const int ksizeHalf, const float boxAreaInv, PtrStepf dst)
    360     {
    361         const int y = by * bdy + ty;
    362         const int x = bx * bdx + tx;
    363 
    364         extern __shared__ float smem[];
    365 
    366         const int smw = bdx + 2*ksizeHalf; // shared memory "width"
    367         volatile float *row = smem + 5 * ty * smw;
    368 
    369         if (y < height)
    370         {
    371             // Vertical pass
    372             for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
    373             {
    374                 int xExt = int(bx * bdx) + i - ksizeHalf;
    375                 xExt = ::min(::max(xExt, 0), width - 1);
    376 
    377                 #pragma unroll
    378                 for (int k = 0; k < 5; ++k)
    379                     row[k*smw + i] = src(k*height + y, xExt);
    380 
    381                 for (int j = 1; j <= ksizeHalf; ++j)
    382                     #pragma unroll
    383                     for (int k = 0; k < 5; ++k)
    384                         row[k*smw + i] +=
    385                                 src(k*height + ::max(y - j, 0), xExt) +
    386                                 src(k*height + ::min(y + j, height - 1), xExt);
    387             }
    388 
    389             if (x < width)
    390             {
    391                 __syncthreads();
    392 
    393                 // Horizontal passs
    394 
    395                 row += tx + ksizeHalf;
    396                 float res[5];
    397 
    398                 #pragma unroll
    399                 for (int k = 0; k < 5; ++k)
    400                     res[k] = row[k*smw];
    401 
    402                 for (int i = 1; i <= ksizeHalf; ++i)
    403                     #pragma unroll
    404                     for (int k = 0; k < 5; ++k)
    405                         res[k] += row[k*smw - i] + row[k*smw + i];
    406 
    407                 #pragma unroll
    408                 for (int k = 0; k < 5; ++k)
    409                     dst(k*height + y, x) = res[k] * boxAreaInv;
    410             }
    411         }
    412     }
    413 
    414 
    415     void boxFilter5Gpu(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    416     {
    417         int height = src.rows / 5;
    418         int width = src.cols;
    419 
    420         dim3 block(256);
    421         dim3 grid(divUp(width, block.x), divUp(height, block.y));
    422         int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
    423 
    424         float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
    425         boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
    426 
    427         cudaSafeCall(cudaGetLastError());
    428 
    429         if (stream == 0)
    430             cudaSafeCall(cudaDeviceSynchronize());
    431     }
    432 
    433 
    434     void boxFilter5Gpu_CC11(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    435     {
    436         int height = src.rows / 5;
    437         int width = src.cols;
    438 
    439         dim3 block(128);
    440         dim3 grid(divUp(width, block.x), divUp(height, block.y));
    441         int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
    442 
    443         float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
    444         boxFilter5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, boxAreaInv, dst);
    445 
    446         cudaSafeCall(cudaGetLastError());
    447 
    448         if (stream == 0)
    449             cudaSafeCall(cudaDeviceSynchronize());
    450     }
    451 
    452 
    453     __constant__ float c_gKer[MAX_KSIZE_HALF + 1];
    454 
    455     template <typename Border>
    456     __global__ void gaussianBlur(
    457             const int height, const int width, const PtrStepf src, const int ksizeHalf,
    458             const Border b, PtrStepf dst)
    459     {
    460         const int y = by * bdy + ty;
    461         const int x = bx * bdx + tx;
    462 
    463         extern __shared__ float smem[];
    464         volatile float *row = smem + ty * (bdx + 2*ksizeHalf);
    465 
    466         if (y < height)
    467         {
    468             // Vertical pass
    469             for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
    470             {
    471                 int xExt = int(bx * bdx) + i - ksizeHalf;
    472                 xExt = b.idx_col(xExt);
    473                 row[i] = src(y, xExt) * c_gKer[0];
    474                 for (int j = 1; j <= ksizeHalf; ++j)
    475                     row[i] +=
    476                             (src(b.idx_row_low(y - j), xExt) +
    477                              src(b.idx_row_high(y + j), xExt)) * c_gKer[j];
    478             }
    479 
    480             if (x < width)
    481             {
    482                 __syncthreads();
    483 
    484                 // Horizontal pass
    485                 row += tx + ksizeHalf;
    486                 float res = row[0] * c_gKer[0];
    487                 for (int i = 1; i <= ksizeHalf; ++i)
    488                     res += (row[-i] + row[i]) * c_gKer[i];
    489                 dst(y, x) = res;
    490             }
    491         }
    492     }
    493 
    494 
    495     void setGaussianBlurKernel(const float *gKer, int ksizeHalf)
    496     {
    497         cudaSafeCall(cudaMemcpyToSymbol(c_gKer, gKer, (ksizeHalf + 1) * sizeof(*gKer)));
    498     }
    499 
    500 
    501     template <typename Border>
    502     void gaussianBlurCaller(const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    503     {
    504         int height = src.rows;
    505         int width = src.cols;
    506 
    507         dim3 block(256);
    508         dim3 grid(divUp(width, block.x), divUp(height, block.y));
    509         int smem = (block.x + 2*ksizeHalf) * block.y * sizeof(float);
    510         Border b(height, width);
    511 
    512         gaussianBlur<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
    513 
    514         cudaSafeCall(cudaGetLastError());
    515 
    516         if (stream == 0)
    517             cudaSafeCall(cudaDeviceSynchronize());
    518     }
    519 
    520 
    521     void gaussianBlurGpu(
    522             const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
    523     {
    524         typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
    525 
    526         static const caller_t callers[] =
    527         {
    528             0 /*gaussianBlurCaller<BrdConstant<float> >*/,
    529             gaussianBlurCaller<BrdReplicate<float> >,
    530             0 /*gaussianBlurCaller<BrdReflect<float> >*/,
    531             0 /*gaussianBlurCaller<BrdWrap<float> >*/,
    532             gaussianBlurCaller<BrdReflect101<float> >
    533         };
    534 
    535         callers[borderMode](src, ksizeHalf, dst, stream);
    536     }
    537 
    538 
    539     template <typename Border>
    540     __global__ void gaussianBlur5(
    541             const int height, const int width, const PtrStepf src, const int ksizeHalf,
    542             const Border b, PtrStepf dst)
    543     {
    544         const int y = by * bdy + ty;
    545         const int x = bx * bdx + tx;
    546 
    547         extern __shared__ float smem[];
    548 
    549         const int smw = bdx + 2*ksizeHalf; // shared memory "width"
    550         volatile float *row = smem + 5 * ty * smw;
    551 
    552         if (y < height)
    553         {
    554             // Vertical pass
    555             for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
    556             {
    557                 int xExt = int(bx * bdx) + i - ksizeHalf;
    558                 xExt = b.idx_col(xExt);
    559 
    560                 #pragma unroll
    561                 for (int k = 0; k < 5; ++k)
    562                     row[k*smw + i] = src(k*height + y, xExt) * c_gKer[0];
    563 
    564                 for (int j = 1; j <= ksizeHalf; ++j)
    565                     #pragma unroll
    566                     for (int k = 0; k < 5; ++k)
    567                         row[k*smw + i] +=
    568                                 (src(k*height + b.idx_row_low(y - j), xExt) +
    569                                  src(k*height + b.idx_row_high(y + j), xExt)) * c_gKer[j];
    570             }
    571 
    572             if (x < width)
    573             {
    574                 __syncthreads();
    575 
    576                 // Horizontal pass
    577 
    578                 row += tx + ksizeHalf;
    579                 float res[5];
    580 
    581                 #pragma unroll
    582                 for (int k = 0; k < 5; ++k)
    583                     res[k] = row[k*smw] * c_gKer[0];
    584 
    585                 for (int i = 1; i <= ksizeHalf; ++i)
    586                     #pragma unroll
    587                     for (int k = 0; k < 5; ++k)
    588                         res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
    589 
    590                 #pragma unroll
    591                 for (int k = 0; k < 5; ++k)
    592                     dst(k*height + y, x) = res[k];
    593             }
    594         }
    595     }
    596 
    597 
    598     template <typename Border, int blockDimX>
    599     void gaussianBlur5Caller(
    600             const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, cudaStream_t stream)
    601     {
    602         int height = src.rows / 5;
    603         int width = src.cols;
    604 
    605         dim3 block(blockDimX);
    606         dim3 grid(divUp(width, block.x), divUp(height, block.y));
    607         int smem = (block.x + 2*ksizeHalf) * 5 * block.y * sizeof(float);
    608         Border b(height, width);
    609 
    610         gaussianBlur5<<<grid, block, smem, stream>>>(height, width, src, ksizeHalf, b, dst);
    611 
    612         cudaSafeCall(cudaGetLastError());
    613 
    614         if (stream == 0)
    615             cudaSafeCall(cudaDeviceSynchronize());
    616     }
    617 
    618 
    619     void gaussianBlur5Gpu(
    620             const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
    621     {
    622         typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
    623 
    624         static const caller_t callers[] =
    625         {
    626             0 /*gaussianBlur5Caller<BrdConstant<float>,256>*/,
    627             gaussianBlur5Caller<BrdReplicate<float>,256>,
    628             0 /*gaussianBlur5Caller<BrdReflect<float>,256>*/,
    629             0 /*gaussianBlur5Caller<BrdWrap<float>,256>*/,
    630             gaussianBlur5Caller<BrdReflect101<float>,256>
    631         };
    632 
    633         callers[borderMode](src, ksizeHalf, dst, stream);
    634     }
    635 
    636     void gaussianBlur5Gpu_CC11(
    637             const PtrStepSzf src, int ksizeHalf, PtrStepSzf dst, int borderMode, cudaStream_t stream)
    638     {
    639         typedef void (*caller_t)(const PtrStepSzf, int, PtrStepSzf, cudaStream_t);
    640 
    641         static const caller_t callers[] =
    642         {
    643             0 /*gaussianBlur5Caller<BrdConstant<float>,128>*/,
    644             gaussianBlur5Caller<BrdReplicate<float>,128>,
    645             0 /*gaussianBlur5Caller<BrdReflect<float>,128>*/,
    646             0 /*gaussianBlur5Caller<BrdWrap<float>,128>*/,
    647             gaussianBlur5Caller<BrdReflect101<float>,128>
    648         };
    649 
    650         callers[borderMode](src, ksizeHalf, dst, stream);
    651     }
    652 
    653 }}}} // namespace cv { namespace cuda { namespace cudev { namespace optflow_farneback
    654 
    655 
    656 #endif /* CUDA_DISABLER */
    657