Home | History | Annotate | Download | only in opencl
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
     14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // @Authors
     18 //    Sen Liu, swjtuls1987 (at) 126.com
     19 //
     20 // Redistribution and use in source and binary forms, with or without modification,
     21 // are permitted provided that the following conditions are met:
     22 //
     23 //   * Redistribution's of source code must retain the above copyright notice,
     24 //     this list of conditions and the following disclaimer.
     25 //
     26 //   * Redistribution's in binary form must reproduce the above copyright notice,
     27 //     this list of conditions and the following disclaimer in the documentation
     28 //     and/or other materials provided with the distribution.
     29 //
     30 //   * The name of the copyright holders may not be used to endorse or promote products
     31 //     derived from this software without specific prior written permission.
     32 //
     33 // This software is provided by the copyright holders and contributors as is and
     34 // any express or implied warranties, including, but not limited to, the implied
     35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     36 // In no event shall the Intel Corporation or contributors be liable for any direct,
     37 // indirect, incidental, special, exemplary, or consequential damages
     38 // (including, but not limited to, procurement of substitute goods or services;
     39 // loss of use, data, or profits; or business interruption) however caused
     40 // and on any theory of liability, whether in contract, strict liability,
     41 // or tort (including negligence or otherwise) arising in any way out of
     42 // the use of this software, even if advised of the possibility of such damage.
     43 //
     44 //M*/
     45 
     46 
     47 #define tx  (int)get_local_id(0)
     48 #define ty  get_local_id(1)
     49 #define bx  get_group_id(0)
     50 #define bdx (int)get_local_size(0)
     51 
     52 #define BORDER_SIZE 5
     53 #define MAX_KSIZE_HALF 100
     54 
     55 #ifndef polyN
     56 #define polyN 5
     57 #endif
     58 
     59 #if USE_DOUBLE
     60 #ifdef cl_amd_fp64
     61 #pragma OPENCL EXTENSION cl_amd_fp64:enable
     62 #elif defined (cl_khr_fp64)
     63 #pragma OPENCL EXTENSION cl_khr_fp64:enable
     64 #endif
     65 #define TYPE double
     66 #define VECTYPE double4
     67 #else
     68 #define TYPE float
     69 #define VECTYPE float4
     70 #endif
     71 
     72 __kernel void polynomialExpansion(__global __const float * src, int srcStep,
     73                                   __global float * dst, int dstStep,
     74                                   const int rows, const  int cols,
     75                                   __global __const float * c_g,
     76                                   __global __const float * c_xg,
     77                                   __global __const float * c_xxg,
     78                                   __local float * smem,
     79                                   const VECTYPE ig)
     80 {
     81     const int y = get_global_id(1);
     82     const int x = bx * (bdx - 2*polyN) + tx - polyN;
     83 
     84     int xWarped;
     85     __local float *row = smem + tx;
     86 
     87     if (y < rows && y >= 0)
     88     {
     89         xWarped = min(max(x, 0), cols - 1);
     90 
     91         row[0] = src[mad24(y, srcStep, xWarped)] * c_g[0];
     92         row[bdx] = 0.f;
     93         row[2*bdx] = 0.f;
     94 
     95 #pragma unroll
     96         for (int k = 1; k <= polyN; ++k)
     97         {
     98             float t0 = src[mad24(max(y - k, 0), srcStep, xWarped)];
     99             float t1 = src[mad24(min(y + k, rows - 1), srcStep, xWarped)];
    100 
    101             row[0] += c_g[k] * (t0 + t1);
    102             row[bdx] += c_xg[k] * (t1 - t0);
    103             row[2*bdx] += c_xxg[k] * (t0 + t1);
    104         }
    105     }
    106 
    107     barrier(CLK_LOCAL_MEM_FENCE);
    108 
    109     if (y < rows && y >= 0 && tx >= polyN && tx + polyN < bdx && x < cols)
    110     {
    111         TYPE b1 = c_g[0] * row[0];
    112         TYPE b3 = c_g[0] * row[bdx];
    113         TYPE b5 = c_g[0] * row[2*bdx];
    114         TYPE b2 = 0, b4 = 0, b6 = 0;
    115 
    116 #pragma unroll
    117         for (int k = 1; k <= polyN; ++k)
    118         {
    119             b1 += (row[k] + row[-k]) * c_g[k];
    120             b4 += (row[k] + row[-k]) * c_xxg[k];
    121             b2 += (row[k] - row[-k]) * c_xg[k];
    122             b3 += (row[k + bdx] + row[-k + bdx]) * c_g[k];
    123             b6 += (row[k + bdx] - row[-k + bdx]) * c_xg[k];
    124             b5 += (row[k + 2*bdx] + row[-k + 2*bdx]) * c_g[k];
    125         }
    126 
    127         dst[mad24(y, dstStep, xWarped)] = (float)(b3*ig.s0);
    128         dst[mad24(rows + y, dstStep, xWarped)] = (float)(b2*ig.s0);
    129         dst[mad24(2*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b5*ig.s2);
    130         dst[mad24(3*rows + y, dstStep, xWarped)] = (float)(b1*ig.s1 + b4*ig.s2);
    131         dst[mad24(4*rows + y, dstStep, xWarped)] = (float)(b6*ig.s3);
    132     }
    133 }
    134 
    135 inline int idx_row_low(const int y, const int last_row)
    136 {
    137     return abs(y) % (last_row + 1);
    138 }
    139 
    140 inline int idx_row_high(const int y, const int last_row)
    141 {
    142     return abs(last_row - abs(last_row - y)) % (last_row + 1);
    143 }
    144 
    145 inline int idx_col_low(const int x, const int last_col)
    146 {
    147     return abs(x) % (last_col + 1);
    148 }
    149 
    150 inline int idx_col_high(const int x, const int last_col)
    151 {
    152     return abs(last_col - abs(last_col - x)) % (last_col + 1);
    153 }
    154 
    155 inline int idx_col(const int x, const int last_col)
    156 {
    157     return idx_col_low(idx_col_high(x, last_col), last_col);
    158 }
    159 
    160 __kernel void gaussianBlur(__global const float * src, int srcStep,
    161                            __global float * dst, int dstStep, const int rows, const  int cols,
    162                            __global const float * c_gKer, const int ksizeHalf,
    163                            __local float * smem)
    164 {
    165     const int y = get_global_id(1);
    166     const int x = get_global_id(0);
    167 
    168     __local float *row = smem + ty * (bdx + 2*ksizeHalf);
    169 
    170     if (y < rows)
    171     {
    172         // Vertical pass
    173         for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
    174         {
    175             int xExt = (int)(bx * bdx) + i - ksizeHalf;
    176             xExt = idx_col(xExt, cols - 1);
    177             row[i] = src[mad24(y, srcStep, xExt)] * c_gKer[0];
    178             for (int j = 1; j <= ksizeHalf; ++j)
    179                 row[i] += (src[mad24(idx_row_low(y - j, rows - 1), srcStep, xExt)]
    180                            + src[mad24(idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];
    181         }
    182     }
    183 
    184     barrier(CLK_LOCAL_MEM_FENCE);
    185 
    186     if (y < rows && y >= 0 && x < cols && x >= 0)
    187     {
    188         // Horizontal pass
    189         row += tx + ksizeHalf;
    190         float res = row[0] * c_gKer[0];
    191         for (int i = 1; i <= ksizeHalf; ++i)
    192             res += (row[-i] + row[i]) * c_gKer[i];
    193 
    194         dst[mad24(y, dstStep, x)] = res;
    195     }
    196 }
    197 
    198 __kernel void gaussianBlur5(__global const float * src, int srcStep,
    199                             __global float * dst, int dstStep,
    200                             const int rows, const  int cols,
    201                             __global const float * c_gKer, const int ksizeHalf,
    202                             __local float * smem)
    203 {
    204     const int y = get_global_id(1);
    205     const int x = get_global_id(0);
    206 
    207     const int smw = bdx + 2*ksizeHalf; // shared memory "cols"
    208     __local volatile float *row = smem + 5 * ty * smw;
    209 
    210     if (y < rows)
    211     {
    212         // Vertical pass
    213         for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
    214         {
    215             int xExt = (int)(bx * bdx) + i - ksizeHalf;
    216             xExt = idx_col(xExt, cols - 1);
    217 
    218 #pragma unroll
    219             for (int k = 0; k < 5; ++k)
    220                 row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)] * c_gKer[0];
    221 
    222             for (int j = 1; j <= ksizeHalf; ++j)
    223 #pragma unroll
    224                 for (int k = 0; k < 5; ++k)
    225                     row[k*smw + i] +=
    226                         (src[mad24(k*rows + idx_row_low(y - j, rows - 1), srcStep, xExt)] +
    227                          src[mad24(k*rows + idx_row_high(y + j, rows - 1), srcStep, xExt)]) * c_gKer[j];
    228         }
    229     }
    230 
    231     barrier(CLK_LOCAL_MEM_FENCE);
    232 
    233     if (y < rows && y >= 0 && x < cols && x >= 0)
    234     {
    235         // Horizontal pass
    236 
    237         row += tx + ksizeHalf;
    238         float res[5];
    239 
    240 #pragma unroll
    241         for (int k = 0; k < 5; ++k)
    242             res[k] = row[k*smw] * c_gKer[0];
    243 
    244         for (int i = 1; i <= ksizeHalf; ++i)
    245 #pragma unroll
    246             for (int k = 0; k < 5; ++k)
    247                 res[k] += (row[k*smw - i] + row[k*smw + i]) * c_gKer[i];
    248 
    249 #pragma unroll
    250         for (int k = 0; k < 5; ++k)
    251             dst[mad24(k*rows + y, dstStep, x)] = res[k];
    252     }
    253 }
    254 __constant float c_border[BORDER_SIZE + 1] = { 0.14f, 0.14f, 0.4472f, 0.4472f, 0.4472f, 1.f };
    255 
    256 __kernel void updateMatrices(__global const float * flowx, int xStep,
    257                              __global const float * flowy, int yStep,
    258                              const int rows, const int cols,
    259                              __global const float * R0, int R0Step,
    260                              __global const float * R1, int R1Step,
    261                              __global float * M, int mStep)
    262 {
    263     const int y = get_global_id(1);
    264     const int x = get_global_id(0);
    265 
    266     if (y < rows && y >= 0 && x < cols && x >= 0)
    267     {
    268         float dx = flowx[mad24(y, xStep, x)];
    269         float dy = flowy[mad24(y, yStep, x)];
    270         float fx = x + dx;
    271         float fy = y + dy;
    272 
    273         int x1 = convert_int(floor(fx));
    274         int y1 = convert_int(floor(fy));
    275         fx -= x1;
    276         fy -= y1;
    277 
    278         float r2, r3, r4, r5, r6;
    279 
    280         if (x1 >= 0 && y1 >= 0 && x1 < cols - 1 && y1 < rows - 1)
    281         {
    282             float a00 = (1.f - fx) * (1.f - fy);
    283             float a01 = fx * (1.f - fy);
    284             float a10 = (1.f - fx) * fy;
    285             float a11 = fx * fy;
    286 
    287             r2 = a00 * R1[mad24(y1, R1Step, x1)] +
    288                  a01 * R1[mad24(y1, R1Step, x1 + 1)] +
    289                  a10 * R1[mad24(y1 + 1, R1Step, x1)] +
    290                  a11 * R1[mad24(y1 + 1, R1Step, x1 + 1)];
    291 
    292             r3 = a00 * R1[mad24(rows + y1, R1Step, x1)] +
    293                  a01 * R1[mad24(rows + y1, R1Step, x1 + 1)] +
    294                  a10 * R1[mad24(rows + y1 + 1, R1Step, x1)] +
    295                  a11 * R1[mad24(rows + y1 + 1, R1Step, x1 + 1)];
    296 
    297             r4 = a00 * R1[mad24(2*rows + y1, R1Step, x1)] +
    298                  a01 * R1[mad24(2*rows + y1, R1Step, x1 + 1)] +
    299                  a10 * R1[mad24(2*rows + y1 + 1, R1Step, x1)] +
    300                  a11 * R1[mad24(2*rows + y1 + 1, R1Step, x1 + 1)];
    301 
    302             r5 = a00 * R1[mad24(3*rows + y1, R1Step, x1)] +
    303                  a01 * R1[mad24(3*rows + y1, R1Step, x1 + 1)] +
    304                  a10 * R1[mad24(3*rows + y1 + 1, R1Step, x1)] +
    305                  a11 * R1[mad24(3*rows + y1 + 1, R1Step, x1 + 1)];
    306 
    307             r6 = a00 * R1[mad24(4*rows + y1, R1Step, x1)] +
    308                  a01 * R1[mad24(4*rows + y1, R1Step, x1 + 1)] +
    309                  a10 * R1[mad24(4*rows + y1 + 1, R1Step, x1)] +
    310                  a11 * R1[mad24(4*rows + y1 + 1, R1Step, x1 + 1)];
    311 
    312             r4 = (R0[mad24(2*rows + y, R0Step, x)] + r4) * 0.5f;
    313             r5 = (R0[mad24(3*rows + y, R0Step, x)] + r5) * 0.5f;
    314             r6 = (R0[mad24(4*rows + y, R0Step, x)] + r6) * 0.25f;
    315         }
    316         else
    317         {
    318             r2 = r3 = 0.f;
    319             r4 = R0[mad24(2*rows + y, R0Step, x)];
    320             r5 = R0[mad24(3*rows + y, R0Step, x)];
    321             r6 = R0[mad24(4*rows + y, R0Step, x)] * 0.5f;
    322         }
    323 
    324         r2 = (R0[mad24(y, R0Step, x)] - r2) * 0.5f;
    325         r3 = (R0[mad24(rows + y, R0Step, x)] - r3) * 0.5f;
    326 
    327         r2 += r4*dy + r6*dx;
    328         r3 += r6*dy + r5*dx;
    329 
    330         float scale =
    331             c_border[min(x, BORDER_SIZE)] *
    332             c_border[min(y, BORDER_SIZE)] *
    333             c_border[min(cols - x - 1, BORDER_SIZE)] *
    334             c_border[min(rows - y - 1, BORDER_SIZE)];
    335 
    336         r2 *= scale;
    337         r3 *= scale;
    338         r4 *= scale;
    339         r5 *= scale;
    340         r6 *= scale;
    341 
    342         M[mad24(y, mStep, x)] = r4*r4 + r6*r6;
    343         M[mad24(rows + y, mStep, x)] = (r4 + r5)*r6;
    344         M[mad24(2*rows + y, mStep, x)] = r5*r5 + r6*r6;
    345         M[mad24(3*rows + y, mStep, x)] = r4*r2 + r6*r3;
    346         M[mad24(4*rows + y, mStep, x)] = r6*r2 + r5*r3;
    347     }
    348 }
    349 
    350 __kernel void boxFilter5(__global const float * src, int srcStep,
    351                          __global float * dst, int dstStep,
    352                          const int rows, const  int cols,
    353                          const int ksizeHalf,
    354                          __local float * smem)
    355 {
    356     const int y = get_global_id(1);
    357     const int x = get_global_id(0);
    358 
    359     const float boxAreaInv = 1.f / ((1 + 2*ksizeHalf) * (1 + 2*ksizeHalf));
    360     const int smw = bdx + 2*ksizeHalf; // shared memory "width"
    361     __local float *row = smem + 5 * ty * smw;
    362 
    363     if (y < rows)
    364     {
    365         // Vertical pass
    366         for (int i = tx; i < bdx + 2*ksizeHalf; i += bdx)
    367         {
    368             int xExt = (int)(bx * bdx) + i - ksizeHalf;
    369             xExt = min(max(xExt, 0), cols - 1);
    370 
    371 #pragma unroll
    372             for (int k = 0; k < 5; ++k)
    373                 row[k*smw + i] = src[mad24(k*rows + y, srcStep, xExt)];
    374 
    375             for (int j = 1; j <= ksizeHalf; ++j)
    376 #pragma unroll
    377                 for (int k = 0; k < 5; ++k)
    378                     row[k*smw + i] +=
    379                         src[mad24(k*rows + max(y - j, 0), srcStep, xExt)] +
    380                         src[mad24(k*rows + min(y + j, rows - 1), srcStep, xExt)];
    381         }
    382     }
    383 
    384     barrier(CLK_LOCAL_MEM_FENCE);
    385 
    386     if (y < rows && y >= 0 && x < cols && x >= 0)
    387     {
    388         // Horizontal pass
    389 
    390         row += tx + ksizeHalf;
    391         float res[5];
    392 
    393 #pragma unroll
    394         for (int k = 0; k < 5; ++k)
    395             res[k] = row[k*smw];
    396 
    397         for (int i = 1; i <= ksizeHalf; ++i)
    398 #pragma unroll
    399             for (int k = 0; k < 5; ++k)
    400                 res[k] += row[k*smw - i] + row[k*smw + i];
    401 
    402 #pragma unroll
    403         for (int k = 0; k < 5; ++k)
    404             dst[mad24(k*rows + y, dstStep, x)] = res[k] * boxAreaInv;
    405     }
    406 }
    407 
    408 __kernel void updateFlow(__global const float * M, int mStep,
    409                          __global float * flowx, int xStep,
    410                          __global float * flowy, int yStep,
    411                          const int rows, const int cols)
    412 {
    413     const int y = get_global_id(1);
    414     const int x = get_global_id(0);
    415 
    416     if (y < rows && y >= 0 && x < cols && x >= 0)
    417     {
    418         float g11 = M[mad24(y, mStep, x)];
    419         float g12 = M[mad24(rows + y, mStep, x)];
    420         float g22 = M[mad24(2*rows + y, mStep, x)];
    421         float h1 =  M[mad24(3*rows + y, mStep, x)];
    422         float h2 =  M[mad24(4*rows + y, mStep, x)];
    423 
    424         float detInv = 1.f / (g11*g22 - g12*g12 + 1e-3f);
    425 
    426         flowx[mad24(y, xStep, x)] = (g11*h2 - g12*h1) * detInv;
    427         flowy[mad24(y, yStep, x)] = (g22*h1 - g12*h2) * detInv;
    428     }
    429 }
    430