Home | History | Annotate | Download | only in opencl
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
     14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // @Authors
     18 //    Zhang Ying, zhangying913 (at) gmail.com
     19 //
     20 // Redistribution and use in source and binary forms, with or without modification,
     21 // are permitted provided that the following conditions are met:
     22 //
     23 //   * Redistribution's of source code must retain the above copyright notice,
     24 //     this list of conditions and the following disclaimer.
     25 //
     26 //   * Redistribution's in binary form must reproduce the above copyright notice,
     27 //     this list of conditions and the following disclaimer in the documentation
     28 //     and/or other materials provided with the distribution.
     29 //
     30 //   * The name of the copyright holders may not be used to endorse or promote products
     31 //     derived from this software without specific prior written permission.
     32 //
     33 // This software is provided by the copyright holders and contributors as is and
     34 // any express or implied warranties, including, but not limited to, the implied
     35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     36 // In no event shall the Intel Corporation or contributors be liable for any direct,
     37 // indirect, incidental, special, exemplary, or consequential damages
     38 // (including, but not limited to, procurement of substitute goods or services;
     39 // loss of use, data, or profits; or business interruption) however caused
     40 // and on any theory of liability, whether in contract, strict liability,
     41 // or tort (including negligence or otherwise) arising in any way out of
     42 // the use of this software, even if advised of the possibility of such damage.
     43 //
     44 //M*/
     45 
     46 #ifdef DOUBLE_SUPPORT
     47 #ifdef cl_amd_fp64
     48 #pragma OPENCL EXTENSION cl_amd_fp64:enable
     49 #elif defined (cl_khr_fp64)
     50 #pragma OPENCL EXTENSION cl_khr_fp64:enable
     51 #endif
     52 #define CT double
     53 #else
     54 #define CT float
     55 #endif
     56 
     57 #define INTER_BITS 5
     58 #define INTER_TAB_SIZE (1 << INTER_BITS)
     59 #define INTER_SCALE 1.f/INTER_TAB_SIZE
     60 #define AB_BITS max(10, (int)INTER_BITS)
     61 #define AB_SCALE (1 << AB_BITS)
     62 #define INTER_REMAP_COEF_BITS 15
     63 #define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS)
     64 #define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1))
     65 
     66 #define noconvert
     67 
     68 #ifndef ST
     69 #define ST T
     70 #endif
     71 
     72 #if cn != 3
     73 #define loadpix(addr)  *(__global const T*)(addr)
     74 #define storepix(val, addr)  *(__global T*)(addr) = val
     75 #define scalar scalar_
     76 #define pixsize (int)sizeof(T)
     77 #else
     78 #define loadpix(addr)  vload3(0, (__global const T1*)(addr))
     79 #define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr))
     80 #ifdef INTER_NEAREST
     81 #define scalar (T)(scalar_.x, scalar_.y, scalar_.z)
     82 #else
     83 #define scalar (WT)(scalar_.x, scalar_.y, scalar_.z)
     84 #endif
     85 #define pixsize ((int)sizeof(T1)*3)
     86 #endif
     87 
     88 #ifdef INTER_NEAREST
     89 
     90 __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
     91                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
     92                          __constant CT * M, ST scalar_)
     93 {
     94     int dx = get_global_id(0);
     95     int dy0 = get_global_id(1) * rowsPerWI;
     96 
     97     if (dx < dst_cols)
     98     {
     99         int round_delta = (AB_SCALE >> 1);
    100 
    101         int X0_ = rint(M[0] * dx * AB_SCALE);
    102         int Y0_ = rint(M[3] * dx * AB_SCALE);
    103         int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset));
    104 
    105         for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step)
    106         {
    107             int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta;
    108             int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta;
    109 
    110             short sx = convert_short_sat(X0 >> AB_BITS);
    111             short sy = convert_short_sat(Y0 >> AB_BITS);
    112 
    113             if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows)
    114             {
    115                 int src_index = mad24(sy, src_step, mad24(sx, pixsize, src_offset));
    116                 storepix(loadpix(srcptr + src_index), dstptr + dst_index);
    117             }
    118             else
    119                 storepix(scalar, dstptr + dst_index);
    120         }
    121     }
    122 }
    123 
    124 #elif defined INTER_LINEAR
    125 
    126 __constant float coeffs[64] =
    127 { 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f,
    128     0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f,
    129     0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f,
    130     0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f,
    131     0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f,
    132     0.062500f, 0.937500f, 0.031250f, 0.968750f };
    133 
    134 __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
    135                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    136                          __constant CT * M, ST scalar_)
    137 {
    138     int dx = get_global_id(0);
    139     int dy0 = get_global_id(1) * rowsPerWI;
    140 
    141     if (dx < dst_cols)
    142     {
    143         int tmp = dx << AB_BITS;
    144         int X0_ = rint(M[0] * tmp);
    145         int Y0_ = rint(M[3] * tmp);
    146 
    147         for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy)
    148         {
    149             int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;
    150             int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;
    151             X0 = X0 >> (AB_BITS - INTER_BITS);
    152             Y0 = Y0 >> (AB_BITS - INTER_BITS);
    153 
    154             short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS);
    155             short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1));
    156 
    157 #if defined AMD_DEVICE || depth > 4
    158             WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar;
    159             if (sx >= 0 && sx < src_cols)
    160             {
    161                 if (sy >= 0 && sy < src_rows)
    162                     v0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset))));
    163                 if (sy+1 >= 0 && sy+1 < src_rows)
    164                     v2 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset))));
    165             }
    166             if (sx+1 >= 0 && sx+1 < src_cols)
    167             {
    168                 if (sy >= 0 && sy < src_rows)
    169                     v1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset))));
    170                 if (sy+1 >= 0 && sy+1 < src_rows)
    171                     v3 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset))));
    172             }
    173 
    174             float taby = 1.f/INTER_TAB_SIZE*ay;
    175             float tabx = 1.f/INTER_TAB_SIZE*ax;
    176 
    177             int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
    178 
    179 #if depth <= 4
    180             int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
    181             int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE );
    182             int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE );
    183             int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE );
    184 
    185             WT val = mad24(v0, itab0, mad24(v1, itab1, mad24(v2, itab2, v3 * itab3)));
    186             storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index);
    187 #else
    188             float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby;
    189             WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby));
    190             storepix(convertToT(val), dstptr + dst_index);
    191 #endif
    192 #else // INTEL_DEVICE
    193             __constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1);
    194 
    195             int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;
    196             int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
    197 
    198             WT sum = (WT)(0), xsum;
    199             #pragma unroll
    200             for (int y = 0; y < 2; y++)
    201             {
    202                 src_index = mad24(y, src_step, src_index0);
    203                 if (sy + y >= 0 && sy + y < src_rows)
    204                 {
    205                     xsum = (WT)(0);
    206                     if (sx >= 0 && sx + 2 < src_cols)
    207                     {
    208 #if depth == 0 && cn == 1
    209                         uchar2 value = vload2(0, srcptr + src_index);
    210                         xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1]));
    211 #else
    212                         #pragma unroll
    213                         for (int x = 0; x < 2; x++)
    214                             xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
    215 #endif
    216                     }
    217                     else
    218                     {
    219                         #pragma unroll
    220                         for (int x = 0; x < 2; x++)
    221                             xsum = fma(sx + x >= 0 && sx + x < src_cols ?
    222                                        convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
    223                     }
    224                     sum = fma(xsum, coeffs_y[y], sum);
    225                 }
    226                 else
    227                     sum = fma(scalar, coeffs_y[y], sum);
    228             }
    229 
    230             storepix(convertToT(sum), dstptr + dst_index);
    231 #endif
    232         }
    233     }
    234 }
    235 
    236 #elif defined INTER_CUBIC
    237 
    238 #ifdef AMD_DEVICE
    239 
    240 inline void interpolateCubic( float x, float* coeffs )
    241 {
    242     const float A = -0.75f;
    243 
    244     coeffs[0] = fma(fma(fma(A, (x + 1.f), - 5.0f*A), (x + 1.f), 8.0f*A), x + 1.f, - 4.0f*A);
    245     coeffs[1] = fma(fma(A + 2.f, x, - (A + 3.f)), x*x, 1.f);
    246     coeffs[2] = fma(fma(A + 2.f, 1.f - x, - (A + 3.f)), (1.f - x)*(1.f - x), 1.f);
    247     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
    248 }
    249 
    250 #else
    251 
    252 __constant float coeffs[128] =
    253     { 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f,
    254     -0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f,
    255     -0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f,
    256     -0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f,
    257     -0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f,
    258     -0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f,
    259     -0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f,
    260     -0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f,
    261     -0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f,
    262     -0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f,
    263     -0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f };
    264 
    265 #endif
    266 
    267 __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols,
    268                          __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    269                          __constant CT * M, ST scalar_)
    270 {
    271     int dx = get_global_id(0);
    272     int dy = get_global_id(1);
    273 
    274     if (dx < dst_cols && dy < dst_rows)
    275     {
    276         int tmp = (dx << AB_BITS);
    277         int X0 = rint(M[0] * tmp) + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA;
    278         int Y0 = rint(M[3] * tmp) + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA;
    279 
    280         X0 = X0 >> (AB_BITS - INTER_BITS);
    281         Y0 = Y0 >> (AB_BITS - INTER_BITS);
    282 
    283         int sx = (short)(X0 >> INTER_BITS) - 1, sy = (short)(Y0 >> INTER_BITS) - 1;
    284         int ay = (short)(Y0 & (INTER_TAB_SIZE - 1)), ax = (short)(X0 & (INTER_TAB_SIZE - 1));
    285 
    286 #ifdef AMD_DEVICE
    287         WT v[16];
    288         #pragma unroll
    289         for (int y = 0; y < 4; y++)
    290         {
    291             if (sy+y >= 0 && sy+y < src_rows)
    292             {
    293                 #pragma unroll
    294                 for (int x = 0; x < 4; x++)
    295                     v[mad24(y, 4, x)] = sx+x >= 0 && sx+x < src_cols ?
    296                         convertToWT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar;
    297             }
    298             else
    299             {
    300                 #pragma unroll
    301                 for (int x = 0; x < 4; x++)
    302                     v[mad24(y, 4, x)] = scalar;
    303             }
    304         }
    305 
    306         float tab1y[4], tab1x[4];
    307 
    308         float ayy = INTER_SCALE * ay;
    309         float axx = INTER_SCALE * ax;
    310         interpolateCubic(ayy, tab1y);
    311         interpolateCubic(axx, tab1x);
    312 
    313         int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
    314 
    315         WT sum = (WT)(0);
    316 #if depth <= 4
    317         int itab[16];
    318 
    319         #pragma unroll
    320         for (int i = 0; i < 16; i++)
    321             itab[i] = rint(tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE);
    322 
    323         #pragma unroll
    324         for (int i = 0; i < 16; i++)
    325             sum = mad24(v[i], itab[i], sum);
    326         storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index);
    327 #else
    328         #pragma unroll
    329         for (int i = 0; i < 16; i++)
    330             sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum);
    331         storepix(convertToT( sum ), dstptr + dst_index);
    332 #endif
    333 #else // INTEL_DEVICE
    334         __constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2);
    335 
    336         int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index;
    337         int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset));
    338 
    339         WT sum = (WT)(0), xsum;
    340         #pragma unroll
    341         for (int y = 0; y < 4; y++)
    342         {
    343             src_index = mad24(y, src_step, src_index0);
    344             if (sy + y >= 0 && sy + y < src_rows)
    345             {
    346                 xsum = (WT)(0);
    347                 if (sx >= 0 && sx + 4 < src_cols)
    348                 {
    349 #if depth == 0 && cn == 1
    350                     uchar4 value = vload4(0, srcptr + src_index);
    351                     xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3]));
    352 #else
    353                     #pragma unroll
    354                     for (int x = 0; x < 4; x++)
    355                         xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum);
    356 #endif
    357                 }
    358                 else
    359                 {
    360                     #pragma unroll
    361                     for (int x = 0; x < 4; x++)
    362                         xsum = fma(sx + x >= 0 && sx + x < src_cols ?
    363                                    convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum);
    364                 }
    365                 sum = fma(xsum, coeffs_y[y], sum);
    366             }
    367             else
    368                 sum = fma(scalar, coeffs_y[y], sum);
    369         }
    370 
    371         storepix(convertToT(sum), dstptr + dst_index);
    372 #endif
    373     }
    374 }
    375 
    376 #endif