Home | History | Annotate | Download | only in opencl
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
     14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // @Authors
     18 //    Dachuan Zhao, dachuan (at) multicorewareinc.com
     19 //
     20 // Redistribution and use in source and binary forms, with or without modification,
     21 // are permitted provided that the following conditions are met:
     22 //
     23 //   * Redistribution's of source code must retain the above copyright notice,
     24 //     this list of conditions and the following disclaimer.
     25 //
     26 //   * Redistribution's in binary form must reproduce the above copyright notice,
     27 //     this list of conditions and the following disclaimer in the documentation
     28 //     and/or other materials provided with the distribution.
     29 //
     30 //   * The name of the copyright holders may not be used to endorse or promote products
     31 //     derived from this software without specific prior written permission.
     32 //
     33 // This software is provided by the copyright holders and contributors as is and
     34 // any express or implied warranties, including, but not limited to, the implied
     35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     36 // In no event shall the Intel Corporation or contributors be liable for any direct,
     37 // indirect, incidental, special, exemplary, or consequential damages
     38 // (including, but not limited to, procurement of substitute goods or services;
     39 // loss of use, data, or profits; or business interruption) however caused
     40 // and on any theory of liability, whether in contract, strict liability,
     41 // or tort (including negligence or otherwise) arising in any way out of
     42 // the use of this software, even if advised of the possibility of such damage.
     43 //
     44 //M*/
     45 
     46 #ifdef DOUBLE_SUPPORT
     47 #ifdef cl_amd_fp64
     48 #pragma OPENCL EXTENSION cl_amd_fp64:enable
     49 #elif defined (cl_khr_fp64)
     50 #pragma OPENCL EXTENSION cl_khr_fp64:enable
     51 #endif
     52 #endif
     53 
     54 #if defined BORDER_REPLICATE
     55 // aaaaaa|abcdefgh|hhhhhhh
     56 #define EXTRAPOLATE(x, maxV) clamp((x), 0, (maxV)-1)
     57 #elif defined BORDER_WRAP
     58 // cdefgh|abcdefgh|abcdefg
     59 #define EXTRAPOLATE(x, maxV) ( (x) + (maxV) ) % (maxV)
     60 #elif defined BORDER_REFLECT
     61 // fedcba|abcdefgh|hgfedcb
     62 #define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ), 0, (maxV)-1)
     63 #elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
     64 // gfedcb|abcdefgh|gfedcba
     65 #define EXTRAPOLATE(x, maxV) clamp(min(((maxV)-1)*2-(x), max((x),-(x)) ), 0, (maxV)-1)
     66 #else
     67 #error No extrapolation method
     68 #endif
     69 
     70 #if cn != 3
     71 #define loadpix(addr)  *(__global const T*)(addr)
     72 #define storepix(val, addr)  *(__global T*)(addr) = (val)
     73 #define PIXSIZE ((int)sizeof(T))
     74 #else
     75 #define loadpix(addr)  vload3(0, (__global const T1*)(addr))
     76 #define storepix(val, addr) vstore3((val), 0, (__global T1*)(addr))
     77 #define PIXSIZE ((int)sizeof(T1)*3)
     78 #endif
     79 
     80 #define SRC(_x,_y) convertToFT(loadpix(srcData + mad24(_y, src_step, PIXSIZE * _x)))
     81 
     82 #if kercn == 4
     83 #define SRC4(_x,_y) convert_float4(vload4(0, srcData + mad24(_y, src_step, PIXSIZE * _x)))
     84 #endif
     85 
     86 #ifdef INTEL_DEVICE
     87 #define MAD(x,y,z) fma((x),(y),(z))
     88 #else
     89 #define MAD(x,y,z) mad((x),(y),(z))
     90 #endif
     91 
     92 #define LOAD_LOCAL(col_gl, col_lcl) \
     93     sum0 =     co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));         \
     94     sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0);  \
     95     temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows));                      \
     96     sum0 = MAD(co1, temp, sum0);                                            \
     97     sum1 = co3 * temp;                                                      \
     98     temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows));                  \
     99     sum0 = MAD(co2, temp, sum0);                                            \
    100     sum1 = MAD(co2, temp, sum1);                                            \
    101     temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows));                  \
    102     sum0 = MAD(co3, temp, sum0);                                            \
    103     sum1 = MAD(co1, temp, sum1);                                            \
    104     smem[0][col_lcl] = sum0;                                                \
    105     sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1);  \
    106     sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1);  \
    107     smem[1][col_lcl] = sum1;
    108 
    109 
    110 #if kercn == 4
    111 #define LOAD_LOCAL4(col_gl, col_lcl) \
    112     sum40 =     co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows));           \
    113     sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40);   \
    114     temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y, src_rows));                       \
    115     sum40 = MAD(co1, temp4, sum40);                                             \
    116     sum41 = co3 * temp4;                                                        \
    117     temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 1, src_rows));                   \
    118     sum40 = MAD(co2, temp4, sum40);                                             \
    119     sum41 = MAD(co2, temp4, sum41);                                             \
    120     temp4 = SRC4(col_gl,  EXTRAPOLATE_(src_y + 2, src_rows));                   \
    121     sum40 = MAD(co3, temp4, sum40);                                             \
    122     sum41 = MAD(co1, temp4, sum41);                                             \
    123     vstore4(sum40, col_lcl, (__local float*) &smem[0][2]);                      \
    124     sum41 = MAD(co2, SRC4(col_gl,  EXTRAPOLATE_(src_y + 3, src_rows)), sum41);  \
    125     sum41 = MAD(co3, SRC4(col_gl,  EXTRAPOLATE_(src_y + 4, src_rows)), sum41);  \
    126     vstore4(sum41, col_lcl, (__local float*) &smem[1][2]);
    127 #endif
    128 
    129 #define noconvert
    130 
    131 __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
    132                          __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
    133 {
    134     const int x = get_global_id(0)*kercn;
    135     const int y = 2*get_global_id(1);
    136 
    137     __local FT smem[2][LOCAL_SIZE + 4];
    138     __global uchar * dstData = dst + dst_offset;
    139     __global const uchar * srcData = src + src_offset;
    140 
    141     FT sum0, sum1, temp;
    142     FT co1 = 0.375f;
    143     FT co2 = 0.25f;
    144     FT co3 = 0.0625f;
    145 
    146     const int src_y = 2*y;
    147     int col;
    148 
    149     if (src_y >= 2 && src_y < src_rows - 4)
    150     {
    151 #define EXTRAPOLATE_(val, maxVal)   val
    152 #if kercn == 1
    153         col = EXTRAPOLATE(x, src_cols);
    154         LOAD_LOCAL(col, 2 + get_local_id(0))
    155 #else
    156         if (x < src_cols-4)
    157         {
    158             float4 sum40, sum41, temp4;
    159             LOAD_LOCAL4(x, get_local_id(0))
    160         }
    161         else
    162         {
    163             for (int i=0; i<4; i++)
    164             {
    165                 col = EXTRAPOLATE(x+i, src_cols);
    166                 LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i)
    167             }
    168         }
    169 #endif
    170         if (get_local_id(0) < 2)
    171         {
    172             col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
    173             LOAD_LOCAL(col, get_local_id(0))
    174         }
    175         else if (get_local_id(0) < 4)
    176         {
    177             col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
    178             LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
    179         }
    180     }
    181     else // need extrapolate y
    182     {
    183 #define EXTRAPOLATE_(val, maxVal)   EXTRAPOLATE(val, maxVal)
    184 #if kercn == 1
    185         col = EXTRAPOLATE(x, src_cols);
    186         LOAD_LOCAL(col, 2 + get_local_id(0))
    187 #else
    188         if (x < src_cols-4)
    189         {
    190             float4 sum40, sum41, temp4;
    191             LOAD_LOCAL4(x, get_local_id(0))
    192         }
    193         else
    194         {
    195             for (int i=0; i<4; i++)
    196             {
    197                 col = EXTRAPOLATE(x+i, src_cols);
    198                 LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i)
    199             }
    200         }
    201 #endif
    202         if (get_local_id(0) < 2)
    203         {
    204             col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
    205             LOAD_LOCAL(col, get_local_id(0))
    206         }
    207         else if (get_local_id(0) < 4)
    208         {
    209             col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);
    210             LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
    211         }
    212     }
    213 
    214     barrier(CLK_LOCAL_MEM_FENCE);
    215 
    216 #if kercn == 1
    217     if (get_local_id(0) < LOCAL_SIZE / 2)
    218     {
    219         const int tid2 = get_local_id(0) * 2;
    220 
    221         const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;
    222 
    223         if (dst_x < dst_cols)
    224         {
    225             for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
    226             {
    227 #if cn == 1
    228 #if fdepth <= 5
    229                 FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));
    230 #else
    231                 FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));
    232 #endif
    233 #else
    234                 FT sum = co3 * smem[yin - y][2 + tid2 - 2];
    235                 sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum);
    236                 sum = MAD(co1, smem[yin - y][2 + tid2    ], sum);
    237                 sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);
    238 #endif
    239                 sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);
    240                 storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
    241             }
    242         }
    243     }
    244 #else
    245     int tid4 = get_local_id(0) * 4;
    246     int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;
    247     if (dst_x < dst_cols - 1)
    248     {
    249         for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
    250         {
    251 
    252             FT sum =  co3* smem[yin - y][2 + tid4 + 2];
    253             sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
    254             sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
    255             sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
    256             sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
    257             storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
    258 
    259             dst_x ++;
    260             sum =     co3* smem[yin - y][2 + tid4 + 4];
    261             sum = MAD(co3, smem[yin - y][2 + tid4    ], sum);
    262             sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
    263             sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);
    264             sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);
    265             storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
    266             dst_x --;
    267         }
    268 
    269     }
    270     else if (dst_x < dst_cols)
    271     {
    272         for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
    273         {
    274             FT sum =  co3* smem[yin - y][2 + tid4 + 2];
    275             sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
    276             sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
    277             sum = MAD(co1, smem[yin - y][2 + tid4    ], sum);
    278             sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
    279 
    280             storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
    281         }
    282     }
    283 #endif
    284 
    285 }
    286