Home | History | Annotate | Download | only in opencl
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2014, Intel Corporation, all rights reserved.
     14 // Third party copyrights are property of their respective owners.
     15 //
     16 // Redistribution and use in source and binary forms, with or without modification,
     17 // are permitted provided that the following conditions are met:
     18 //
     19 //   * Redistribution's of source code must retain the above copyright notice,
     20 //     this list of conditions and the following disclaimer.
     21 //
     22 //   * Redistribution's in binary form must reproduce the above copyright notice,
     23 //     this list of conditions and the following disclaimer in the documentation
     24 //     and/or other materials provided with the distribution.
     25 //
     26 //   * The name of the copyright holders may not be used to endorse or promote products
     27 //     derived from this software without specific prior written permission.
     28 //
     29 // This software is provided by the copyright holders and contributors "as is" and
     30 // any express or implied warranties, including, but not limited to, the implied
     31 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     32 // In no event shall the Intel Corporation or contributors be liable for any direct,
     33 // indirect, incidental, special, exemplary, or consequential damages
     34 // (including, but not limited to, procurement of substitute goods or services;
     35 // loss of use, data, or profits; or business interruption) however caused
     36 // and on any theory of liability, whether in contract, strict liability,
     37 // or tort (including negligence or otherwise) arising in any way out of
     38 // the use of this software, even if advised of the possibility of such damage.
     39 //
     40 //M*/
     41 
     42 ///////////////////////////////////////////////////////////////////////////////////////////////////
     43 /////////////////////////////////Macro for border type////////////////////////////////////////////
     44 /////////////////////////////////////////////////////////////////////////////////////////////////
     45 
     46 #ifdef BORDER_CONSTANT
     47 // CCCCCC|abcdefgh|CCCCCCC
     48 #define EXTRAPOLATE(x, maxV)
     49 #elif defined BORDER_REPLICATE
     50 // aaaaaa|abcdefgh|hhhhhhh
     51 #define EXTRAPOLATE(x, maxV) \
     52     { \
     53         (x) = clamp((x), 0, (maxV)-1); \
     54     }
     55 #elif defined BORDER_WRAP
     56 // cdefgh|abcdefgh|abcdefg
     57 #define EXTRAPOLATE(x, maxV) \
     58     { \
     59         (x) = ( (x) + (maxV) ) % (maxV); \
     60     }
     61 #elif defined BORDER_REFLECT
     62 // fedcba|abcdefgh|hgfedcb
     63 #define EXTRAPOLATE(x, maxV) \
     64     { \
     65         (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \
     66     }
     67 #elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101
     68 // gfedcb|abcdefgh|gfedcba
     69 #define EXTRAPOLATE(x, maxV) \
     70     { \
     71         (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \
     72     }
     73 #else
     74 #error No extrapolation method
     75 #endif
     76 
     77 #if CN != 3
     78 #define loadpix(addr) *(__global const srcT *)(addr)
     79 #define storepix(val, addr)  *(__global dstT *)(addr) = val
     80 #define SRCSIZE (int)sizeof(srcT)
     81 #define DSTSIZE (int)sizeof(dstT)
     82 #else
     83 #define loadpix(addr)  vload3(0, (__global const srcT1 *)(addr))
     84 #define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr))
     85 #define SRCSIZE (int)sizeof(srcT1)*3
     86 #define DSTSIZE (int)sizeof(dstT1)*3
     87 #endif
     88 
     89 #define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x)))
     90 
     91 #ifdef BORDER_CONSTANT
     92 // CCCCCC|abcdefgh|CCCCCCC
     93 #define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y))
     94 #else
     95 #define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y))
     96 #endif
     97 
     98 #define noconvert
     99 
    100 // horizontal and vertical filter kernels
    101 // should be defined on host during compile time to avoid overhead
    102 #define DIG(a) a,
    103 __constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X };
    104 __constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y };
    105 
    106 __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width,
    107                          __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta)
    108 {
    109     // RADIUSX, RADIUSY are filter dimensions
    110     // BLK_X, BLK_Y are local wrogroup sizes
    111     // all these should be defined on host during compile time
    112     // first lsmem array for source pixels used in first pass,
    113     // second lsmemDy for storing first pass results
    114     __local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX];
    115     __local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX];
    116 
    117     // get local and global ids - used as image and local memory array indexes
    118     int lix = get_local_id(0);
    119     int liy = get_local_id(1);
    120 
    121     int x = get_global_id(0);
    122 
    123     // calculate pixel position in source image taking image offset into account
    124     int srcX = x + srcOffsetX - RADIUSX;
    125 
    126     // extrapolate coordinates, if needed
    127     // and read my own source pixel into local memory
    128     // with account for extra border pixels, which will be read by starting workitems
    129     int clocY = liy;
    130     do
    131     {
    132         int yb = clocY + srcOffsetY - RADIUSY;
    133         EXTRAPOLATE(yb, (height));
    134 
    135         int clocX = lix;
    136         int cSrcX = srcX;
    137         do
    138         {
    139             int xb = cSrcX;
    140             EXTRAPOLATE(xb,(width));
    141             lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 );
    142 
    143             clocX += BLK_X;
    144             cSrcX += BLK_X;
    145         }
    146         while(clocX < BLK_X+(RADIUSX*2));
    147 
    148         clocY += BLK_Y;
    149     }
    150     while (clocY < BLK_Y+(RADIUSY*2));
    151     barrier(CLK_LOCAL_MEM_FENCE);
    152 
    153     for (int y = 0; y < dst_rows; y+=BLK_Y)
    154     {
    155         // do vertical filter pass
    156         // and store intermediate results to second local memory array
    157         int i, clocX = lix;
    158         WT sum = (WT) 0;
    159         do
    160         {
    161             sum = (WT) 0;
    162             for (i=0; i<=2*RADIUSY; i++)
    163 #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
    164                 sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum);
    165 #else
    166                 sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum);
    167 #endif
    168             lsmemDy[liy][clocX] = sum;
    169             clocX += BLK_X;
    170         }
    171         while(clocX < BLK_X+(RADIUSX*2));
    172         barrier(CLK_LOCAL_MEM_FENCE);
    173 
    174         // if this pixel happened to be out of image borders because of global size rounding,
    175         // then just return
    176         if ((x < dst_cols) && (y + liy < dst_rows))
    177         {
    178             // do second horizontal filter pass
    179             // and calculate final result
    180             sum = 0.0f;
    181             for (i=0; i<=2*RADIUSX; i++)
    182 #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE)
    183                 sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
    184 #else
    185                 sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum);
    186 #endif
    187 
    188 #ifdef INTEGER_ARITHMETIC
    189 #ifdef INTEL_DEVICE
    190             sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS);
    191 #else
    192             sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS;
    193 #endif
    194 #endif
    195             // store result into destination image
    196             storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset)));
    197         }
    198 
    199         for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y)
    200         {
    201             int clocX = i % (BLK_X+(RADIUSX*2));
    202             int clocY = i / (BLK_X+(RADIUSX*2));
    203             lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX];
    204         }
    205         barrier(CLK_LOCAL_MEM_FENCE);
    206 
    207         int yb = y + liy + BLK_Y + srcOffsetY + RADIUSY;
    208         EXTRAPOLATE(yb, (height));
    209 
    210         clocX = lix;
    211         int cSrcX = x + srcOffsetX - RADIUSX;
    212         do
    213         {
    214             int xb = cSrcX;
    215             EXTRAPOLATE(xb,(width));
    216             lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, yb, (width), (height), 0 );
    217 
    218             clocX += BLK_X;
    219             cSrcX += BLK_X;
    220         }
    221         while(clocX < BLK_X+(RADIUSX*2));
    222         barrier(CLK_LOCAL_MEM_FENCE);
    223     }
    224 
    225 }
    226