1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2014, Intel Corporation, all rights reserved. 14 // Third party copyrights are property of their respective owners. 15 // 16 // Redistribution and use in source and binary forms, with or without modification, 17 // are permitted provided that the following conditions are met: 18 // 19 // * Redistribution's of source code must retain the above copyright notice, 20 // this list of conditions and the following disclaimer. 21 // 22 // * Redistribution's in binary form must reproduce the above copyright notice, 23 // this list of conditions and the following disclaimer in the documentation 24 // and/or other materials provided with the distribution. 25 // 26 // * The name of the copyright holders may not be used to endorse or promote products 27 // derived from this software without specific prior written permission. 28 // 29 // This software is provided by the copyright holders and contributors "as is" and 30 // any express or implied warranties, including, but not limited to, the implied 31 // warranties of merchantability and fitness for a particular purpose are disclaimed. 32 // In no event shall the Intel Corporation or contributors be liable for any direct, 33 // indirect, incidental, special, exemplary, or consequential damages 34 // (including, but not limited to, procurement of substitute goods or services; 35 // loss of use, data, or profits; or business interruption) however caused 36 // and on any theory of liability, whether in contract, strict liability, 37 // or tort (including negligence or otherwise) arising in any way out of 38 // the use of this software, even if advised of the possibility of such damage. 39 // 40 //M*/ 41 42 /////////////////////////////////////////////////////////////////////////////////////////////////// 43 /////////////////////////////////Macro for border type//////////////////////////////////////////// 44 ///////////////////////////////////////////////////////////////////////////////////////////////// 45 46 #ifdef BORDER_CONSTANT 47 // CCCCCC|abcdefgh|CCCCCCC 48 #define EXTRAPOLATE(x, maxV) 49 #elif defined BORDER_REPLICATE 50 // aaaaaa|abcdefgh|hhhhhhh 51 #define EXTRAPOLATE(x, maxV) \ 52 { \ 53 (x) = clamp((x), 0, (maxV)-1); \ 54 } 55 #elif defined BORDER_WRAP 56 // cdefgh|abcdefgh|abcdefg 57 #define EXTRAPOLATE(x, maxV) \ 58 { \ 59 (x) = ( (x) + (maxV) ) % (maxV); \ 60 } 61 #elif defined BORDER_REFLECT 62 // fedcba|abcdefgh|hgfedcb 63 #define EXTRAPOLATE(x, maxV) \ 64 { \ 65 (x) = min(((maxV)-1)*2-(x)+1, max((x),-(x)-1) ); \ 66 } 67 #elif defined BORDER_REFLECT_101 || defined BORDER_REFLECT101 68 // gfedcb|abcdefgh|gfedcba 69 #define EXTRAPOLATE(x, maxV) \ 70 { \ 71 (x) = min(((maxV)-1)*2-(x), max((x),-(x)) ); \ 72 } 73 #else 74 #error No extrapolation method 75 #endif 76 77 #if CN != 3 78 #define loadpix(addr) *(__global const srcT *)(addr) 79 #define storepix(val, addr) *(__global dstT *)(addr) = val 80 #define SRCSIZE (int)sizeof(srcT) 81 #define DSTSIZE (int)sizeof(dstT) 82 #else 83 #define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) 84 #define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) 85 #define SRCSIZE (int)sizeof(srcT1)*3 86 #define DSTSIZE (int)sizeof(dstT1)*3 87 #endif 88 89 #define SRC(_x,_y) convertToWT(loadpix(Src + mad24(_y, src_step, SRCSIZE * _x))) 90 91 #ifdef BORDER_CONSTANT 92 // CCCCCC|abcdefgh|CCCCCCC 93 #define ELEM(_x,_y,r_edge,t_edge,const_v) (_x)<0 | (_x) >= (r_edge) | (_y)<0 | (_y) >= (t_edge) ? (const_v) : SRC((_x),(_y)) 94 #else 95 #define ELEM(_x,_y,r_edge,t_edge,const_v) SRC((_x),(_y)) 96 #endif 97 98 #define noconvert 99 100 // horizontal and vertical filter kernels 101 // should be defined on host during compile time to avoid overhead 102 #define DIG(a) a, 103 __constant WT1 mat_kernelX[] = { KERNEL_MATRIX_X }; 104 __constant WT1 mat_kernelY[] = { KERNEL_MATRIX_Y }; 105 106 __kernel void sep_filter(__global uchar* Src, int src_step, int srcOffsetX, int srcOffsetY, int height, int width, 107 __global uchar* Dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, float delta) 108 { 109 // RADIUSX, RADIUSY are filter dimensions 110 // BLK_X, BLK_Y are local wrogroup sizes 111 // all these should be defined on host during compile time 112 // first lsmem array for source pixels used in first pass, 113 // second lsmemDy for storing first pass results 114 __local WT lsmem[BLK_Y + 2 * RADIUSY][BLK_X + 2 * RADIUSX]; 115 __local WT lsmemDy[BLK_Y][BLK_X + 2 * RADIUSX]; 116 117 // get local and global ids - used as image and local memory array indexes 118 int lix = get_local_id(0); 119 int liy = get_local_id(1); 120 121 int x = get_global_id(0); 122 123 // calculate pixel position in source image taking image offset into account 124 int srcX = x + srcOffsetX - RADIUSX; 125 126 // extrapolate coordinates, if needed 127 // and read my own source pixel into local memory 128 // with account for extra border pixels, which will be read by starting workitems 129 int clocY = liy; 130 do 131 { 132 int yb = clocY + srcOffsetY - RADIUSY; 133 EXTRAPOLATE(yb, (height)); 134 135 int clocX = lix; 136 int cSrcX = srcX; 137 do 138 { 139 int xb = cSrcX; 140 EXTRAPOLATE(xb,(width)); 141 lsmem[clocY][clocX] = ELEM(xb, yb, (width), (height), 0 ); 142 143 clocX += BLK_X; 144 cSrcX += BLK_X; 145 } 146 while(clocX < BLK_X+(RADIUSX*2)); 147 148 clocY += BLK_Y; 149 } 150 while (clocY < BLK_Y+(RADIUSY*2)); 151 barrier(CLK_LOCAL_MEM_FENCE); 152 153 for (int y = 0; y < dst_rows; y+=BLK_Y) 154 { 155 // do vertical filter pass 156 // and store intermediate results to second local memory array 157 int i, clocX = lix; 158 WT sum = (WT) 0; 159 do 160 { 161 sum = (WT) 0; 162 for (i=0; i<=2*RADIUSY; i++) 163 #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) 164 sum = mad24(lsmem[liy + i][clocX], mat_kernelY[i], sum); 165 #else 166 sum = mad(lsmem[liy + i][clocX], mat_kernelY[i], sum); 167 #endif 168 lsmemDy[liy][clocX] = sum; 169 clocX += BLK_X; 170 } 171 while(clocX < BLK_X+(RADIUSX*2)); 172 barrier(CLK_LOCAL_MEM_FENCE); 173 174 // if this pixel happened to be out of image borders because of global size rounding, 175 // then just return 176 if ((x < dst_cols) && (y + liy < dst_rows)) 177 { 178 // do second horizontal filter pass 179 // and calculate final result 180 sum = 0.0f; 181 for (i=0; i<=2*RADIUSX; i++) 182 #if (defined(INTEGER_ARITHMETIC) && !INTEL_DEVICE) 183 sum = mad24(lsmemDy[liy][lix+i], mat_kernelX[i], sum); 184 #else 185 sum = mad(lsmemDy[liy][lix+i], mat_kernelX[i], sum); 186 #endif 187 188 #ifdef INTEGER_ARITHMETIC 189 #ifdef INTEL_DEVICE 190 sum = (sum + (1 << (SHIFT_BITS-1))) / (1 << SHIFT_BITS); 191 #else 192 sum = (sum + (1 << (SHIFT_BITS-1))) >> SHIFT_BITS; 193 #endif 194 #endif 195 // store result into destination image 196 storepix(convertToDstT(sum + (WT)(delta)), Dst + mad24(y + liy, dst_step, mad24(x, DSTSIZE, dst_offset))); 197 } 198 199 for (int i = liy * BLK_X + lix; i < (RADIUSY*2) * (BLK_X+(RADIUSX*2)); i += BLK_X * BLK_Y) 200 { 201 int clocX = i % (BLK_X+(RADIUSX*2)); 202 int clocY = i / (BLK_X+(RADIUSX*2)); 203 lsmem[clocY][clocX] = lsmem[clocY + BLK_Y][clocX]; 204 } 205 barrier(CLK_LOCAL_MEM_FENCE); 206 207 int yb = y + liy + BLK_Y + srcOffsetY + RADIUSY; 208 EXTRAPOLATE(yb, (height)); 209 210 clocX = lix; 211 int cSrcX = x + srcOffsetX - RADIUSX; 212 do 213 { 214 int xb = cSrcX; 215 EXTRAPOLATE(xb,(width)); 216 lsmem[liy + 2*RADIUSY][clocX] = ELEM(xb, yb, (width), (height), 0 ); 217 218 clocX += BLK_X; 219 cSrcX += BLK_X; 220 } 221 while(clocX < BLK_X+(RADIUSX*2)); 222 barrier(CLK_LOCAL_MEM_FENCE); 223 } 224 225 } 226