1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. 14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // @Authors 18 // Zhang Ying, zhangying913 (at) gmail.com 19 // 20 // Redistribution and use in source and binary forms, with or without modification, 21 // are permitted provided that the following conditions are met: 22 // 23 // * Redistribution's of source code must retain the above copyright notice, 24 // this list of conditions and the following disclaimer. 25 // 26 // * Redistribution's in binary form must reproduce the above copyright notice, 27 // this list of conditions and the following disclaimer in the documentation 28 // and/or other materials provided with the distribution. 29 // 30 // * The name of the copyright holders may not be used to endorse or promote products 31 // derived from this software without specific prior written permission. 32 // 33 // This software is provided by the copyright holders and contributors as is and 34 // any express or implied warranties, including, but not limited to, the implied 35 // warranties of merchantability and fitness for a particular purpose are disclaimed. 36 // In no event shall the Intel Corporation or contributors be liable for any direct, 37 // indirect, incidental, special, exemplary, or consequential damages 38 // (including, but not limited to, procurement of substitute goods or services; 39 // loss of use, data, or profits; or business interruption) however caused 40 // and on any theory of liability, whether in contract, strict liability, 41 // or tort (including negligence or otherwise) arising in any way out of 42 // the use of this software, even if advised of the possibility of such damage. 43 // 44 //M*/ 45 46 #ifdef DOUBLE_SUPPORT 47 #ifdef cl_amd_fp64 48 #pragma OPENCL EXTENSION cl_amd_fp64:enable 49 #elif defined (cl_khr_fp64) 50 #pragma OPENCL EXTENSION cl_khr_fp64:enable 51 #endif 52 #define CT double 53 #else 54 #define CT float 55 #endif 56 57 #define INTER_BITS 5 58 #define INTER_TAB_SIZE (1 << INTER_BITS) 59 #define INTER_SCALE 1.f/INTER_TAB_SIZE 60 #define AB_BITS max(10, (int)INTER_BITS) 61 #define AB_SCALE (1 << AB_BITS) 62 #define INTER_REMAP_COEF_BITS 15 63 #define INTER_REMAP_COEF_SCALE (1 << INTER_REMAP_COEF_BITS) 64 #define ROUND_DELTA (1 << (AB_BITS - INTER_BITS - 1)) 65 66 #define noconvert 67 68 #ifndef ST 69 #define ST T 70 #endif 71 72 #if cn != 3 73 #define loadpix(addr) *(__global const T*)(addr) 74 #define storepix(val, addr) *(__global T*)(addr) = val 75 #define scalar scalar_ 76 #define pixsize (int)sizeof(T) 77 #else 78 #define loadpix(addr) vload3(0, (__global const T1*)(addr)) 79 #define storepix(val, addr) vstore3(val, 0, (__global T1*)(addr)) 80 #ifdef INTER_NEAREST 81 #define scalar (T)(scalar_.x, scalar_.y, scalar_.z) 82 #else 83 #define scalar (WT)(scalar_.x, scalar_.y, scalar_.z) 84 #endif 85 #define pixsize ((int)sizeof(T1)*3) 86 #endif 87 88 #ifdef INTER_NEAREST 89 90 __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, 91 __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 92 __constant CT * M, ST scalar_) 93 { 94 int dx = get_global_id(0); 95 int dy0 = get_global_id(1) * rowsPerWI; 96 97 if (dx < dst_cols) 98 { 99 int round_delta = (AB_SCALE >> 1); 100 101 int X0_ = rint(M[0] * dx * AB_SCALE); 102 int Y0_ = rint(M[3] * dx * AB_SCALE); 103 int dst_index = mad24(dy0, dst_step, mad24(dx, pixsize, dst_offset)); 104 105 for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy, dst_index += dst_step) 106 { 107 int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + round_delta; 108 int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + round_delta; 109 110 short sx = convert_short_sat(X0 >> AB_BITS); 111 short sy = convert_short_sat(Y0 >> AB_BITS); 112 113 if (sx >= 0 && sx < src_cols && sy >= 0 && sy < src_rows) 114 { 115 int src_index = mad24(sy, src_step, mad24(sx, pixsize, src_offset)); 116 storepix(loadpix(srcptr + src_index), dstptr + dst_index); 117 } 118 else 119 storepix(scalar, dstptr + dst_index); 120 } 121 } 122 } 123 124 #elif defined INTER_LINEAR 125 126 __constant float coeffs[64] = 127 { 1.000000f, 0.000000f, 0.968750f, 0.031250f, 0.937500f, 0.062500f, 0.906250f, 0.093750f, 0.875000f, 0.125000f, 0.843750f, 0.156250f, 128 0.812500f, 0.187500f, 0.781250f, 0.218750f, 0.750000f, 0.250000f, 0.718750f, 0.281250f, 0.687500f, 0.312500f, 0.656250f, 0.343750f, 129 0.625000f, 0.375000f, 0.593750f, 0.406250f, 0.562500f, 0.437500f, 0.531250f, 0.468750f, 0.500000f, 0.500000f, 0.468750f, 0.531250f, 130 0.437500f, 0.562500f, 0.406250f, 0.593750f, 0.375000f, 0.625000f, 0.343750f, 0.656250f, 0.312500f, 0.687500f, 0.281250f, 0.718750f, 131 0.250000f, 0.750000f, 0.218750f, 0.781250f, 0.187500f, 0.812500f, 0.156250f, 0.843750f, 0.125000f, 0.875000f, 0.093750f, 0.906250f, 132 0.062500f, 0.937500f, 0.031250f, 0.968750f }; 133 134 __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, 135 __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 136 __constant CT * M, ST scalar_) 137 { 138 int dx = get_global_id(0); 139 int dy0 = get_global_id(1) * rowsPerWI; 140 141 if (dx < dst_cols) 142 { 143 int tmp = dx << AB_BITS; 144 int X0_ = rint(M[0] * tmp); 145 int Y0_ = rint(M[3] * tmp); 146 147 for (int dy = dy0, dy1 = min(dst_rows, dy0 + rowsPerWI); dy < dy1; ++dy) 148 { 149 int X0 = X0_ + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA; 150 int Y0 = Y0_ + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA; 151 X0 = X0 >> (AB_BITS - INTER_BITS); 152 Y0 = Y0 >> (AB_BITS - INTER_BITS); 153 154 short sx = convert_short_sat(X0 >> INTER_BITS), sy = convert_short_sat(Y0 >> INTER_BITS); 155 short ax = convert_short(X0 & (INTER_TAB_SIZE-1)), ay = convert_short(Y0 & (INTER_TAB_SIZE-1)); 156 157 #if defined AMD_DEVICE || depth > 4 158 WT v0 = scalar, v1 = scalar, v2 = scalar, v3 = scalar; 159 if (sx >= 0 && sx < src_cols) 160 { 161 if (sy >= 0 && sy < src_rows) 162 v0 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx, pixsize, src_offset)))); 163 if (sy+1 >= 0 && sy+1 < src_rows) 164 v2 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx, pixsize, src_offset)))); 165 } 166 if (sx+1 >= 0 && sx+1 < src_cols) 167 { 168 if (sy >= 0 && sy < src_rows) 169 v1 = convertToWT(loadpix(srcptr + mad24(sy, src_step, mad24(sx+1, pixsize, src_offset)))); 170 if (sy+1 >= 0 && sy+1 < src_rows) 171 v3 = convertToWT(loadpix(srcptr + mad24(sy+1, src_step, mad24(sx+1, pixsize, src_offset)))); 172 } 173 174 float taby = 1.f/INTER_TAB_SIZE*ay; 175 float tabx = 1.f/INTER_TAB_SIZE*ax; 176 177 int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); 178 179 #if depth <= 4 180 int itab0 = convert_short_sat_rte( (1.0f-taby)*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); 181 int itab1 = convert_short_sat_rte( (1.0f-taby)*tabx * INTER_REMAP_COEF_SCALE ); 182 int itab2 = convert_short_sat_rte( taby*(1.0f-tabx) * INTER_REMAP_COEF_SCALE ); 183 int itab3 = convert_short_sat_rte( taby*tabx * INTER_REMAP_COEF_SCALE ); 184 185 WT val = mad24(v0, itab0, mad24(v1, itab1, mad24(v2, itab2, v3 * itab3))); 186 storepix(convertToT((val + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS), dstptr + dst_index); 187 #else 188 float tabx2 = 1.0f - tabx, taby2 = 1.0f - taby; 189 WT val = fma(tabx2, fma(v0, taby2, v2 * taby), tabx * fma(v1, taby2, v3 * taby)); 190 storepix(convertToT(val), dstptr + dst_index); 191 #endif 192 #else // INTEL_DEVICE 193 __constant float * coeffs_y = coeffs + (ay << 1), * coeffs_x = coeffs + (ax << 1); 194 195 int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index; 196 int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); 197 198 WT sum = (WT)(0), xsum; 199 #pragma unroll 200 for (int y = 0; y < 2; y++) 201 { 202 src_index = mad24(y, src_step, src_index0); 203 if (sy + y >= 0 && sy + y < src_rows) 204 { 205 xsum = (WT)(0); 206 if (sx >= 0 && sx + 2 < src_cols) 207 { 208 #if depth == 0 && cn == 1 209 uchar2 value = vload2(0, srcptr + src_index); 210 xsum = dot(convert_float2(value), (float2)(coeffs_x[0], coeffs_x[1])); 211 #else 212 #pragma unroll 213 for (int x = 0; x < 2; x++) 214 xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum); 215 #endif 216 } 217 else 218 { 219 #pragma unroll 220 for (int x = 0; x < 2; x++) 221 xsum = fma(sx + x >= 0 && sx + x < src_cols ? 222 convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum); 223 } 224 sum = fma(xsum, coeffs_y[y], sum); 225 } 226 else 227 sum = fma(scalar, coeffs_y[y], sum); 228 } 229 230 storepix(convertToT(sum), dstptr + dst_index); 231 #endif 232 } 233 } 234 } 235 236 #elif defined INTER_CUBIC 237 238 #ifdef AMD_DEVICE 239 240 inline void interpolateCubic( float x, float* coeffs ) 241 { 242 const float A = -0.75f; 243 244 coeffs[0] = fma(fma(fma(A, (x + 1.f), - 5.0f*A), (x + 1.f), 8.0f*A), x + 1.f, - 4.0f*A); 245 coeffs[1] = fma(fma(A + 2.f, x, - (A + 3.f)), x*x, 1.f); 246 coeffs[2] = fma(fma(A + 2.f, 1.f - x, - (A + 3.f)), (1.f - x)*(1.f - x), 1.f); 247 coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; 248 } 249 250 #else 251 252 __constant float coeffs[128] = 253 { 0.000000f, 1.000000f, 0.000000f, 0.000000f, -0.021996f, 0.997841f, 0.024864f, -0.000710f, -0.041199f, 0.991516f, 0.052429f, -0.002747f, 254 -0.057747f, 0.981255f, 0.082466f, -0.005974f, -0.071777f, 0.967285f, 0.114746f, -0.010254f, -0.083427f, 0.949837f, 0.149040f, -0.015450f, 255 -0.092834f, 0.929138f, 0.185120f, -0.021423f, -0.100136f, 0.905418f, 0.222755f, -0.028038f, -0.105469f, 0.878906f, 0.261719f, -0.035156f, 256 -0.108971f, 0.849831f, 0.301781f, -0.042641f, -0.110779f, 0.818420f, 0.342712f, -0.050354f, -0.111031f, 0.784904f, 0.384285f, -0.058159f, 257 -0.109863f, 0.749512f, 0.426270f, -0.065918f, -0.107414f, 0.712471f, 0.468437f, -0.073494f, -0.103821f, 0.674011f, 0.510559f, -0.080750f, 258 -0.099220f, 0.634361f, 0.552406f, -0.087547f, -0.093750f, 0.593750f, 0.593750f, -0.093750f, -0.087547f, 0.552406f, 0.634361f, -0.099220f, 259 -0.080750f, 0.510559f, 0.674011f, -0.103821f, -0.073494f, 0.468437f, 0.712471f, -0.107414f, -0.065918f, 0.426270f, 0.749512f, -0.109863f, 260 -0.058159f, 0.384285f, 0.784904f, -0.111031f, -0.050354f, 0.342712f, 0.818420f, -0.110779f, -0.042641f, 0.301781f, 0.849831f, -0.108971f, 261 -0.035156f, 0.261719f, 0.878906f, -0.105469f, -0.028038f, 0.222755f, 0.905418f, -0.100136f, -0.021423f, 0.185120f, 0.929138f, -0.092834f, 262 -0.015450f, 0.149040f, 0.949837f, -0.083427f, -0.010254f, 0.114746f, 0.967285f, -0.071777f, -0.005974f, 0.082466f, 0.981255f, -0.057747f, 263 -0.002747f, 0.052429f, 0.991516f, -0.041199f, -0.000710f, 0.024864f, 0.997841f, -0.021996f }; 264 265 #endif 266 267 __kernel void warpAffine(__global const uchar * srcptr, int src_step, int src_offset, int src_rows, int src_cols, 268 __global uchar * dstptr, int dst_step, int dst_offset, int dst_rows, int dst_cols, 269 __constant CT * M, ST scalar_) 270 { 271 int dx = get_global_id(0); 272 int dy = get_global_id(1); 273 274 if (dx < dst_cols && dy < dst_rows) 275 { 276 int tmp = (dx << AB_BITS); 277 int X0 = rint(M[0] * tmp) + rint(fma(M[1], dy, M[2]) * AB_SCALE) + ROUND_DELTA; 278 int Y0 = rint(M[3] * tmp) + rint(fma(M[4], dy, M[5]) * AB_SCALE) + ROUND_DELTA; 279 280 X0 = X0 >> (AB_BITS - INTER_BITS); 281 Y0 = Y0 >> (AB_BITS - INTER_BITS); 282 283 int sx = (short)(X0 >> INTER_BITS) - 1, sy = (short)(Y0 >> INTER_BITS) - 1; 284 int ay = (short)(Y0 & (INTER_TAB_SIZE - 1)), ax = (short)(X0 & (INTER_TAB_SIZE - 1)); 285 286 #ifdef AMD_DEVICE 287 WT v[16]; 288 #pragma unroll 289 for (int y = 0; y < 4; y++) 290 { 291 if (sy+y >= 0 && sy+y < src_rows) 292 { 293 #pragma unroll 294 for (int x = 0; x < 4; x++) 295 v[mad24(y, 4, x)] = sx+x >= 0 && sx+x < src_cols ? 296 convertToWT(loadpix(srcptr + mad24(sy+y, src_step, mad24(sx+x, pixsize, src_offset)))) : scalar; 297 } 298 else 299 { 300 #pragma unroll 301 for (int x = 0; x < 4; x++) 302 v[mad24(y, 4, x)] = scalar; 303 } 304 } 305 306 float tab1y[4], tab1x[4]; 307 308 float ayy = INTER_SCALE * ay; 309 float axx = INTER_SCALE * ax; 310 interpolateCubic(ayy, tab1y); 311 interpolateCubic(axx, tab1x); 312 313 int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); 314 315 WT sum = (WT)(0); 316 #if depth <= 4 317 int itab[16]; 318 319 #pragma unroll 320 for (int i = 0; i < 16; i++) 321 itab[i] = rint(tab1y[(i>>2)] * tab1x[(i&3)] * INTER_REMAP_COEF_SCALE); 322 323 #pragma unroll 324 for (int i = 0; i < 16; i++) 325 sum = mad24(v[i], itab[i], sum); 326 storepix(convertToT( (sum + (1 << (INTER_REMAP_COEF_BITS-1))) >> INTER_REMAP_COEF_BITS ), dstptr + dst_index); 327 #else 328 #pragma unroll 329 for (int i = 0; i < 16; i++) 330 sum = fma(v[i], tab1y[(i>>2)] * tab1x[(i&3)], sum); 331 storepix(convertToT( sum ), dstptr + dst_index); 332 #endif 333 #else // INTEL_DEVICE 334 __constant float * coeffs_y = coeffs + (ay << 2), * coeffs_x = coeffs + (ax << 2); 335 336 int src_index0 = mad24(sy, src_step, mad24(sx, pixsize, src_offset)), src_index; 337 int dst_index = mad24(dy, dst_step, mad24(dx, pixsize, dst_offset)); 338 339 WT sum = (WT)(0), xsum; 340 #pragma unroll 341 for (int y = 0; y < 4; y++) 342 { 343 src_index = mad24(y, src_step, src_index0); 344 if (sy + y >= 0 && sy + y < src_rows) 345 { 346 xsum = (WT)(0); 347 if (sx >= 0 && sx + 4 < src_cols) 348 { 349 #if depth == 0 && cn == 1 350 uchar4 value = vload4(0, srcptr + src_index); 351 xsum = dot(convert_float4(value), (float4)(coeffs_x[0], coeffs_x[1], coeffs_x[2], coeffs_x[3])); 352 #else 353 #pragma unroll 354 for (int x = 0; x < 4; x++) 355 xsum = fma(convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))), coeffs_x[x], xsum); 356 #endif 357 } 358 else 359 { 360 #pragma unroll 361 for (int x = 0; x < 4; x++) 362 xsum = fma(sx + x >= 0 && sx + x < src_cols ? 363 convertToWT(loadpix(srcptr + mad24(x, pixsize, src_index))) : scalar, coeffs_x[x], xsum); 364 } 365 sum = fma(xsum, coeffs_y[y], sum); 366 } 367 else 368 sum = fma(scalar, coeffs_y[y], sum); 369 } 370 371 storepix(convertToT(sum), dstptr + dst_index); 372 #endif 373 } 374 } 375 376 #endif