1 // License Agreement 2 // For Open Source Computer Vision Library 3 // 4 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. 5 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 6 // Third party copyrights are property of their respective owners. 7 // 8 // Redistribution and use in source and binary forms, with or without modification, 9 // are permitted provided that the following conditions are met: 10 // 11 // * Redistribution's of source code must retain the above copyright notice, 12 // this list of conditions and the following disclaimer. 13 // 14 // * Redistribution's in binary form must reproduce the above copyright notice, 15 // this list of conditions and the following disclaimer in the documentation 16 // and/or other materials provided with the distribution. 17 // 18 // * The name of the copyright holders may not be used to endorse or promote products 19 // derived from this software without specific prior written permission. 20 // 21 // This software is provided by the copyright holders and contributors as is and 22 // any express or implied warranties, including, but not limited to, the implied 23 // warranties of merchantability and fitness for a particular purpose are disclaimed. 24 // In no event shall the Intel Corporation or contributors be liable for any direct, 25 // indirect, incidental, special, exemplary, or consequential damages 26 // (including, but not limited to, procurement of substitute goods or services; 27 // loss of use, data, or profits; or business interruption) however caused 28 // and on any theory of liability, whether in contract, strict liability, 29 // or tort (including negligence or otherwise) arising in any way out of 30 // the use of this software, even if advised of the possibility of such damage. 31 32 #if cn != 3 33 #define loadpix(addr) *(__global const T *)(addr) 34 #define TSIZE (int)sizeof(T) 35 #else 36 #define loadpix(addr) vload3(0, (__global const T1 *)(addr)) 37 #define TSIZE ((int)sizeof(T1)*3) 38 #endif 39 40 #define SQSUMS_PTR(ox, oy) mad24(y + oy, src_sqsums_step, mad24(x + ox, cn, src_sqsums_offset)) 41 #define SUMS_PTR(ox, oy) mad24(y + oy, src_sums_step, mad24(x + ox, cn, src_sums_offset)) 42 #define SUMS(ox, oy) mad24(y+oy, src_sums_step, mad24(x+ox, (int)sizeof(T1)*cn, src_sums_offset)) 43 #define SQ_SUMS(ox, oy) mad24(y+oy, src_sqsums_step, mad24(x+ox, (int)sizeof(T1)*cn, src_sqsums_offset)) 44 45 inline float normAcc(float num, float denum) 46 { 47 if (fabs(num) < denum) 48 return num / denum; 49 if (fabs(num) < denum * 1.125f) 50 return num > 0 ? 1 : -1; 51 return 0; 52 } 53 54 inline float normAcc_SQDIFF(float num, float denum) 55 { 56 if (fabs(num) < denum) 57 return num / denum; 58 if (fabs(num) < denum * 1.125f) 59 return num > 0 ? 1 : -1; 60 return 1; 61 } 62 63 #define noconvert 64 65 #if cn == 1 66 #define convertToDT(value) (float)(value) 67 #elif cn == 2 68 #define convertToDT(value) (float)(value.x + value.y) 69 #elif cn == 3 70 #define convertToDT(value) (float)(value.x + value.y + value.z) 71 #elif cn == 4 72 #define convertToDT(value) (float)(value.x + value.y + value.z + value.w) 73 #else 74 #error "cn should be 1-4" 75 #endif 76 77 #ifdef CALC_SUM 78 79 __kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset, 80 int cols, int total, __global float * dst) 81 { 82 int lid = get_local_id(0), id = get_global_id(0); 83 84 __local WT localmem[WGS2_ALIGNED]; 85 WT accumulator = (WT)(0), tmp; 86 87 for ( ; id < total; id += WGS) 88 { 89 int src_index = mad24(id / cols, src_step, mad24(id % cols, TSIZE, src_offset)); 90 T src = loadpix(srcptr + src_index); 91 92 tmp = convertToWT(src); 93 94 accumulator = mad(tmp, tmp, accumulator); 95 } 96 97 if (lid < WGS2_ALIGNED) 98 localmem[lid] = accumulator; 99 barrier(CLK_LOCAL_MEM_FENCE); 100 101 if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED) 102 localmem[lid - WGS2_ALIGNED] += accumulator; 103 barrier(CLK_LOCAL_MEM_FENCE); 104 105 for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1) 106 { 107 if (lid < lsize) 108 { 109 int lid2 = lsize + lid; 110 localmem[lid] += localmem[lid2]; 111 } 112 barrier(CLK_LOCAL_MEM_FENCE); 113 } 114 115 if (lid == 0) 116 dst[0] = convertToDT(localmem[0]); 117 } 118 119 #elif defined FIRST_CHANNEL 120 121 __kernel void extractFirstChannel( const __global uchar* img, int img_step, int img_offset, 122 __global uchar* res, int res_step, int res_offset, int rows, int cols) 123 { 124 int x = get_global_id(0); 125 int y = get_global_id(1)*PIX_PER_WI_Y; 126 127 if(x < cols ) 128 { 129 #pragma unroll 130 for (int cy=0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y) 131 { 132 T1 image = *(__global const T1*)(img + mad24(y, img_step, mad24(x, (int)sizeof(T1)*cn, img_offset)));; 133 int res_idx = mad24(y, res_step, mad24(x, (int)sizeof(float), res_offset)); 134 *(__global float *)(res + res_idx) = image; 135 } 136 } 137 } 138 139 #elif defined CCORR 140 141 #if cn==1 && PIX_PER_WI_X==4 142 143 __kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset, 144 __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, 145 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) 146 { 147 int x0 = get_global_id(0)*PIX_PER_WI_X; 148 int y = get_global_id(1); 149 150 if (y < dst_rows) 151 { 152 if (x0 + PIX_PER_WI_X <= dst_cols) 153 { 154 WT sum = (WT)(0); 155 156 int ind = mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset)); 157 __global const T1 * template = (__global const T1*)(templateptr + template_offset); 158 159 for (int i = 0; i < template_rows; ++i) 160 { 161 for (int j = 0; j < template_cols; ++j) 162 { 163 T temp = (T)(template[j]); 164 T src = vload4(0, (__global const T1*)(srcptr + ind + j*(int)sizeof(T1))); 165 166 sum = mad(convertToWT(src), convertToWT(temp), sum); 167 168 } 169 ind += src_step; 170 template = (__global const T1 *)((__global const uchar *)template + template_step); 171 } 172 173 T temp = (T)(template[0]); 174 int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset)); 175 *(__global float4 *)(dst + dst_idx) = convert_float4(sum); 176 } 177 else 178 { 179 WT1 sum [PIX_PER_WI_X]; 180 #pragma unroll 181 for (int i=0; i < PIX_PER_WI_X; i++) sum[i] = 0; 182 183 __global const T1 * src = (__global const T1 *)(srcptr + mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset))); 184 __global const T1 * template = (__global const T1 *)(templateptr + template_offset); 185 186 for (int i = 0; i < template_rows; ++i) 187 { 188 for (int j = 0; j < template_cols; ++j) 189 { 190 #pragma unroll 191 for (int cx=0, x = x0; cx < PIX_PER_WI_X && x < dst_cols; ++cx, ++x) 192 { 193 sum[cx] = mad(convertToWT1(src[j+cx]), convertToWT1(template[j]), sum[cx]); 194 } 195 } 196 197 src = (__global const T1 *)((__global const uchar *)src + src_step); 198 template = (__global const T1 *)((__global const uchar *)template + template_step); 199 } 200 201 #pragma unroll 202 for (int cx=0; cx < PIX_PER_WI_X && x0 < dst_cols; ++cx, ++x0) 203 { 204 int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset)); 205 *(__global float *)(dst + dst_idx) = convertToDT(sum[cx]); 206 } 207 } 208 } 209 } 210 211 #else 212 213 __kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset, 214 __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, 215 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) 216 { 217 int x = get_global_id(0); 218 int y = get_global_id(1); 219 220 if (x < dst_cols && y < dst_rows) 221 { 222 WT sum = (WT)(0); 223 224 for (int i = 0; i < template_rows; ++i) 225 { 226 for (int j = 0; j < template_cols; ++j) 227 { 228 T src = loadpix(srcptr + mad24(y+i, src_step, mad24(x+j, TSIZE, src_offset))); 229 T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset))); 230 231 sum = mad(convertToWT(src), convertToWT(template), sum); 232 } 233 } 234 235 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 236 *(__global float *)(dst + dst_idx) = convertToDT(sum); 237 } 238 } 239 #endif 240 241 #elif defined CCORR_NORMED 242 243 __kernel void matchTemplate_CCORR_NORMED(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, 244 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 245 int template_rows, int template_cols, __global const float * template_sqsum) 246 { 247 int x = get_global_id(0); 248 int y = get_global_id(1); 249 250 if (x < dst_cols && y < dst_rows) 251 { 252 __global const float * sqsum = (__global const float *)(src_sqsums); 253 254 src_sqsums_step /= sizeof(float); 255 src_sqsums_offset /= sizeof(float); 256 float image_sqsum_ = (float)(sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)] - 257 sqsum[SQSUMS_PTR(0, template_rows)] + sqsum[SQSUMS_PTR(0, 0)]); 258 259 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 260 __global float * dstult = (__global float *)(dst + dst_idx); 261 *dstult = normAcc(*dstult, sqrt(image_sqsum_ * template_sqsum[0])); 262 } 263 } 264 265 #elif defined SQDIFF 266 267 __kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_step, int src_offset, 268 __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols, 269 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols) 270 { 271 int x = get_global_id(0); 272 int y = get_global_id(1); 273 274 if (x < dst_cols && y < dst_rows) 275 { 276 WT sum = (WT)(0), value; 277 278 for (int i = 0; i < template_rows; ++i) 279 { 280 for (int j = 0; j < template_cols; ++j) 281 { 282 T src = loadpix(srcptr + mad24(y+i, src_step, mad24(x+j, TSIZE, src_offset))); 283 T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset))); 284 285 value = convertToWT(src) - convertToWT(template); 286 287 sum = mad(value, value, sum); 288 } 289 } 290 291 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 292 *(__global float *)(dst + dst_idx) = convertToDT(sum); 293 } 294 } 295 296 #elif defined SQDIFF_PREPARED 297 298 __kernel void matchTemplate_Prepared_SQDIFF(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, 299 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 300 int template_rows, int template_cols, __global const float * template_sqsum) 301 { 302 int x = get_global_id(0); 303 int y = get_global_id(1); 304 305 if (x < dst_cols && y < dst_rows) 306 { 307 src_sqsums_step /= sizeof(float); 308 src_sqsums_offset /= sizeof(float); 309 310 __global const float * sqsum = (__global const float *)(src_sqsums); 311 float image_sqsum_ = (float)( 312 (sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)]) - 313 (sqsum[SQSUMS_PTR(0, template_rows)] - sqsum[SQSUMS_PTR(0, 0)])); 314 float template_sqsum_value = template_sqsum[0]; 315 316 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 317 __global float * dstult = (__global float *)(dst + dst_idx); 318 *dstult = image_sqsum_ - 2.0f * dstult[0] + template_sqsum_value; 319 } 320 } 321 322 #elif defined SQDIFF_NORMED 323 324 __kernel void matchTemplate_SQDIFF_NORMED(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, 325 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 326 int template_rows, int template_cols, __global const float * template_sqsum) 327 { 328 int x = get_global_id(0); 329 int y = get_global_id(1); 330 331 if (x < dst_cols && y < dst_rows) 332 { 333 src_sqsums_step /= sizeof(float); 334 src_sqsums_offset /= sizeof(float); 335 336 __global const float * sqsum = (__global const float *)(src_sqsums); 337 float image_sqsum_ = (float)( 338 (sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)]) - 339 (sqsum[SQSUMS_PTR(0, template_rows)] - sqsum[SQSUMS_PTR(0, 0)])); 340 float template_sqsum_value = template_sqsum[0]; 341 342 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 343 __global float * dstult = (__global float *)(dst + dst_idx); 344 *dstult = normAcc_SQDIFF(image_sqsum_ - 2.0f * dstult[0] + template_sqsum_value, sqrt(image_sqsum_ * template_sqsum_value)); 345 } 346 } 347 348 #elif defined CCOEFF 349 350 #if cn == 1 351 352 __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, 353 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 354 int template_rows, int template_cols, float template_sum) 355 { 356 int x = get_global_id(0); 357 int y = get_global_id(1); 358 359 if (x < dst_cols && y < dst_rows) 360 { 361 __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset))); 362 363 int step = src_sums_step/(int)sizeof(T); 364 365 T image_sum = (T)(0), value; 366 367 value = (T)(sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]); 368 369 image_sum = mad(value, template_sum , image_sum); 370 371 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 372 *(__global float *)(dst + dst_idx) -= convertToDT(image_sum); 373 } 374 } 375 376 #elif cn==3 377 378 __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, 379 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 380 int template_rows, int template_cols, float4 template_sum) 381 { 382 int x = get_global_id(0); 383 int y = get_global_id(1); 384 385 if (x < dst_cols && y < dst_rows) 386 { 387 T image_sum = (T)(0), value, temp_sum; 388 389 temp_sum.x = template_sum.x; 390 temp_sum.y = template_sum.y; 391 temp_sum.z = template_sum.z; 392 393 value = vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, template_rows))); 394 value -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, template_rows))); 395 value -= vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, 0))); 396 value += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0))); 397 398 image_sum = mad(value, temp_sum , 0); 399 400 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 401 *(__global float *)(dst + dst_idx) -= convertToDT(image_sum); 402 } 403 } 404 405 #elif (cn==2 || cn==4) 406 407 __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, 408 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 409 int template_rows, int template_cols, float4 template_sum) 410 { 411 int x = get_global_id(0); 412 int y = get_global_id(1); 413 414 if (x < dst_cols && y < dst_rows) 415 { 416 __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset))); 417 418 int step = src_sums_step/(int)sizeof(T); 419 420 T image_sum = (T)(0), value, temp_sum; 421 422 #if cn==2 423 temp_sum.x = template_sum.x; 424 temp_sum.y = template_sum.y; 425 #else 426 temp_sum = template_sum; 427 #endif 428 429 value = (sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]); 430 431 image_sum = mad(value, temp_sum , image_sum); 432 433 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 434 *(__global float *)(dst + dst_idx) -= convertToDT(image_sum); 435 } 436 } 437 438 #else 439 #error "cn should be 1-4" 440 #endif 441 442 #elif defined CCOEFF_NORMED 443 444 #if cn == 1 445 446 __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, 447 __global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, 448 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 449 int t_rows, int t_cols, float weight, float template_sum, float template_sqsum) 450 { 451 int x = get_global_id(0); 452 int y = get_global_id(1); 453 454 float sum_[2]; 455 float sqsum_[2]; 456 457 458 if (x < dst_cols && y < dst_rows) 459 { 460 int step = src_sums_step/(int)sizeof(T); 461 462 __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset))); 463 __global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset))); 464 465 T value_sum = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0]; 466 T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0]; 467 468 float num = convertToDT(mad(value_sum, template_sum, 0)); 469 470 value_sqsum -= weight * value_sum * value_sum; 471 float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0)); 472 473 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 474 __global float * dstult = (__global float *)(dst+dst_idx); 475 *dstult = normAcc((*dstult) - num, denum); 476 } 477 } 478 479 #elif cn==3 480 481 __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, 482 __global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, 483 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 484 int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum) 485 { 486 int x = get_global_id(0); 487 int y = get_global_id(1); 488 489 if (x < dst_cols && y < dst_rows) 490 { 491 int step = src_sums_step/(int)sizeof(T); 492 493 T temp_sum, value_sum, value_sqsum; 494 495 temp_sum.x = template_sum.x; 496 temp_sum.y = template_sum.y; 497 temp_sum.z = template_sum.z; 498 499 value_sum = vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, t_rows))); 500 value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, t_rows))); 501 value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, 0))); 502 value_sum += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0))); 503 504 value_sqsum = vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, t_rows))); 505 value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, t_rows))); 506 value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, 0))); 507 value_sqsum += vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, 0))); 508 509 float num = convertToDT(mad(value_sum, temp_sum, 0)); 510 511 value_sqsum -= weight * value_sum * value_sum; 512 float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0)); 513 514 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 515 __global float * dstult = (__global float *)(dst+dst_idx); 516 *dstult = normAcc((*dstult) - num, denum); 517 } 518 } 519 520 #elif (cn==2 || cn==4) 521 522 __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset, 523 __global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset, 524 __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols, 525 int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum) 526 { 527 int x = get_global_id(0); 528 int y = get_global_id(1); 529 530 if (x < dst_cols && y < dst_rows) 531 { 532 int step = src_sums_step/(int)sizeof(T); 533 534 T temp_sum; 535 536 __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset))); 537 __global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset))); 538 539 T value_sum = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0]; 540 T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0]; 541 542 #if cn==2 543 temp_sum.x = template_sum.x; 544 temp_sum.y = template_sum.y; 545 #else 546 temp_sum = template_sum; 547 #endif 548 549 float num = convertToDT(mad(value_sum, temp_sum, 0)); 550 551 value_sqsum -= weight * value_sum * value_sum; 552 float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0)); 553 554 int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset)); 555 __global float * dstult = (__global float *)(dst+dst_idx); 556 *dstult = normAcc((*dstult) - num, denum); 557 } 558 } 559 560 #else 561 #error "cn should be 1-4" 562 #endif 563 564 #endif 565