1 //////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved. 14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // @Authors 18 // Shengen Yan,yanshengen (a] gmail.com 19 // 20 // Redistribution and use in source and binary forms, with or without modification, 21 // are permitted provided that the following conditions are met: 22 // 23 // * Redistribution's of source code must retain the above copyright notice, 24 // this list of conditions and the following disclaimer. 25 // 26 // * Redistribution's in binary form must reproduce the above copyright notice, 27 // this list of conditions and the following disclaimer in the documentation 28 // and/or other materials provided with the distribution. 29 // 30 // * The name of the copyright holders may not be used to endorse or promote products 31 // derived from this software without specific prior written permission. 32 // 33 // This software is provided by the copyright holders and contributors as is and 34 // any express or implied warranties, including, but not limited to, the implied 35 // warranties of merchantability and fitness for a particular purpose are disclaimed. 36 // In no event shall the Intel Corporation or contributors be liable for any direct, 37 // indirect, incidental, special, exemplary, or consequential damages 38 // (including, but not limited to, procurement of substitute goods or services; 39 // loss of use, data, or profits; or business interruption) however caused 40 // and on any theory of liability, whether in contract, strict liability, 41 // or tort (including negligence or otherwise) arising in any way out of 42 // the use of this software, even if advised of the possibility of such damage. 43 // 44 45 #ifdef DOUBLE_SUPPORT 46 #ifdef cl_amd_fp64 47 #pragma OPENCL EXTENSION cl_amd_fp64:enable 48 #elif defined (cl_khr_fp64) 49 #pragma OPENCL EXTENSION cl_khr_fp64:enable 50 #endif 51 #endif 52 53 #if defined OP_NORM_INF_MASK 54 55 #ifdef DEPTH_0 56 #define MIN_VAL 0 57 #define MAX_VAL 255 58 #elif defined DEPTH_1 59 #define MIN_VAL -128 60 #define MAX_VAL 127 61 #elif defined DEPTH_2 62 #define MIN_VAL 0 63 #define MAX_VAL 65535 64 #elif defined DEPTH_3 65 #define MIN_VAL -32768 66 #define MAX_VAL 32767 67 #elif defined DEPTH_4 68 #define MIN_VAL INT_MIN 69 #define MAX_VAL INT_MAX 70 #elif defined DEPTH_5 71 #define MIN_VAL (-FLT_MAX) 72 #define MAX_VAL FLT_MAX 73 #elif defined DEPTH_6 74 #define MIN_VAL (-DBL_MAX) 75 #define MAX_VAL DBL_MAX 76 #endif 77 78 #define dstT srcT 79 #define dstT1 srcT1 80 81 #endif // min/max stuff 82 83 #define noconvert 84 85 #ifndef kercn 86 #define kercn 1 87 #endif 88 89 #ifdef HAVE_MASK_CONT 90 #define MASK_INDEX int mask_index = id + mask_offset; 91 #else 92 #define MASK_INDEX int mask_index = mad24(id / cols, mask_step, mask_offset + (id % cols)) 93 #endif 94 95 #if cn != 3 96 #define loadpix(addr) *(__global const srcT *)(addr) 97 #define storepix(val, addr) *(__global dstT *)(addr) = val 98 #if kercn == 1 99 #define srcTSIZE (int)sizeof(srcT) 100 #else 101 #define srcTSIZE (int)sizeof(srcT1) 102 #endif 103 #define dstTSIZE (int)sizeof(dstT) 104 #else 105 #define loadpix(addr) vload3(0, (__global const srcT1 *)(addr)) 106 #define storepix(val, addr) vstore3(val, 0, (__global dstT1 *)(addr)) 107 #define srcTSIZE ((int)sizeof(srcT1)*3) 108 #define dstTSIZE ((int)sizeof(dstT1)*3) 109 #endif 110 111 #if ddepth <= 4 112 #define SUM_ABS(a) convertFromU(abs(a)) 113 #define SUM_ABS2(a, b) convertFromU(abs_diff(a, b)) 114 #else 115 #define SUM_ABS(a) fabs(a) 116 #define SUM_ABS2(a, b) fabs(a - b) 117 #endif 118 119 #ifdef HAVE_MASK 120 #ifdef HAVE_SRC2 121 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset, __global const uchar * src2ptr, int src2_step, int src2_offset 122 #else 123 #define EXTRA_PARAMS , __global const uchar * mask, int mask_step, int mask_offset 124 #endif 125 #else 126 #ifdef HAVE_SRC2 127 #define EXTRA_PARAMS , __global const uchar * src2ptr, int src2_step, int src2_offset 128 #else 129 #define EXTRA_PARAMS 130 #endif 131 #endif 132 133 // accumulative reduction stuff 134 #if defined OP_SUM || defined OP_SUM_ABS || defined OP_SUM_SQR || defined OP_DOT 135 136 #ifdef OP_DOT 137 #if ddepth <= 4 138 #define FUNC(a, b, c) a = mad24(b, c, a) 139 #else 140 #define FUNC(a, b, c) a = mad(b, c, a) 141 #endif 142 143 #elif defined OP_SUM 144 #define FUNC(a, b) a += b 145 146 #elif defined OP_SUM_ABS 147 #define FUNC(a, b) a += SUM_ABS(b) 148 149 #elif defined OP_SUM_SQR 150 #if ddepth <= 4 151 #define FUNC(a, b) a = mad24(b, b, a) 152 #else 153 #define FUNC(a, b) a = mad(b, b, a) 154 #endif 155 #endif 156 157 #ifdef OP_CALC2 158 #define DECLARE_LOCAL_MEM \ 159 __local dstT localmem[WGS2_ALIGNED], localmem2[WGS2_ALIGNED] 160 #define DEFINE_ACCUMULATOR \ 161 dstT accumulator = (dstT)(0), accumulator2 = (dstT)(0) 162 #else 163 #define DECLARE_LOCAL_MEM \ 164 __local dstT localmem[WGS2_ALIGNED] 165 #define DEFINE_ACCUMULATOR \ 166 dstT accumulator = (dstT)(0) 167 #endif 168 169 #ifdef HAVE_SRC2 170 #ifdef OP_CALC2 171 #define PROCESS_ELEMS \ 172 dstT temp = convertToDT(loadpix(srcptr + src_index)); \ 173 dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 174 temp = SUM_ABS2(temp, temp2); \ 175 temp2 = SUM_ABS(temp2); \ 176 FUNC(accumulator2, temp2); \ 177 FUNC(accumulator, temp) 178 #else 179 #define PROCESS_ELEMS \ 180 dstT temp = convertToDT(loadpix(srcptr + src_index)); \ 181 dstT temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 182 temp = SUM_ABS2(temp, temp2); \ 183 FUNC(accumulator, temp) 184 #endif 185 #else 186 #define PROCESS_ELEMS \ 187 dstT temp = convertToDT(loadpix(srcptr + src_index)); \ 188 FUNC(accumulator, temp) 189 #endif 190 191 #ifdef HAVE_MASK 192 #define REDUCE_GLOBAL \ 193 MASK_INDEX; \ 194 if (mask[mask_index]) \ 195 { \ 196 PROCESS_ELEMS; \ 197 } 198 #elif defined OP_DOT 199 200 #ifdef HAVE_SRC2_CONT 201 #define SRC2_INDEX int src2_index = mad24(id, srcTSIZE, src2_offset); 202 #else 203 #define SRC2_INDEX int src2_index = mad24(id / cols, src2_step, mad24(id % cols, srcTSIZE, src2_offset)) 204 #endif 205 206 #if kercn == 1 207 #define REDUCE_GLOBAL \ 208 SRC2_INDEX; \ 209 dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 210 FUNC(accumulator, temp, temp2) 211 #elif kercn == 2 212 #define REDUCE_GLOBAL \ 213 SRC2_INDEX; \ 214 dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 215 FUNC(accumulator, temp.s0, temp2.s0); \ 216 FUNC(accumulator, temp.s1, temp2.s1) 217 #elif kercn == 4 218 #define REDUCE_GLOBAL \ 219 SRC2_INDEX; \ 220 dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 221 FUNC(accumulator, temp.s0, temp2.s0); \ 222 FUNC(accumulator, temp.s1, temp2.s1); \ 223 FUNC(accumulator, temp.s2, temp2.s2); \ 224 FUNC(accumulator, temp.s3, temp2.s3) 225 #elif kercn == 8 226 #define REDUCE_GLOBAL \ 227 SRC2_INDEX; \ 228 dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 229 FUNC(accumulator, temp.s0, temp2.s0); \ 230 FUNC(accumulator, temp.s1, temp2.s1); \ 231 FUNC(accumulator, temp.s2, temp2.s2); \ 232 FUNC(accumulator, temp.s3, temp2.s3); \ 233 FUNC(accumulator, temp.s4, temp2.s4); \ 234 FUNC(accumulator, temp.s5, temp2.s5); \ 235 FUNC(accumulator, temp.s6, temp2.s6); \ 236 FUNC(accumulator, temp.s7, temp2.s7) 237 #elif kercn == 16 238 #define REDUCE_GLOBAL \ 239 SRC2_INDEX; \ 240 dstTK temp = convertToDT(loadpix(srcptr + src_index)), temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 241 FUNC(accumulator, temp.s0, temp2.s0); \ 242 FUNC(accumulator, temp.s1, temp2.s1); \ 243 FUNC(accumulator, temp.s2, temp2.s2); \ 244 FUNC(accumulator, temp.s3, temp2.s3); \ 245 FUNC(accumulator, temp.s4, temp2.s4); \ 246 FUNC(accumulator, temp.s5, temp2.s5); \ 247 FUNC(accumulator, temp.s6, temp2.s6); \ 248 FUNC(accumulator, temp.s7, temp2.s7); \ 249 FUNC(accumulator, temp.s8, temp2.s8); \ 250 FUNC(accumulator, temp.s9, temp2.s9); \ 251 FUNC(accumulator, temp.sA, temp2.sA); \ 252 FUNC(accumulator, temp.sB, temp2.sB); \ 253 FUNC(accumulator, temp.sC, temp2.sC); \ 254 FUNC(accumulator, temp.sD, temp2.sD); \ 255 FUNC(accumulator, temp.sE, temp2.sE); \ 256 FUNC(accumulator, temp.sF, temp2.sF) 257 #endif 258 259 #else // sum or norm with 2 args 260 #ifdef HAVE_SRC2 261 #ifdef OP_CALC2 // norm relative 262 #if kercn == 1 263 #define REDUCE_GLOBAL \ 264 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 265 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 266 temp = SUM_ABS2(temp, temp2); \ 267 temp2 = SUM_ABS(temp2); \ 268 FUNC(accumulator, temp); \ 269 FUNC(accumulator2, temp2) 270 #elif kercn == 2 271 #define REDUCE_GLOBAL \ 272 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 273 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 274 temp = SUM_ABS2(temp, temp2); \ 275 temp2 = SUM_ABS(temp2); \ 276 FUNC(accumulator, temp.s0); \ 277 FUNC(accumulator, temp.s1); \ 278 FUNC(accumulator2, temp2.s0); \ 279 FUNC(accumulator2, temp2.s1) 280 #elif kercn == 4 281 #define REDUCE_GLOBAL \ 282 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 283 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 284 temp = SUM_ABS2(temp, temp2); \ 285 temp2 = SUM_ABS(temp2); \ 286 FUNC(accumulator, temp.s0); \ 287 FUNC(accumulator, temp.s1); \ 288 FUNC(accumulator, temp.s2); \ 289 FUNC(accumulator, temp.s3); \ 290 FUNC(accumulator2, temp2.s0); \ 291 FUNC(accumulator2, temp2.s1); \ 292 FUNC(accumulator2, temp2.s2); \ 293 FUNC(accumulator2, temp2.s3) 294 #elif kercn == 8 295 #define REDUCE_GLOBAL \ 296 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 297 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 298 temp = SUM_ABS2(temp, temp2); \ 299 temp2 = SUM_ABS(temp2); \ 300 FUNC(accumulator, temp.s0); \ 301 FUNC(accumulator, temp.s1); \ 302 FUNC(accumulator, temp.s2); \ 303 FUNC(accumulator, temp.s3); \ 304 FUNC(accumulator, temp.s4); \ 305 FUNC(accumulator, temp.s5); \ 306 FUNC(accumulator, temp.s6); \ 307 FUNC(accumulator, temp.s7); \ 308 FUNC(accumulator2, temp2.s0); \ 309 FUNC(accumulator2, temp2.s1); \ 310 FUNC(accumulator2, temp2.s2); \ 311 FUNC(accumulator2, temp2.s3); \ 312 FUNC(accumulator2, temp2.s4); \ 313 FUNC(accumulator2, temp2.s5); \ 314 FUNC(accumulator2, temp2.s6); \ 315 FUNC(accumulator2, temp2.s7) 316 #elif kercn == 16 317 #define REDUCE_GLOBAL \ 318 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 319 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 320 temp = SUM_ABS2(temp, temp2); \ 321 temp2 = SUM_ABS(temp2); \ 322 FUNC(accumulator, temp.s0); \ 323 FUNC(accumulator, temp.s1); \ 324 FUNC(accumulator, temp.s2); \ 325 FUNC(accumulator, temp.s3); \ 326 FUNC(accumulator, temp.s4); \ 327 FUNC(accumulator, temp.s5); \ 328 FUNC(accumulator, temp.s6); \ 329 FUNC(accumulator, temp.s7); \ 330 FUNC(accumulator, temp.s8); \ 331 FUNC(accumulator, temp.s9); \ 332 FUNC(accumulator, temp.sA); \ 333 FUNC(accumulator, temp.sB); \ 334 FUNC(accumulator, temp.sC); \ 335 FUNC(accumulator, temp.sD); \ 336 FUNC(accumulator, temp.sE); \ 337 FUNC(accumulator, temp.sF); \ 338 FUNC(accumulator2, temp2.s0); \ 339 FUNC(accumulator2, temp2.s1); \ 340 FUNC(accumulator2, temp2.s2); \ 341 FUNC(accumulator2, temp2.s3); \ 342 FUNC(accumulator2, temp2.s4); \ 343 FUNC(accumulator2, temp2.s5); \ 344 FUNC(accumulator2, temp2.s6); \ 345 FUNC(accumulator2, temp2.s7); \ 346 FUNC(accumulator2, temp2.s8); \ 347 FUNC(accumulator2, temp2.s9); \ 348 FUNC(accumulator2, temp2.sA); \ 349 FUNC(accumulator2, temp2.sB); \ 350 FUNC(accumulator2, temp2.sC); \ 351 FUNC(accumulator2, temp2.sD); \ 352 FUNC(accumulator2, temp2.sE); \ 353 FUNC(accumulator2, temp2.sF) 354 #endif 355 #else // norm with 2 args 356 #if kercn == 1 357 #define REDUCE_GLOBAL \ 358 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 359 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 360 temp = SUM_ABS2(temp, temp2); \ 361 FUNC(accumulator, temp) 362 #elif kercn == 2 363 #define REDUCE_GLOBAL \ 364 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 365 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 366 temp = SUM_ABS2(temp, temp2); \ 367 FUNC(accumulator, temp.s0); \ 368 FUNC(accumulator, temp.s1) 369 #elif kercn == 4 370 #define REDUCE_GLOBAL \ 371 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 372 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 373 temp = SUM_ABS2(temp, temp2); \ 374 FUNC(accumulator, temp.s0); \ 375 FUNC(accumulator, temp.s1); \ 376 FUNC(accumulator, temp.s2); \ 377 FUNC(accumulator, temp.s3) 378 #elif kercn == 8 379 #define REDUCE_GLOBAL \ 380 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 381 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 382 temp = SUM_ABS2(temp, temp2); \ 383 FUNC(accumulator, temp.s0); \ 384 FUNC(accumulator, temp.s1); \ 385 FUNC(accumulator, temp.s2); \ 386 FUNC(accumulator, temp.s3); \ 387 FUNC(accumulator, temp.s4); \ 388 FUNC(accumulator, temp.s5); \ 389 FUNC(accumulator, temp.s6); \ 390 FUNC(accumulator, temp.s7) 391 #elif kercn == 16 392 #define REDUCE_GLOBAL \ 393 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 394 dstTK temp2 = convertToDT(loadpix(src2ptr + src2_index)); \ 395 temp = SUM_ABS2(temp, temp2); \ 396 FUNC(accumulator, temp.s0); \ 397 FUNC(accumulator, temp.s1); \ 398 FUNC(accumulator, temp.s2); \ 399 FUNC(accumulator, temp.s3); \ 400 FUNC(accumulator, temp.s4); \ 401 FUNC(accumulator, temp.s5); \ 402 FUNC(accumulator, temp.s6); \ 403 FUNC(accumulator, temp.s7); \ 404 FUNC(accumulator, temp.s8); \ 405 FUNC(accumulator, temp.s9); \ 406 FUNC(accumulator, temp.sA); \ 407 FUNC(accumulator, temp.sB); \ 408 FUNC(accumulator, temp.sC); \ 409 FUNC(accumulator, temp.sD); \ 410 FUNC(accumulator, temp.sE); \ 411 FUNC(accumulator, temp.sF) 412 #endif 413 #endif 414 415 #else // sum 416 #if kercn == 1 417 #define REDUCE_GLOBAL \ 418 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 419 FUNC(accumulator, temp) 420 #elif kercn == 2 421 #define REDUCE_GLOBAL \ 422 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 423 FUNC(accumulator, temp.s0); \ 424 FUNC(accumulator, temp.s1) 425 #elif kercn == 4 426 #define REDUCE_GLOBAL \ 427 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 428 FUNC(accumulator, temp.s0); \ 429 FUNC(accumulator, temp.s1); \ 430 FUNC(accumulator, temp.s2); \ 431 FUNC(accumulator, temp.s3) 432 #elif kercn == 8 433 #define REDUCE_GLOBAL \ 434 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 435 FUNC(accumulator, temp.s0); \ 436 FUNC(accumulator, temp.s1); \ 437 FUNC(accumulator, temp.s2); \ 438 FUNC(accumulator, temp.s3); \ 439 FUNC(accumulator, temp.s4); \ 440 FUNC(accumulator, temp.s5); \ 441 FUNC(accumulator, temp.s6); \ 442 FUNC(accumulator, temp.s7) 443 #elif kercn == 16 444 #define REDUCE_GLOBAL \ 445 dstTK temp = convertToDT(loadpix(srcptr + src_index)); \ 446 FUNC(accumulator, temp.s0); \ 447 FUNC(accumulator, temp.s1); \ 448 FUNC(accumulator, temp.s2); \ 449 FUNC(accumulator, temp.s3); \ 450 FUNC(accumulator, temp.s4); \ 451 FUNC(accumulator, temp.s5); \ 452 FUNC(accumulator, temp.s6); \ 453 FUNC(accumulator, temp.s7); \ 454 FUNC(accumulator, temp.s8); \ 455 FUNC(accumulator, temp.s9); \ 456 FUNC(accumulator, temp.sA); \ 457 FUNC(accumulator, temp.sB); \ 458 FUNC(accumulator, temp.sC); \ 459 FUNC(accumulator, temp.sD); \ 460 FUNC(accumulator, temp.sE); \ 461 FUNC(accumulator, temp.sF) 462 #endif 463 #endif 464 #endif 465 466 #ifdef OP_CALC2 467 #define SET_LOCAL_1 \ 468 localmem[lid] = accumulator; \ 469 localmem2[lid] = accumulator2 470 #define REDUCE_LOCAL_1 \ 471 localmem[lid - WGS2_ALIGNED] += accumulator; \ 472 localmem2[lid - WGS2_ALIGNED] += accumulator2 473 #define REDUCE_LOCAL_2 \ 474 localmem[lid] += localmem[lid2]; \ 475 localmem2[lid] += localmem2[lid2] 476 #define CALC_RESULT \ 477 storepix(localmem[0], dstptr + dstTSIZE * gid); \ 478 storepix(localmem2[0], dstptr + mad24(groupnum, dstTSIZE, dstTSIZE * gid)) 479 #else 480 #define SET_LOCAL_1 \ 481 localmem[lid] = accumulator 482 #define REDUCE_LOCAL_1 \ 483 localmem[lid - WGS2_ALIGNED] += accumulator 484 #define REDUCE_LOCAL_2 \ 485 localmem[lid] += localmem[lid2] 486 #define CALC_RESULT \ 487 storepix(localmem[0], dstptr + dstTSIZE * gid) 488 #endif 489 490 // countNonZero stuff 491 #elif defined OP_COUNT_NON_ZERO 492 #define dstT int 493 #define DECLARE_LOCAL_MEM \ 494 __local dstT localmem[WGS2_ALIGNED] 495 #define DEFINE_ACCUMULATOR \ 496 dstT accumulator = (dstT)(0); \ 497 srcT1 zero = (srcT1)(0), one = (srcT1)(1) 498 #if kercn == 1 499 #define REDUCE_GLOBAL \ 500 accumulator += loadpix(srcptr + src_index) == zero ? zero : one 501 #elif kercn == 2 502 #define REDUCE_GLOBAL \ 503 srcT value = loadpix(srcptr + src_index); \ 504 accumulator += value.s0 == zero ? zero : one; \ 505 accumulator += value.s1 == zero ? zero : one 506 #elif kercn == 4 507 #define REDUCE_GLOBAL \ 508 srcT value = loadpix(srcptr + src_index); \ 509 accumulator += value.s0 == zero ? zero : one; \ 510 accumulator += value.s1 == zero ? zero : one; \ 511 accumulator += value.s2 == zero ? zero : one; \ 512 accumulator += value.s3 == zero ? zero : one 513 #elif kercn == 8 514 #define REDUCE_GLOBAL \ 515 srcT value = loadpix(srcptr + src_index); \ 516 accumulator += value.s0 == zero ? zero : one; \ 517 accumulator += value.s1 == zero ? zero : one; \ 518 accumulator += value.s2 == zero ? zero : one; \ 519 accumulator += value.s3 == zero ? zero : one; \ 520 accumulator += value.s4 == zero ? zero : one; \ 521 accumulator += value.s5 == zero ? zero : one; \ 522 accumulator += value.s6 == zero ? zero : one; \ 523 accumulator += value.s7 == zero ? zero : one 524 #elif kercn == 16 525 #define REDUCE_GLOBAL \ 526 srcT value = loadpix(srcptr + src_index); \ 527 accumulator += value.s0 == zero ? zero : one; \ 528 accumulator += value.s1 == zero ? zero : one; \ 529 accumulator += value.s2 == zero ? zero : one; \ 530 accumulator += value.s3 == zero ? zero : one; \ 531 accumulator += value.s4 == zero ? zero : one; \ 532 accumulator += value.s5 == zero ? zero : one; \ 533 accumulator += value.s6 == zero ? zero : one; \ 534 accumulator += value.s7 == zero ? zero : one; \ 535 accumulator += value.s8 == zero ? zero : one; \ 536 accumulator += value.s9 == zero ? zero : one; \ 537 accumulator += value.sA == zero ? zero : one; \ 538 accumulator += value.sB == zero ? zero : one; \ 539 accumulator += value.sC == zero ? zero : one; \ 540 accumulator += value.sD == zero ? zero : one; \ 541 accumulator += value.sE == zero ? zero : one; \ 542 accumulator += value.sF == zero ? zero : one 543 #endif 544 545 #define SET_LOCAL_1 \ 546 localmem[lid] = accumulator 547 #define REDUCE_LOCAL_1 \ 548 localmem[lid - WGS2_ALIGNED] += accumulator 549 #define REDUCE_LOCAL_2 \ 550 localmem[lid] += localmem[lid2] 551 #define CALC_RESULT \ 552 storepix(localmem[0], dstptr + dstTSIZE * gid) 553 554 #else 555 #error "No operation" 556 #endif 557 558 #ifdef OP_DOT 559 #undef EXTRA_PARAMS 560 #define EXTRA_PARAMS , __global uchar * src2ptr, int src2_step, int src2_offset 561 #endif 562 563 __kernel void reduce(__global const uchar * srcptr, int src_step, int src_offset, int cols, 564 int total, int groupnum, __global uchar * dstptr EXTRA_PARAMS) 565 { 566 int lid = get_local_id(0); 567 int gid = get_group_id(0); 568 int id = get_global_id(0) * kercn; 569 570 srcptr += src_offset; 571 #ifdef HAVE_SRC2 572 src2ptr += src2_offset; 573 #endif 574 575 DECLARE_LOCAL_MEM; 576 DEFINE_ACCUMULATOR; 577 578 for (int grain = groupnum * WGS * kercn; id < total; id += grain) 579 { 580 #ifdef HAVE_SRC_CONT 581 int src_index = id * srcTSIZE; 582 #else 583 int src_index = mad24(id / cols, src_step, mul24(id % cols, srcTSIZE)); 584 #endif 585 #ifdef HAVE_SRC2 586 #ifdef HAVE_SRC2_CONT 587 int src2_index = id * srcTSIZE; 588 #else 589 int src2_index = mad24(id / cols, src2_step, mul24(id % cols, srcTSIZE)); 590 #endif 591 #endif 592 REDUCE_GLOBAL; 593 } 594 595 if (lid < WGS2_ALIGNED) 596 { 597 SET_LOCAL_1; 598 } 599 barrier(CLK_LOCAL_MEM_FENCE); 600 601 if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED) 602 { 603 REDUCE_LOCAL_1; 604 } 605 barrier(CLK_LOCAL_MEM_FENCE); 606 607 for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1) 608 { 609 if (lid < lsize) 610 { 611 int lid2 = lsize + lid; 612 REDUCE_LOCAL_2; 613 } 614 barrier(CLK_LOCAL_MEM_FENCE); 615 } 616 617 if (lid == 0) 618 { 619 CALC_RESULT; 620 } 621 } 622