1 ///////////////////////////// OpenCL kernels for face detection ////////////////////////////// 2 ////////////////////////////// see the opencv/doc/license.txt /////////////////////////////// 3 4 // 5 // the code has been derived from the OpenCL Haar cascade kernel by 6 // 7 // Niko Li, newlife20080214 (a] gmail.com 8 // Wang Weiyan, wangweiyanster (a] gmail.com 9 // Jia Haipeng, jiahaipeng95 (a] gmail.com 10 // Nathan, liujun (a] multicorewareinc.com 11 // Peng Xiao, pengxiao (a] outlook.com 12 // Erping Pang, erping (a] multicorewareinc.com 13 // 14 15 #ifdef HAAR 16 typedef struct __attribute__((aligned(4))) OptHaarFeature 17 { 18 int4 ofs[3] __attribute__((aligned (4))); 19 float4 weight __attribute__((aligned (4))); 20 } 21 OptHaarFeature; 22 #endif 23 24 #ifdef LBP 25 typedef struct __attribute__((aligned(4))) OptLBPFeature 26 { 27 int16 ofs __attribute__((aligned (4))); 28 } 29 OptLBPFeature; 30 #endif 31 32 typedef struct __attribute__((aligned(4))) Stump 33 { 34 float4 st __attribute__((aligned (4))); 35 } 36 Stump; 37 38 typedef struct __attribute__((aligned(4))) Node 39 { 40 int4 n __attribute__((aligned (4))); 41 } 42 Node; 43 44 typedef struct __attribute__((aligned (4))) Stage 45 { 46 int first __attribute__((aligned (4))); 47 int ntrees __attribute__((aligned (4))); 48 float threshold __attribute__((aligned (4))); 49 } 50 Stage; 51 52 typedef struct __attribute__((aligned (4))) ScaleData 53 { 54 float scale __attribute__((aligned (4))); 55 int szi_width __attribute__((aligned (4))); 56 int szi_height __attribute__((aligned (4))); 57 int layer_ofs __attribute__((aligned (4))); 58 int ystep __attribute__((aligned (4))); 59 } 60 ScaleData; 61 62 #ifndef SUM_BUF_SIZE 63 #define SUM_BUF_SIZE 0 64 #endif 65 66 #ifndef NODE_COUNT 67 #define NODE_COUNT 1 68 #endif 69 70 #ifdef HAAR 71 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1))) 72 void runHaarClassifier( 73 int nscales, __global const ScaleData* scaleData, 74 __global const int* sum, 75 int _sumstep, int sumoffset, 76 __global const OptHaarFeature* optfeatures, 77 __global const Stage* stages, 78 __global const Node* nodes, 79 __global const float* leaves0, 80 81 volatile __global int* facepos, 82 int4 normrect, int sqofs, int2 windowsize) 83 { 84 int lx = get_local_id(0); 85 int ly = get_local_id(1); 86 int groupIdx = get_group_id(0); 87 int i, ngroups = get_global_size(0)/LOCAL_SIZE_X; 88 int scaleIdx, tileIdx, stageIdx; 89 int sumstep = (int)(_sumstep/sizeof(int)); 90 int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x), 91 mad24(normrect.y, sumstep, normrect.x + normrect.z), 92 mad24(normrect.y + normrect.w, sumstep, normrect.x), 93 mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z)); 94 int normarea = normrect.z * normrect.w; 95 float invarea = 1.f/normarea; 96 int lidx = ly*LOCAL_SIZE_X + lx; 97 98 #if SUM_BUF_SIZE > 0 99 int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x), 100 mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z), 101 mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x), 102 mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z)); 103 #else 104 int4 nofs = nofs0; 105 #endif 106 #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y) 107 __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1]; 108 #if SUM_BUF_SIZE > 0 109 __local int* ibuf = lstore; 110 __local int* lcount = ibuf + SUM_BUF_SIZE; 111 #else 112 __local int* lcount = lstore; 113 #endif 114 __local float* lnf = (__local float*)(lcount + 1); 115 __local float* lpartsum = lnf + LOCAL_SIZE; 116 __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE); 117 118 for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- ) 119 { 120 __global const ScaleData* s = scaleData + scaleIdx; 121 int ystep = s->ystep; 122 int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0)); 123 int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X, 124 (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y); 125 int totalTiles = ntiles.x*ntiles.y; 126 127 for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups ) 128 { 129 int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X; 130 int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y; 131 int ix = lx, iy = ly; 132 __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs; 133 __global const int* psum1 = psum0 + mad24(iy, sumstep, ix); 134 135 if( ix0 >= worksize.x || iy0 >= worksize.y ) 136 continue; 137 #if SUM_BUF_SIZE > 0 138 for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 ) 139 { 140 int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP; 141 vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i); 142 } 143 #endif 144 145 if( lidx == 0 ) 146 lcount[0] = 0; 147 barrier(CLK_LOCAL_MEM_FENCE); 148 149 if( ix0 + ix < worksize.x && iy0 + iy < worksize.y ) 150 { 151 #if NODE_COUNT==1 152 __global const Stump* stump = (__global const Stump*)nodes; 153 #else 154 __global const Node* node = nodes; 155 __global const float* leaves = leaves0; 156 #endif 157 #if SUM_BUF_SIZE > 0 158 __local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix); 159 #else 160 __global const int* psum = psum1; 161 #endif 162 163 __global const int* psqsum = (__global const int*)(psum1 + sqofs); 164 float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea; 165 float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea; 166 float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f)); 167 nf = nf > 0 ? nf : 1.f; 168 169 for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ ) 170 { 171 int ntrees = stages[stageIdx].ntrees; 172 float s = 0.f; 173 #if NODE_COUNT==1 174 for( i = 0; i < ntrees; i++ ) 175 { 176 float4 st = stump[i].st; 177 __global const OptHaarFeature* f = optfeatures + as_int(st.x); 178 float4 weight = f->weight; 179 180 int4 ofs = f->ofs[0]; 181 sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; 182 ofs = f->ofs[1]; 183 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); 184 if( weight.z > 0 ) 185 { 186 ofs = f->ofs[2]; 187 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); 188 } 189 190 s += (sval < st.y*nf) ? st.z : st.w; 191 } 192 stump += ntrees; 193 #else 194 for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 ) 195 { 196 int idx = 0; 197 do 198 { 199 int4 n = node[idx].n; 200 __global const OptHaarFeature* f = optfeatures + n.x; 201 float4 weight = f->weight; 202 203 int4 ofs = f->ofs[0]; 204 205 sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; 206 ofs = f->ofs[1]; 207 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); 208 if( weight.z > 0 ) 209 { 210 ofs = f->ofs[2]; 211 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); 212 } 213 214 idx = (sval < as_float(n.y)*nf) ? n.z : n.w; 215 } 216 while(idx > 0); 217 s += leaves[-idx]; 218 } 219 #endif 220 221 if( s < stages[stageIdx].threshold ) 222 break; 223 } 224 225 if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) ) 226 { 227 int count = atomic_inc(lcount); 228 lbuf[count] = (int)(ix | (iy << 8)); 229 lnf[count] = nf; 230 } 231 } 232 233 for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ ) 234 { 235 barrier(CLK_LOCAL_MEM_FENCE); 236 int nrects = lcount[0]; 237 238 if( nrects == 0 ) 239 break; 240 barrier(CLK_LOCAL_MEM_FENCE); 241 if( lidx == 0 ) 242 lcount[0] = 0; 243 244 { 245 #if NODE_COUNT == 1 246 __global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first; 247 #else 248 __global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT; 249 __global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1); 250 #endif 251 int nparts = LOCAL_SIZE / nrects; 252 int ntrees = stages[stageIdx].ntrees; 253 int ntrees_p = (ntrees + nparts - 1)/nparts; 254 int nr = lidx / nparts; 255 int partidx = -1, idxval = 0; 256 float partsum = 0.f, nf = 0.f; 257 258 if( nr < nrects ) 259 { 260 partidx = lidx % nparts; 261 idxval = lbuf[nr]; 262 nf = lnf[nr]; 263 264 { 265 int ntrees0 = ntrees_p*partidx; 266 int ntrees1 = min(ntrees0 + ntrees_p, ntrees); 267 int ix1 = idxval & 255, iy1 = idxval >> 8; 268 #if SUM_BUF_SIZE > 0 269 __local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1); 270 #else 271 __global const int* psum = psum0 + mad24(iy1, sumstep, ix1); 272 #endif 273 274 #if NODE_COUNT == 1 275 for( i = ntrees0; i < ntrees1; i++ ) 276 { 277 float4 st = stump[i].st; 278 __global const OptHaarFeature* f = optfeatures + as_int(st.x); 279 float4 weight = f->weight; 280 281 int4 ofs = f->ofs[0]; 282 float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; 283 ofs = f->ofs[1]; 284 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); 285 //if( weight.z > 0 ) 286 if( fabs(weight.z) > 0 ) 287 { 288 ofs = f->ofs[2]; 289 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); 290 } 291 292 partsum += (sval < st.y*nf) ? st.z : st.w; 293 } 294 #else 295 for( i = ntrees0; i < ntrees1; i++ ) 296 { 297 int idx = 0; 298 do 299 { 300 int4 n = node[i*2 + idx].n; 301 __global const OptHaarFeature* f = optfeatures + n.x; 302 float4 weight = f->weight; 303 int4 ofs = f->ofs[0]; 304 305 float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x; 306 ofs = f->ofs[1]; 307 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval); 308 if( weight.z > 0 ) 309 { 310 ofs = f->ofs[2]; 311 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval); 312 } 313 314 idx = (sval < as_float(n.y)*nf) ? n.z : n.w; 315 } 316 while(idx > 0); 317 partsum += leaves[i*3-idx]; 318 } 319 #endif 320 } 321 } 322 lpartsum[lidx] = partsum; 323 barrier(CLK_LOCAL_MEM_FENCE); 324 325 if( partidx == 0 ) 326 { 327 float s = lpartsum[nr*nparts]; 328 for( i = 1; i < nparts; i++ ) 329 s += lpartsum[i + nr*nparts]; 330 if( s >= stages[stageIdx].threshold ) 331 { 332 int count = atomic_inc(lcount); 333 lbuf[count] = idxval; 334 lnf[count] = nf; 335 } 336 } 337 } 338 } 339 340 barrier(CLK_LOCAL_MEM_FENCE); 341 if( stageIdx == N_STAGES ) 342 { 343 int nrects = lcount[0]; 344 if( lidx < nrects ) 345 { 346 int nfaces = atomic_inc(facepos); 347 if( nfaces < MAX_FACES ) 348 { 349 volatile __global int* face = facepos + 1 + nfaces*3; 350 int val = lbuf[lidx]; 351 face[0] = scaleIdx; 352 face[1] = ix0 + (val & 255); 353 face[2] = iy0 + (val >> 8); 354 } 355 } 356 } 357 } 358 } 359 } 360 #endif 361 362 #ifdef LBP 363 #undef CALC_SUM_OFS_ 364 #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \ 365 ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3]) 366 367 __kernel void runLBPClassifierStumpSimple( 368 int nscales, __global const ScaleData* scaleData, 369 __global const int* sum, 370 int _sumstep, int sumoffset, 371 __global const OptLBPFeature* optfeatures, 372 __global const Stage* stages, 373 __global const Stump* stumps, 374 __global const int* bitsets, 375 int bitsetSize, 376 377 volatile __global int* facepos, 378 int2 windowsize) 379 { 380 int lx = get_local_id(0); 381 int ly = get_local_id(1); 382 int local_size_x = get_local_size(0); 383 int local_size_y = get_local_size(1); 384 int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0); 385 int ngroups = get_num_groups(0)*get_num_groups(1); 386 int scaleIdx, tileIdx, stageIdx; 387 int sumstep = (int)(_sumstep/sizeof(int)); 388 389 for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- ) 390 { 391 __global const ScaleData* s = scaleData + scaleIdx; 392 int ystep = s->ystep; 393 int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0)); 394 int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x, 395 (worksize.y/ystep + local_size_y-1)/local_size_y); 396 int totalTiles = ntiles.x*ntiles.y; 397 398 for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups ) 399 { 400 int iy = mad24((tileIdx / ntiles.x), local_size_y, ly) * ystep; 401 int ix = mad24((tileIdx % ntiles.x), local_size_x, lx) * ystep; 402 403 if( ix < worksize.x && iy < worksize.y ) 404 { 405 __global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs; 406 __global const Stump* stump = stumps; 407 __global const int* bitset = bitsets; 408 409 for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ ) 410 { 411 int i, ntrees = stages[stageIdx].ntrees; 412 float s = 0.f; 413 for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize ) 414 { 415 float4 st = stump->st; 416 __global const OptLBPFeature* f = optfeatures + as_int(st.x); 417 int16 ofs = f->ofs; 418 419 int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ); 420 421 int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0 422 idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1 423 idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2 424 425 mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5 426 mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8 427 mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7 428 mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6 429 mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7 430 431 s += (bitset[idx] & (1 << mask)) ? st.z : st.w; 432 } 433 434 if( s < stages[stageIdx].threshold ) 435 break; 436 } 437 438 if( stageIdx == N_STAGES ) 439 { 440 int nfaces = atomic_inc(facepos); 441 if( nfaces < MAX_FACES ) 442 { 443 volatile __global int* face = facepos + 1 + nfaces*3; 444 face[0] = scaleIdx; 445 face[1] = ix; 446 face[2] = iy; 447 } 448 } 449 } 450 } 451 } 452 } 453 454 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1))) 455 void runLBPClassifierStump( 456 int nscales, __global const ScaleData* scaleData, 457 __global const int* sum, 458 int _sumstep, int sumoffset, 459 __global const OptLBPFeature* optfeatures, 460 __global const Stage* stages, 461 __global const Stump* stumps, 462 __global const int* bitsets, 463 int bitsetSize, 464 465 volatile __global int* facepos, 466 int2 windowsize) 467 { 468 int lx = get_local_id(0); 469 int ly = get_local_id(1); 470 int groupIdx = get_group_id(0); 471 int i, ngroups = get_global_size(0)/LOCAL_SIZE_X; 472 int scaleIdx, tileIdx, stageIdx; 473 int sumstep = (int)(_sumstep/sizeof(int)); 474 int lidx = ly*LOCAL_SIZE_X + lx; 475 476 #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y) 477 __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1]; 478 #if SUM_BUF_SIZE > 0 479 __local int* ibuf = lstore; 480 __local int* lcount = ibuf + SUM_BUF_SIZE; 481 #else 482 __local int* lcount = lstore; 483 #endif 484 __local float* lpartsum = (__local float*)(lcount + 1); 485 __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE); 486 487 for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- ) 488 { 489 __global const ScaleData* s = scaleData + scaleIdx; 490 int ystep = s->ystep; 491 int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0)); 492 int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X, 493 (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y); 494 int totalTiles = ntiles.x*ntiles.y; 495 496 for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups ) 497 { 498 int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X; 499 int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y; 500 int ix = lx, iy = ly; 501 __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs; 502 503 if( ix0 >= worksize.x || iy0 >= worksize.y ) 504 continue; 505 #if SUM_BUF_SIZE > 0 506 for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 ) 507 { 508 int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP; 509 vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i); 510 } 511 barrier(CLK_LOCAL_MEM_FENCE); 512 #endif 513 514 if( lidx == 0 ) 515 lcount[0] = 0; 516 barrier(CLK_LOCAL_MEM_FENCE); 517 518 if( ix0 + ix < worksize.x && iy0 + iy < worksize.y ) 519 { 520 __global const Stump* stump = stumps; 521 __global const int* bitset = bitsets; 522 #if SUM_BUF_SIZE > 0 523 __local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix); 524 #else 525 __global const int* p = psum0 + mad24(iy, sumstep, ix); 526 #endif 527 528 for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ ) 529 { 530 int ntrees = stages[stageIdx].ntrees; 531 float s = 0.f; 532 for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize ) 533 { 534 float4 st = stump->st; 535 __global const OptLBPFeature* f = optfeatures + as_int(st.x); 536 int16 ofs = f->ofs; 537 538 int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ); 539 540 int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0 541 idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1 542 idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2 543 544 mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5 545 mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8 546 mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7 547 mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6 548 mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7 549 550 s += (bitset[idx] & (1 << mask)) ? st.z : st.w; 551 } 552 553 if( s < stages[stageIdx].threshold ) 554 break; 555 } 556 557 if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) ) 558 { 559 int count = atomic_inc(lcount); 560 lbuf[count] = (int)(ix | (iy << 8)); 561 } 562 } 563 564 for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ ) 565 { 566 int nrects = lcount[0]; 567 568 barrier(CLK_LOCAL_MEM_FENCE); 569 if( nrects == 0 ) 570 break; 571 if( lidx == 0 ) 572 lcount[0] = 0; 573 574 { 575 __global const Stump* stump = stumps + stages[stageIdx].first; 576 __global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize; 577 int nparts = LOCAL_SIZE / nrects; 578 int ntrees = stages[stageIdx].ntrees; 579 int ntrees_p = (ntrees + nparts - 1)/nparts; 580 int nr = lidx / nparts; 581 int partidx = -1, idxval = 0; 582 float partsum = 0.f, nf = 0.f; 583 584 if( nr < nrects ) 585 { 586 partidx = lidx % nparts; 587 idxval = lbuf[nr]; 588 589 { 590 int ntrees0 = ntrees_p*partidx; 591 int ntrees1 = min(ntrees0 + ntrees_p, ntrees); 592 int ix1 = idxval & 255, iy1 = idxval >> 8; 593 #if SUM_BUF_SIZE > 0 594 __local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1); 595 #else 596 __global const int* p = psum0 + mad24(iy1, sumstep, ix1); 597 #endif 598 599 for( i = ntrees0; i < ntrees1; i++ ) 600 { 601 float4 st = stump[i].st; 602 __global const OptLBPFeature* f = optfeatures + as_int(st.x); 603 int16 ofs = f->ofs; 604 605 #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \ 606 ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3]) 607 608 int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p ); 609 610 int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0 611 idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1 612 idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2 613 614 mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5 615 mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0); // 8 616 mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0); // 7 617 mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0); // 6 618 mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0); // 7 619 620 partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w; 621 } 622 } 623 } 624 lpartsum[lidx] = partsum; 625 barrier(CLK_LOCAL_MEM_FENCE); 626 627 if( partidx == 0 ) 628 { 629 float s = lpartsum[nr*nparts]; 630 for( i = 1; i < nparts; i++ ) 631 s += lpartsum[i + nr*nparts]; 632 if( s >= stages[stageIdx].threshold ) 633 { 634 int count = atomic_inc(lcount); 635 lbuf[count] = idxval; 636 } 637 } 638 } 639 } 640 641 barrier(CLK_LOCAL_MEM_FENCE); 642 if( stageIdx == N_STAGES ) 643 { 644 int nrects = lcount[0]; 645 if( lidx < nrects ) 646 { 647 int nfaces = atomic_inc(facepos); 648 if( nfaces < MAX_FACES ) 649 { 650 volatile __global int* face = facepos + 1 + nfaces*3; 651 int val = lbuf[lidx]; 652 face[0] = scaleIdx; 653 face[1] = ix0 + (val & 255); 654 face[2] = iy0 + (val >> 8); 655 } 656 } 657 } 658 } 659 } 660 } 661 #endif 662