Home | History | Annotate | Download | only in opencl
      1 ///////////////////////////// OpenCL kernels for face detection //////////////////////////////
      2 ////////////////////////////// see the opencv/doc/license.txt ///////////////////////////////
      3 
      4 //
      5 // the code has been derived from the OpenCL Haar cascade kernel by
      6 //
      7 //    Niko Li, newlife20080214 (a] gmail.com
      8 //    Wang Weiyan, wangweiyanster (a] gmail.com
      9 //    Jia Haipeng, jiahaipeng95 (a] gmail.com
     10 //    Nathan, liujun (a] multicorewareinc.com
     11 //    Peng Xiao, pengxiao (a] outlook.com
     12 //    Erping Pang, erping (a] multicorewareinc.com
     13 //
     14 
     15 #ifdef HAAR
     16 typedef struct __attribute__((aligned(4))) OptHaarFeature
     17 {
     18     int4 ofs[3] __attribute__((aligned (4)));
     19     float4 weight __attribute__((aligned (4)));
     20 }
     21 OptHaarFeature;
     22 #endif
     23 
     24 #ifdef LBP
     25 typedef struct __attribute__((aligned(4))) OptLBPFeature
     26 {
     27     int16 ofs __attribute__((aligned (4)));
     28 }
     29 OptLBPFeature;
     30 #endif
     31 
     32 typedef struct __attribute__((aligned(4))) Stump
     33 {
     34     float4 st __attribute__((aligned (4)));
     35 }
     36 Stump;
     37 
     38 typedef struct __attribute__((aligned(4))) Node
     39 {
     40     int4 n __attribute__((aligned (4)));
     41 }
     42 Node;
     43 
     44 typedef struct __attribute__((aligned (4))) Stage
     45 {
     46     int first __attribute__((aligned (4)));
     47     int ntrees __attribute__((aligned (4)));
     48     float threshold __attribute__((aligned (4)));
     49 }
     50 Stage;
     51 
     52 typedef struct __attribute__((aligned (4))) ScaleData
     53 {
     54     float scale __attribute__((aligned (4)));
     55     int szi_width __attribute__((aligned (4)));
     56     int szi_height __attribute__((aligned (4)));
     57     int layer_ofs __attribute__((aligned (4)));
     58     int ystep __attribute__((aligned (4)));
     59 }
     60 ScaleData;
     61 
     62 #ifndef SUM_BUF_SIZE
     63 #define SUM_BUF_SIZE 0
     64 #endif
     65 
     66 #ifndef NODE_COUNT
     67 #define NODE_COUNT 1
     68 #endif
     69 
     70 #ifdef HAAR
     71 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
     72 void runHaarClassifier(
     73     int nscales, __global const ScaleData* scaleData,
     74     __global const int* sum,
     75     int _sumstep, int sumoffset,
     76     __global const OptHaarFeature* optfeatures,
     77     __global const Stage* stages,
     78     __global const Node* nodes,
     79     __global const float* leaves0,
     80 
     81     volatile __global int* facepos,
     82     int4 normrect, int sqofs, int2 windowsize)
     83 {
     84     int lx = get_local_id(0);
     85     int ly = get_local_id(1);
     86     int groupIdx = get_group_id(0);
     87     int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
     88     int scaleIdx, tileIdx, stageIdx;
     89     int sumstep = (int)(_sumstep/sizeof(int));
     90     int4 nofs0 = (int4)(mad24(normrect.y, sumstep, normrect.x),
     91                         mad24(normrect.y, sumstep, normrect.x + normrect.z),
     92                         mad24(normrect.y + normrect.w, sumstep, normrect.x),
     93                         mad24(normrect.y + normrect.w, sumstep, normrect.x + normrect.z));
     94     int normarea = normrect.z * normrect.w;
     95     float invarea = 1.f/normarea;
     96     int lidx = ly*LOCAL_SIZE_X + lx;
     97 
     98     #if SUM_BUF_SIZE > 0
     99     int4 nofs = (int4)(mad24(normrect.y, SUM_BUF_STEP, normrect.x),
    100                        mad24(normrect.y, SUM_BUF_STEP, normrect.x + normrect.z),
    101                        mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x),
    102                        mad24(normrect.y + normrect.w, SUM_BUF_STEP, normrect.x + normrect.z));
    103     #else
    104     int4 nofs = nofs0;
    105     #endif
    106     #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
    107     __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*5/2+1];
    108     #if SUM_BUF_SIZE > 0
    109     __local int* ibuf = lstore;
    110     __local int* lcount = ibuf + SUM_BUF_SIZE;
    111     #else
    112     __local int* lcount = lstore;
    113     #endif
    114     __local float* lnf = (__local float*)(lcount + 1);
    115     __local float* lpartsum = lnf + LOCAL_SIZE;
    116     __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
    117 
    118     for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
    119     {
    120         __global const ScaleData* s = scaleData + scaleIdx;
    121         int ystep = s->ystep;
    122         int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
    123         int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
    124                              (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
    125         int totalTiles = ntiles.x*ntiles.y;
    126 
    127         for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
    128         {
    129             int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
    130             int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
    131             int ix = lx, iy = ly;
    132             __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
    133             __global const int* psum1 = psum0 + mad24(iy, sumstep, ix);
    134 
    135             if( ix0 >= worksize.x || iy0 >= worksize.y )
    136                 continue;
    137             #if SUM_BUF_SIZE > 0
    138             for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
    139             {
    140                 int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
    141                 vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
    142             }
    143             #endif
    144 
    145             if( lidx == 0 )
    146                 lcount[0] = 0;
    147             barrier(CLK_LOCAL_MEM_FENCE);
    148 
    149             if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
    150             {
    151                 #if NODE_COUNT==1
    152                 __global const Stump* stump = (__global const Stump*)nodes;
    153                 #else
    154                 __global const Node* node = nodes;
    155                 __global const float* leaves = leaves0;
    156                 #endif
    157                 #if SUM_BUF_SIZE > 0
    158                 __local const int* psum = ibuf + mad24(iy, SUM_BUF_STEP, ix);
    159                 #else
    160                 __global const int* psum = psum1;
    161                 #endif
    162 
    163                 __global const int* psqsum = (__global const int*)(psum1 + sqofs);
    164                 float sval = (psum[nofs.x] - psum[nofs.y] - psum[nofs.z] + psum[nofs.w])*invarea;
    165                 float sqval = (psqsum[nofs0.x] - psqsum[nofs0.y] - psqsum[nofs0.z] + psqsum[nofs0.w])*invarea;
    166                 float nf = (float)normarea * sqrt(max(sqval - sval * sval, 0.f));
    167                 nf = nf > 0 ? nf : 1.f;
    168 
    169                 for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
    170                 {
    171                     int ntrees = stages[stageIdx].ntrees;
    172                     float s = 0.f;
    173                     #if NODE_COUNT==1
    174                     for( i = 0; i < ntrees; i++ )
    175                     {
    176                         float4 st = stump[i].st;
    177                         __global const OptHaarFeature* f = optfeatures + as_int(st.x);
    178                         float4 weight = f->weight;
    179 
    180                         int4 ofs = f->ofs[0];
    181                         sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
    182                         ofs = f->ofs[1];
    183                         sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
    184                         if( weight.z > 0 )
    185                         {
    186                             ofs = f->ofs[2];
    187                             sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
    188                         }
    189 
    190                         s += (sval < st.y*nf) ? st.z : st.w;
    191                     }
    192                     stump += ntrees;
    193                     #else
    194                     for( i = 0; i < ntrees; i++, node += NODE_COUNT, leaves += NODE_COUNT+1 )
    195                     {
    196                         int idx = 0;
    197                         do
    198                         {
    199                             int4 n = node[idx].n;
    200                             __global const OptHaarFeature* f = optfeatures + n.x;
    201                             float4 weight = f->weight;
    202 
    203                             int4 ofs = f->ofs[0];
    204 
    205                             sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
    206                             ofs = f->ofs[1];
    207                             sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
    208                             if( weight.z > 0 )
    209                             {
    210                                 ofs = f->ofs[2];
    211                                 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
    212                             }
    213 
    214                             idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
    215                         }
    216                         while(idx > 0);
    217                         s += leaves[-idx];
    218                     }
    219                     #endif
    220 
    221                     if( s < stages[stageIdx].threshold )
    222                         break;
    223                 }
    224 
    225                 if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
    226                 {
    227                     int count = atomic_inc(lcount);
    228                     lbuf[count] = (int)(ix | (iy << 8));
    229                     lnf[count] = nf;
    230                 }
    231             }
    232 
    233             for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
    234             {
    235                 barrier(CLK_LOCAL_MEM_FENCE);
    236                 int nrects = lcount[0];
    237 
    238                 if( nrects == 0 )
    239                     break;
    240                 barrier(CLK_LOCAL_MEM_FENCE);
    241                 if( lidx == 0 )
    242                     lcount[0] = 0;
    243 
    244                 {
    245                     #if NODE_COUNT == 1
    246                     __global const Stump* stump = (__global const Stump*)nodes + stages[stageIdx].first;
    247                     #else
    248                     __global const Node* node = nodes + stages[stageIdx].first*NODE_COUNT;
    249                     __global const float* leaves = leaves0 + stages[stageIdx].first*(NODE_COUNT+1);
    250                     #endif
    251                     int nparts = LOCAL_SIZE / nrects;
    252                     int ntrees = stages[stageIdx].ntrees;
    253                     int ntrees_p = (ntrees + nparts - 1)/nparts;
    254                     int nr = lidx / nparts;
    255                     int partidx = -1, idxval = 0;
    256                     float partsum = 0.f, nf = 0.f;
    257 
    258                     if( nr < nrects )
    259                     {
    260                         partidx = lidx % nparts;
    261                         idxval = lbuf[nr];
    262                         nf = lnf[nr];
    263 
    264                         {
    265                         int ntrees0 = ntrees_p*partidx;
    266                         int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
    267                         int ix1 = idxval & 255, iy1 = idxval >> 8;
    268                         #if SUM_BUF_SIZE > 0
    269                         __local const int* psum = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
    270                         #else
    271                         __global const int* psum = psum0 + mad24(iy1, sumstep, ix1);
    272                         #endif
    273 
    274                         #if NODE_COUNT == 1
    275                         for( i = ntrees0; i < ntrees1; i++ )
    276                         {
    277                             float4 st = stump[i].st;
    278                             __global const OptHaarFeature* f = optfeatures + as_int(st.x);
    279                             float4 weight = f->weight;
    280 
    281                             int4 ofs = f->ofs[0];
    282                             float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
    283                             ofs = f->ofs[1];
    284                             sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
    285                             //if( weight.z > 0 )
    286                             if( fabs(weight.z) > 0 )
    287                             {
    288                                 ofs = f->ofs[2];
    289                                 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
    290                             }
    291 
    292                             partsum += (sval < st.y*nf) ? st.z : st.w;
    293                         }
    294                         #else
    295                         for( i = ntrees0; i < ntrees1; i++ )
    296                         {
    297                             int idx = 0;
    298                             do
    299                             {
    300                                 int4 n = node[i*2 + idx].n;
    301                                 __global const OptHaarFeature* f = optfeatures + n.x;
    302                                 float4 weight = f->weight;
    303                                 int4 ofs = f->ofs[0];
    304 
    305                                 float sval = (psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w])*weight.x;
    306                                 ofs = f->ofs[1];
    307                                 sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.y, sval);
    308                                 if( weight.z > 0 )
    309                                 {
    310                                     ofs = f->ofs[2];
    311                                     sval = mad((psum[ofs.x] - psum[ofs.y] - psum[ofs.z] + psum[ofs.w]), weight.z, sval);
    312                                 }
    313 
    314                                 idx = (sval < as_float(n.y)*nf) ? n.z : n.w;
    315                             }
    316                             while(idx > 0);
    317                             partsum += leaves[i*3-idx];
    318                         }
    319                         #endif
    320                         }
    321                     }
    322                     lpartsum[lidx] = partsum;
    323                     barrier(CLK_LOCAL_MEM_FENCE);
    324 
    325                     if( partidx == 0 )
    326                     {
    327                         float s = lpartsum[nr*nparts];
    328                         for( i = 1; i < nparts; i++ )
    329                             s += lpartsum[i + nr*nparts];
    330                         if( s >= stages[stageIdx].threshold )
    331                         {
    332                             int count = atomic_inc(lcount);
    333                             lbuf[count] = idxval;
    334                             lnf[count] = nf;
    335                         }
    336                     }
    337                 }
    338             }
    339 
    340             barrier(CLK_LOCAL_MEM_FENCE);
    341             if( stageIdx == N_STAGES )
    342             {
    343                 int nrects = lcount[0];
    344                 if( lidx < nrects )
    345                 {
    346                     int nfaces = atomic_inc(facepos);
    347                     if( nfaces < MAX_FACES )
    348                     {
    349                         volatile __global int* face = facepos + 1 + nfaces*3;
    350                         int val = lbuf[lidx];
    351                         face[0] = scaleIdx;
    352                         face[1] = ix0 + (val & 255);
    353                         face[2] = iy0 + (val >> 8);
    354                     }
    355                 }
    356             }
    357         }
    358     }
    359 }
    360 #endif
    361 
    362 #ifdef LBP
    363 #undef CALC_SUM_OFS_
    364 #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
    365     ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
    366 
    367 __kernel void runLBPClassifierStumpSimple(
    368     int nscales, __global const ScaleData* scaleData,
    369     __global const int* sum,
    370     int _sumstep, int sumoffset,
    371     __global const OptLBPFeature* optfeatures,
    372     __global const Stage* stages,
    373     __global const Stump* stumps,
    374     __global const int* bitsets,
    375     int bitsetSize,
    376 
    377     volatile __global int* facepos,
    378     int2 windowsize)
    379 {
    380     int lx = get_local_id(0);
    381     int ly = get_local_id(1);
    382     int local_size_x = get_local_size(0);
    383     int local_size_y = get_local_size(1);
    384     int groupIdx = get_group_id(1)*get_num_groups(0) + get_group_id(0);
    385     int ngroups = get_num_groups(0)*get_num_groups(1);
    386     int scaleIdx, tileIdx, stageIdx;
    387     int sumstep = (int)(_sumstep/sizeof(int));
    388 
    389     for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
    390     {
    391         __global const ScaleData* s = scaleData + scaleIdx;
    392         int ystep = s->ystep;
    393         int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
    394         int2 ntiles = (int2)((worksize.x/ystep + local_size_x-1)/local_size_x,
    395                              (worksize.y/ystep + local_size_y-1)/local_size_y);
    396         int totalTiles = ntiles.x*ntiles.y;
    397 
    398         for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
    399         {
    400             int iy = mad24((tileIdx / ntiles.x), local_size_y, ly) * ystep;
    401             int ix = mad24((tileIdx % ntiles.x), local_size_x, lx) * ystep;
    402 
    403             if( ix < worksize.x && iy < worksize.y )
    404             {
    405                 __global const int* p = sum + mad24(iy, sumstep, ix) + s->layer_ofs;
    406                 __global const Stump* stump = stumps;
    407                 __global const int* bitset = bitsets;
    408 
    409                 for( stageIdx = 0; stageIdx < N_STAGES; stageIdx++ )
    410                 {
    411                     int i, ntrees = stages[stageIdx].ntrees;
    412                     float s = 0.f;
    413                     for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
    414                     {
    415                         float4 st = stump->st;
    416                         __global const OptLBPFeature* f = optfeatures + as_int(st.x);
    417                         int16 ofs = f->ofs;
    418 
    419                         int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
    420 
    421                         int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
    422                         idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
    423                         idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
    424 
    425                         mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
    426                         mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
    427                         mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
    428                         mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
    429                         mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
    430 
    431                         s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
    432                     }
    433 
    434                     if( s < stages[stageIdx].threshold )
    435                         break;
    436                 }
    437 
    438                 if( stageIdx == N_STAGES )
    439                 {
    440                     int nfaces = atomic_inc(facepos);
    441                     if( nfaces < MAX_FACES )
    442                     {
    443                         volatile __global int* face = facepos + 1 + nfaces*3;
    444                         face[0] = scaleIdx;
    445                         face[1] = ix;
    446                         face[2] = iy;
    447                     }
    448                 }
    449             }
    450         }
    451     }
    452 }
    453 
    454 __kernel __attribute__((reqd_work_group_size(LOCAL_SIZE_X,LOCAL_SIZE_Y,1)))
    455 void runLBPClassifierStump(
    456     int nscales, __global const ScaleData* scaleData,
    457     __global const int* sum,
    458     int _sumstep, int sumoffset,
    459     __global const OptLBPFeature* optfeatures,
    460     __global const Stage* stages,
    461     __global const Stump* stumps,
    462     __global const int* bitsets,
    463     int bitsetSize,
    464 
    465     volatile __global int* facepos,
    466     int2 windowsize)
    467 {
    468     int lx = get_local_id(0);
    469     int ly = get_local_id(1);
    470     int groupIdx = get_group_id(0);
    471     int i, ngroups = get_global_size(0)/LOCAL_SIZE_X;
    472     int scaleIdx, tileIdx, stageIdx;
    473     int sumstep = (int)(_sumstep/sizeof(int));
    474     int lidx = ly*LOCAL_SIZE_X + lx;
    475 
    476     #define LOCAL_SIZE (LOCAL_SIZE_X*LOCAL_SIZE_Y)
    477     __local int lstore[SUM_BUF_SIZE + LOCAL_SIZE*3/2+1];
    478     #if SUM_BUF_SIZE > 0
    479     __local int* ibuf = lstore;
    480     __local int* lcount = ibuf + SUM_BUF_SIZE;
    481     #else
    482     __local int* lcount = lstore;
    483     #endif
    484     __local float* lpartsum = (__local float*)(lcount + 1);
    485     __local short* lbuf = (__local short*)(lpartsum + LOCAL_SIZE);
    486 
    487     for( scaleIdx = nscales-1; scaleIdx >= 0; scaleIdx-- )
    488     {
    489         __global const ScaleData* s = scaleData + scaleIdx;
    490         int ystep = s->ystep;
    491         int2 worksize = (int2)(max(s->szi_width - windowsize.x, 0), max(s->szi_height - windowsize.y, 0));
    492         int2 ntiles = (int2)((worksize.x + LOCAL_SIZE_X-1)/LOCAL_SIZE_X,
    493                              (worksize.y + LOCAL_SIZE_Y-1)/LOCAL_SIZE_Y);
    494         int totalTiles = ntiles.x*ntiles.y;
    495 
    496         for( tileIdx = groupIdx; tileIdx < totalTiles; tileIdx += ngroups )
    497         {
    498             int ix0 = (tileIdx % ntiles.x)*LOCAL_SIZE_X;
    499             int iy0 = (tileIdx / ntiles.x)*LOCAL_SIZE_Y;
    500             int ix = lx, iy = ly;
    501             __global const int* psum0 = sum + mad24(iy0, sumstep, ix0) + s->layer_ofs;
    502 
    503             if( ix0 >= worksize.x || iy0 >= worksize.y )
    504                 continue;
    505             #if SUM_BUF_SIZE > 0
    506             for( i = lidx*4; i < SUM_BUF_SIZE; i += LOCAL_SIZE_X*LOCAL_SIZE_Y*4 )
    507             {
    508                 int dy = i/SUM_BUF_STEP, dx = i - dy*SUM_BUF_STEP;
    509                 vstore4(vload4(0, psum0 + mad24(dy, sumstep, dx)), 0, ibuf+i);
    510             }
    511             barrier(CLK_LOCAL_MEM_FENCE);
    512             #endif
    513 
    514             if( lidx == 0 )
    515                 lcount[0] = 0;
    516             barrier(CLK_LOCAL_MEM_FENCE);
    517 
    518             if( ix0 + ix < worksize.x && iy0 + iy < worksize.y )
    519             {
    520                 __global const Stump* stump = stumps;
    521                 __global const int* bitset = bitsets;
    522                 #if SUM_BUF_SIZE > 0
    523                 __local const int* p = ibuf + mad24(iy, SUM_BUF_STEP, ix);
    524                 #else
    525                 __global const int* p = psum0 + mad24(iy, sumstep, ix);
    526                 #endif
    527 
    528                 for( stageIdx = 0; stageIdx < SPLIT_STAGE; stageIdx++ )
    529                 {
    530                     int ntrees = stages[stageIdx].ntrees;
    531                     float s = 0.f;
    532                     for( i = 0; i < ntrees; i++, stump++, bitset += bitsetSize )
    533                     {
    534                         float4 st = stump->st;
    535                         __global const OptLBPFeature* f = optfeatures + as_int(st.x);
    536                         int16 ofs = f->ofs;
    537 
    538                         int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
    539 
    540                         int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
    541                         idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
    542                         idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
    543 
    544                         mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
    545                         mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
    546                         mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
    547                         mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
    548                         mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
    549 
    550                         s += (bitset[idx] & (1 << mask)) ? st.z : st.w;
    551                     }
    552 
    553                     if( s < stages[stageIdx].threshold )
    554                         break;
    555                 }
    556 
    557                 if( stageIdx == SPLIT_STAGE && (ystep == 1 || ((ix | iy) & 1) == 0) )
    558                 {
    559                     int count = atomic_inc(lcount);
    560                     lbuf[count] = (int)(ix | (iy << 8));
    561                 }
    562             }
    563 
    564             for( stageIdx = SPLIT_STAGE; stageIdx < N_STAGES; stageIdx++ )
    565             {
    566                 int nrects = lcount[0];
    567 
    568                 barrier(CLK_LOCAL_MEM_FENCE);
    569                 if( nrects == 0 )
    570                     break;
    571                 if( lidx == 0 )
    572                     lcount[0] = 0;
    573 
    574                 {
    575                     __global const Stump* stump = stumps + stages[stageIdx].first;
    576                     __global const int* bitset = bitsets + stages[stageIdx].first*bitsetSize;
    577                     int nparts = LOCAL_SIZE / nrects;
    578                     int ntrees = stages[stageIdx].ntrees;
    579                     int ntrees_p = (ntrees + nparts - 1)/nparts;
    580                     int nr = lidx / nparts;
    581                     int partidx = -1, idxval = 0;
    582                     float partsum = 0.f, nf = 0.f;
    583 
    584                     if( nr < nrects )
    585                     {
    586                         partidx = lidx % nparts;
    587                         idxval = lbuf[nr];
    588 
    589                         {
    590                             int ntrees0 = ntrees_p*partidx;
    591                             int ntrees1 = min(ntrees0 + ntrees_p, ntrees);
    592                             int ix1 = idxval & 255, iy1 = idxval >> 8;
    593                             #if SUM_BUF_SIZE > 0
    594                             __local const int* p = ibuf + mad24(iy1, SUM_BUF_STEP, ix1);
    595                             #else
    596                             __global const int* p = psum0 + mad24(iy1, sumstep, ix1);
    597                             #endif
    598 
    599                             for( i = ntrees0; i < ntrees1; i++ )
    600                             {
    601                                 float4 st = stump[i].st;
    602                                 __global const OptLBPFeature* f = optfeatures + as_int(st.x);
    603                                 int16 ofs = f->ofs;
    604 
    605                                 #define CALC_SUM_OFS_(p0, p1, p2, p3, ptr) \
    606                                     ((ptr)[p0] - (ptr)[p1] - (ptr)[p2] + (ptr)[p3])
    607 
    608                                 int cval = CALC_SUM_OFS_( ofs.s5, ofs.s6, ofs.s9, ofs.sa, p );
    609 
    610                                 int mask, idx = (CALC_SUM_OFS_( ofs.s0, ofs.s1, ofs.s4, ofs.s5, p ) >= cval ? 4 : 0); // 0
    611                                 idx |= (CALC_SUM_OFS_( ofs.s1, ofs.s2, ofs.s5, ofs.s6, p ) >= cval ? 2 : 0); // 1
    612                                 idx |= (CALC_SUM_OFS_( ofs.s2, ofs.s3, ofs.s6, ofs.s7, p ) >= cval ? 1 : 0); // 2
    613 
    614                                 mask = (CALC_SUM_OFS_( ofs.s6, ofs.s7, ofs.sa, ofs.sb, p ) >= cval ? 16 : 0); // 5
    615                                 mask |= (CALC_SUM_OFS_( ofs.sa, ofs.sb, ofs.se, ofs.sf, p ) >= cval ? 8 : 0);  // 8
    616                                 mask |= (CALC_SUM_OFS_( ofs.s9, ofs.sa, ofs.sd, ofs.se, p ) >= cval ? 4 : 0);  // 7
    617                                 mask |= (CALC_SUM_OFS_( ofs.s8, ofs.s9, ofs.sc, ofs.sd, p ) >= cval ? 2 : 0);  // 6
    618                                 mask |= (CALC_SUM_OFS_( ofs.s4, ofs.s5, ofs.s8, ofs.s9, p ) >= cval ? 1 : 0);  // 7
    619 
    620                                 partsum += (bitset[i*bitsetSize + idx] & (1 << mask)) ? st.z : st.w;
    621                             }
    622                         }
    623                     }
    624                     lpartsum[lidx] = partsum;
    625                     barrier(CLK_LOCAL_MEM_FENCE);
    626 
    627                     if( partidx == 0 )
    628                     {
    629                         float s = lpartsum[nr*nparts];
    630                         for( i = 1; i < nparts; i++ )
    631                             s += lpartsum[i + nr*nparts];
    632                         if( s >= stages[stageIdx].threshold )
    633                         {
    634                             int count = atomic_inc(lcount);
    635                             lbuf[count] = idxval;
    636                         }
    637                     }
    638                 }
    639             }
    640 
    641             barrier(CLK_LOCAL_MEM_FENCE);
    642             if( stageIdx == N_STAGES )
    643             {
    644                 int nrects = lcount[0];
    645                 if( lidx < nrects )
    646                 {
    647                     int nfaces = atomic_inc(facepos);
    648                     if( nfaces < MAX_FACES )
    649                     {
    650                         volatile __global int* face = facepos + 1 + nfaces*3;
    651                         int val = lbuf[lidx];
    652                         face[0] = scaleIdx;
    653                         face[1] = ix0 + (val & 255);
    654                         face[2] = iy0 + (val >> 8);
    655                     }
    656                 }
    657             }
    658         }
    659     }
    660 }
    661 #endif
    662