Home | History | Annotate | Download | only in opencl
      1 /* See LICENSE file in the root OpenCV directory */
      2 
      3 #if TILE_SIZE != 32
      4 #error "TILE SIZE should be 32"
      5 #endif
      6 
      7 
      8 __kernel void moments(__global const uchar* src, int src_step, int src_offset,
      9     int src_rows, int src_cols, __global int* mom0, int xtiles)
     10 {
     11     int x0 = get_global_id(0);
     12     int y0 = get_group_id(1);
     13     int x, y = get_local_id(1);
     14     int x_min = x0*TILE_SIZE;
     15     int ypix = y0*TILE_SIZE + y;
     16     __local int mom[TILE_SIZE][10];
     17 
     18     if (x_min < src_cols && y0*TILE_SIZE < src_rows)
     19     {
     20         if (ypix < src_rows)
     21         {
     22             int x_max = min(src_cols - x_min, TILE_SIZE);
     23             __global const uchar* ptr = src + src_offset + ypix*src_step + x_min;
     24             int4 S = (int4)(0, 0, 0, 0), p;
     25 
     26 #define SUM_ELEM(elem, ofs) \
     27     (int4)(1, (ofs), (ofs)*(ofs), (ofs)*(ofs)*(ofs))*elem
     28 
     29             x = x_max & -4;
     30             if (x_max >= 4)
     31             {
     32                 p = convert_int4(vload4(0, ptr));
     33 #ifdef OP_MOMENTS_BINARY
     34                 p = min(p, 1);
     35 #endif
     36                 S += (int4)(p.s0, 0, 0, 0) + (int4)(p.s1, p.s1, p.s1, p.s1) +
     37                     (int4)(p.s2, p.s2 * 2, p.s2 * 4, p.s2 * 8) + (int4)(p.s3, p.s3 * 3, p.s3 * 9, p.s3 * 27);
     38                 //SUM_ELEM(p.s0, 0) + SUM_ELEM(p.s1, 1) + SUM_ELEM(p.s2, 2) + SUM_ELEM(p.s3, 3);
     39 
     40                 if (x_max >= 8)
     41                 {
     42                     p = convert_int4(vload4(0, ptr + 4));
     43 #ifdef OP_MOMENTS_BINARY
     44                     p = min(p, 1);
     45 #endif
     46                     S += (int4)(p.s0, p.s0 * 4, p.s0 * 16, p.s0 * 64) + (int4)(p.s1, p.s1 * 5, p.s1 * 25, p.s1 * 125) +
     47                         (int4)(p.s2, p.s2 * 6, p.s2 * 36, p.s2 * 216) + (int4)(p.s3, p.s3 * 7, p.s3 * 49, p.s3 * 343);
     48                     //SUM_ELEM(p.s0, 4) + SUM_ELEM(p.s1, 5) + SUM_ELEM(p.s2, 6) + SUM_ELEM(p.s3, 7);
     49 
     50                     if (x_max >= 12)
     51                     {
     52                         p = convert_int4(vload4(0, ptr + 8));
     53 #ifdef OP_MOMENTS_BINARY
     54                         p = min(p, 1);
     55 #endif
     56                         S += (int4)(p.s0, p.s0 * 8, p.s0 * 64, p.s0 * 512) + (int4)(p.s1, p.s1 * 9, p.s1 * 81, p.s1 * 729) +
     57                             (int4)(p.s2, p.s2 * 10, p.s2 * 100, p.s2 * 1000) + (int4)(p.s3, p.s3 * 11, p.s3 * 121, p.s3 * 1331);
     58                         //SUM_ELEM(p.s0, 8) + SUM_ELEM(p.s1, 9) + SUM_ELEM(p.s2, 10) + SUM_ELEM(p.s3, 11);
     59 
     60                         if (x_max >= 16)
     61                         {
     62                             p = convert_int4(vload4(0, ptr + 12));
     63 #ifdef OP_MOMENTS_BINARY
     64                             p = min(p, 1);
     65 #endif
     66                             S += (int4)(p.s0, p.s0 * 12, p.s0 * 144, p.s0 * 1728) + (int4)(p.s1, p.s1 * 13, p.s1 * 169, p.s1 * 2197) +
     67                                 (int4)(p.s2, p.s2 * 14, p.s2 * 196, p.s2 * 2744) + (int4)(p.s3, p.s3 * 15, p.s3 * 225, p.s3 * 3375);
     68                             //SUM_ELEM(p.s0, 12) + SUM_ELEM(p.s1, 13) + SUM_ELEM(p.s2, 14) + SUM_ELEM(p.s3, 15);
     69                         }
     70                     }
     71                 }
     72             }
     73 
     74             if (x_max >= 20)
     75             {
     76                 p = convert_int4(vload4(0, ptr + 16));
     77 #ifdef OP_MOMENTS_BINARY
     78                 p = min(p, 1);
     79 #endif
     80                 S += (int4)(p.s0, p.s0 * 16, p.s0 * 256, p.s0 * 4096) + (int4)(p.s1, p.s1 * 17, p.s1 * 289, p.s1 * 4913) +
     81                     (int4)(p.s2, p.s2 * 18, p.s2 * 324, p.s2 * 5832) + (int4)(p.s3, p.s3 * 19, p.s3 * 361, p.s3 * 6859);
     82                 //SUM_ELEM(p.s0, 16) + SUM_ELEM(p.s1, 17) + SUM_ELEM(p.s2, 18) + SUM_ELEM(p.s3, 19);
     83 
     84                 if (x_max >= 24)
     85                 {
     86                     p = convert_int4(vload4(0, ptr + 20));
     87 #ifdef OP_MOMENTS_BINARY
     88                     p = min(p, 1);
     89 #endif
     90                     S += (int4)(p.s0, p.s0 * 20, p.s0 * 400, p.s0 * 8000) + (int4)(p.s1, p.s1 * 21, p.s1 * 441, p.s1 * 9261) +
     91                         (int4)(p.s2, p.s2 * 22, p.s2 * 484, p.s2 * 10648) + (int4)(p.s3, p.s3 * 23, p.s3 * 529, p.s3 * 12167);
     92                     //SUM_ELEM(p.s0, 20) + SUM_ELEM(p.s1, 21) + SUM_ELEM(p.s2, 22) + SUM_ELEM(p.s3, 23);
     93 
     94                     if (x_max >= 28)
     95                     {
     96                         p = convert_int4(vload4(0, ptr + 24));
     97 #ifdef OP_MOMENTS_BINARY
     98                         p = min(p, 1);
     99 #endif
    100                         S += (int4)(p.s0, p.s0 * 24, p.s0 * 576, p.s0 * 13824) + (int4)(p.s1, p.s1 * 25, p.s1 * 625, p.s1 * 15625) +
    101                             (int4)(p.s2, p.s2 * 26, p.s2 * 676, p.s2 * 17576) + (int4)(p.s3, p.s3 * 27, p.s3 * 729, p.s3 * 19683);
    102                         //SUM_ELEM(p.s0, 24) + SUM_ELEM(p.s1, 25) + SUM_ELEM(p.s2, 26) + SUM_ELEM(p.s3, 27);
    103 
    104                         if (x_max >= 32)
    105                         {
    106                             p = convert_int4(vload4(0, ptr + 28));
    107 #ifdef OP_MOMENTS_BINARY
    108                             p = min(p, 1);
    109 #endif
    110                             S += (int4)(p.s0, p.s0 * 28, p.s0 * 784, p.s0 * 21952) + (int4)(p.s1, p.s1 * 29, p.s1 * 841, p.s1 * 24389) +
    111                                 (int4)(p.s2, p.s2 * 30, p.s2 * 900, p.s2 * 27000) + (int4)(p.s3, p.s3 * 31, p.s3 * 961, p.s3 * 29791);
    112                             //SUM_ELEM(p.s0, 28) + SUM_ELEM(p.s1, 29) + SUM_ELEM(p.s2, 30) + SUM_ELEM(p.s3, 31);
    113                         }
    114                     }
    115                 }
    116             }
    117 
    118             if (x < x_max)
    119             {
    120                 int ps = ptr[x];
    121 #ifdef OP_MOMENTS_BINARY
    122                 ps = min(ps, 1);
    123 #endif
    124                 S += SUM_ELEM(ps, x);
    125                 if (x + 1 < x_max)
    126                 {
    127                     ps = ptr[x + 1];
    128 #ifdef OP_MOMENTS_BINARY
    129                     ps = min(ps, 1);
    130 #endif
    131                     S += SUM_ELEM(ps, x + 1);
    132                     if (x + 2 < x_max)
    133                     {
    134                         ps = ptr[x + 2];
    135 #ifdef OP_MOMENTS_BINARY
    136                         ps = min(ps, 1);
    137 #endif
    138                         S += SUM_ELEM(ps, x + 2);
    139                     }
    140                 }
    141             }
    142 
    143             int sy = y*y;
    144 
    145             mom[y][0] = S.s0;
    146             mom[y][1] = S.s1;
    147             mom[y][2] = y*S.s0;
    148             mom[y][3] = S.s2;
    149             mom[y][4] = y*S.s1;
    150             mom[y][5] = sy*S.s0;
    151             mom[y][6] = S.s3;
    152             mom[y][7] = y*S.s2;
    153             mom[y][8] = sy*S.s1;
    154             mom[y][9] = y*sy*S.s0;
    155         }
    156         else
    157             mom[y][0] = mom[y][1] = mom[y][2] = mom[y][3] = mom[y][4] =
    158             mom[y][5] = mom[y][6] = mom[y][7] = mom[y][8] = mom[y][9] = 0;
    159         barrier(CLK_LOCAL_MEM_FENCE);
    160 
    161 #define REDUCE(d) \
    162         if (y < d) \
    163         { \
    164         mom[y][0] += mom[y + d][0]; \
    165         mom[y][1] += mom[y + d][1]; \
    166         mom[y][2] += mom[y + d][2]; \
    167         mom[y][3] += mom[y + d][3]; \
    168         mom[y][4] += mom[y + d][4]; \
    169         mom[y][5] += mom[y + d][5]; \
    170         mom[y][6] += mom[y + d][6]; \
    171         mom[y][7] += mom[y + d][7]; \
    172         mom[y][8] += mom[y + d][8]; \
    173         mom[y][9] += mom[y + d][9]; \
    174         } \
    175         barrier(CLK_LOCAL_MEM_FENCE)
    176 
    177         REDUCE(16);
    178         REDUCE(8);
    179         REDUCE(4);
    180         REDUCE(2);
    181 
    182         if (y < 10)
    183         {
    184             __global int* momout = mom0 + (y0*xtiles + x0) * 10;
    185             momout[y] = mom[0][y] + mom[1][y];
    186         }
    187     }
    188 }
    189