Home | History | Annotate | Download | only in opencl
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 // This file is part of OpenCV project.
      3 // It is subject to the license terms in the LICENSE file found in the top-level directory
      4 // of this distribution and at http://opencv.org/license.html.
      5 // Copyright (C) 2014, Itseez, Inc., all rights reserved.
      6 // Third party copyrights are property of their respective owners.
      7 //M*/
      8 
      9 #ifdef DOUBLE_SUPPORT
     10 #ifdef cl_amd_fp64
     11 #pragma OPENCL EXTENSION cl_amd_fp64:enable
     12 #elif defined (cl_khr_fp64)
     13 #pragma OPENCL EXTENSION cl_khr_fp64:enable
     14 #endif
     15 #endif
     16 
     17 #ifndef LOCAL_SUM_SIZE
     18 #define LOCAL_SUM_SIZE      16
     19 #endif
     20 
     21 #define LOCAL_SUM_STRIDE    (LOCAL_SUM_SIZE + 1)
     22 
     23 
     24 kernel void integral_sum_cols(__global const uchar *src_ptr, int src_step, int src_offset, int rows, int cols,
     25                               __global uchar *buf_ptr, int buf_step, int buf_offset
     26 #ifdef SUM_SQUARE
     27                               ,__global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset
     28 #endif
     29                               )
     30 {
     31     __local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
     32 #ifdef SUM_SQUARE
     33     __local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
     34 #endif
     35     int lid = get_local_id(0);
     36     int gid = get_group_id(0);
     37 
     38     int x = get_global_id(0);
     39     int src_index = x + src_offset;
     40 
     41     sumT accum = 0;
     42 #ifdef SUM_SQUARE
     43     sumSQT accum_sq = 0;
     44 #endif
     45     for (int y = 0; y < rows; y += LOCAL_SUM_SIZE)
     46     {
     47         int lsum_index = lid;
     48         #pragma unroll
     49         for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, src_index+=src_step, lsum_index += LOCAL_SUM_STRIDE)
     50         {
     51             if ((x < cols) && (y + yin < rows))
     52             {
     53                 __global const uchar *src = src_ptr + src_index;
     54                 accum += src[0];
     55 #ifdef SUM_SQUARE
     56                 sumSQT temp = src[0] * src[0];
     57                 accum_sq += temp;
     58 #endif
     59             }
     60             lm_sum[lsum_index] = accum;
     61 #ifdef SUM_SQUARE
     62             lm_sum_sq[lsum_index] = accum_sq;
     63 #endif
     64         }
     65         barrier(CLK_LOCAL_MEM_FENCE);
     66 
     67         //int buf_index = buf_offset + buf_step * LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
     68         int buf_index = mad24(buf_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, buf_offset));
     69 #ifdef SUM_SQUARE
     70         int buf_sq_index = mad24(buf_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, buf_sq_offset));
     71 #endif
     72 
     73         lsum_index = LOCAL_SUM_STRIDE * lid;
     74         #pragma unroll
     75         for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index ++)
     76         {
     77             __global sumT *buf = (__global sumT *)(buf_ptr + buf_index);
     78             buf[0] = lm_sum[lsum_index];
     79             buf_index += buf_step;
     80 #ifdef SUM_SQUARE
     81             __global sumSQT *bufsq = (__global sumSQT *)(buf_sq_ptr + buf_sq_index);
     82             bufsq[0] = lm_sum_sq[lsum_index];
     83             buf_sq_index += buf_sq_step;
     84 #endif
     85         }
     86         barrier(CLK_LOCAL_MEM_FENCE);
     87     }
     88 }
     89 
     90 kernel void integral_sum_rows(__global const uchar *buf_ptr, int buf_step, int buf_offset,
     91 #ifdef SUM_SQUARE
     92                               __global uchar *buf_sq_ptr, int buf_sq_step, int buf_sq_offset,
     93 #endif
     94                               __global uchar *dst_ptr, int dst_step, int dst_offset, int rows, int cols
     95 #ifdef SUM_SQUARE
     96                               ,__global uchar *dst_sq_ptr, int dst_sq_step, int dst_sq_offset
     97 #endif
     98                               )
     99 {
    100     __local sumT lm_sum[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
    101 #ifdef SUM_SQUARE
    102     __local sumSQT lm_sum_sq[LOCAL_SUM_STRIDE * LOCAL_SUM_SIZE];
    103 #endif
    104     int lid = get_local_id(0);
    105     int gid = get_group_id(0);
    106 
    107     int gs = get_global_size(0);
    108 
    109     int x = get_global_id(0);
    110 
    111     __global sumT *dst = (__global sumT *)(dst_ptr + dst_offset);
    112     for (int xin = x; xin < cols; xin += gs)
    113     {
    114         dst[xin] = 0;
    115     }
    116     dst_offset += dst_step;
    117 
    118     if (x < rows - 1)
    119     {
    120         dst = (__global sumT *)(dst_ptr + mad24(x, dst_step, dst_offset));
    121         dst[0] = 0;
    122     }
    123 
    124     int buf_index = mad24((int)sizeof(sumT), x, buf_offset);
    125     sumT accum = 0;
    126 
    127 #ifdef SUM_SQUARE
    128     __global sumSQT *dst_sq = (__global sumT *)(dst_sq_ptr + dst_sq_offset);
    129     for (int xin = x; xin < cols; xin += gs)
    130     {
    131         dst_sq[xin] = 0;
    132     }
    133     dst_sq_offset += dst_sq_step;
    134 
    135     if (x < rows - 1)
    136     {
    137         dst_sq = (__global sumSQT *)(dst_sq_ptr + mad24(x, dst_sq_step, dst_sq_offset));
    138         dst_sq[0] = 0;
    139     }
    140 
    141     int buf_sq_index = mad24((int)sizeof(sumSQT), x, buf_sq_offset);
    142     sumSQT accum_sq = 0;
    143 #endif
    144 
    145     for (int y = 1; y < cols; y += LOCAL_SUM_SIZE)
    146     {
    147         int lsum_index = lid;
    148         #pragma unroll
    149         for (int yin = 0; yin < LOCAL_SUM_SIZE; yin++, lsum_index += LOCAL_SUM_STRIDE)
    150         {
    151             __global const sumT *buf = (__global const sumT *)(buf_ptr + buf_index);
    152             accum += buf[0];
    153             lm_sum[lsum_index] = accum;
    154             buf_index += buf_step;
    155 #ifdef SUM_SQUARE
    156             __global const sumSQT *buf_sq = (__global const sumSQT *)(buf_sq_ptr + buf_sq_index);
    157             accum_sq += buf_sq[0];
    158             lm_sum_sq[lsum_index] = accum_sq;
    159             buf_sq_index += buf_sq_step;
    160 #endif
    161         }
    162         barrier(CLK_LOCAL_MEM_FENCE);
    163 
    164         if (y + lid < cols)
    165         {
    166             //int dst_index = dst_offset + dst_step *  LOCAL_SUM_COLS * gid + sizeof(sumT) * y + sizeof(sumT) * lid;
    167             int dst_index = mad24(dst_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumT), y + lid, dst_offset));
    168 #ifdef SUM_SQUARE
    169             int dst_sq_index = mad24(dst_sq_step, LOCAL_SUM_SIZE * gid, mad24((int)sizeof(sumSQT), y + lid, dst_sq_offset));
    170 #endif
    171             lsum_index = LOCAL_SUM_STRIDE * lid;
    172             int yin_max = min(rows - 1 -  LOCAL_SUM_SIZE * gid, LOCAL_SUM_SIZE);
    173             #pragma unroll
    174             for (int yin = 0; yin < yin_max; yin++, lsum_index++)
    175             {
    176                 dst = (__global sumT *)(dst_ptr + dst_index);
    177                 dst[0] = lm_sum[lsum_index];
    178                 dst_index += dst_step;
    179 #ifdef SUM_SQUARE
    180                 dst_sq = (__global sumSQT *)(dst_sq_ptr + dst_sq_index);
    181                 dst_sq[0] = lm_sum_sq[lsum_index];
    182                 dst_sq_index += dst_sq_step;
    183 #endif
    184             }
    185         }
    186         barrier(CLK_LOCAL_MEM_FENCE);
    187     }
    188 }
    189