Home | History | Annotate | Download | only in opencl
      1 //                           License Agreement
      2 //                For Open Source Computer Vision Library
      3 //
      4 // Copyright (C) 2010-2012, Institute Of Software Chinese Academy Of Science, all rights reserved.
      5 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
      6 // Third party copyrights are property of their respective owners.
      7 //
      8 // Redistribution and use in source and binary forms, with or without modification,
      9 // are permitted provided that the following conditions are met:
     10 //
     11 //   * Redistribution's of source code must retain the above copyright notice,
     12 //     this list of conditions and the following disclaimer.
     13 //
     14 //   * Redistribution's in binary form must reproduce the above copyright notice,
     15 //     this list of conditions and the following disclaimer in the documentation
     16 //     and/or other materials provided with the distribution.
     17 //
     18 //   * The name of the copyright holders may not be used to endorse or promote products
     19 //     derived from this software without specific prior written permission.
     20 //
     21 // This software is provided by the copyright holders and contributors as is and
     22 // any express or implied warranties, including, but not limited to, the implied
     23 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     24 // In no event shall the Intel Corporation or contributors be liable for any direct,
     25 // indirect, incidental, special, exemplary, or consequential damages
     26 // (including, but not limited to, procurement of substitute goods or services;
     27 // loss of use, data, or profits; or business interruption) however caused
     28 // and on any theory of liability, whether in contract, strict liability,
     29 // or tort (including negligence or otherwise) arising in any way out of
     30 // the use of this software, even if advised of the possibility of such damage.
     31 
     32 #if cn != 3
     33 #define loadpix(addr) *(__global const T *)(addr)
     34 #define TSIZE (int)sizeof(T)
     35 #else
     36 #define loadpix(addr) vload3(0, (__global const T1 *)(addr))
     37 #define TSIZE ((int)sizeof(T1)*3)
     38 #endif
     39 
     40 #define SQSUMS_PTR(ox, oy) mad24(y + oy, src_sqsums_step, mad24(x + ox, cn, src_sqsums_offset))
     41 #define SUMS_PTR(ox, oy) mad24(y + oy, src_sums_step, mad24(x + ox, cn, src_sums_offset))
     42 #define SUMS(ox, oy)    mad24(y+oy, src_sums_step, mad24(x+ox, (int)sizeof(T1)*cn, src_sums_offset))
     43 #define SQ_SUMS(ox, oy) mad24(y+oy, src_sqsums_step, mad24(x+ox, (int)sizeof(T1)*cn, src_sqsums_offset))
     44 
     45 inline float normAcc(float num, float denum)
     46 {
     47     if (fabs(num) < denum)
     48         return num / denum;
     49     if (fabs(num) < denum * 1.125f)
     50         return num > 0 ? 1 : -1;
     51     return 0;
     52 }
     53 
     54 inline float normAcc_SQDIFF(float num, float denum)
     55 {
     56     if (fabs(num) < denum)
     57         return num / denum;
     58     if (fabs(num) < denum * 1.125f)
     59         return num > 0 ? 1 : -1;
     60     return 1;
     61 }
     62 
     63 #define noconvert
     64 
     65 #if cn == 1
     66 #define convertToDT(value) (float)(value)
     67 #elif cn == 2
     68 #define convertToDT(value) (float)(value.x + value.y)
     69 #elif cn == 3
     70 #define convertToDT(value) (float)(value.x + value.y + value.z)
     71 #elif cn == 4
     72 #define convertToDT(value) (float)(value.x + value.y + value.z + value.w)
     73 #else
     74 #error "cn should be 1-4"
     75 #endif
     76 
     77 #ifdef CALC_SUM
     78 
     79 __kernel void calcSum(__global const uchar * srcptr, int src_step, int src_offset,
     80                       int cols, int total, __global float * dst)
     81 {
     82     int lid = get_local_id(0), id = get_global_id(0);
     83 
     84     __local WT localmem[WGS2_ALIGNED];
     85     WT accumulator = (WT)(0), tmp;
     86 
     87     for ( ; id < total; id += WGS)
     88     {
     89         int src_index = mad24(id / cols, src_step, mad24(id % cols, TSIZE, src_offset));
     90         T src = loadpix(srcptr + src_index);
     91 
     92         tmp = convertToWT(src);
     93 
     94         accumulator = mad(tmp, tmp, accumulator);
     95     }
     96 
     97     if (lid < WGS2_ALIGNED)
     98         localmem[lid] = accumulator;
     99     barrier(CLK_LOCAL_MEM_FENCE);
    100 
    101     if (lid >= WGS2_ALIGNED && total >= WGS2_ALIGNED)
    102         localmem[lid - WGS2_ALIGNED] += accumulator;
    103     barrier(CLK_LOCAL_MEM_FENCE);
    104 
    105     for (int lsize = WGS2_ALIGNED >> 1; lsize > 0; lsize >>= 1)
    106     {
    107         if (lid < lsize)
    108         {
    109             int lid2 = lsize + lid;
    110             localmem[lid] += localmem[lid2];
    111         }
    112         barrier(CLK_LOCAL_MEM_FENCE);
    113     }
    114 
    115     if (lid == 0)
    116         dst[0] = convertToDT(localmem[0]);
    117 }
    118 
    119 #elif defined FIRST_CHANNEL
    120 
    121 __kernel void extractFirstChannel( const __global uchar* img, int img_step, int img_offset,
    122                                    __global uchar* res, int res_step, int res_offset, int rows, int cols)
    123 {
    124     int x = get_global_id(0);
    125     int y = get_global_id(1)*PIX_PER_WI_Y;
    126 
    127     if(x < cols )
    128     {
    129         #pragma unroll
    130         for (int cy=0; cy < PIX_PER_WI_Y && y < rows; ++cy, ++y)
    131         {
    132             T1 image = *(__global const T1*)(img + mad24(y, img_step, mad24(x, (int)sizeof(T1)*cn, img_offset)));;
    133             int res_idx = mad24(y, res_step, mad24(x, (int)sizeof(float), res_offset));
    134             *(__global float *)(res + res_idx) = image;
    135         }
    136     }
    137 }
    138 
    139 #elif defined CCORR
    140 
    141 #if cn==1 && PIX_PER_WI_X==4
    142 
    143 __kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset,
    144                                         __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols,
    145                                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
    146 {
    147     int x0 = get_global_id(0)*PIX_PER_WI_X;
    148     int y = get_global_id(1);
    149 
    150     if (y < dst_rows)
    151     {
    152         if (x0 + PIX_PER_WI_X <= dst_cols)
    153         {
    154             WT sum = (WT)(0);
    155 
    156             int ind = mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset));
    157             __global const T1 * template = (__global const T1*)(templateptr + template_offset);
    158 
    159             for (int i = 0; i < template_rows; ++i)
    160             {
    161                 for (int j = 0; j < template_cols; ++j)
    162                 {
    163                     T temp = (T)(template[j]);
    164                     T src = vload4(0, (__global const T1*)(srcptr + ind + j*(int)sizeof(T1)));
    165 
    166                     sum = mad(convertToWT(src), convertToWT(temp), sum);
    167 
    168                 }
    169             ind += src_step;
    170             template = (__global const T1 *)((__global const uchar *)template + template_step);
    171             }
    172 
    173             T temp = (T)(template[0]);
    174             int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset));
    175             *(__global float4 *)(dst + dst_idx) = convert_float4(sum);
    176         }
    177         else
    178         {
    179             WT1 sum [PIX_PER_WI_X];
    180             #pragma unroll
    181             for (int i=0; i < PIX_PER_WI_X; i++) sum[i] = 0;
    182 
    183             __global const T1 * src = (__global const T1 *)(srcptr + mad24(y, src_step, mad24(x0, (int)sizeof(T1), src_offset)));
    184             __global const T1 * template = (__global const T1 *)(templateptr + template_offset);
    185 
    186             for (int i = 0; i < template_rows; ++i)
    187             {
    188                 for (int j = 0; j < template_cols; ++j)
    189                 {
    190                     #pragma unroll
    191                     for (int cx=0, x = x0; cx < PIX_PER_WI_X && x < dst_cols; ++cx, ++x)
    192                     {
    193                         sum[cx] = mad(convertToWT1(src[j+cx]), convertToWT1(template[j]), sum[cx]);
    194                     }
    195                 }
    196 
    197             src = (__global const T1 *)((__global const uchar *)src + src_step);
    198             template = (__global const T1 *)((__global const uchar *)template + template_step);
    199             }
    200 
    201             #pragma unroll
    202             for (int cx=0; cx < PIX_PER_WI_X && x0 < dst_cols; ++cx, ++x0)
    203             {
    204                 int dst_idx = mad24(y, dst_step, mad24(x0, (int)sizeof(float), dst_offset));
    205                 *(__global float *)(dst + dst_idx) = convertToDT(sum[cx]);
    206             }
    207         }
    208     }
    209 }
    210 
    211 #else
    212 
    213 __kernel void matchTemplate_Naive_CCORR(__global const uchar * srcptr, int src_step, int src_offset,
    214                                         __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols,
    215                                         __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
    216 {
    217     int x = get_global_id(0);
    218     int y = get_global_id(1);
    219 
    220     if (x < dst_cols && y < dst_rows)
    221     {
    222         WT sum = (WT)(0);
    223 
    224         for (int i = 0; i < template_rows; ++i)
    225         {
    226             for (int j = 0; j < template_cols; ++j)
    227             {
    228                 T src      = loadpix(srcptr      + mad24(y+i, src_step,    mad24(x+j, TSIZE, src_offset)));
    229                 T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset)));
    230 
    231                 sum = mad(convertToWT(src), convertToWT(template), sum);
    232             }
    233         }
    234 
    235         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    236         *(__global float *)(dst + dst_idx) = convertToDT(sum);
    237     }
    238 }
    239 #endif
    240 
    241 #elif defined CCORR_NORMED
    242 
    243 __kernel void matchTemplate_CCORR_NORMED(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,
    244                                          __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    245                                          int template_rows, int template_cols, __global const float * template_sqsum)
    246 {
    247     int x = get_global_id(0);
    248     int y = get_global_id(1);
    249 
    250     if (x < dst_cols && y < dst_rows)
    251     {
    252         __global const float * sqsum = (__global const float *)(src_sqsums);
    253 
    254         src_sqsums_step /= sizeof(float);
    255         src_sqsums_offset /= sizeof(float);
    256         float image_sqsum_ = (float)(sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)] -
    257                                      sqsum[SQSUMS_PTR(0, template_rows)] + sqsum[SQSUMS_PTR(0, 0)]);
    258 
    259         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    260         __global float * dstult = (__global float *)(dst + dst_idx);
    261         *dstult = normAcc(*dstult, sqrt(image_sqsum_ * template_sqsum[0]));
    262     }
    263 }
    264 
    265 #elif defined SQDIFF
    266 
    267 __kernel void matchTemplate_Naive_SQDIFF(__global const uchar * srcptr, int src_step, int src_offset,
    268                                          __global const uchar * templateptr, int template_step, int template_offset, int template_rows, int template_cols,
    269                                          __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
    270 {
    271     int x = get_global_id(0);
    272     int y = get_global_id(1);
    273 
    274     if (x < dst_cols && y < dst_rows)
    275     {
    276         WT sum = (WT)(0), value;
    277 
    278         for (int i = 0; i < template_rows; ++i)
    279         {
    280             for (int j = 0; j < template_cols; ++j)
    281             {
    282                 T src      = loadpix(srcptr      + mad24(y+i, src_step,    mad24(x+j, TSIZE, src_offset)));
    283                 T template = loadpix(templateptr + mad24(i, template_step, mad24(j, TSIZE, template_offset)));
    284 
    285                 value = convertToWT(src) - convertToWT(template);
    286 
    287                 sum = mad(value, value, sum);
    288             }
    289         }
    290 
    291         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    292         *(__global float *)(dst + dst_idx) = convertToDT(sum);
    293     }
    294 }
    295 
    296 #elif defined SQDIFF_PREPARED
    297 
    298 __kernel void matchTemplate_Prepared_SQDIFF(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,
    299                                             __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    300                                             int template_rows, int template_cols, __global const float * template_sqsum)
    301 {
    302     int x = get_global_id(0);
    303     int y = get_global_id(1);
    304 
    305     if (x < dst_cols && y < dst_rows)
    306     {
    307         src_sqsums_step /= sizeof(float);
    308         src_sqsums_offset /= sizeof(float);
    309 
    310         __global const float * sqsum = (__global const float *)(src_sqsums);
    311         float image_sqsum_ = (float)(
    312                                  (sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)]) -
    313                                  (sqsum[SQSUMS_PTR(0, template_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
    314         float template_sqsum_value = template_sqsum[0];
    315 
    316         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    317         __global float * dstult = (__global float *)(dst + dst_idx);
    318         *dstult = image_sqsum_ - 2.0f * dstult[0] + template_sqsum_value;
    319     }
    320 }
    321 
    322 #elif defined SQDIFF_NORMED
    323 
    324 __kernel void matchTemplate_SQDIFF_NORMED(__global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,
    325                                           __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    326                                           int template_rows, int template_cols, __global const float * template_sqsum)
    327 {
    328     int x = get_global_id(0);
    329     int y = get_global_id(1);
    330 
    331     if (x < dst_cols && y < dst_rows)
    332     {
    333         src_sqsums_step /= sizeof(float);
    334         src_sqsums_offset /= sizeof(float);
    335 
    336         __global const float * sqsum = (__global const float *)(src_sqsums);
    337         float image_sqsum_ = (float)(
    338                                  (sqsum[SQSUMS_PTR(template_cols, template_rows)] - sqsum[SQSUMS_PTR(template_cols, 0)]) -
    339                                  (sqsum[SQSUMS_PTR(0, template_rows)] - sqsum[SQSUMS_PTR(0, 0)]));
    340         float template_sqsum_value = template_sqsum[0];
    341 
    342         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    343         __global float * dstult = (__global float *)(dst + dst_idx);
    344         *dstult = normAcc_SQDIFF(image_sqsum_ - 2.0f * dstult[0] + template_sqsum_value, sqrt(image_sqsum_ * template_sqsum_value));
    345     }
    346 }
    347 
    348 #elif defined CCOEFF
    349 
    350 #if cn == 1
    351 
    352 __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,
    353                                             __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    354                                             int template_rows, int template_cols, float template_sum)
    355 {
    356     int x = get_global_id(0);
    357     int y = get_global_id(1);
    358 
    359     if (x < dst_cols && y < dst_rows)
    360     {
    361         __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset)));
    362 
    363         int step = src_sums_step/(int)sizeof(T);
    364 
    365         T image_sum = (T)(0), value;
    366 
    367         value = (T)(sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]);
    368 
    369         image_sum = mad(value, template_sum , image_sum);
    370 
    371         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    372         *(__global float *)(dst + dst_idx) -= convertToDT(image_sum);
    373     }
    374 }
    375 
    376 #elif cn==3
    377 
    378 __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,
    379                                             __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    380                                             int template_rows, int template_cols, float4 template_sum)
    381 {
    382     int x = get_global_id(0);
    383     int y = get_global_id(1);
    384 
    385     if (x < dst_cols && y < dst_rows)
    386     {
    387         T image_sum = (T)(0), value, temp_sum;
    388 
    389         temp_sum.x = template_sum.x;
    390         temp_sum.y = template_sum.y;
    391         temp_sum.z = template_sum.z;
    392 
    393         value  = vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, template_rows)));
    394         value -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, template_rows)));
    395         value -= vload3(0, (__global const T1 *)(src_sums + SUMS(template_cols, 0)));
    396         value += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0)));
    397 
    398         image_sum = mad(value, temp_sum , 0);
    399 
    400         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    401         *(__global float *)(dst + dst_idx) -= convertToDT(image_sum);
    402     }
    403 }
    404 
    405 #elif (cn==2 || cn==4)
    406 
    407 __kernel void matchTemplate_Prepared_CCOEFF(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,
    408                                             __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    409                                             int template_rows, int template_cols, float4 template_sum)
    410 {
    411     int x = get_global_id(0);
    412     int y = get_global_id(1);
    413 
    414     if (x < dst_cols && y < dst_rows)
    415     {
    416         __global const T* sum = (__global const T*)(src_sums + mad24(y, src_sums_step, mad24(x, (int)sizeof(T), src_sums_offset)));
    417 
    418         int step = src_sums_step/(int)sizeof(T);
    419 
    420         T image_sum = (T)(0), value, temp_sum;
    421 
    422 #if cn==2
    423         temp_sum.x = template_sum.x;
    424         temp_sum.y = template_sum.y;
    425 #else
    426         temp_sum = template_sum;
    427 #endif
    428 
    429         value = (sum[mad24(template_rows, step, template_cols)] - sum[mad24(template_rows, step, 0)] - sum[template_cols] + sum[0]);
    430 
    431         image_sum = mad(value, temp_sum , image_sum);
    432 
    433         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    434         *(__global float *)(dst + dst_idx) -= convertToDT(image_sum);
    435     }
    436 }
    437 
    438 #else
    439 #error "cn should be 1-4"
    440 #endif
    441 
    442 #elif defined CCOEFF_NORMED
    443 
    444 #if cn == 1
    445 
    446 __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,
    447                                           __global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,
    448                                           __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    449                                           int t_rows, int t_cols, float weight, float template_sum, float template_sqsum)
    450 {
    451     int x = get_global_id(0);
    452     int y = get_global_id(1);
    453 
    454     float sum_[2];
    455     float sqsum_[2];
    456 
    457 
    458     if (x < dst_cols && y < dst_rows)
    459     {
    460         int step = src_sums_step/(int)sizeof(T);
    461 
    462         __global const T* sum   = (__global const T*)(src_sums + mad24(y, src_sums_step,     mad24(x, (int)sizeof(T), src_sums_offset)));
    463         __global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset)));
    464 
    465         T value_sum   = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0];
    466         T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0];
    467 
    468         float num = convertToDT(mad(value_sum, template_sum, 0));
    469 
    470         value_sqsum -= weight * value_sum * value_sum;
    471         float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0));
    472 
    473         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    474         __global float * dstult = (__global float *)(dst+dst_idx);
    475         *dstult = normAcc((*dstult) - num, denum);
    476     }
    477 }
    478 
    479 #elif cn==3
    480 
    481 __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,
    482                                           __global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,
    483                                           __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    484                                           int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum)
    485 {
    486     int x = get_global_id(0);
    487     int y = get_global_id(1);
    488 
    489     if (x < dst_cols && y < dst_rows)
    490     {
    491         int step = src_sums_step/(int)sizeof(T);
    492 
    493         T temp_sum, value_sum, value_sqsum;
    494 
    495         temp_sum.x = template_sum.x;
    496         temp_sum.y = template_sum.y;
    497         temp_sum.z = template_sum.z;
    498 
    499         value_sum  = vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, t_rows)));
    500         value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(0, t_rows)));
    501         value_sum -= vload3(0, (__global const T1 *)(src_sums + SUMS(t_cols, 0)));
    502         value_sum += vload3(0, (__global const T1 *)(src_sums + SUMS(0, 0)));
    503 
    504         value_sqsum  = vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, t_rows)));
    505         value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, t_rows)));
    506         value_sqsum -= vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(t_cols, 0)));
    507         value_sqsum += vload3(0, (__global const T1 *)(src_sqsums + SQ_SUMS(0, 0)));
    508 
    509         float num = convertToDT(mad(value_sum, temp_sum, 0));
    510 
    511         value_sqsum -= weight * value_sum * value_sum;
    512         float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0));
    513 
    514         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    515         __global float * dstult = (__global float *)(dst+dst_idx);
    516         *dstult = normAcc((*dstult) - num, denum);
    517     }
    518 }
    519 
    520 #elif (cn==2 || cn==4)
    521 
    522 __kernel void matchTemplate_CCOEFF_NORMED(__global const uchar * src_sums, int src_sums_step, int src_sums_offset,
    523                                           __global const uchar * src_sqsums, int src_sqsums_step, int src_sqsums_offset,
    524                                           __global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols,
    525                                           int t_rows, int t_cols, float weight, float4 template_sum, float template_sqsum)
    526 {
    527     int x = get_global_id(0);
    528     int y = get_global_id(1);
    529 
    530     if (x < dst_cols && y < dst_rows)
    531     {
    532         int step = src_sums_step/(int)sizeof(T);
    533 
    534         T temp_sum;
    535 
    536         __global const T* sum   = (__global const T*)(src_sums + mad24(y, src_sums_step,     mad24(x, (int)sizeof(T), src_sums_offset)));
    537         __global const T* sqsum = (__global const T*)(src_sqsums + mad24(y, src_sqsums_step, mad24(x, (int)sizeof(T), src_sqsums_offset)));
    538 
    539         T value_sum   = sum[mad24(t_rows, step, t_cols)] - sum[mad24(t_rows, step, 0)] - sum[t_cols] + sum[0];
    540         T value_sqsum = sqsum[mad24(t_rows, step, t_cols)] - sqsum[mad24(t_rows, step, 0)] - sqsum[t_cols] + sqsum[0];
    541 
    542 #if cn==2
    543         temp_sum.x = template_sum.x;
    544         temp_sum.y = template_sum.y;
    545 #else
    546         temp_sum = template_sum;
    547 #endif
    548 
    549         float num = convertToDT(mad(value_sum, temp_sum, 0));
    550 
    551         value_sqsum -= weight * value_sum * value_sum;
    552         float denum = sqrt(mad(template_sqsum, convertToDT(value_sqsum), 0));
    553 
    554         int dst_idx = mad24(y, dst_step, mad24(x, (int)sizeof(float), dst_offset));
    555         __global float * dstult = (__global float *)(dst+dst_idx);
    556         *dstult = normAcc((*dstult) - num, denum);
    557     }
    558 }
    559 
    560 #else
    561 #error "cn should be 1-4"
    562 #endif
    563 
    564 #endif
    565