Home | History | Annotate | Download | only in opencl
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2010-2012, Multicoreware, Inc., all rights reserved.
     14 // Copyright (C) 2010-2012, Advanced Micro Devices, Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // @Authors
     18 //    Wenju He, wenju (at) multicorewareinc.com
     19 //
     20 // Redistribution and use in source and binary forms, with or without modification,
     21 // are permitted provided that the following conditions are met:
     22 //
     23 //   * Redistribution's of source code must retain the above copyright notice,
     24 //     this list of conditions and the following disclaimer.
     25 //
     26 //   * Redistribution's in binary form must reproduce the above copyright notice,
     27 //     this list of conditions and the following disclaimer in the documentation
     28 //     and/or other materials provided with the distribution.
     29 //
     30 //   * The name of the copyright holders may not be used to endorse or promote products
     31 //     derived from this software without specific prior written permission.
     32 //
     33 // This software is provided by the copyright holders and contributors as is and
     34 // any express or implied warranties, including, but not limited to, the implied
     35 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     36 // In no event shall the Intel Corporation or contributors be liable for any direct,
     37 // indirect, incidental, special, exemplary, or consequential damages
     38 // (including, but not limited to, procurement of substitute goods or services;
     39 // loss of use, data, or profits; or business interruption) however caused
     40 // and on any theory of liability, whether in contract, strict liability,
     41 // or tort (including negligence or otherwise) arising in any way out of
     42 // the use of this software, even if advised of the possibility of such damage.
     43 //
     44 //M*/
     45 
     46 #define CELL_WIDTH 8
     47 #define CELL_HEIGHT 8
     48 #define CELLS_PER_BLOCK_X 2
     49 #define CELLS_PER_BLOCK_Y 2
     50 #define NTHREADS 256
     51 #define CV_PI_F M_PI_F
     52 
     53 #ifdef INTEL_DEVICE
     54 #define QANGLE_TYPE     int
     55 #define QANGLE_TYPE2    int2
     56 #else
     57 #define QANGLE_TYPE     uchar
     58 #define QANGLE_TYPE2    uchar2
     59 #endif
     60 
     61 //----------------------------------------------------------------------------
     62 // Histogram computation
     63 // 12 threads for a cell, 12x4 threads per block
     64 // Use pre-computed gaussian and interp_weight lookup tables
     65 __kernel void compute_hists_lut_kernel(
     66     const int cblock_stride_x, const int cblock_stride_y,
     67     const int cnbins, const int cblock_hist_size, const int img_block_width,
     68     const int blocks_in_group, const int blocks_total,
     69     const int grad_quadstep, const int qangle_step,
     70     __global const float* grad, __global const QANGLE_TYPE* qangle,
     71     __global const float* gauss_w_lut,
     72     __global float* block_hists, __local float* smem)
     73 {
     74     const int lx = get_local_id(0);
     75     const int lp = lx / 24; /* local group id */
     76     const int gid = get_group_id(0) * blocks_in_group + lp;/* global group id */
     77     const int gidY = gid / img_block_width;
     78     const int gidX = gid - gidY * img_block_width;
     79 
     80     const int lidX = lx - lp * 24;
     81     const int lidY = get_local_id(1);
     82 
     83     const int cell_x = lidX / 12;
     84     const int cell_y = lidY;
     85     const int cell_thread_x = lidX - cell_x * 12;
     86 
     87     __local float* hists = smem + lp * cnbins * (CELLS_PER_BLOCK_X *
     88         CELLS_PER_BLOCK_Y * 12 + CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y);
     89     __local float* final_hist = hists + cnbins *
     90         (CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12);
     91 
     92     const int offset_x = gidX * cblock_stride_x + (cell_x << 2) + cell_thread_x;
     93     const int offset_y = gidY * cblock_stride_y + (cell_y << 2);
     94 
     95     __global const float* grad_ptr = (gid < blocks_total) ?
     96         grad + offset_y * grad_quadstep + (offset_x << 1) : grad;
     97     __global const QANGLE_TYPE* qangle_ptr = (gid < blocks_total) ?
     98         qangle + offset_y * qangle_step + (offset_x << 1) : qangle;
     99 
    100     __local float* hist = hists + 12 * (cell_y * CELLS_PER_BLOCK_Y + cell_x) +
    101         cell_thread_x;
    102     for (int bin_id = 0; bin_id < cnbins; ++bin_id)
    103         hist[bin_id * 48] = 0.f;
    104 
    105     const int dist_x = -4 + cell_thread_x - 4 * cell_x;
    106     const int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);
    107 
    108     const int dist_y_begin = -4 - 4 * lidY;
    109     for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
    110     {
    111         float2 vote = (float2) (grad_ptr[0], grad_ptr[1]);
    112         QANGLE_TYPE2 bin = (QANGLE_TYPE2) (qangle_ptr[0], qangle_ptr[1]);
    113 
    114         grad_ptr += grad_quadstep;
    115         qangle_ptr += qangle_step;
    116 
    117         int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
    118 
    119         int idx = (dist_center_y + 8) * 16 + (dist_center_x + 8);
    120         float gaussian = gauss_w_lut[idx];
    121         idx = (dist_y + 8) * 16 + (dist_x + 8);
    122         float interp_weight = gauss_w_lut[256+idx];
    123 
    124         hist[bin.x * 48] += gaussian * interp_weight * vote.x;
    125         hist[bin.y * 48] += gaussian * interp_weight * vote.y;
    126     }
    127     barrier(CLK_LOCAL_MEM_FENCE);
    128 
    129     volatile __local float* hist_ = hist;
    130     for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48)
    131     {
    132         if (cell_thread_x < 6)
    133             hist_[0] += hist_[6];
    134         barrier(CLK_LOCAL_MEM_FENCE);
    135         if (cell_thread_x < 3)
    136             hist_[0] += hist_[3];
    137 #ifdef CPU
    138         barrier(CLK_LOCAL_MEM_FENCE);
    139 #endif
    140         if (cell_thread_x == 0)
    141             final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
    142                 hist_[0] + hist_[1] + hist_[2];
    143     }
    144 
    145     barrier(CLK_LOCAL_MEM_FENCE);
    146 
    147     int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 12 + cell_thread_x;
    148     if ((tid < cblock_hist_size) && (gid < blocks_total))
    149     {
    150         __global float* block_hist = block_hists +
    151             (gidY * img_block_width + gidX) * cblock_hist_size;
    152         block_hist[tid] = final_hist[tid];
    153     }
    154 }
    155 
    156 //-------------------------------------------------------------
    157 //  Normalization of histograms via L2Hys_norm
    158 //  optimized for the case of 9 bins
    159 __kernel void normalize_hists_36_kernel(__global float* block_hists,
    160                                         const float threshold, __local float *squares)
    161 {
    162     const int tid = get_local_id(0);
    163     const int gid = get_global_id(0);
    164     const int bid = tid / 36;      /* block-hist id, (0 - 6) */
    165     const int boffset = bid * 36;  /* block-hist offset in the work-group */
    166     const int hid = tid - boffset; /* histogram bin id, (0 - 35) */
    167 
    168     float elem = block_hists[gid];
    169     squares[tid] = elem * elem;
    170     barrier(CLK_LOCAL_MEM_FENCE);
    171 
    172     __local float* smem = squares + boffset;
    173     float sum = smem[hid];
    174     if (hid < 18)
    175         smem[hid] = sum = sum + smem[hid + 18];
    176     barrier(CLK_LOCAL_MEM_FENCE);
    177     if (hid < 9)
    178         smem[hid] = sum = sum + smem[hid + 9];
    179     barrier(CLK_LOCAL_MEM_FENCE);
    180     if (hid < 4)
    181         smem[hid] = sum + smem[hid + 4];
    182     barrier(CLK_LOCAL_MEM_FENCE);
    183     sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
    184 
    185     elem = elem / (sqrt(sum) + 3.6f);
    186     elem = min(elem, threshold);
    187 
    188     barrier(CLK_LOCAL_MEM_FENCE);
    189     squares[tid] = elem * elem;
    190     barrier(CLK_LOCAL_MEM_FENCE);
    191 
    192     sum = smem[hid];
    193     if (hid < 18)
    194       smem[hid] = sum = sum + smem[hid + 18];
    195     barrier(CLK_LOCAL_MEM_FENCE);
    196     if (hid < 9)
    197         smem[hid] = sum = sum + smem[hid + 9];
    198     barrier(CLK_LOCAL_MEM_FENCE);
    199     if (hid < 4)
    200         smem[hid] = sum + smem[hid + 4];
    201     barrier(CLK_LOCAL_MEM_FENCE);
    202     sum = smem[0] + smem[1] + smem[2] + smem[3] + smem[8];
    203 
    204     block_hists[gid] = elem / (sqrt(sum) + 1e-3f);
    205 }
    206 
    207 //-------------------------------------------------------------
    208 //  Normalization of histograms via L2Hys_norm
    209 //
    210 inline float reduce_smem(volatile __local float* smem, int size)
    211 {
    212     unsigned int tid = get_local_id(0);
    213     float sum = smem[tid];
    214 
    215     if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256];
    216         barrier(CLK_LOCAL_MEM_FENCE); }
    217     if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128];
    218         barrier(CLK_LOCAL_MEM_FENCE); }
    219     if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
    220         barrier(CLK_LOCAL_MEM_FENCE); }
    221 #ifdef CPU
    222     if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
    223         barrier(CLK_LOCAL_MEM_FENCE); }
    224     if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
    225         barrier(CLK_LOCAL_MEM_FENCE); }
    226     if (size >= 16) { if (tid < 8) smem[tid] = sum = sum + smem[tid + 8];
    227         barrier(CLK_LOCAL_MEM_FENCE); }
    228     if (size >= 8) { if (tid < 4) smem[tid] = sum = sum + smem[tid + 4];
    229         barrier(CLK_LOCAL_MEM_FENCE); }
    230     if (size >= 4) { if (tid < 2) smem[tid] = sum = sum + smem[tid + 2];
    231         barrier(CLK_LOCAL_MEM_FENCE); }
    232     if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
    233         barrier(CLK_LOCAL_MEM_FENCE); }
    234 #else
    235     if (tid < 32)
    236     {
    237         if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
    238 #if WAVE_SIZE < 32
    239     } barrier(CLK_LOCAL_MEM_FENCE);
    240     if (tid < 16) {
    241 #endif
    242         if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
    243         if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
    244         if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
    245         if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
    246         if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
    247     }
    248 #endif
    249 
    250     return sum;
    251 }
    252 
    253 __kernel void normalize_hists_kernel(
    254     const int nthreads, const int block_hist_size, const int img_block_width,
    255     __global float* block_hists, const float threshold, __local float *squares)
    256 {
    257     const int tid = get_local_id(0);
    258     const int gidX = get_group_id(0);
    259     const int gidY = get_group_id(1);
    260 
    261     __global float* hist = block_hists + (gidY * img_block_width + gidX) *
    262         block_hist_size + tid;
    263 
    264     float elem = 0.f;
    265     if (tid < block_hist_size)
    266         elem = hist[0];
    267 
    268     squares[tid] = elem * elem;
    269 
    270     barrier(CLK_LOCAL_MEM_FENCE);
    271     float sum = reduce_smem(squares, nthreads);
    272 
    273     float scale = 1.0f / (sqrt(sum) + 0.1f * block_hist_size);
    274     elem = min(elem * scale, threshold);
    275 
    276     barrier(CLK_LOCAL_MEM_FENCE);
    277     squares[tid] = elem * elem;
    278 
    279     barrier(CLK_LOCAL_MEM_FENCE);
    280     sum = reduce_smem(squares, nthreads);
    281     scale = 1.0f / (sqrt(sum) + 1e-3f);
    282 
    283     if (tid < block_hist_size)
    284         hist[0] = elem * scale;
    285 }
    286 
    287 //---------------------------------------------------------------------
    288 //  Linear SVM based classification
    289 //  48x96 window, 9 bins and default parameters
    290 //  180 threads, each thread corresponds to a bin in a row
    291 __kernel void classify_hists_180_kernel(
    292     const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
    293     const int img_win_width, const int img_block_width,
    294     const int win_block_stride_x, const int win_block_stride_y,
    295     __global const float * block_hists, __global const float* coefs,
    296     float free_coef, float threshold, __global uchar* labels)
    297 {
    298     const int tid = get_local_id(0);
    299     const int gidX = get_group_id(0);
    300     const int gidY = get_group_id(1);
    301 
    302     __global const float* hist = block_hists + (gidY * win_block_stride_y *
    303         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
    304 
    305     float product = 0.f;
    306 
    307     for (int i = 0; i < cdescr_height; i++)
    308     {
    309         product += coefs[i * cdescr_width + tid] *
    310             hist[i * img_block_width * cblock_hist_size + tid];
    311     }
    312 
    313     __local float products[180];
    314 
    315     products[tid] = product;
    316 
    317     barrier(CLK_LOCAL_MEM_FENCE);
    318 
    319     if (tid < 90) products[tid] = product = product + products[tid + 90];
    320     barrier(CLK_LOCAL_MEM_FENCE);
    321 
    322     if (tid < 45) products[tid] = product = product + products[tid + 45];
    323     barrier(CLK_LOCAL_MEM_FENCE);
    324 
    325     volatile __local float* smem = products;
    326 #ifdef CPU
    327     if (tid < 13) smem[tid] = product = product + smem[tid + 32];
    328     barrier(CLK_LOCAL_MEM_FENCE);
    329     if (tid < 16) smem[tid] = product = product + smem[tid + 16];
    330     barrier(CLK_LOCAL_MEM_FENCE);
    331     if(tid<8) smem[tid] = product = product + smem[tid + 8];
    332     barrier(CLK_LOCAL_MEM_FENCE);
    333     if(tid<4) smem[tid] = product = product + smem[tid + 4];
    334     barrier(CLK_LOCAL_MEM_FENCE);
    335     if(tid<2) smem[tid] = product = product + smem[tid + 2];
    336     barrier(CLK_LOCAL_MEM_FENCE);
    337 #else
    338     if (tid < 13)
    339     {
    340         smem[tid] = product = product + smem[tid + 32];
    341     }
    342 #if WAVE_SIZE < 32
    343     barrier(CLK_LOCAL_MEM_FENCE);
    344 #endif
    345     if (tid < 16)
    346     {
    347         smem[tid] = product = product + smem[tid + 16];
    348         smem[tid] = product = product + smem[tid + 8];
    349         smem[tid] = product = product + smem[tid + 4];
    350         smem[tid] = product = product + smem[tid + 2];
    351     }
    352 #endif
    353 
    354     if (tid == 0){
    355         product = product + smem[tid + 1];
    356         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
    357     }
    358 }
    359 
    360 //---------------------------------------------------------------------
    361 //  Linear SVM based classification
    362 //  64x128 window, 9 bins and default parameters
    363 //  256 threads, 252 of them are used
    364 __kernel void classify_hists_252_kernel(
    365     const int cdescr_width, const int cdescr_height, const int cblock_hist_size,
    366     const int img_win_width, const int img_block_width,
    367     const int win_block_stride_x, const int win_block_stride_y,
    368     __global const float * block_hists, __global const float* coefs,
    369     float free_coef, float threshold, __global uchar* labels)
    370 {
    371     const int tid = get_local_id(0);
    372     const int gidX = get_group_id(0);
    373     const int gidY = get_group_id(1);
    374 
    375     __global const float* hist = block_hists + (gidY * win_block_stride_y *
    376         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
    377 
    378     float product = 0.f;
    379     if (tid < cdescr_width)
    380     {
    381         for (int i = 0; i < cdescr_height; i++)
    382             product += coefs[i * cdescr_width + tid] *
    383                 hist[i * img_block_width * cblock_hist_size + tid];
    384     }
    385 
    386     __local float products[NTHREADS];
    387 
    388     products[tid] = product;
    389 
    390     barrier(CLK_LOCAL_MEM_FENCE);
    391 
    392     if (tid < 128) products[tid] = product = product + products[tid + 128];
    393     barrier(CLK_LOCAL_MEM_FENCE);
    394 
    395     if (tid < 64) products[tid] = product = product + products[tid + 64];
    396     barrier(CLK_LOCAL_MEM_FENCE);
    397 
    398     volatile __local float* smem = products;
    399 #ifdef CPU
    400     if(tid<32) smem[tid] = product = product + smem[tid + 32];
    401     barrier(CLK_LOCAL_MEM_FENCE);
    402     if(tid<16) smem[tid] = product = product + smem[tid + 16];
    403     barrier(CLK_LOCAL_MEM_FENCE);
    404     if(tid<8) smem[tid] = product = product + smem[tid + 8];
    405     barrier(CLK_LOCAL_MEM_FENCE);
    406     if(tid<4) smem[tid] = product = product + smem[tid + 4];
    407     barrier(CLK_LOCAL_MEM_FENCE);
    408     if(tid<2) smem[tid] = product = product + smem[tid + 2];
    409     barrier(CLK_LOCAL_MEM_FENCE);
    410 #else
    411     if (tid < 32)
    412     {
    413         smem[tid] = product = product + smem[tid + 32];
    414 #if WAVE_SIZE < 32
    415     } barrier(CLK_LOCAL_MEM_FENCE);
    416     if (tid < 16) {
    417 #endif
    418         smem[tid] = product = product + smem[tid + 16];
    419         smem[tid] = product = product + smem[tid + 8];
    420         smem[tid] = product = product + smem[tid + 4];
    421         smem[tid] = product = product + smem[tid + 2];
    422     }
    423 #endif
    424     if (tid == 0){
    425         product = product + smem[tid + 1];
    426         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
    427     }
    428 }
    429 
    430 //---------------------------------------------------------------------
    431 //  Linear SVM based classification
    432 //  256 threads
    433 __kernel void classify_hists_kernel(
    434     const int cdescr_size, const int cdescr_width, const int cblock_hist_size,
    435     const int img_win_width, const int img_block_width,
    436     const int win_block_stride_x, const int win_block_stride_y,
    437     __global const float * block_hists, __global const float* coefs,
    438     float free_coef, float threshold, __global uchar* labels)
    439 {
    440     const int tid = get_local_id(0);
    441     const int gidX = get_group_id(0);
    442     const int gidY = get_group_id(1);
    443 
    444     __global const float* hist = block_hists + (gidY * win_block_stride_y *
    445         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
    446 
    447     float product = 0.f;
    448     for (int i = tid; i < cdescr_size; i += NTHREADS)
    449     {
    450         int offset_y = i / cdescr_width;
    451         int offset_x = i - offset_y * cdescr_width;
    452         product += coefs[i] *
    453             hist[offset_y * img_block_width * cblock_hist_size + offset_x];
    454     }
    455 
    456     __local float products[NTHREADS];
    457 
    458     products[tid] = product;
    459 
    460     barrier(CLK_LOCAL_MEM_FENCE);
    461 
    462     if (tid < 128) products[tid] = product = product + products[tid + 128];
    463     barrier(CLK_LOCAL_MEM_FENCE);
    464 
    465     if (tid < 64) products[tid] = product = product + products[tid + 64];
    466     barrier(CLK_LOCAL_MEM_FENCE);
    467 
    468     volatile __local float* smem = products;
    469 #ifdef CPU
    470     if(tid<32) smem[tid] = product = product + smem[tid + 32];
    471     barrier(CLK_LOCAL_MEM_FENCE);
    472     if(tid<16) smem[tid] = product = product + smem[tid + 16];
    473     barrier(CLK_LOCAL_MEM_FENCE);
    474     if(tid<8) smem[tid] = product = product + smem[tid + 8];
    475     barrier(CLK_LOCAL_MEM_FENCE);
    476     if(tid<4) smem[tid] = product = product + smem[tid + 4];
    477     barrier(CLK_LOCAL_MEM_FENCE);
    478     if(tid<2) smem[tid] = product = product + smem[tid + 2];
    479     barrier(CLK_LOCAL_MEM_FENCE);
    480 #else
    481     if (tid < 32)
    482     {
    483         smem[tid] = product = product + smem[tid + 32];
    484 #if WAVE_SIZE < 32
    485     } barrier(CLK_LOCAL_MEM_FENCE);
    486     if (tid < 16) {
    487 #endif
    488         smem[tid] = product = product + smem[tid + 16];
    489         smem[tid] = product = product + smem[tid + 8];
    490         smem[tid] = product = product + smem[tid + 4];
    491         smem[tid] = product = product + smem[tid + 2];
    492     }
    493 #endif
    494     if (tid == 0){
    495         smem[tid] = product = product + smem[tid + 1];
    496         labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
    497     }
    498 }
    499 
    500 //----------------------------------------------------------------------------
    501 // Extract descriptors
    502 
    503 __kernel void extract_descrs_by_rows_kernel(
    504     const int cblock_hist_size, const int descriptors_quadstep,
    505     const int cdescr_size, const int cdescr_width, const int img_block_width,
    506     const int win_block_stride_x, const int win_block_stride_y,
    507     __global const float* block_hists, __global float* descriptors)
    508 {
    509     int tid = get_local_id(0);
    510     int gidX = get_group_id(0);
    511     int gidY = get_group_id(1);
    512 
    513     // Get left top corner of the window in src
    514     __global const float* hist = block_hists + (gidY * win_block_stride_y *
    515         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
    516 
    517     // Get left top corner of the window in dst
    518     __global float* descriptor = descriptors +
    519         (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
    520 
    521     // Copy elements from src to dst
    522     for (int i = tid; i < cdescr_size; i += NTHREADS)
    523     {
    524         int offset_y = i / cdescr_width;
    525         int offset_x = i - offset_y * cdescr_width;
    526         descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
    527     }
    528 }
    529 
    530 __kernel void extract_descrs_by_cols_kernel(
    531     const int cblock_hist_size, const int descriptors_quadstep, const int cdescr_size,
    532     const int cnblocks_win_x, const int cnblocks_win_y, const int img_block_width,
    533     const int win_block_stride_x, const int win_block_stride_y,
    534     __global const float* block_hists, __global float* descriptors)
    535 {
    536     int tid = get_local_id(0);
    537     int gidX = get_group_id(0);
    538     int gidY = get_group_id(1);
    539 
    540     // Get left top corner of the window in src
    541     __global const float* hist = block_hists +  (gidY * win_block_stride_y *
    542         img_block_width + gidX * win_block_stride_x) * cblock_hist_size;
    543 
    544     // Get left top corner of the window in dst
    545     __global float* descriptor = descriptors +
    546         (gidY * get_num_groups(0) + gidX) * descriptors_quadstep;
    547 
    548     // Copy elements from src to dst
    549     for (int i = tid; i < cdescr_size; i += NTHREADS)
    550     {
    551         int block_idx = i / cblock_hist_size;
    552         int idx_in_block = i - block_idx * cblock_hist_size;
    553 
    554         int y = block_idx / cnblocks_win_x;
    555         int x = block_idx - y * cnblocks_win_x;
    556 
    557         descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] =
    558             hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
    559     }
    560 }
    561 
    562 //----------------------------------------------------------------------------
    563 // Gradients computation
    564 
    565 __kernel void compute_gradients_8UC4_kernel(
    566     const int height, const int width,
    567     const int img_step, const int grad_quadstep, const int qangle_step,
    568     const __global uchar4 * img, __global float * grad, __global QANGLE_TYPE * qangle,
    569     const float angle_scale, const char correct_gamma, const int cnbins)
    570 {
    571     const int x = get_global_id(0);
    572     const int tid = get_local_id(0);
    573     const int gSizeX = get_local_size(0);
    574     const int gidY = get_group_id(1);
    575 
    576     __global const uchar4* row = img + gidY * img_step;
    577 
    578     __local float sh_row[(NTHREADS + 2) * 3];
    579 
    580     uchar4 val;
    581     if (x < width)
    582         val = row[x];
    583     else
    584         val = row[width - 2];
    585 
    586     sh_row[tid + 1] = val.x;
    587     sh_row[tid + 1 + (NTHREADS + 2)] = val.y;
    588     sh_row[tid + 1 + 2 * (NTHREADS + 2)] = val.z;
    589 
    590     if (tid == 0)
    591     {
    592         val = row[max(x - 1, 1)];
    593         sh_row[0] = val.x;
    594         sh_row[(NTHREADS + 2)] = val.y;
    595         sh_row[2 * (NTHREADS + 2)] = val.z;
    596     }
    597 
    598     if (tid == gSizeX - 1)
    599     {
    600         val = row[min(x + 1, width - 2)];
    601         sh_row[gSizeX + 1] = val.x;
    602         sh_row[gSizeX + 1 + (NTHREADS + 2)] = val.y;
    603         sh_row[gSizeX + 1 + 2 * (NTHREADS + 2)] = val.z;
    604     }
    605 
    606     barrier(CLK_LOCAL_MEM_FENCE);
    607     if (x < width)
    608     {
    609         float4 a = (float4) (sh_row[tid], sh_row[tid + (NTHREADS + 2)],
    610             sh_row[tid + 2 * (NTHREADS + 2)], 0);
    611         float4 b = (float4) (sh_row[tid + 2], sh_row[tid + 2 + (NTHREADS + 2)],
    612             sh_row[tid + 2 + 2 * (NTHREADS + 2)], 0);
    613 
    614         float4 dx;
    615         if (correct_gamma == 1)
    616             dx = sqrt(b) - sqrt(a);
    617         else
    618             dx = b - a;
    619 
    620         float4 dy = (float4) 0.f;
    621 
    622         if (gidY > 0 && gidY < height - 1)
    623         {
    624             a = convert_float4(img[(gidY - 1) * img_step + x].xyzw);
    625             b = convert_float4(img[(gidY + 1) * img_step + x].xyzw);
    626 
    627             if (correct_gamma == 1)
    628                 dy = sqrt(b) - sqrt(a);
    629             else
    630                 dy = b - a;
    631         }
    632 
    633         float4 mag = hypot(dx, dy);
    634         float best_dx = dx.x;
    635         float best_dy = dy.x;
    636 
    637         float mag0 = mag.x;
    638         if (mag0 < mag.y)
    639         {
    640             best_dx = dx.y;
    641             best_dy = dy.y;
    642             mag0 = mag.y;
    643         }
    644 
    645         if (mag0 < mag.z)
    646         {
    647             best_dx = dx.z;
    648             best_dy = dy.z;
    649             mag0 = mag.z;
    650         }
    651 
    652         float ang = (atan2(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
    653         int hidx = (int)floor(ang);
    654         ang -= hidx;
    655         hidx = (hidx + cnbins) % cnbins;
    656 
    657         qangle[(gidY * qangle_step + x) << 1] = hidx;
    658         qangle[((gidY * qangle_step + x) << 1) + 1] = (hidx + 1) % cnbins;
    659         grad[(gidY * grad_quadstep + x) << 1] = mag0 * (1.f - ang);
    660         grad[((gidY * grad_quadstep + x) << 1) + 1] = mag0 * ang;
    661     }
    662 }
    663 
    664 __kernel void compute_gradients_8UC1_kernel(
    665     const int height, const int width,
    666     const int img_step, const int grad_quadstep, const int qangle_step,
    667     __global const uchar * img, __global float * grad, __global QANGLE_TYPE * qangle,
    668     const float angle_scale, const char correct_gamma, const int cnbins)
    669 {
    670     const int x = get_global_id(0);
    671     const int tid = get_local_id(0);
    672     const int gSizeX = get_local_size(0);
    673     const int gidY = get_group_id(1);
    674 
    675     __global const uchar* row = img + gidY * img_step;
    676 
    677     __local float sh_row[NTHREADS + 2];
    678 
    679     if (x < width)
    680         sh_row[tid + 1] = row[x];
    681     else
    682         sh_row[tid + 1] = row[width - 2];
    683 
    684     if (tid == 0)
    685         sh_row[0] = row[max(x - 1, 1)];
    686 
    687     if (tid == gSizeX - 1)
    688         sh_row[gSizeX + 1] = row[min(x + 1, width - 2)];
    689 
    690     barrier(CLK_LOCAL_MEM_FENCE);
    691     if (x < width)
    692     {
    693         float dx;
    694 
    695         if (correct_gamma == 1)
    696             dx = sqrt(sh_row[tid + 2]) - sqrt(sh_row[tid]);
    697         else
    698             dx = sh_row[tid + 2] - sh_row[tid];
    699 
    700         float dy = 0.f;
    701         if (gidY > 0 && gidY < height - 1)
    702         {
    703             float a = (float) img[ (gidY + 1) * img_step + x ];
    704             float b = (float) img[ (gidY - 1) * img_step + x ];
    705             if (correct_gamma == 1)
    706                 dy = sqrt(a) - sqrt(b);
    707             else
    708                 dy = a - b;
    709         }
    710         float mag = hypot(dx, dy);
    711 
    712         float ang = (atan2(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
    713         int hidx = (int)floor(ang);
    714         ang -= hidx;
    715         hidx = (hidx + cnbins) % cnbins;
    716 
    717         qangle[ (gidY * qangle_step + x) << 1 ]     = hidx;
    718         qangle[ ((gidY * qangle_step + x) << 1) + 1 ] = (hidx + 1) % cnbins;
    719         grad[ (gidY * grad_quadstep + x) << 1 ]       = mag * (1.f - ang);
    720         grad[ ((gidY * grad_quadstep + x) << 1) + 1 ]   = mag * ang;
    721     }
    722 }
    723