Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include <thrust/device_ptr.h>
     46 #include <thrust/transform.h>
     47 
     48 #include "opencv2/core/cuda/common.hpp"
     49 #include "opencv2/core/cuda/emulation.hpp"
     50 #include "opencv2/core/cuda/vec_math.hpp"
     51 #include "opencv2/core/cuda/functional.hpp"
     52 
     53 #include "opencv2/opencv_modules.hpp"
     54 
     55 #ifdef HAVE_OPENCV_CUDAARITHM
     56 
     57 namespace cv { namespace cuda { namespace device
     58 {
     59     namespace ght
     60     {
     61         __device__ int g_counter;
     62 
     63         template <typename T, int PIXELS_PER_THREAD>
     64         __global__ void buildEdgePointList(const PtrStepSzb edges, const PtrStep<T> dx, const PtrStep<T> dy,
     65                                            unsigned int* coordList, float* thetaList)
     66         {
     67             __shared__ unsigned int s_coordLists[4][32 * PIXELS_PER_THREAD];
     68             __shared__ float s_thetaLists[4][32 * PIXELS_PER_THREAD];
     69             __shared__ int s_sizes[4];
     70             __shared__ int s_globStart[4];
     71 
     72             const int x = blockIdx.x * blockDim.x * PIXELS_PER_THREAD + threadIdx.x;
     73             const int y = blockIdx.y * blockDim.y + threadIdx.y;
     74 
     75             if (threadIdx.x == 0)
     76                 s_sizes[threadIdx.y] = 0;
     77             __syncthreads();
     78 
     79             if (y < edges.rows)
     80             {
     81                 // fill the queue
     82                 const uchar* edgesRow = edges.ptr(y);
     83                 const T* dxRow = dx.ptr(y);
     84                 const T* dyRow = dy.ptr(y);
     85 
     86                 for (int i = 0, xx = x; i < PIXELS_PER_THREAD && xx < edges.cols; ++i, xx += blockDim.x)
     87                 {
     88                     const T dxVal = dxRow[xx];
     89                     const T dyVal = dyRow[xx];
     90 
     91                     if (edgesRow[xx] && (dxVal != 0 || dyVal != 0))
     92                     {
     93                         const unsigned int coord = (y << 16) | xx;
     94 
     95                         float theta = ::atan2f(dyVal, dxVal);
     96                         if (theta < 0)
     97                             theta += 2.0f * CV_PI_F;
     98 
     99                         const int qidx = Emulation::smem::atomicAdd(&s_sizes[threadIdx.y], 1);
    100 
    101                         s_coordLists[threadIdx.y][qidx] = coord;
    102                         s_thetaLists[threadIdx.y][qidx] = theta;
    103                     }
    104                 }
    105             }
    106 
    107             __syncthreads();
    108 
    109             // let one thread reserve the space required in the global list
    110             if (threadIdx.x == 0 && threadIdx.y == 0)
    111             {
    112                 // find how many items are stored in each list
    113                 int totalSize = 0;
    114                 for (int i = 0; i < blockDim.y; ++i)
    115                 {
    116                     s_globStart[i] = totalSize;
    117                     totalSize += s_sizes[i];
    118                 }
    119 
    120                 // calculate the offset in the global list
    121                 const int globalOffset = atomicAdd(&g_counter, totalSize);
    122                 for (int i = 0; i < blockDim.y; ++i)
    123                     s_globStart[i] += globalOffset;
    124             }
    125 
    126             __syncthreads();
    127 
    128             // copy local queues to global queue
    129             const int qsize = s_sizes[threadIdx.y];
    130             int gidx = s_globStart[threadIdx.y] + threadIdx.x;
    131             for(int i = threadIdx.x; i < qsize; i += blockDim.x, gidx += blockDim.x)
    132             {
    133                 coordList[gidx] = s_coordLists[threadIdx.y][i];
    134                 thetaList[gidx] = s_thetaLists[threadIdx.y][i];
    135             }
    136         }
    137 
    138         template <typename T>
    139         int buildEdgePointList_gpu(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList)
    140         {
    141             const int PIXELS_PER_THREAD = 8;
    142 
    143             void* counterPtr;
    144             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
    145 
    146             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
    147 
    148             const dim3 block(32, 4);
    149             const dim3 grid(divUp(edges.cols, block.x * PIXELS_PER_THREAD), divUp(edges.rows, block.y));
    150 
    151             cudaSafeCall( cudaFuncSetCacheConfig(buildEdgePointList<T, PIXELS_PER_THREAD>, cudaFuncCachePreferShared) );
    152 
    153             buildEdgePointList<T, PIXELS_PER_THREAD><<<grid, block>>>(edges, (PtrStepSz<T>) dx, (PtrStepSz<T>) dy, coordList, thetaList);
    154             cudaSafeCall( cudaGetLastError() );
    155 
    156             cudaSafeCall( cudaDeviceSynchronize() );
    157 
    158             int totalCount;
    159             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
    160 
    161             return totalCount;
    162         }
    163 
    164         template int buildEdgePointList_gpu<short>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
    165         template int buildEdgePointList_gpu<int>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
    166         template int buildEdgePointList_gpu<float>(PtrStepSzb edges, PtrStepSzb dx, PtrStepSzb dy, unsigned int* coordList, float* thetaList);
    167 
    168         __global__ void buildRTable(const unsigned int* coordList, const float* thetaList, const int pointsCount,
    169                                     PtrStep<short2> r_table, int* r_sizes, int maxSize,
    170                                     const short2 templCenter, const float thetaScale)
    171         {
    172             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    173 
    174             if (tid >= pointsCount)
    175                 return;
    176 
    177             const unsigned int coord = coordList[tid];
    178             short2 p;
    179             p.x = (coord & 0xFFFF);
    180             p.y = (coord >> 16) & 0xFFFF;
    181 
    182             const float theta = thetaList[tid];
    183             const int n = __float2int_rn(theta * thetaScale);
    184 
    185             const int ind = ::atomicAdd(r_sizes + n, 1);
    186             if (ind < maxSize)
    187                 r_table(n, ind) = saturate_cast<short2>(p - templCenter);
    188         }
    189 
    190         void buildRTable_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
    191                              PtrStepSz<short2> r_table, int* r_sizes,
    192                              short2 templCenter, int levels)
    193         {
    194             const dim3 block(256);
    195             const dim3 grid(divUp(pointsCount, block.x));
    196 
    197             const float thetaScale = levels / (2.0f * CV_PI_F);
    198 
    199             buildRTable<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, r_table.cols, templCenter, thetaScale);
    200             cudaSafeCall( cudaGetLastError() );
    201 
    202             cudaSafeCall( cudaDeviceSynchronize() );
    203         }
    204 
    205         ////////////////////////////////////////////////////////////////////////
    206         // Ballard_Pos
    207 
    208         __global__ void Ballard_Pos_calcHist(const unsigned int* coordList, const float* thetaList, const int pointsCount,
    209                                              const PtrStep<short2> r_table, const int* r_sizes,
    210                                              PtrStepSzi hist,
    211                                              const float idp, const float thetaScale)
    212         {
    213             const int tid = blockIdx.x * blockDim.x + threadIdx.x;
    214 
    215             if (tid >= pointsCount)
    216                 return;
    217 
    218             const unsigned int coord = coordList[tid];
    219             short2 p;
    220             p.x = (coord & 0xFFFF);
    221             p.y = (coord >> 16) & 0xFFFF;
    222 
    223             const float theta = thetaList[tid];
    224             const int n = __float2int_rn(theta * thetaScale);
    225 
    226             const short2* r_row = r_table.ptr(n);
    227             const int r_row_size = r_sizes[n];
    228 
    229             for (int j = 0; j < r_row_size; ++j)
    230             {
    231                 short2 c = saturate_cast<short2>(p - r_row[j]);
    232 
    233                 c.x = __float2int_rn(c.x * idp);
    234                 c.y = __float2int_rn(c.y * idp);
    235 
    236                 if (c.x >= 0 && c.x < hist.cols - 2 && c.y >= 0 && c.y < hist.rows - 2)
    237                     ::atomicAdd(hist.ptr(c.y + 1) + c.x + 1, 1);
    238             }
    239         }
    240 
    241         void Ballard_Pos_calcHist_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
    242                                       PtrStepSz<short2> r_table, const int* r_sizes,
    243                                       PtrStepSzi hist,
    244                                       float dp, int levels)
    245         {
    246             const dim3 block(256);
    247             const dim3 grid(divUp(pointsCount, block.x));
    248 
    249             const float idp = 1.0f / dp;
    250             const float thetaScale = levels / (2.0f * CV_PI_F);
    251 
    252             Ballard_Pos_calcHist<<<grid, block>>>(coordList, thetaList, pointsCount, r_table, r_sizes, hist, idp, thetaScale);
    253             cudaSafeCall( cudaGetLastError() );
    254 
    255             cudaSafeCall( cudaDeviceSynchronize() );
    256         }
    257 
    258         __global__ void Ballard_Pos_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes,
    259                                                   const int maxSize, const float dp, const int threshold)
    260         {
    261             const int x = blockIdx.x * blockDim.x + threadIdx.x;
    262             const int y = blockIdx.y * blockDim.y + threadIdx.y;
    263 
    264             if (x >= hist.cols - 2 || y >= hist.rows - 2)
    265                 return;
    266 
    267             const int curVotes = hist(y + 1, x + 1);
    268 
    269             if (curVotes > threshold &&
    270                 curVotes >  hist(y + 1, x) &&
    271                 curVotes >= hist(y + 1, x + 2) &&
    272                 curVotes >  hist(y, x + 1) &&
    273                 curVotes >= hist(y + 2, x + 1))
    274             {
    275                 const int ind = ::atomicAdd(&g_counter, 1);
    276 
    277                 if (ind < maxSize)
    278                 {
    279                     out[ind] = make_float4(x * dp, y * dp, 1.0f, 0.0f);
    280                     votes[ind] = make_int3(curVotes, 0, 0);
    281                 }
    282             }
    283         }
    284 
    285         int Ballard_Pos_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int maxSize, float dp, int threshold)
    286         {
    287             void* counterPtr;
    288             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
    289 
    290             cudaSafeCall( cudaMemset(counterPtr, 0, sizeof(int)) );
    291 
    292             const dim3 block(32, 8);
    293             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
    294 
    295             cudaSafeCall( cudaFuncSetCacheConfig(Ballard_Pos_findPosInHist, cudaFuncCachePreferL1) );
    296 
    297             Ballard_Pos_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize, dp, threshold);
    298             cudaSafeCall( cudaGetLastError() );
    299 
    300             cudaSafeCall( cudaDeviceSynchronize() );
    301 
    302             int totalCount;
    303             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
    304 
    305             totalCount = ::min(totalCount, maxSize);
    306 
    307             return totalCount;
    308         }
    309 
    310         ////////////////////////////////////////////////////////////////////////
    311         // Guil_Full
    312 
    313         struct FeatureTable
    314         {
    315             uchar* p1_pos_data;
    316             size_t p1_pos_step;
    317 
    318             uchar* p1_theta_data;
    319             size_t p1_theta_step;
    320 
    321             uchar* p2_pos_data;
    322             size_t p2_pos_step;
    323 
    324             uchar* d12_data;
    325             size_t d12_step;
    326 
    327             uchar* r1_data;
    328             size_t r1_step;
    329 
    330             uchar* r2_data;
    331             size_t r2_step;
    332         };
    333 
    334         __constant__ FeatureTable c_templFeatures;
    335         __constant__ FeatureTable c_imageFeatures;
    336 
    337         void Guil_Full_setTemplFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
    338         {
    339             FeatureTable tbl;
    340 
    341             tbl.p1_pos_data = p1_pos.data;
    342             tbl.p1_pos_step = p1_pos.step;
    343 
    344             tbl.p1_theta_data = p1_theta.data;
    345             tbl.p1_theta_step = p1_theta.step;
    346 
    347             tbl.p2_pos_data = p2_pos.data;
    348             tbl.p2_pos_step = p2_pos.step;
    349 
    350             tbl.d12_data = d12.data;
    351             tbl.d12_step = d12.step;
    352 
    353             tbl.r1_data = r1.data;
    354             tbl.r1_step = r1.step;
    355 
    356             tbl.r2_data = r2.data;
    357             tbl.r2_step = r2.step;
    358 
    359             cudaSafeCall( cudaMemcpyToSymbol(c_templFeatures, &tbl, sizeof(FeatureTable)) );
    360         }
    361         void Guil_Full_setImageFeatures(PtrStepb p1_pos, PtrStepb p1_theta, PtrStepb p2_pos, PtrStepb d12, PtrStepb r1, PtrStepb r2)
    362         {
    363             FeatureTable tbl;
    364 
    365             tbl.p1_pos_data = p1_pos.data;
    366             tbl.p1_pos_step = p1_pos.step;
    367 
    368             tbl.p1_theta_data = p1_theta.data;
    369             tbl.p1_theta_step = p1_theta.step;
    370 
    371             tbl.p2_pos_data = p2_pos.data;
    372             tbl.p2_pos_step = p2_pos.step;
    373 
    374             tbl.d12_data = d12.data;
    375             tbl.d12_step = d12.step;
    376 
    377             tbl.r1_data = r1.data;
    378             tbl.r1_step = r1.step;
    379 
    380             tbl.r2_data = r2.data;
    381             tbl.r2_step = r2.step;
    382 
    383             cudaSafeCall( cudaMemcpyToSymbol(c_imageFeatures, &tbl, sizeof(FeatureTable)) );
    384         }
    385 
    386         struct TemplFeatureTable
    387         {
    388             static __device__ float2* p1_pos(int n)
    389             {
    390                 return (float2*)(c_templFeatures.p1_pos_data + n * c_templFeatures.p1_pos_step);
    391             }
    392             static __device__ float* p1_theta(int n)
    393             {
    394                 return (float*)(c_templFeatures.p1_theta_data + n * c_templFeatures.p1_theta_step);
    395             }
    396             static __device__ float2* p2_pos(int n)
    397             {
    398                 return (float2*)(c_templFeatures.p2_pos_data + n * c_templFeatures.p2_pos_step);
    399             }
    400 
    401             static __device__ float* d12(int n)
    402             {
    403                 return (float*)(c_templFeatures.d12_data + n * c_templFeatures.d12_step);
    404             }
    405 
    406             static __device__ float2* r1(int n)
    407             {
    408                 return (float2*)(c_templFeatures.r1_data + n * c_templFeatures.r1_step);
    409             }
    410             static __device__ float2* r2(int n)
    411             {
    412                 return (float2*)(c_templFeatures.r2_data + n * c_templFeatures.r2_step);
    413             }
    414         };
    415         struct ImageFeatureTable
    416         {
    417             static __device__ float2* p1_pos(int n)
    418             {
    419                 return (float2*)(c_imageFeatures.p1_pos_data + n * c_imageFeatures.p1_pos_step);
    420             }
    421             static __device__ float* p1_theta(int n)
    422             {
    423                 return (float*)(c_imageFeatures.p1_theta_data + n * c_imageFeatures.p1_theta_step);
    424             }
    425             static __device__ float2* p2_pos(int n)
    426             {
    427                 return (float2*)(c_imageFeatures.p2_pos_data + n * c_imageFeatures.p2_pos_step);
    428             }
    429 
    430             static __device__ float* d12(int n)
    431             {
    432                 return (float*)(c_imageFeatures.d12_data + n * c_imageFeatures.d12_step);
    433             }
    434 
    435             static __device__ float2* r1(int n)
    436             {
    437                 return (float2*)(c_imageFeatures.r1_data + n * c_imageFeatures.r1_step);
    438             }
    439             static __device__ float2* r2(int n)
    440             {
    441                 return (float2*)(c_imageFeatures.r2_data + n * c_imageFeatures.r2_step);
    442             }
    443         };
    444 
    445         __device__ float clampAngle(float a)
    446         {
    447             float res = a;
    448 
    449             while (res > 2.0f * CV_PI_F)
    450                 res -= 2.0f * CV_PI_F;
    451             while (res < 0.0f)
    452                 res += 2.0f * CV_PI_F;
    453 
    454             return res;
    455         }
    456 
    457         __device__ bool angleEq(float a, float b, float eps)
    458         {
    459             return (::fabs(clampAngle(a - b)) <= eps);
    460         }
    461 
    462         template <class FT, bool isTempl>
    463         __global__ void Guil_Full_buildFeatureList(const unsigned int* coordList, const float* thetaList, const int pointsCount,
    464                                                    int* sizes, const int maxSize,
    465                                                    const float xi, const float angleEpsilon, const float alphaScale,
    466                                                    const float2 center, const float maxDist)
    467         {
    468             const float p1_theta = thetaList[blockIdx.x];
    469             const unsigned int coord1 = coordList[blockIdx.x];
    470             float2 p1_pos;
    471             p1_pos.x = (coord1 & 0xFFFF);
    472             p1_pos.y = (coord1 >> 16) & 0xFFFF;
    473 
    474             for (int i = threadIdx.x; i < pointsCount; i += blockDim.x)
    475             {
    476                 const float p2_theta = thetaList[i];
    477                 const unsigned int coord2 = coordList[i];
    478                 float2 p2_pos;
    479                 p2_pos.x = (coord2 & 0xFFFF);
    480                 p2_pos.y = (coord2 >> 16) & 0xFFFF;
    481 
    482                 if (angleEq(p1_theta - p2_theta, xi, angleEpsilon))
    483                 {
    484                     const float2 d = p1_pos - p2_pos;
    485 
    486                     float alpha12 = clampAngle(::atan2(d.y, d.x) - p1_theta);
    487                     float d12 = ::sqrtf(d.x * d.x + d.y * d.y);
    488 
    489                     if (d12 > maxDist)
    490                         continue;
    491 
    492                     float2 r1 = p1_pos - center;
    493                     float2 r2 = p2_pos - center;
    494 
    495                     const int n = __float2int_rn(alpha12 * alphaScale);
    496 
    497                     const int ind = ::atomicAdd(sizes + n, 1);
    498 
    499                     if (ind < maxSize)
    500                     {
    501                         if (!isTempl)
    502                         {
    503                             FT::p1_pos(n)[ind] = p1_pos;
    504                             FT::p2_pos(n)[ind] = p2_pos;
    505                         }
    506 
    507                         FT::p1_theta(n)[ind] = p1_theta;
    508 
    509                         FT::d12(n)[ind] = d12;
    510 
    511                         if (isTempl)
    512                         {
    513                             FT::r1(n)[ind] = r1;
    514                             FT::r2(n)[ind] = r2;
    515                         }
    516                     }
    517                 }
    518             }
    519         }
    520 
    521         template <class FT, bool isTempl>
    522         void Guil_Full_buildFeatureList_caller(const unsigned int* coordList, const float* thetaList, int pointsCount,
    523                                                int* sizes, int maxSize,
    524                                                float xi, float angleEpsilon, int levels,
    525                                                float2 center, float maxDist)
    526         {
    527             const dim3 block(256);
    528             const dim3 grid(pointsCount);
    529 
    530             const float alphaScale = levels / (2.0f * CV_PI_F);
    531 
    532             Guil_Full_buildFeatureList<FT, isTempl><<<grid, block>>>(coordList, thetaList, pointsCount,
    533                                                                      sizes, maxSize,
    534                                                                      xi * (CV_PI_F / 180.0f), angleEpsilon * (CV_PI_F / 180.0f), alphaScale,
    535                                                                      center, maxDist);
    536             cudaSafeCall( cudaGetLastError() );
    537 
    538             cudaSafeCall( cudaDeviceSynchronize() );
    539 
    540             thrust::device_ptr<int> sizesPtr(sizes);
    541             thrust::transform(sizesPtr, sizesPtr + levels + 1, sizesPtr, device::bind2nd(device::minimum<int>(), maxSize));
    542         }
    543 
    544         void Guil_Full_buildTemplFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
    545                                                  int* sizes, int maxSize,
    546                                                  float xi, float angleEpsilon, int levels,
    547                                                  float2 center, float maxDist)
    548         {
    549             Guil_Full_buildFeatureList_caller<TemplFeatureTable, true>(coordList, thetaList, pointsCount,
    550                                                                        sizes, maxSize,
    551                                                                        xi, angleEpsilon, levels,
    552                                                                        center, maxDist);
    553         }
    554         void Guil_Full_buildImageFeatureList_gpu(const unsigned int* coordList, const float* thetaList, int pointsCount,
    555                                                  int* sizes, int maxSize,
    556                                                  float xi, float angleEpsilon, int levels,
    557                                                  float2 center, float maxDist)
    558         {
    559             Guil_Full_buildFeatureList_caller<ImageFeatureTable, false>(coordList, thetaList, pointsCount,
    560                                                                         sizes, maxSize,
    561                                                                         xi, angleEpsilon, levels,
    562                                                                         center, maxDist);
    563         }
    564 
    565         __global__ void Guil_Full_calcOHist(const int* templSizes, const int* imageSizes, int* OHist,
    566                                             const float minAngle, const float maxAngle, const float iAngleStep, const int angleRange)
    567         {
    568             extern __shared__ int s_OHist[];
    569             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
    570                 s_OHist[i] = 0;
    571             __syncthreads();
    572 
    573             const int tIdx = blockIdx.x;
    574             const int level = blockIdx.y;
    575 
    576             const int tSize = templSizes[level];
    577 
    578             if (tIdx < tSize)
    579             {
    580                 const int imSize = imageSizes[level];
    581 
    582                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx];
    583 
    584                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
    585                 {
    586                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
    587 
    588                     const float angle = clampAngle(im_p1_theta - t_p1_theta);
    589 
    590                     if (angle >= minAngle && angle <= maxAngle)
    591                     {
    592                         const int n = __float2int_rn((angle - minAngle) * iAngleStep);
    593                         Emulation::smem::atomicAdd(&s_OHist[n], 1);
    594                     }
    595                 }
    596             }
    597             __syncthreads();
    598 
    599             for (int i = threadIdx.x; i <= angleRange; i += blockDim.x)
    600                 ::atomicAdd(OHist + i, s_OHist[i]);
    601         }
    602 
    603         void Guil_Full_calcOHist_gpu(const int* templSizes, const int* imageSizes, int* OHist,
    604                                      float minAngle, float maxAngle, float angleStep, int angleRange,
    605                                      int levels, int tMaxSize)
    606         {
    607             const dim3 block(256);
    608             const dim3 grid(tMaxSize, levels + 1);
    609 
    610             minAngle *= (CV_PI_F / 180.0f);
    611             maxAngle *= (CV_PI_F / 180.0f);
    612             angleStep *= (CV_PI_F / 180.0f);
    613 
    614             const size_t smemSize = (angleRange + 1) * sizeof(float);
    615 
    616             Guil_Full_calcOHist<<<grid, block, smemSize>>>(templSizes, imageSizes, OHist,
    617                                                            minAngle, maxAngle, 1.0f / angleStep, angleRange);
    618             cudaSafeCall( cudaGetLastError() );
    619 
    620             cudaSafeCall( cudaDeviceSynchronize() );
    621         }
    622 
    623         __global__ void Guil_Full_calcSHist(const int* templSizes, const int* imageSizes, int* SHist,
    624                                             const float angle, const float angleEpsilon,
    625                                             const float minScale, const float maxScale, const float iScaleStep, const int scaleRange)
    626         {
    627             extern __shared__ int s_SHist[];
    628             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
    629                 s_SHist[i] = 0;
    630             __syncthreads();
    631 
    632             const int tIdx = blockIdx.x;
    633             const int level = blockIdx.y;
    634 
    635             const int tSize = templSizes[level];
    636 
    637             if (tIdx < tSize)
    638             {
    639                 const int imSize = imageSizes[level];
    640 
    641                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
    642                 const float t_d12 = TemplFeatureTable::d12(level)[tIdx] + angle;
    643 
    644                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
    645                 {
    646                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
    647                     const float im_d12 = ImageFeatureTable::d12(level)[i];
    648 
    649                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
    650                     {
    651                         const float scale = im_d12 / t_d12;
    652 
    653                         if (scale >= minScale && scale <= maxScale)
    654                         {
    655                             const int s = __float2int_rn((scale - minScale) * iScaleStep);
    656                             Emulation::smem::atomicAdd(&s_SHist[s], 1);
    657                         }
    658                     }
    659                 }
    660             }
    661             __syncthreads();
    662 
    663             for (int i = threadIdx.x; i <= scaleRange; i += blockDim.x)
    664                 ::atomicAdd(SHist + i, s_SHist[i]);
    665         }
    666 
    667         void Guil_Full_calcSHist_gpu(const int* templSizes, const int* imageSizes, int* SHist,
    668                                      float angle, float angleEpsilon,
    669                                      float minScale, float maxScale, float iScaleStep, int scaleRange,
    670                                      int levels, int tMaxSize)
    671         {
    672             const dim3 block(256);
    673             const dim3 grid(tMaxSize, levels + 1);
    674 
    675             angle *= (CV_PI_F / 180.0f);
    676             angleEpsilon *= (CV_PI_F / 180.0f);
    677 
    678             const size_t smemSize = (scaleRange + 1) * sizeof(float);
    679 
    680             Guil_Full_calcSHist<<<grid, block, smemSize>>>(templSizes, imageSizes, SHist,
    681                                                            angle, angleEpsilon,
    682                                                            minScale, maxScale, iScaleStep, scaleRange);
    683             cudaSafeCall( cudaGetLastError() );
    684 
    685             cudaSafeCall( cudaDeviceSynchronize() );
    686         }
    687 
    688         __global__ void Guil_Full_calcPHist(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
    689                                             const float angle, const float sinVal, const float cosVal, const float angleEpsilon, const float scale,
    690                                             const float idp)
    691         {
    692             const int tIdx = blockIdx.x;
    693             const int level = blockIdx.y;
    694 
    695             const int tSize = templSizes[level];
    696 
    697             if (tIdx < tSize)
    698             {
    699                 const int imSize = imageSizes[level];
    700 
    701                 const float t_p1_theta = TemplFeatureTable::p1_theta(level)[tIdx] + angle;
    702 
    703                 float2 r1 = TemplFeatureTable::r1(level)[tIdx];
    704                 float2 r2 = TemplFeatureTable::r2(level)[tIdx];
    705 
    706                 r1 = r1 * scale;
    707                 r2 = r2 * scale;
    708 
    709                 r1 = make_float2(cosVal * r1.x - sinVal * r1.y, sinVal * r1.x + cosVal * r1.y);
    710                 r2 = make_float2(cosVal * r2.x - sinVal * r2.y, sinVal * r2.x + cosVal * r2.y);
    711 
    712                 for (int i = threadIdx.x; i < imSize; i += blockDim.x)
    713                 {
    714                     const float im_p1_theta = ImageFeatureTable::p1_theta(level)[i];
    715 
    716                     const float2 im_p1_pos = ImageFeatureTable::p1_pos(level)[i];
    717                     const float2 im_p2_pos = ImageFeatureTable::p2_pos(level)[i];
    718 
    719                     if (angleEq(im_p1_theta, t_p1_theta, angleEpsilon))
    720                     {
    721                         float2 c1, c2;
    722 
    723                         c1 = im_p1_pos - r1;
    724                         c1 = c1 * idp;
    725 
    726                         c2 = im_p2_pos - r2;
    727                         c2 = c2 * idp;
    728 
    729                         if (::fabs(c1.x - c2.x) > 1 || ::fabs(c1.y - c2.y) > 1)
    730                             continue;
    731 
    732                         if (c1.y >= 0 && c1.y < PHist.rows - 2 && c1.x >= 0 && c1.x < PHist.cols - 2)
    733                             ::atomicAdd(PHist.ptr(__float2int_rn(c1.y) + 1) + __float2int_rn(c1.x) + 1, 1);
    734                     }
    735                 }
    736             }
    737         }
    738 
    739         void Guil_Full_calcPHist_gpu(const int* templSizes, const int* imageSizes, PtrStepSzi PHist,
    740                                      float angle, float angleEpsilon, float scale,
    741                                      float dp,
    742                                      int levels, int tMaxSize)
    743         {
    744             const dim3 block(256);
    745             const dim3 grid(tMaxSize, levels + 1);
    746 
    747             angle *= (CV_PI_F / 180.0f);
    748             angleEpsilon *= (CV_PI_F / 180.0f);
    749 
    750             const float sinVal = ::sinf(angle);
    751             const float cosVal = ::cosf(angle);
    752 
    753             cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_calcPHist, cudaFuncCachePreferL1) );
    754 
    755             Guil_Full_calcPHist<<<grid, block>>>(templSizes, imageSizes, PHist,
    756                                                  angle, sinVal, cosVal, angleEpsilon, scale,
    757                                                  1.0f / dp);
    758             cudaSafeCall( cudaGetLastError() );
    759 
    760             cudaSafeCall( cudaDeviceSynchronize() );
    761         }
    762 
    763         __global__ void Guil_Full_findPosInHist(const PtrStepSzi hist, float4* out, int3* votes, const int maxSize,
    764                                                 const float angle, const int angleVotes, const float scale, const int scaleVotes,
    765                                                 const float dp, const int threshold)
    766         {
    767             const int x = blockIdx.x * blockDim.x + threadIdx.x;
    768             const int y = blockIdx.y * blockDim.y + threadIdx.y;
    769 
    770             if (x >= hist.cols - 2 || y >= hist.rows - 2)
    771                 return;
    772 
    773             const int curVotes = hist(y + 1, x + 1);
    774 
    775             if (curVotes > threshold &&
    776                 curVotes >  hist(y + 1, x) &&
    777                 curVotes >= hist(y + 1, x + 2) &&
    778                 curVotes >  hist(y, x + 1) &&
    779                 curVotes >= hist(y + 2, x + 1))
    780             {
    781                 const int ind = ::atomicAdd(&g_counter, 1);
    782 
    783                 if (ind < maxSize)
    784                 {
    785                     out[ind] = make_float4(x * dp, y * dp, scale, angle);
    786                     votes[ind] = make_int3(curVotes, scaleVotes, angleVotes);
    787                 }
    788             }
    789         }
    790 
    791         int Guil_Full_findPosInHist_gpu(PtrStepSzi hist, float4* out, int3* votes, int curSize, int maxSize,
    792                                         float angle, int angleVotes, float scale, int scaleVotes,
    793                                         float dp, int threshold)
    794         {
    795             void* counterPtr;
    796             cudaSafeCall( cudaGetSymbolAddress(&counterPtr, g_counter) );
    797 
    798             cudaSafeCall( cudaMemcpy(counterPtr, &curSize, sizeof(int), cudaMemcpyHostToDevice) );
    799 
    800             const dim3 block(32, 8);
    801             const dim3 grid(divUp(hist.cols - 2, block.x), divUp(hist.rows - 2, block.y));
    802 
    803             cudaSafeCall( cudaFuncSetCacheConfig(Guil_Full_findPosInHist, cudaFuncCachePreferL1) );
    804 
    805             Guil_Full_findPosInHist<<<grid, block>>>(hist, out, votes, maxSize,
    806                                                      angle, angleVotes, scale, scaleVotes,
    807                                                      dp, threshold);
    808             cudaSafeCall( cudaGetLastError() );
    809 
    810             cudaSafeCall( cudaDeviceSynchronize() );
    811 
    812             int totalCount;
    813             cudaSafeCall( cudaMemcpy(&totalCount, counterPtr, sizeof(int), cudaMemcpyDeviceToHost) );
    814 
    815             totalCount = ::min(totalCount, maxSize);
    816 
    817             return totalCount;
    818         }
    819     }
    820 }}}
    821 
    822 #endif // HAVE_OPENCV_CUDAARITHM
    823 
    824 #endif /* CUDA_DISABLER */
    825