Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 #if !defined CUDA_DISABLER
     44 
     45 #include "lbp.hpp"
     46 #include "opencv2/core/cuda/vec_traits.hpp"
     47 #include "opencv2/core/cuda/saturate_cast.hpp"
     48 
     49 namespace cv { namespace cuda { namespace device
     50 {
     51     namespace lbp
     52     {
     53         struct LBP
     54         {
     55             __host__ __device__ __forceinline__ LBP() {}
     56 
     57             __device__ __forceinline__ int operator() (const int* integral, int ty, int fh, int fw, int& shift) const
     58             {
     59                 int anchors[9];
     60 
     61                 anchors[0]  = integral[ty];
     62                 anchors[1]  = integral[ty + fw];
     63                 anchors[0] -= anchors[1];
     64                 anchors[2]  = integral[ty + fw * 2];
     65                 anchors[1] -= anchors[2];
     66                 anchors[2] -= integral[ty + fw * 3];
     67 
     68                 ty += fh;
     69                 anchors[3]  = integral[ty];
     70                 anchors[4]  = integral[ty + fw];
     71                 anchors[3] -= anchors[4];
     72                 anchors[5]  = integral[ty + fw * 2];
     73                 anchors[4] -= anchors[5];
     74                 anchors[5] -= integral[ty + fw * 3];
     75 
     76                 anchors[0] -= anchors[3];
     77                 anchors[1] -= anchors[4];
     78                 anchors[2] -= anchors[5];
     79                 // 0 - 2 contains s0 - s2
     80 
     81                 ty += fh;
     82                 anchors[6]  = integral[ty];
     83                 anchors[7]  = integral[ty + fw];
     84                 anchors[6] -= anchors[7];
     85                 anchors[8]  = integral[ty + fw * 2];
     86                 anchors[7] -= anchors[8];
     87                 anchors[8] -= integral[ty + fw * 3];
     88 
     89                 anchors[3] -= anchors[6];
     90                 anchors[4] -= anchors[7];
     91                 anchors[5] -= anchors[8];
     92                 // 3 - 5 contains s3 - s5
     93 
     94                 anchors[0] -= anchors[4];
     95                 anchors[1] -= anchors[4];
     96                 anchors[2] -= anchors[4];
     97                 anchors[3] -= anchors[4];
     98                 anchors[5] -= anchors[4];
     99 
    100                 int response = (~(anchors[0] >> 31)) & 4;
    101                 response |= (~(anchors[1] >> 31)) & 2;;
    102                 response |= (~(anchors[2] >> 31)) & 1;
    103 
    104                 shift = (~(anchors[5] >> 31)) & 16;
    105                 shift |= (~(anchors[3] >> 31)) & 1;
    106 
    107                 ty += fh;
    108                 anchors[0]  = integral[ty];
    109                 anchors[1]  = integral[ty + fw];
    110                 anchors[0] -= anchors[1];
    111                 anchors[2]  = integral[ty + fw * 2];
    112                 anchors[1] -= anchors[2];
    113                 anchors[2] -= integral[ty + fw * 3];
    114 
    115                 anchors[6] -= anchors[0];
    116                 anchors[7] -= anchors[1];
    117                 anchors[8] -= anchors[2];
    118                 // 0 -2 contains s6 - s8
    119 
    120                 anchors[6] -= anchors[4];
    121                 anchors[7] -= anchors[4];
    122                 anchors[8] -= anchors[4];
    123 
    124                 shift |= (~(anchors[6] >> 31)) & 2;
    125                 shift |= (~(anchors[7] >> 31)) & 4;
    126                 shift |= (~(anchors[8] >> 31)) & 8;
    127                 return response;
    128             }
    129         };
    130 
    131         template<typename Pr>
    132         __global__ void disjoin(int4* candidates, int4* objects, unsigned int n, int groupThreshold, float grouping_eps, unsigned int* nclasses)
    133         {
    134             unsigned int tid = threadIdx.x;
    135             extern __shared__ int sbuff[];
    136 
    137             int* labels = sbuff;
    138             int* rrects = sbuff + n;
    139 
    140             Pr predicate(grouping_eps);
    141             partition(candidates, n, labels, predicate);
    142 
    143             rrects[tid * 4 + 0] = 0;
    144             rrects[tid * 4 + 1] = 0;
    145             rrects[tid * 4 + 2] = 0;
    146             rrects[tid * 4 + 3] = 0;
    147             __syncthreads();
    148 
    149             int cls = labels[tid];
    150             Emulation::smem::atomicAdd((rrects + cls * 4 + 0), candidates[tid].x);
    151             Emulation::smem::atomicAdd((rrects + cls * 4 + 1), candidates[tid].y);
    152             Emulation::smem::atomicAdd((rrects + cls * 4 + 2), candidates[tid].z);
    153             Emulation::smem::atomicAdd((rrects + cls * 4 + 3), candidates[tid].w);
    154 
    155             __syncthreads();
    156             labels[tid] = 0;
    157 
    158             __syncthreads();
    159             Emulation::smem::atomicInc((unsigned int*)labels + cls, n);
    160 
    161             __syncthreads();
    162             *nclasses = 0;
    163 
    164             int active = labels[tid];
    165             if (active)
    166             {
    167                 int* r1 = rrects + tid * 4;
    168                 float s = 1.f / active;
    169                 r1[0] = saturate_cast<int>(r1[0] * s);
    170                 r1[1] = saturate_cast<int>(r1[1] * s);
    171                 r1[2] = saturate_cast<int>(r1[2] * s);
    172                 r1[3] = saturate_cast<int>(r1[3] * s);
    173             }
    174             __syncthreads();
    175 
    176             if (active && active >= groupThreshold)
    177             {
    178                 int* r1 = rrects + tid * 4;
    179                 int4 r_out = make_int4(r1[0], r1[1], r1[2], r1[3]);
    180 
    181                 int aidx = Emulation::smem::atomicInc(nclasses, n);
    182                 objects[aidx] = r_out;
    183             }
    184         }
    185 
    186         void connectedConmonents(PtrStepSz<int4> candidates, int ncandidates, PtrStepSz<int4> objects, int groupThreshold, float grouping_eps, unsigned int* nclasses)
    187         {
    188             if (!ncandidates) return;
    189             int block = ncandidates;
    190             int smem  = block * ( sizeof(int) + sizeof(int4) );
    191             disjoin<InSameComponint><<<1, block, smem>>>(candidates, objects, ncandidates, groupThreshold, grouping_eps, nclasses);
    192             cudaSafeCall( cudaGetLastError() );
    193         }
    194 
    195         struct Cascade
    196         {
    197             __host__ __device__ __forceinline__ Cascade(const Stage* _stages, int _nstages, const ClNode* _nodes, const float* _leaves,
    198                 const int* _subsets, const uchar4* _features, int _subsetSize)
    199 
    200             : stages(_stages), nstages(_nstages), nodes(_nodes), leaves(_leaves), subsets(_subsets), features(_features), subsetSize(_subsetSize){}
    201 
    202             __device__ __forceinline__ bool operator() (int y, int x, int* integral, const int pitch) const
    203             {
    204                 int current_node = 0;
    205                 int current_leave = 0;
    206 
    207                 for (int s = 0; s < nstages; ++s)
    208                 {
    209                     float sum = 0;
    210                     Stage stage = stages[s];
    211                     for (int t = 0; t < stage.ntrees; t++)
    212                     {
    213                         ClNode node = nodes[current_node];
    214                         uchar4 feature = features[node.featureIdx];
    215 
    216                         int shift;
    217                         int c = evaluator(integral, (y + feature.y) * pitch + x + feature.x, feature.w * pitch, feature.z, shift);
    218                         int idx =  (subsets[ current_node * subsetSize + c] & ( 1 << shift)) ? current_leave : current_leave + 1;
    219                         sum += leaves[idx];
    220 
    221                         current_node += 1;
    222                         current_leave += 2;
    223                     }
    224 
    225                     if (sum < stage.threshold)
    226                         return false;
    227                 }
    228 
    229                 return true;
    230             }
    231 
    232             const Stage*  stages;
    233             const int nstages;
    234 
    235             const ClNode* nodes;
    236             const float* leaves;
    237             const int* subsets;
    238             const uchar4* features;
    239 
    240             const int subsetSize;
    241             const LBP evaluator;
    242         };
    243 
    244         // stepShift, scale, width_k, sum_prev => y =  sum_prev + tid_k / width_k, x = tid_k - tid_k / width_k
    245         __global__ void lbp_cascade(const Cascade cascade, int frameW, int frameH, int windowW, int windowH, float scale, const float factor,
    246             const int total, int* integral, const int pitch, PtrStepSz<int4> objects, unsigned int* classified)
    247         {
    248             int ftid = blockIdx.x * blockDim.x + threadIdx.x;
    249             if (ftid >= total) return;
    250 
    251             int step = (scale <= 2.f);
    252 
    253             int windowsForLine = (__float2int_rn( __fdividef(frameW, scale)) - windowW) >> step;
    254             int stotal = windowsForLine * ( (__float2int_rn( __fdividef(frameH, scale)) - windowH) >> step);
    255             int wshift = 0;
    256 
    257             int scaleTid = ftid;
    258 
    259             while (scaleTid >= stotal)
    260             {
    261                 scaleTid -= stotal;
    262                 wshift += __float2int_rn(__fdividef(frameW, scale)) + 1;
    263                 scale *= factor;
    264                 step = (scale <= 2.f);
    265                 windowsForLine = ( ((__float2int_rn(__fdividef(frameW, scale)) - windowW) >> step));
    266                 stotal = windowsForLine * ( (__float2int_rn(__fdividef(frameH, scale)) - windowH) >> step);
    267             }
    268 
    269             int y = __fdividef(scaleTid, windowsForLine);
    270             int x = scaleTid - y * windowsForLine;
    271 
    272             x <<= step;
    273             y <<= step;
    274 
    275             if (cascade(y, x + wshift, integral, pitch))
    276             {
    277                 if(x >= __float2int_rn(__fdividef(frameW, scale)) - windowW) return;
    278 
    279                 int4 rect;
    280                 rect.x = __float2int_rn(x * scale);
    281                 rect.y = __float2int_rn(y * scale);
    282                 rect.z = __float2int_rn(windowW * scale);
    283                 rect.w = __float2int_rn(windowH * scale);
    284 
    285                 int res = atomicInc(classified, (unsigned int)objects.cols);
    286                 objects(0, res) = rect;
    287             }
    288         }
    289 
    290         void classifyPyramid(int frameW, int frameH, int windowW, int windowH, float initialScale, float factor, int workAmount,
    291             const PtrStepSzb& mstages, const int nstages, const PtrStepSzi& mnodes, const PtrStepSzf& mleaves, const PtrStepSzi& msubsets, const PtrStepSzb& mfeatures,
    292             const int subsetSize, PtrStepSz<int4> objects, unsigned int* classified, PtrStepSzi integral)
    293         {
    294             const int block = 128;
    295             int grid = divUp(workAmount, block);
    296             cudaFuncSetCacheConfig(lbp_cascade, cudaFuncCachePreferL1);
    297             Cascade cascade((Stage*)mstages.ptr(), nstages, (ClNode*)mnodes.ptr(), mleaves.ptr(), msubsets.ptr(), (uchar4*)mfeatures.ptr(), subsetSize);
    298             lbp_cascade<<<grid, block>>>(cascade, frameW, frameH, windowW, windowH, initialScale, factor, workAmount, integral.ptr(), (int)integral.step / sizeof(int), objects, classified);
    299         }
    300     }
    301 }}}
    302 
    303 #endif /* CUDA_DISABLER */
    304