Home | History | Annotate | Download | only in cuda
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Third party copyrights are property of their respective owners.
     16 //
     17 // Redistribution and use in source and binary forms, with or without modification,
     18 // are permitted provided that the following conditions are met:
     19 //
     20 //   * Redistribution's of source code must retain the above copyright notice,
     21 //     this list of conditions and the following disclaimer.
     22 //
     23 //   * Redistribution's in binary form must reproduce the above copyright notice,
     24 //     this list of conditions and the following disclaimer in the documentation
     25 //     and/or other materials provided with the distribution.
     26 //
     27 //   * The name of the copyright holders may not be used to endorse or promote products
     28 //     derived from this software without specific prior written permission.
     29 //
     30 // This software is provided by the copyright holders and contributors "as is" and
     31 // any express or implied warranties, including, but not limited to, the implied
     32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     33 // In no event shall the Intel Corporation or contributors be liable for any direct,
     34 // indirect, incidental, special, exemplary, or consequential damages
     35 // (including, but not limited to, procurement of substitute goods or services;
     36 // loss of use, data, or profits; or business interruption) however caused
     37 // and on any theory of liability, whether in contract, strict liability,
     38 // or tort (including negligence or otherwise) arising in any way out of
     39 // the use of this software, even if advised of the possibility of such damage.
     40 //
     41 //M*/
     42 
     43 ////////////////////////////////////////////////////////////////////////////////
     44 //
     45 // NVIDIA CUDA implementation of Viola-Jones Object Detection Framework
     46 //
     47 // The algorithm and code are explained in the upcoming GPU Computing Gems
     48 // chapter in detail:
     49 //
     50 //   Anton Obukhov, "Haar Classifiers for Object Detection with CUDA"
     51 //   PDF URL placeholder
     52 //   email: aobukhov (at) nvidia.com, devsupport (at) nvidia.com
     53 //
     54 // Credits for help with the code to:
     55 // Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov.
     56 //
     57 ////////////////////////////////////////////////////////////////////////////////
     58 
     59 #include <algorithm>
     60 #include <cstdio>
     61 
     62 #include "opencv2/core/cuda/warp.hpp"
     63 #include "opencv2/core/cuda/warp_shuffle.hpp"
     64 
     65 #include "opencv2/opencv_modules.hpp"
     66 
     67 #ifdef HAVE_OPENCV_OBJDETECT
     68 #  include "opencv2/objdetect.hpp"
     69 #  include "opencv2/objdetect/objdetect_c.h"
     70 #endif
     71 
     72 #include "opencv2/cudalegacy/NCV.hpp"
     73 #include "opencv2/cudalegacy/NPP_staging.hpp"
     74 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp"
     75 
     76 #include "NCVRuntimeTemplates.hpp"
     77 #include "NCVAlg.hpp"
     78 
     79 
     80 //==============================================================================
     81 //
     82 // BlockScan file
     83 //
     84 //==============================================================================
     85 
     86 
     87 NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive
     88 
     89 
     90 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads()
     91 //assuming size <= WARP_SIZE and size is power of 2
     92 __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data)
     93 {
     94 #if __CUDA_ARCH__ >= 300
     95     const unsigned int laneId = cv::cuda::device::Warp::laneId();
     96 
     97     // scan on shuffl functions
     98     #pragma unroll
     99     for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2)
    100     {
    101         const Ncv32u n = cv::cuda::device::shfl_up(idata, i);
    102         if (laneId >= i)
    103               idata += n;
    104     }
    105 
    106     return idata;
    107 #else
    108     Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1));
    109     s_Data[pos] = 0;
    110     pos += K_WARP_SIZE;
    111     s_Data[pos] = idata;
    112 
    113     s_Data[pos] += s_Data[pos - 1];
    114     s_Data[pos] += s_Data[pos - 2];
    115     s_Data[pos] += s_Data[pos - 4];
    116     s_Data[pos] += s_Data[pos - 8];
    117     s_Data[pos] += s_Data[pos - 16];
    118 
    119     return s_Data[pos];
    120 #endif
    121 }
    122 
    123 __device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data)
    124 {
    125     return warpScanInclusive(idata, s_Data) - idata;
    126 }
    127 
    128 template <Ncv32u tiNumScanThreads>
    129 __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data)
    130 {
    131     if (tiNumScanThreads > K_WARP_SIZE)
    132     {
    133         //Bottom-level inclusive warp scan
    134         Ncv32u warpResult = warpScanInclusive(idata, s_Data);
    135 
    136         //Save top elements of each warp for exclusive warp scan
    137         //sync to wait for warp scans to complete (because s_Data is being overwritten)
    138         __syncthreads();
    139         if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) )
    140         {
    141             s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult;
    142         }
    143 
    144         //wait for warp scans to complete
    145         __syncthreads();
    146 
    147         if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) )
    148         {
    149             //grab top warp elements
    150             Ncv32u val = s_Data[threadIdx.x];
    151             //calculate exclusive scan and write back to shared memory
    152             s_Data[threadIdx.x] = warpScanExclusive(val, s_Data);
    153         }
    154 
    155         //return updated warp scans with exclusive scan results
    156         __syncthreads();
    157         return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE];
    158     }
    159     else
    160     {
    161         return warpScanInclusive(idata, s_Data);
    162     }
    163 }
    164 
    165 
    166 //==============================================================================
    167 //
    168 // HaarClassifierCascade file
    169 //
    170 //==============================================================================
    171 
    172 
    173 const Ncv32u MAX_GRID_DIM = 65535;
    174 
    175 
    176 const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64;
    177 
    178 
    179 #define NUM_THREADS_CLASSIFIERPARALLEL_LOG2     6
    180 #define NUM_THREADS_CLASSIFIERPARALLEL          (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2)
    181 
    182 
    183 /** \internal
    184 * Haar features solid array.
    185 */
    186 texture<uint2, 1, cudaReadModeElementType> texHaarFeatures;
    187 
    188 
    189 /** \internal
    190 * Haar classifiers flattened trees container.
    191 * Two parts: first contains root nodes, second - nodes that are referred by root nodes.
    192 * Drawback: breaks tree locality (might cause more cache misses
    193 * Advantage: No need to introduce additional 32-bit field to index root nodes offsets
    194 */
    195 texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes;
    196 
    197 
    198 texture<Ncv32u, 1, cudaReadModeElementType> texIImage;
    199 
    200 
    201 __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages)
    202 {
    203     return d_Stages[iStage];
    204 }
    205 
    206 
    207 template <NcvBool tbCacheTextureCascade>
    208 __device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes)
    209 {
    210     HaarClassifierNode128 tmpNode;
    211     if (tbCacheTextureCascade)
    212     {
    213         tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode);
    214     }
    215     else
    216     {
    217         tmpNode = d_ClassifierNodes[iNode];
    218     }
    219     return tmpNode;
    220 }
    221 
    222 
    223 template <NcvBool tbCacheTextureCascade>
    224 __device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features,
    225                            Ncv32f *weight,
    226                            Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight)
    227 {
    228     HaarFeature64 feature;
    229     if (tbCacheTextureCascade)
    230     {
    231         feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature);
    232     }
    233     else
    234     {
    235         feature = d_Features[iFeature];
    236     }
    237     feature.getRect(rectX, rectY, rectWidth, rectHeight);
    238     *weight = feature.getWeight();
    239 }
    240 
    241 
    242 template <NcvBool tbCacheTextureIImg>
    243 __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg)
    244 {
    245     if (tbCacheTextureIImg)
    246     {
    247         return tex1Dfetch(texIImage, x);
    248     }
    249     else
    250     {
    251         return d_IImg[x];
    252     }
    253 }
    254 
    255 
    256 __device__ Ncv32u d_outMaskPosition;
    257 
    258 
    259 __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut)
    260 {
    261 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
    262 
    263     __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2];
    264     __shared__ Ncv32u numPassed;
    265     __shared__ Ncv32u outMaskOffset;
    266 
    267     Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem);
    268     __syncthreads();
    269 
    270     if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1)
    271     {
    272         numPassed = incScan;
    273         outMaskOffset = atomicAdd(&d_outMaskPosition, incScan);
    274     }
    275 
    276     if (threadPassFlag)
    277     {
    278         Ncv32u excScan = incScan - threadPassFlag;
    279         shmem[excScan] = threadElem;
    280     }
    281 
    282     __syncthreads();
    283 
    284     if (threadIdx.x < numPassed)
    285     {
    286         vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x];
    287     }
    288 #endif
    289 }
    290 
    291 
    292 template <NcvBool tbInitMaskPositively,
    293           NcvBool tbCacheTextureIImg,
    294           NcvBool tbCacheTextureCascade,
    295           NcvBool tbReadPixelIndexFromVector,
    296           NcvBool tbDoAtomicCompaction>
    297 __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
    298                                                   Ncv32f *d_weights, Ncv32u weightsStride,
    299                                                   HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
    300                                                   Ncv32u *d_inMask, Ncv32u *d_outMask,
    301                                                   Ncv32u mask1Dlen, Ncv32u mask2Dstride,
    302                                                   NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
    303 {
    304     Ncv32u y_offs;
    305     Ncv32u x_offs;
    306     Ncv32u maskOffset;
    307     Ncv32u outMaskVal;
    308 
    309     NcvBool bInactiveThread = false;
    310 
    311     if (tbReadPixelIndexFromVector)
    312     {
    313         maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
    314 
    315         if (maskOffset >= mask1Dlen)
    316         {
    317             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
    318         }
    319 
    320         if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
    321         {
    322             outMaskVal = d_inMask[maskOffset];
    323             y_offs = outMaskVal >> 16;
    324             x_offs = outMaskVal & 0xFFFF;
    325         }
    326     }
    327     else
    328     {
    329         y_offs = blockIdx.y;
    330         x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
    331 
    332         if (x_offs >= mask2Dstride)
    333         {
    334             if (tbDoAtomicCompaction) bInactiveThread = true; else return;
    335         }
    336 
    337         if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread)
    338         {
    339             maskOffset = y_offs * mask2Dstride + x_offs;
    340 
    341             if ((x_offs >= anchorsRoi.width) ||
    342                 (!tbInitMaskPositively &&
    343                  d_inMask != d_outMask &&
    344                  d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U))
    345             {
    346                 if (tbDoAtomicCompaction)
    347                 {
    348                     bInactiveThread = true;
    349                 }
    350                 else
    351                 {
    352                     d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U;
    353                     return;
    354                 }
    355             }
    356 
    357             outMaskVal = (y_offs << 16) | x_offs;
    358         }
    359     }
    360 
    361     NcvBool bPass = true;
    362 
    363     if (!tbDoAtomicCompaction || tbDoAtomicCompaction)
    364     {
    365         Ncv32f pixelStdDev = 0.0f;
    366 
    367         if (!bInactiveThread)
    368             pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
    369 
    370         for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++)
    371         {
    372             Ncv32f curStageSum = 0.0f;
    373 
    374             HaarStage64 curStage = getStage(iStage, d_Stages);
    375             Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes();
    376             Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset();
    377             Ncv32f stageThreshold = curStage.getStageThreshold();
    378 
    379             while (numRootNodesInStage--)
    380             {
    381                 NcvBool bMoreNodesToTraverse = true;
    382                 Ncv32u iNode = curRootNodeOffset;
    383 
    384                 if (bPass && !bInactiveThread)
    385                 {
    386                     while (bMoreNodesToTraverse)
    387                     {
    388                         HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
    389                         HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
    390                         Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
    391                         Ncv32u iFeature = featuresDesc.getFeaturesOffset();
    392 
    393                         Ncv32f curNodeVal = 0.0f;
    394 
    395                         for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
    396                         {
    397                             Ncv32f rectWeight;
    398                             Ncv32u rectX, rectY, rectWidth, rectHeight;
    399                             getFeature<tbCacheTextureCascade>
    400                                 (iFeature + iRect, d_Features,
    401                                 &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
    402 
    403                             Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
    404                             Ncv32u iioffsTR = iioffsTL + rectWidth;
    405                             Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
    406                             Ncv32u iioffsBR = iioffsBL + rectWidth;
    407 
    408                             Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
    409                                              getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
    410                                              getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
    411                                              getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
    412 
    413     #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
    414                         curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
    415     #else
    416                         curNodeVal += (Ncv32f)rectSum * rectWeight;
    417     #endif
    418                         }
    419 
    420                         HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
    421                         HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
    422                         Ncv32f nodeThreshold = curNode.getThreshold();
    423 
    424                         HaarClassifierNodeDescriptor32 nextNodeDescriptor;
    425                         NcvBool nextNodeIsLeaf;
    426 
    427                         if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
    428                         {
    429                             nextNodeDescriptor = nodeLeft;
    430                             nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
    431                         }
    432                         else
    433                         {
    434                             nextNodeDescriptor = nodeRight;
    435                             nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
    436                         }
    437 
    438                         if (nextNodeIsLeaf)
    439                         {
    440                             Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
    441                             curStageSum += tmpLeafValue;
    442                             bMoreNodesToTraverse = false;
    443                         }
    444                         else
    445                         {
    446                             iNode = nextNodeDescriptor.getNextNodeOffset();
    447                         }
    448                     }
    449                 }
    450 
    451                 __syncthreads();
    452                 curRootNodeOffset++;
    453             }
    454 
    455             if (curStageSum < stageThreshold)
    456             {
    457                 bPass = false;
    458                 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
    459             }
    460         }
    461     }
    462 
    463     __syncthreads();
    464 
    465     if (!tbDoAtomicCompaction)
    466     {
    467         if (!tbReadPixelIndexFromVector ||
    468             (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask)))
    469         {
    470             d_outMask[maskOffset] = outMaskVal;
    471         }
    472     }
    473     else
    474     {
    475         compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread,
    476                                            outMaskVal,
    477                                            d_outMask);
    478     }
    479 }
    480 
    481 
    482 template <NcvBool tbCacheTextureIImg,
    483           NcvBool tbCacheTextureCascade,
    484           NcvBool tbDoAtomicCompaction>
    485 __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride,
    486                                                       Ncv32f *d_weights, Ncv32u weightsStride,
    487                                                       HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
    488                                                       Ncv32u *d_inMask, Ncv32u *d_outMask,
    489                                                       Ncv32u mask1Dlen, Ncv32u mask2Dstride,
    490                                                       NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea)
    491 {
    492     Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x;
    493 
    494     if (maskOffset >= mask1Dlen)
    495     {
    496         return;
    497     }
    498 
    499     Ncv32u outMaskVal = d_inMask[maskOffset];
    500     Ncv32u y_offs = outMaskVal >> 16;
    501     Ncv32u x_offs = outMaskVal & 0xFFFF;
    502 
    503     Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs];
    504     NcvBool bPass = true;
    505 
    506     for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++)
    507     {
    508         //this variable is subject to reduction
    509         Ncv32f curStageSum = 0.0f;
    510 
    511         HaarStage64 curStage = getStage(iStage, d_Stages);
    512         Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes();
    513         Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x;
    514         Ncv32f stageThreshold = curStage.getStageThreshold();
    515 
    516         Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2;
    517 
    518         for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++)
    519         {
    520             NcvBool bMoreNodesToTraverse = true;
    521 
    522             if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage)
    523             {
    524                 Ncv32u iNode = curRootNodeOffset;
    525 
    526                 while (bMoreNodesToTraverse)
    527                 {
    528                     HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes);
    529                     HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc();
    530                     Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures();
    531                     Ncv32u iFeature = featuresDesc.getFeaturesOffset();
    532 
    533                     Ncv32f curNodeVal = 0.0f;
    534                     //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce
    535                     for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
    536                     {
    537                         Ncv32f rectWeight;
    538                         Ncv32u rectX, rectY, rectWidth, rectHeight;
    539                         getFeature<tbCacheTextureCascade>
    540                             (iFeature + iRect, d_Features,
    541                             &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight);
    542 
    543                         Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX);
    544                         Ncv32u iioffsTR = iioffsTL + rectWidth;
    545                         Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride;
    546                         Ncv32u iioffsBR = iioffsBL + rectWidth;
    547 
    548                         Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) -
    549                                          getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) +
    550                                          getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) -
    551                                          getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg);
    552 
    553 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY
    554                         curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight);
    555 #else
    556                         curNodeVal += (Ncv32f)rectSum * rectWeight;
    557 #endif
    558                     }
    559 
    560                     HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
    561                     HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
    562                     Ncv32f nodeThreshold = curNode.getThreshold();
    563 
    564                     HaarClassifierNodeDescriptor32 nextNodeDescriptor;
    565                     NcvBool nextNodeIsLeaf;
    566 
    567                     if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold)
    568                     {
    569                         nextNodeDescriptor = nodeLeft;
    570                         nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf();
    571                     }
    572                     else
    573                     {
    574                         nextNodeDescriptor = nodeRight;
    575                         nextNodeIsLeaf = featuresDesc.isRightNodeLeaf();
    576                     }
    577 
    578                     if (nextNodeIsLeaf)
    579                     {
    580                         Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue();
    581                         curStageSum += tmpLeafValue;
    582                         bMoreNodesToTraverse = false;
    583                     }
    584                     else
    585                     {
    586                         iNode = nextNodeDescriptor.getNextNodeOffset();
    587                     }
    588                 }
    589             }
    590             __syncthreads();
    591 
    592             curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL;
    593         }
    594 
    595         Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues<Ncv32f>, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum);
    596 
    597         if (finalStageSum < stageThreshold)
    598         {
    599             bPass = false;
    600             outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U;
    601             break;
    602         }
    603     }
    604 
    605     if (!tbDoAtomicCompaction)
    606     {
    607         if (!bPass || d_inMask != d_outMask)
    608         {
    609             if (!threadIdx.x)
    610             {
    611                 d_outMask[maskOffset] = outMaskVal;
    612             }
    613         }
    614     }
    615     else
    616     {
    617 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110
    618         if (bPass && !threadIdx.x)
    619         {
    620             Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1);
    621             d_outMask[outMaskOffset] = outMaskVal;
    622         }
    623 #endif
    624     }
    625 }
    626 
    627 
    628 template <NcvBool tbMaskByInmask,
    629           NcvBool tbDoAtomicCompaction>
    630 __global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask,
    631                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
    632                                      NcvSize32u anchorsRoi, Ncv32u step)
    633 {
    634     Ncv32u y_offs = blockIdx.y;
    635     Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x;
    636     Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs;
    637 
    638     Ncv32u y_offs_upsc = step * y_offs;
    639     Ncv32u x_offs_upsc = step * x_offs;
    640     Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc;
    641 
    642     Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U;
    643 
    644     if (x_offs_upsc < anchorsRoi.width &&
    645         (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U))
    646     {
    647         outElem = (y_offs_upsc << 16) | x_offs_upsc;
    648     }
    649 
    650     if (!tbDoAtomicCompaction)
    651     {
    652         d_outMask[outMaskOffset] = outElem;
    653     }
    654     else
    655     {
    656         compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U,
    657                                            outElem,
    658                                            d_outMask);
    659     }
    660 }
    661 
    662 
    663 struct applyHaarClassifierAnchorParallelFunctor
    664 {
    665     dim3 gridConf, blockConf;
    666     cudaStream_t cuStream;
    667 
    668     //Kernel arguments are stored as members;
    669     Ncv32u *d_IImg;
    670     Ncv32u IImgStride;
    671     Ncv32f *d_weights;
    672     Ncv32u weightsStride;
    673     HaarFeature64 *d_Features;
    674     HaarClassifierNode128 *d_ClassifierNodes;
    675     HaarStage64 *d_Stages;
    676     Ncv32u *d_inMask;
    677     Ncv32u *d_outMask;
    678     Ncv32u mask1Dlen;
    679     Ncv32u mask2Dstride;
    680     NcvSize32u anchorsRoi;
    681     Ncv32u startStageInc;
    682     Ncv32u endStageExc;
    683     Ncv32f scaleArea;
    684 
    685     //Arguments are passed through the constructor
    686     applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
    687                                              Ncv32u *_d_IImg, Ncv32u _IImgStride,
    688                                              Ncv32f *_d_weights, Ncv32u _weightsStride,
    689                                              HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
    690                                              Ncv32u *_d_inMask, Ncv32u *_d_outMask,
    691                                              Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
    692                                              NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
    693                                              Ncv32u _endStageExc, Ncv32f _scaleArea) :
    694     gridConf(_gridConf),
    695     blockConf(_blockConf),
    696     cuStream(_cuStream),
    697     d_IImg(_d_IImg),
    698     IImgStride(_IImgStride),
    699     d_weights(_d_weights),
    700     weightsStride(_weightsStride),
    701     d_Features(_d_Features),
    702     d_ClassifierNodes(_d_ClassifierNodes),
    703     d_Stages(_d_Stages),
    704     d_inMask(_d_inMask),
    705     d_outMask(_d_outMask),
    706     mask1Dlen(_mask1Dlen),
    707     mask2Dstride(_mask2Dstride),
    708     anchorsRoi(_anchorsRoi),
    709     startStageInc(_startStageInc),
    710     endStageExc(_endStageExc),
    711     scaleArea(_scaleArea)
    712     {}
    713 
    714     template<class TList>
    715     void call(TList tl)
    716     {
    717         (void)tl;
    718         applyHaarClassifierAnchorParallel <
    719             Loki::TL::TypeAt<TList, 0>::Result::value,
    720             Loki::TL::TypeAt<TList, 1>::Result::value,
    721             Loki::TL::TypeAt<TList, 2>::Result::value,
    722             Loki::TL::TypeAt<TList, 3>::Result::value,
    723             Loki::TL::TypeAt<TList, 4>::Result::value >
    724             <<<gridConf, blockConf, 0, cuStream>>>
    725             (d_IImg, IImgStride,
    726             d_weights, weightsStride,
    727             d_Features, d_ClassifierNodes, d_Stages,
    728             d_inMask, d_outMask,
    729             mask1Dlen, mask2Dstride,
    730             anchorsRoi, startStageInc,
    731             endStageExc, scaleArea);
    732     }
    733 };
    734 
    735 
    736 void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively,
    737                                                   NcvBool tbCacheTextureIImg,
    738                                                   NcvBool tbCacheTextureCascade,
    739                                                   NcvBool tbReadPixelIndexFromVector,
    740                                                   NcvBool tbDoAtomicCompaction,
    741 
    742                                                   dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
    743 
    744                                                   Ncv32u *d_IImg, Ncv32u IImgStride,
    745                                                   Ncv32f *d_weights, Ncv32u weightsStride,
    746                                                   HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
    747                                                   Ncv32u *d_inMask, Ncv32u *d_outMask,
    748                                                   Ncv32u mask1Dlen, Ncv32u mask2Dstride,
    749                                                   NcvSize32u anchorsRoi, Ncv32u startStageInc,
    750                                                   Ncv32u endStageExc, Ncv32f scaleArea)
    751 {
    752 
    753     applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream,
    754                                                      d_IImg, IImgStride,
    755                                                      d_weights, weightsStride,
    756                                                      d_Features, d_ClassifierNodes, d_Stages,
    757                                                      d_inMask, d_outMask,
    758                                                      mask1Dlen, mask2Dstride,
    759                                                      anchorsRoi, startStageInc,
    760                                                      endStageExc, scaleArea);
    761 
    762     //Second parameter is the number of "dynamic" template parameters
    763     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor>
    764         ::call( &functor,
    765                 tbInitMaskPositively,
    766                 tbCacheTextureIImg,
    767                 tbCacheTextureCascade,
    768                 tbReadPixelIndexFromVector,
    769                 tbDoAtomicCompaction);
    770 }
    771 
    772 
    773 struct applyHaarClassifierClassifierParallelFunctor
    774 {
    775     dim3 gridConf, blockConf;
    776     cudaStream_t cuStream;
    777 
    778     //Kernel arguments are stored as members;
    779     Ncv32u *d_IImg;
    780     Ncv32u IImgStride;
    781     Ncv32f *d_weights;
    782     Ncv32u weightsStride;
    783     HaarFeature64 *d_Features;
    784     HaarClassifierNode128 *d_ClassifierNodes;
    785     HaarStage64 *d_Stages;
    786     Ncv32u *d_inMask;
    787     Ncv32u *d_outMask;
    788     Ncv32u mask1Dlen;
    789     Ncv32u mask2Dstride;
    790     NcvSize32u anchorsRoi;
    791     Ncv32u startStageInc;
    792     Ncv32u endStageExc;
    793     Ncv32f scaleArea;
    794 
    795     //Arguments are passed through the constructor
    796     applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
    797                                                  Ncv32u *_d_IImg, Ncv32u _IImgStride,
    798                                                  Ncv32f *_d_weights, Ncv32u _weightsStride,
    799                                                  HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages,
    800                                                  Ncv32u *_d_inMask, Ncv32u *_d_outMask,
    801                                                  Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
    802                                                  NcvSize32u _anchorsRoi, Ncv32u _startStageInc,
    803                                                  Ncv32u _endStageExc, Ncv32f _scaleArea) :
    804     gridConf(_gridConf),
    805     blockConf(_blockConf),
    806     cuStream(_cuStream),
    807     d_IImg(_d_IImg),
    808     IImgStride(_IImgStride),
    809     d_weights(_d_weights),
    810     weightsStride(_weightsStride),
    811     d_Features(_d_Features),
    812     d_ClassifierNodes(_d_ClassifierNodes),
    813     d_Stages(_d_Stages),
    814     d_inMask(_d_inMask),
    815     d_outMask(_d_outMask),
    816     mask1Dlen(_mask1Dlen),
    817     mask2Dstride(_mask2Dstride),
    818     anchorsRoi(_anchorsRoi),
    819     startStageInc(_startStageInc),
    820     endStageExc(_endStageExc),
    821     scaleArea(_scaleArea)
    822     {}
    823 
    824     template<class TList>
    825     void call(TList tl)
    826     {
    827         (void)tl;
    828         applyHaarClassifierClassifierParallel <
    829             Loki::TL::TypeAt<TList, 0>::Result::value,
    830             Loki::TL::TypeAt<TList, 1>::Result::value,
    831             Loki::TL::TypeAt<TList, 2>::Result::value >
    832             <<<gridConf, blockConf, 0, cuStream>>>
    833             (d_IImg, IImgStride,
    834             d_weights, weightsStride,
    835             d_Features, d_ClassifierNodes, d_Stages,
    836             d_inMask, d_outMask,
    837             mask1Dlen, mask2Dstride,
    838             anchorsRoi, startStageInc,
    839             endStageExc, scaleArea);
    840     }
    841 };
    842 
    843 
    844 void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg,
    845                                                       NcvBool tbCacheTextureCascade,
    846                                                       NcvBool tbDoAtomicCompaction,
    847 
    848                                                       dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
    849 
    850                                                       Ncv32u *d_IImg, Ncv32u IImgStride,
    851                                                       Ncv32f *d_weights, Ncv32u weightsStride,
    852                                                       HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages,
    853                                                       Ncv32u *d_inMask, Ncv32u *d_outMask,
    854                                                       Ncv32u mask1Dlen, Ncv32u mask2Dstride,
    855                                                       NcvSize32u anchorsRoi, Ncv32u startStageInc,
    856                                                       Ncv32u endStageExc, Ncv32f scaleArea)
    857 {
    858     applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream,
    859                                                          d_IImg, IImgStride,
    860                                                          d_weights, weightsStride,
    861                                                          d_Features, d_ClassifierNodes, d_Stages,
    862                                                          d_inMask, d_outMask,
    863                                                          mask1Dlen, mask2Dstride,
    864                                                          anchorsRoi, startStageInc,
    865                                                          endStageExc, scaleArea);
    866 
    867     //Second parameter is the number of "dynamic" template parameters
    868     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor>
    869         ::call( &functor,
    870                 tbCacheTextureIImg,
    871                 tbCacheTextureCascade,
    872                 tbDoAtomicCompaction);
    873 }
    874 
    875 
    876 struct initializeMaskVectorFunctor
    877 {
    878     dim3 gridConf, blockConf;
    879     cudaStream_t cuStream;
    880 
    881     //Kernel arguments are stored as members;
    882     Ncv32u *d_inMask;
    883     Ncv32u *d_outMask;
    884     Ncv32u mask1Dlen;
    885     Ncv32u mask2Dstride;
    886     NcvSize32u anchorsRoi;
    887     Ncv32u step;
    888 
    889     //Arguments are passed through the constructor
    890     initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream,
    891                                 Ncv32u *_d_inMask, Ncv32u *_d_outMask,
    892                                 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride,
    893                                 NcvSize32u _anchorsRoi, Ncv32u _step) :
    894     gridConf(_gridConf),
    895     blockConf(_blockConf),
    896     cuStream(_cuStream),
    897     d_inMask(_d_inMask),
    898     d_outMask(_d_outMask),
    899     mask1Dlen(_mask1Dlen),
    900     mask2Dstride(_mask2Dstride),
    901     anchorsRoi(_anchorsRoi),
    902     step(_step)
    903     {}
    904 
    905     template<class TList>
    906     void call(TList tl)
    907     {
    908         (void)tl;
    909         initializeMaskVector <
    910             Loki::TL::TypeAt<TList, 0>::Result::value,
    911             Loki::TL::TypeAt<TList, 1>::Result::value >
    912             <<<gridConf, blockConf, 0, cuStream>>>
    913             (d_inMask, d_outMask,
    914              mask1Dlen, mask2Dstride,
    915              anchorsRoi, step);
    916     }
    917 };
    918 
    919 
    920 void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask,
    921                                      NcvBool tbDoAtomicCompaction,
    922 
    923                                      dim3 gridConf, dim3 blockConf, cudaStream_t cuStream,
    924 
    925                                      Ncv32u *d_inMask, Ncv32u *d_outMask,
    926                                      Ncv32u mask1Dlen, Ncv32u mask2Dstride,
    927                                      NcvSize32u anchorsRoi, Ncv32u step)
    928 {
    929     initializeMaskVectorFunctor functor(gridConf, blockConf, cuStream,
    930                                         d_inMask, d_outMask,
    931                                         mask1Dlen, mask2Dstride,
    932                                         anchorsRoi, step);
    933 
    934     //Second parameter is the number of "dynamic" template parameters
    935     NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor>
    936         ::call( &functor,
    937                 tbMaskByInmask,
    938                 tbDoAtomicCompaction);
    939 }
    940 
    941 
    942 Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar,
    943                                               NCVVector<HaarStage64> &h_HaarStages)
    944 {
    945     Ncv32u i = 0;
    946     for (; i<haar.NumStages; i++)
    947     {
    948         if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N)
    949         {
    950             break;
    951         }
    952     }
    953     return i;
    954 }
    955 
    956 
    957 NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral,
    958                                                NCVMatrix<Ncv32f> &d_weights,
    959                                                NCVMatrixAlloc<Ncv32u> &d_pixelMask,
    960                                                Ncv32u &numDetections,
    961                                                HaarClassifierCascadeDescriptor &haar,
    962                                                NCVVector<HaarStage64> &h_HaarStages,
    963                                                NCVVector<HaarStage64> &d_HaarStages,
    964                                                NCVVector<HaarClassifierNode128> &d_HaarNodes,
    965                                                NCVVector<HaarFeature64> &d_HaarFeatures,
    966                                                NcvBool bMaskElements,
    967                                                NcvSize32u anchorsRoi,
    968                                                Ncv32u pixelStep,
    969                                                Ncv32f scaleArea,
    970                                                INCVMemAllocator &gpuAllocator,
    971                                                INCVMemAllocator &cpuAllocator,
    972                                                cudaDeviceProp &devProp,
    973                                                cudaStream_t cuStream)
    974 {
    975     ncvAssertReturn(integral.memType() == d_weights.memType()&&
    976                     integral.memType() == d_pixelMask.memType() &&
    977                     integral.memType() == gpuAllocator.memType() &&
    978                    (integral.memType() == NCVMemoryTypeDevice ||
    979                     integral.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
    980 
    981     ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
    982                     d_HaarStages.memType() == d_HaarFeatures.memType() &&
    983                      (d_HaarStages.memType() == NCVMemoryTypeDevice ||
    984                       d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
    985 
    986     ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
    987 
    988     ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
    989 
    990     ncvAssertReturn((integral.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL &&
    991                      h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
    992                      d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
    993 
    994     ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
    995                     d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height &&
    996                     d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height &&
    997                     integral.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
    998                     integral.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
    999 
   1000     ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
   1001 
   1002     ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
   1003                     d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
   1004                     d_HaarFeatures.length() >= haar.NumFeatures &&
   1005                     d_HaarStages.length() == h_HaarStages.length() &&
   1006                     haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
   1007 
   1008     ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES);
   1009 
   1010     ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
   1011 
   1012     NCV_SET_SKIP_COND(gpuAllocator.isCounting());
   1013 
   1014 #if defined _SELF_TEST_
   1015 
   1016     NCVStatus ncvStat;
   1017 
   1018     NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, integral.width, integral.height, integral.pitch);
   1019     ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1020     NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch);
   1021     ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1022     NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
   1023     ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1024     NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length);
   1025     ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1026     NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length);
   1027     ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1028 
   1029     NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch);
   1030     ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1031 
   1032     NCV_SKIP_COND_BEGIN
   1033 
   1034     ncvStat = d_pixelMask.copySolid(h_pixelMask, 0);
   1035     ncvAssertReturnNcvStat(ncvStat);
   1036     ncvStat = integral.copySolid(h_integralImage, 0);
   1037     ncvAssertReturnNcvStat(ncvStat);
   1038     ncvStat = d_weights.copySolid(h_weights, 0);
   1039     ncvAssertReturnNcvStat(ncvStat);
   1040     ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0);
   1041     ncvAssertReturnNcvStat(ncvStat);
   1042     ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0);
   1043     ncvAssertReturnNcvStat(ncvStat);
   1044     ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR);
   1045 
   1046     for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++)
   1047     {
   1048         for (Ncv32u j=0; j<d_pixelMask.stride(); j++)
   1049         {
   1050             if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width))
   1051             {
   1052                 if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U)
   1053                 {
   1054                     h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j;
   1055                 }
   1056             }
   1057             else
   1058             {
   1059                 h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U;
   1060             }
   1061         }
   1062     }
   1063 
   1064     NCV_SKIP_COND_END
   1065 
   1066 #endif
   1067 
   1068     NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride());
   1069     ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE);
   1070 
   1071     NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, static_cast<Ncv32u>(d_vecPixelMask.length()));
   1072     ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1073 
   1074     NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2);
   1075     ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1076     Ncv32u *hp_zero = &hp_pool32u.ptr()[0];
   1077     Ncv32u *hp_numDet = &hp_pool32u.ptr()[1];
   1078 
   1079     NCV_SKIP_COND_BEGIN
   1080     *hp_zero = 0;
   1081     *hp_numDet = 0;
   1082     NCV_SKIP_COND_END
   1083 
   1084     Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
   1085                                           (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
   1086 
   1087     NcvBool bTexCacheCascade = devProp.major < 2;
   1088     NcvBool bTexCacheIImg = true; //this works better even on Fermi so far
   1089     NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3);
   1090 
   1091     NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask;
   1092     NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp;
   1093 
   1094     Ncv32u szNppCompactTmpBuf;
   1095     nppsStCompactGetSize_32u(static_cast<Ncv32u>(d_vecPixelMask.length()), &szNppCompactTmpBuf, devProp);
   1096     if (bDoAtomicCompaction)
   1097     {
   1098         szNppCompactTmpBuf = 0;
   1099     }
   1100     NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf);
   1101 
   1102     NCV_SKIP_COND_BEGIN
   1103 
   1104     if (bTexCacheIImg)
   1105     {
   1106         cudaChannelFormatDesc cfdTexIImage;
   1107         cfdTexIImage = cudaCreateChannelDesc<Ncv32u>();
   1108 
   1109         size_t alignmentOffset;
   1110         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage,
   1111             (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR);
   1112         ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
   1113     }
   1114 
   1115     if (bTexCacheCascade)
   1116     {
   1117         cudaChannelFormatDesc cfdTexHaarFeatures;
   1118         cudaChannelFormatDesc cfdTexHaarClassifierNodes;
   1119         cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>();
   1120         cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>();
   1121 
   1122         size_t alignmentOffset;
   1123         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures,
   1124             d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR);
   1125         ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
   1126         ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes,
   1127             d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR);
   1128         ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR);
   1129     }
   1130 
   1131     Ncv32u stageStartAnchorParallel = 0;
   1132     Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL,
   1133         haar, h_HaarStages);
   1134     Ncv32u stageEndClassifierParallel = haar.NumStages;
   1135     if (stageMiddleSwitch == 0)
   1136     {
   1137         stageMiddleSwitch = 1;
   1138     }
   1139 
   1140     //create stages subdivision for pixel-parallel processing
   1141     const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1;
   1142     Ncv32u curStop = stageStartAnchorParallel;
   1143     std::vector<Ncv32u> pixParallelStageStops;
   1144     while (curStop < stageMiddleSwitch)
   1145     {
   1146         pixParallelStageStops.push_back(curStop);
   1147         curStop += compactEveryNstage;
   1148     }
   1149     if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2)
   1150     {
   1151         pixParallelStageStops[pixParallelStageStops.size()-1] =
   1152             (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2;
   1153     }
   1154     pixParallelStageStops.push_back(stageMiddleSwitch);
   1155     Ncv32u pixParallelStageStopsIndex = 0;
   1156 
   1157     if (pixelStep != 1 || bMaskElements)
   1158     {
   1159         if (bDoAtomicCompaction)
   1160         {
   1161             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
   1162                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
   1163             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1164         }
   1165 
   1166         dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
   1167                         (anchorsRoi.height + pixelStep - 1) / pixelStep);
   1168         dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL);
   1169 
   1170         if (gridInit.x == 0 || gridInit.y == 0)
   1171         {
   1172             numDetections = 0;
   1173             return NCV_SUCCESS;
   1174         }
   1175 
   1176         initializeMaskVectorDynTemplate(bMaskElements,
   1177                                         bDoAtomicCompaction,
   1178                                         gridInit, blockInit, cuStream,
   1179                                         d_ptrNowData->ptr(),
   1180                                         d_ptrNowTmp->ptr(),
   1181                                         static_cast<Ncv32u>(d_vecPixelMask.length()), d_pixelMask.stride(),
   1182                                         anchorsRoi, pixelStep);
   1183         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
   1184 
   1185         if (bDoAtomicCompaction)
   1186         {
   1187             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1188             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
   1189                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
   1190             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1191             swap(d_ptrNowData, d_ptrNowTmp);
   1192         }
   1193         else
   1194         {
   1195             NCVStatus nppSt;
   1196             nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
   1197                                       d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
   1198                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
   1199             ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR);
   1200         }
   1201         numDetections = *hp_numDet;
   1202     }
   1203     else
   1204     {
   1205         //
   1206         // 1. Run the first pixel-input pixel-parallel classifier for few stages
   1207         //
   1208 
   1209         if (bDoAtomicCompaction)
   1210         {
   1211             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
   1212                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
   1213             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1214         }
   1215 
   1216         dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL),
   1217                    anchorsRoi.height);
   1218         dim3 block1(NUM_THREADS_ANCHORSPARALLEL);
   1219         applyHaarClassifierAnchorParallelDynTemplate(
   1220             true,                         //tbInitMaskPositively
   1221             bTexCacheIImg,                //tbCacheTextureIImg
   1222             bTexCacheCascade,             //tbCacheTextureCascade
   1223             pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector
   1224             bDoAtomicCompaction,          //tbDoAtomicCompaction
   1225             grid1,
   1226             block1,
   1227             cuStream,
   1228             integral.ptr(), integral.stride(),
   1229             d_weights.ptr(), d_weights.stride(),
   1230             d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
   1231             d_ptrNowData->ptr(),
   1232             bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
   1233             0,
   1234             d_pixelMask.stride(),
   1235             anchorsRoi,
   1236             pixParallelStageStops[pixParallelStageStopsIndex],
   1237             pixParallelStageStops[pixParallelStageStopsIndex+1],
   1238             scaleAreaPixels);
   1239         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
   1240 
   1241         if (bDoAtomicCompaction)
   1242         {
   1243             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1244             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
   1245                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
   1246             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1247         }
   1248         else
   1249         {
   1250             NCVStatus nppSt;
   1251             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()),
   1252                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
   1253                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
   1254             ncvAssertReturnNcvStat(nppSt);
   1255         }
   1256 
   1257         swap(d_ptrNowData, d_ptrNowTmp);
   1258         numDetections = *hp_numDet;
   1259 
   1260         pixParallelStageStopsIndex++;
   1261     }
   1262 
   1263     //
   1264     // 2. Run pixel-parallel stages
   1265     //
   1266 
   1267     for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++)
   1268     {
   1269         if (numDetections == 0)
   1270         {
   1271             break;
   1272         }
   1273 
   1274         if (bDoAtomicCompaction)
   1275         {
   1276             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
   1277                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
   1278             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1279         }
   1280 
   1281         dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL);
   1282         if (numDetections > MAX_GRID_DIM)
   1283         {
   1284             grid2.x = MAX_GRID_DIM;
   1285             grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
   1286         }
   1287         dim3 block2(NUM_THREADS_ANCHORSPARALLEL);
   1288 
   1289         applyHaarClassifierAnchorParallelDynTemplate(
   1290             false,                        //tbInitMaskPositively
   1291             bTexCacheIImg,                //tbCacheTextureIImg
   1292             bTexCacheCascade,             //tbCacheTextureCascade
   1293             pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector
   1294             bDoAtomicCompaction,          //tbDoAtomicCompaction
   1295             grid2,
   1296             block2,
   1297             cuStream,
   1298             integral.ptr(), integral.stride(),
   1299             d_weights.ptr(), d_weights.stride(),
   1300             d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
   1301             d_ptrNowData->ptr(),
   1302             bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
   1303             numDetections,
   1304             d_pixelMask.stride(),
   1305             anchorsRoi,
   1306             pixParallelStageStops[pixParallelStageStopsIndex],
   1307             pixParallelStageStops[pixParallelStageStopsIndex+1],
   1308             scaleAreaPixels);
   1309         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
   1310 
   1311         if (bDoAtomicCompaction)
   1312         {
   1313             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1314             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
   1315                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
   1316             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1317         }
   1318         else
   1319         {
   1320             NCVStatus nppSt;
   1321             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
   1322                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
   1323                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
   1324             ncvAssertReturnNcvStat(nppSt);
   1325         }
   1326 
   1327         swap(d_ptrNowData, d_ptrNowTmp);
   1328         numDetections = *hp_numDet;
   1329     }
   1330 
   1331     //
   1332     // 3. Run all left stages in one stage-parallel kernel
   1333     //
   1334 
   1335     if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel)
   1336     {
   1337         if (bDoAtomicCompaction)
   1338         {
   1339             ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u),
   1340                                                         0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR);
   1341             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1342         }
   1343 
   1344         dim3 grid3(numDetections);
   1345         if (numDetections > MAX_GRID_DIM)
   1346         {
   1347             grid3.x = MAX_GRID_DIM;
   1348             grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM;
   1349         }
   1350         dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL);
   1351 
   1352         applyHaarClassifierClassifierParallelDynTemplate(
   1353             bTexCacheIImg,                //tbCacheTextureIImg
   1354             bTexCacheCascade,             //tbCacheTextureCascade
   1355             bDoAtomicCompaction,          //tbDoAtomicCompaction
   1356             grid3,
   1357             block3,
   1358             cuStream,
   1359             integral.ptr(), integral.stride(),
   1360             d_weights.ptr(), d_weights.stride(),
   1361             d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(),
   1362             d_ptrNowData->ptr(),
   1363             bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(),
   1364             numDetections,
   1365             d_pixelMask.stride(),
   1366             anchorsRoi,
   1367             stageMiddleSwitch,
   1368             stageEndClassifierParallel,
   1369             scaleAreaPixels);
   1370         ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
   1371 
   1372         if (bDoAtomicCompaction)
   1373         {
   1374             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1375             ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u),
   1376                                                           0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR);
   1377             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1378         }
   1379         else
   1380         {
   1381             NCVStatus nppSt;
   1382             nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections,
   1383                                       d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U,
   1384                                       d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp);
   1385             ncvAssertReturnNcvStat(nppSt);
   1386         }
   1387 
   1388         swap(d_ptrNowData, d_ptrNowTmp);
   1389         numDetections = *hp_numDet;
   1390     }
   1391 
   1392     if (d_ptrNowData != &d_vecPixelMask)
   1393     {
   1394         d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream);
   1395         ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1396     }
   1397 
   1398 #if defined _SELF_TEST_
   1399 
   1400     ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0);
   1401     ncvAssertReturnNcvStat(ncvStat);
   1402     ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1403 
   1404     if (bDoAtomicCompaction)
   1405     {
   1406         std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections);
   1407     }
   1408 
   1409     Ncv32u fpu_oldcw, fpu_cw;
   1410     _controlfp_s(&fpu_cw, 0, 0);
   1411     fpu_oldcw = fpu_cw;
   1412     _controlfp_s(&fpu_cw, _PC_24, _MCW_PC);
   1413     Ncv32u numDetGold;
   1414     ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar,
   1415                                                  h_HaarStages, h_HaarNodes, h_HaarFeatures,
   1416                                                  bMaskElements, anchorsRoi, pixelStep, scaleArea);
   1417     ncvAssertReturnNcvStat(ncvStat);
   1418     _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC);
   1419 
   1420     bool bPass = true;
   1421 
   1422     if (numDetGold != numDetections)
   1423     {
   1424         printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections);
   1425         bPass = false;
   1426     }
   1427     else
   1428     {
   1429         for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++)
   1430         {
   1431             if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i])
   1432             {
   1433                 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]);
   1434                 bPass = false;
   1435             }
   1436         }
   1437     }
   1438 
   1439     printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED");
   1440 #endif
   1441 
   1442     NCV_SKIP_COND_END
   1443 
   1444     return NCV_SUCCESS;
   1445 }
   1446 
   1447 
   1448 //==============================================================================
   1449 //
   1450 // HypothesesOperations file
   1451 //
   1452 //==============================================================================
   1453 
   1454 
   1455 const Ncv32u NUM_GROW_THREADS = 128;
   1456 
   1457 
   1458 __device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale)
   1459 {
   1460     NcvRect32u res;
   1461     res.x = (Ncv32u)(scale * (pixel & 0xFFFF));
   1462     res.y = (Ncv32u)(scale * (pixel >> 16));
   1463     res.width = (Ncv32u)(scale * width);
   1464     res.height = (Ncv32u)(scale * height);
   1465     return res;
   1466 }
   1467 
   1468 
   1469 __global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements,
   1470                                      NcvRect32u *hypotheses,
   1471                                      Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale)
   1472 {
   1473     Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x;
   1474     Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x;
   1475     if (elemAddr >= numElements)
   1476     {
   1477         return;
   1478     }
   1479     hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale);
   1480 }
   1481 
   1482 
   1483 NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask,
   1484                                          Ncv32u numPixelMaskDetections,
   1485                                          NCVVector<NcvRect32u> &hypotheses,
   1486                                          Ncv32u &totalDetections,
   1487                                          Ncv32u totalMaxDetections,
   1488                                          Ncv32u rectWidth,
   1489                                          Ncv32u rectHeight,
   1490                                          Ncv32f curScale,
   1491                                          cudaStream_t cuStream)
   1492 {
   1493     ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
   1494 
   1495     ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
   1496                     pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
   1497 
   1498     ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
   1499 
   1500     ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
   1501 
   1502     ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
   1503                     numPixelMaskDetections <= pixelMask.length() &&
   1504                     totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
   1505 
   1506     NCVStatus ncvStat = NCV_SUCCESS;
   1507     Ncv32u numDetsToCopy = numPixelMaskDetections;
   1508 
   1509     if (numDetsToCopy == 0)
   1510     {
   1511         return ncvStat;
   1512     }
   1513 
   1514     if (totalDetections + numPixelMaskDetections > totalMaxDetections)
   1515     {
   1516         ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
   1517         numDetsToCopy = totalMaxDetections - totalDetections;
   1518     }
   1519 
   1520     dim3 block(NUM_GROW_THREADS);
   1521     dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS);
   1522     if (grid.x > 65535)
   1523     {
   1524         grid.y = (grid.x + 65534) / 65535;
   1525         grid.x = 65535;
   1526     }
   1527     growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy,
   1528                                                        hypotheses.ptr() + totalDetections,
   1529                                                        rectWidth, rectHeight, curScale);
   1530     ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR);
   1531 
   1532     totalDetections += numDetsToCopy;
   1533     return ncvStat;
   1534 }
   1535 
   1536 
   1537 //==============================================================================
   1538 //
   1539 // Pipeline file
   1540 //
   1541 //==============================================================================
   1542 
   1543 
   1544 NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg,
   1545                                             NcvSize32u srcRoi,
   1546                                             NCVVector<NcvRect32u> &d_dstRects,
   1547                                             Ncv32u &dstNumRects,
   1548 
   1549                                             HaarClassifierCascadeDescriptor &haar,
   1550                                             NCVVector<HaarStage64> &h_HaarStages,
   1551                                             NCVVector<HaarStage64> &d_HaarStages,
   1552                                             NCVVector<HaarClassifierNode128> &d_HaarNodes,
   1553                                             NCVVector<HaarFeature64> &d_HaarFeatures,
   1554 
   1555                                             NcvSize32u minObjSize,
   1556                                             Ncv32u minNeighbors,      //default 4
   1557                                             Ncv32f scaleStep,         //default 1.2f
   1558                                             Ncv32u pixelStep,         //default 1
   1559                                             Ncv32u flags,             //default NCVPipeObjDet_Default
   1560 
   1561                                             INCVMemAllocator &gpuAllocator,
   1562                                             INCVMemAllocator &cpuAllocator,
   1563                                             cudaDeviceProp &devProp,
   1564                                             cudaStream_t cuStream)
   1565 {
   1566     ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() &&
   1567                     d_srcImg.memType() == gpuAllocator.memType() &&
   1568                      (d_srcImg.memType() == NCVMemoryTypeDevice ||
   1569                       d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
   1570 
   1571     ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() &&
   1572                     d_HaarStages.memType() == d_HaarFeatures.memType() &&
   1573                      (d_HaarStages.memType() == NCVMemoryTypeDevice ||
   1574                       d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR);
   1575 
   1576     ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
   1577 
   1578     ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED);
   1579 
   1580     ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL &&
   1581                      h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL &&
   1582                      d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR);
   1583     ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 &&
   1584                     d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height &&
   1585                     srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height &&
   1586                     d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID);
   1587 
   1588     ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE);
   1589 
   1590     ncvAssertReturn(d_HaarStages.length() >= haar.NumStages &&
   1591                     d_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
   1592                     d_HaarFeatures.length() >= haar.NumFeatures &&
   1593                     d_HaarStages.length() == h_HaarStages.length() &&
   1594                     haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
   1595 
   1596     ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
   1597 
   1598     ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
   1599 
   1600     //TODO: set NPP active stream to cuStream
   1601 
   1602     NCVStatus ncvStat;
   1603     NCV_SET_SKIP_COND(gpuAllocator.isCounting());
   1604 
   1605     Ncv32u integralWidth = d_srcImg.width() + 1;
   1606     Ncv32u integralHeight = d_srcImg.height() + 1;
   1607 
   1608     NCVMatrixAlloc<Ncv32u> integral(gpuAllocator, integralWidth, integralHeight);
   1609     ncvAssertReturn(integral.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1610     NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight);
   1611     ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1612 
   1613     NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height());
   1614     ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1615     NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height());
   1616     ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1617 
   1618     NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight);
   1619     ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1620     NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight);
   1621     ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1622 
   1623     NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height());
   1624     ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1625     NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height());
   1626     ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1627 
   1628     NCVStatus nppStat;
   1629     Ncv32u szTmpBufIntegral, szTmpBufSqIntegral;
   1630     nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp);
   1631     ncvAssertReturnNcvStat(nppStat);
   1632     nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp);
   1633     ncvAssertReturnNcvStat(nppStat);
   1634     NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral));
   1635     ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC);
   1636 
   1637     NCV_SKIP_COND_BEGIN
   1638 
   1639     nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
   1640                                        integral.ptr(), integral.pitch(),
   1641                                        NcvSize32u(d_srcImg.width(), d_srcImg.height()),
   1642                                        d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp);
   1643     ncvAssertReturnNcvStat(nppStat);
   1644 
   1645     nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(),
   1646                                           d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
   1647                                           NcvSize32u(d_srcImg.width(), d_srcImg.height()),
   1648                                           d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp);
   1649     ncvAssertReturnNcvStat(nppStat);
   1650 
   1651     NCV_SKIP_COND_END
   1652 
   1653     dstNumRects = 0;
   1654 
   1655     Ncv32u lastCheckedScale = 0;
   1656     NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0);
   1657     std::vector<Ncv32u> scalesVector;
   1658 
   1659     NcvBool bFoundLargestFace = false;
   1660 
   1661     for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep)
   1662     {
   1663         Ncv32u scale = (Ncv32u)scaleIter;
   1664         if (lastCheckedScale == scale)
   1665         {
   1666             continue;
   1667         }
   1668         lastCheckedScale = scale;
   1669 
   1670         if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width ||
   1671             haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height)
   1672         {
   1673             continue;
   1674         }
   1675 
   1676         NcvSize32s srcRoi_, srcIIRo_i, scaledIIRoi, searchRoi;
   1677 
   1678         srcRoi_.width = d_srcImg.width();
   1679         srcRoi_.height = d_srcImg.height();
   1680 
   1681         srcIIRo_i.width = srcRoi_.width + 1;
   1682         srcIIRo_i.height = srcRoi_.height + 1;
   1683 
   1684         scaledIIRoi.width = srcIIRo_i.width / scale;
   1685         scaledIIRoi.height = srcIIRo_i.height / scale;
   1686 
   1687         searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
   1688         searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
   1689 
   1690         if (searchRoi.width <= 0 || searchRoi.height <= 0)
   1691         {
   1692             break;
   1693         }
   1694 
   1695         scalesVector.push_back(scale);
   1696 
   1697         if (gpuAllocator.isCounting())
   1698         {
   1699             break;
   1700         }
   1701     }
   1702 
   1703     if (bReverseTraverseScale)
   1704     {
   1705         std::reverse(scalesVector.begin(), scalesVector.end());
   1706     }
   1707 
   1708     //TODO: handle _fair_scale_ flag
   1709     for (Ncv32u i=0; i<scalesVector.size(); i++)
   1710     {
   1711         Ncv32u scale = scalesVector[i];
   1712 
   1713         NcvSize32u srcRoi_, scaledIIRoi, searchRoi;
   1714         NcvSize32u srcIIRoi;
   1715         srcRoi_.width = d_srcImg.width();
   1716         srcRoi_.height = d_srcImg.height();
   1717         srcIIRoi.width = srcRoi_.width + 1;
   1718         srcIIRoi.height = srcRoi_.height + 1;
   1719         scaledIIRoi.width = srcIIRoi.width / scale;
   1720         scaledIIRoi.height = srcIIRoi.height / scale;
   1721         searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width;
   1722         searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height;
   1723 
   1724         NCV_SKIP_COND_BEGIN
   1725 
   1726         nppStat = nppiStDecimate_32u_C1R(
   1727             integral.ptr(), integral.pitch(),
   1728             d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
   1729             srcIIRoi, scale, true);
   1730         ncvAssertReturnNcvStat(nppStat);
   1731 
   1732         nppStat = nppiStDecimate_64u_C1R(
   1733             d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(),
   1734             d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
   1735             srcIIRoi, scale, true);
   1736         ncvAssertReturnNcvStat(nppStat);
   1737 
   1738         const NcvRect32u rect(
   1739             HAAR_STDDEV_BORDER,
   1740             HAAR_STDDEV_BORDER,
   1741             haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER,
   1742             haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER);
   1743         nppStat = nppiStRectStdDev_32f_C1R(
   1744             d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(),
   1745             d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(),
   1746             d_rectStdDev.ptr(), d_rectStdDev.pitch(),
   1747             NcvSize32u(searchRoi.width, searchRoi.height), rect,
   1748             (Ncv32f)scale*scale, true);
   1749         ncvAssertReturnNcvStat(nppStat);
   1750 
   1751         NCV_SKIP_COND_END
   1752 
   1753         Ncv32u detectionsOnThisScale;
   1754         ncvStat = ncvApplyHaarClassifierCascade_device(
   1755             d_scaledIntegralImage, d_rectStdDev, d_pixelMask,
   1756             detectionsOnThisScale,
   1757             haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false,
   1758             searchRoi, pixelStep, (Ncv32f)scale*scale,
   1759             gpuAllocator, cpuAllocator, devProp, cuStream);
   1760         ncvAssertReturnNcvStat(nppStat);
   1761 
   1762         NCV_SKIP_COND_BEGIN
   1763 
   1764         NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment());
   1765         ncvStat = ncvGrowDetectionsVector_device(
   1766             d_vecPixelMask,
   1767             detectionsOnThisScale,
   1768             d_hypothesesIntermediate,
   1769             dstNumRects,
   1770             static_cast<Ncv32u>(d_hypothesesIntermediate.length()),
   1771             haar.ClassifierSize.width,
   1772             haar.ClassifierSize.height,
   1773             (Ncv32f)scale,
   1774             cuStream);
   1775         ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
   1776 
   1777         if (flags & NCVPipeObjDet_FindLargestObject)
   1778         {
   1779             if (dstNumRects == 0)
   1780             {
   1781                 continue;
   1782             }
   1783 
   1784             if (dstNumRects != 0)
   1785             {
   1786                 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1787                 ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
   1788                                                              dstNumRects * sizeof(NcvRect32u));
   1789                 ncvAssertReturnNcvStat(ncvStat);
   1790                 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1791             }
   1792 
   1793             Ncv32u numStrongHypothesesNow = dstNumRects;
   1794             ncvStat = ncvGroupRectangles_host(
   1795                 h_hypothesesIntermediate,
   1796                 numStrongHypothesesNow,
   1797                 minNeighbors,
   1798                 RECT_SIMILARITY_PROPORTION,
   1799                 NULL);
   1800             ncvAssertReturnNcvStat(ncvStat);
   1801 
   1802             if (numStrongHypothesesNow > 0)
   1803             {
   1804                 NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0];
   1805                 for (Ncv32u j=1; j<numStrongHypothesesNow; j++)
   1806                 {
   1807                     if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width)
   1808                     {
   1809                         maxRect = h_hypothesesIntermediate.ptr()[j];
   1810                     }
   1811                 }
   1812 
   1813                 h_hypothesesIntermediate.ptr()[0] = maxRect;
   1814                 dstNumRects = 1;
   1815 
   1816                 ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u));
   1817                 ncvAssertReturnNcvStat(ncvStat);
   1818 
   1819                 bFoundLargestFace = true;
   1820 
   1821                 break;
   1822             }
   1823         }
   1824 
   1825         NCV_SKIP_COND_END
   1826 
   1827         if (gpuAllocator.isCounting())
   1828         {
   1829             break;
   1830         }
   1831     }
   1832 
   1833     NCVStatus ncvRetCode = NCV_SUCCESS;
   1834 
   1835     NCV_SKIP_COND_BEGIN
   1836 
   1837     if (flags & NCVPipeObjDet_FindLargestObject)
   1838     {
   1839         if (!bFoundLargestFace)
   1840         {
   1841             dstNumRects = 0;
   1842         }
   1843     }
   1844     else
   1845     {
   1846         //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left)
   1847         if (dstNumRects != 0)
   1848         {
   1849             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1850             ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream,
   1851                                                          dstNumRects * sizeof(NcvRect32u));
   1852             ncvAssertReturnNcvStat(ncvStat);
   1853             ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1854         }
   1855 
   1856         ncvStat = ncvGroupRectangles_host(
   1857             h_hypothesesIntermediate,
   1858             dstNumRects,
   1859             minNeighbors,
   1860             RECT_SIMILARITY_PROPORTION,
   1861             NULL);
   1862         ncvAssertReturnNcvStat(ncvStat);
   1863 
   1864         if (dstNumRects > d_dstRects.length())
   1865         {
   1866             ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
   1867             dstNumRects = static_cast<Ncv32u>(d_dstRects.length());
   1868         }
   1869 
   1870         if (dstNumRects != 0)
   1871         {
   1872             ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream,
   1873                                                          dstNumRects * sizeof(NcvRect32u));
   1874             ncvAssertReturnNcvStat(ncvStat);
   1875         }
   1876     }
   1877 
   1878     if (flags & NCVPipeObjDet_VisualizeInPlace)
   1879     {
   1880         ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR);
   1881         ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(),
   1882                                d_srcImg.width(), d_srcImg.height(),
   1883                                d_dstRects.ptr(), dstNumRects, 255, cuStream);
   1884     }
   1885 
   1886     NCV_SKIP_COND_END
   1887 
   1888     return ncvRetCode;
   1889 }
   1890 
   1891 
   1892 //==============================================================================
   1893 //
   1894 // Purely Host code: classifier IO, mock-ups
   1895 //
   1896 //==============================================================================
   1897 
   1898 
   1899 #ifdef _SELF_TEST_
   1900 #include <float.h>
   1901 #endif
   1902 
   1903 
   1904 NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage,
   1905                                              NCVMatrix<Ncv32f> &h_weights,
   1906                                              NCVMatrixAlloc<Ncv32u> &h_pixelMask,
   1907                                              Ncv32u &numDetections,
   1908                                              HaarClassifierCascadeDescriptor &haar,
   1909                                              NCVVector<HaarStage64> &h_HaarStages,
   1910                                              NCVVector<HaarClassifierNode128> &h_HaarNodes,
   1911                                              NCVVector<HaarFeature64> &h_HaarFeatures,
   1912                                              NcvBool bMaskElements,
   1913                                              NcvSize32u anchorsRoi,
   1914                                              Ncv32u pixelStep,
   1915                                              Ncv32f scaleArea)
   1916 {
   1917     ncvAssertReturn(h_integralImage.memType() == h_weights.memType() &&
   1918                     h_integralImage.memType() == h_pixelMask.memType() &&
   1919                      (h_integralImage.memType() == NCVMemoryTypeHostPageable ||
   1920                       h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
   1921     ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() &&
   1922                     h_HaarStages.memType() == h_HaarFeatures.memType() &&
   1923                      (h_HaarStages.memType() == NCVMemoryTypeHostPageable ||
   1924                       h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR);
   1925     ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL &&
   1926                     h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR);
   1927     ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 &&
   1928                     h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height &&
   1929                     h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height &&
   1930                     h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width &&
   1931                     h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID);
   1932     ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE);
   1933     ncvAssertReturn(h_HaarStages.length() >= haar.NumStages &&
   1934                     h_HaarNodes.length() >= haar.NumClassifierTotalNodes &&
   1935                     h_HaarFeatures.length() >= haar.NumFeatures &&
   1936                     h_HaarStages.length() == h_HaarStages.length() &&
   1937                     haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID);
   1938     ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES);
   1939     ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP);
   1940 
   1941     Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) *
   1942                                           (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER));
   1943 
   1944     for (Ncv32u i=0; i<anchorsRoi.height; i++)
   1945     {
   1946         for (Ncv32u j=0; j<h_pixelMask.stride(); j++)
   1947         {
   1948             if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width)
   1949             {
   1950                 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
   1951             }
   1952             else
   1953             {
   1954                 for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++)
   1955                 {
   1956                     Ncv32f curStageSum = 0.0f;
   1957                     Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes();
   1958                     Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset();
   1959 
   1960                     if (iStage == 0)
   1961                     {
   1962                         if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
   1963                         {
   1964                             break;
   1965                         }
   1966                         else
   1967                         {
   1968                             h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j);
   1969                         }
   1970                     }
   1971                     else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U)
   1972                     {
   1973                         break;
   1974                     }
   1975 
   1976                     while (numRootNodesInStage--)
   1977                     {
   1978                         NcvBool bMoreNodesToTraverse = true;
   1979                         Ncv32u curNodeOffset = curRootNodeOffset;
   1980 
   1981                         while (bMoreNodesToTraverse)
   1982                         {
   1983                             HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset];
   1984                             HaarFeatureDescriptor32 curFeatDesc = curNode.getFeatureDesc();
   1985                             Ncv32u curNodeFeaturesNum = curFeatDesc.getNumFeatures();
   1986                             Ncv32u curNodeFeaturesOffs = curFeatDesc.getFeaturesOffset();
   1987 
   1988                             Ncv32f curNodeVal = 0.f;
   1989                             for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++)
   1990                             {
   1991                                 HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect];
   1992                                 Ncv32u rectX, rectY, rectWidth, rectHeight;
   1993                                 feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight);
   1994                                 Ncv32f rectWeight = feature.getWeight();
   1995                                 Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX);
   1996                                 Ncv32u iioffsTR = iioffsTL + rectWidth;
   1997                                 Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride();
   1998                                 Ncv32u iioffsBR = iioffsBL + rectWidth;
   1999 
   2000                                 Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL];
   2001                                 Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR];
   2002                                 Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL];
   2003                                 Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR];
   2004                                 Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR;
   2005                                 curNodeVal += (Ncv32f)rectSum * rectWeight;
   2006                             }
   2007 
   2008                             HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc();
   2009                             HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc();
   2010                             Ncv32f nodeThreshold = curNode.getThreshold();
   2011 
   2012                             HaarClassifierNodeDescriptor32 nextNodeDescriptor;
   2013                             NcvBool nextNodeIsLeaf;
   2014 
   2015                             if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold)
   2016                             {
   2017                                 nextNodeDescriptor = nodeLeft;
   2018                                 nextNodeIsLeaf = curFeatDesc.isLeftNodeLeaf();
   2019                             }
   2020                             else
   2021                             {
   2022                                 nextNodeDescriptor = nodeRight;
   2023                                 nextNodeIsLeaf = curFeatDesc.isRightNodeLeaf();
   2024                             }
   2025 
   2026                             if (nextNodeIsLeaf)
   2027                             {
   2028                                 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost();
   2029                                 curStageSum += tmpLeafValue;
   2030                                 bMoreNodesToTraverse = false;
   2031                             }
   2032                             else
   2033                             {
   2034                                 curNodeOffset = nextNodeDescriptor.getNextNodeOffset();
   2035                             }
   2036                         }
   2037 
   2038                         curRootNodeOffset++;
   2039                     }
   2040 
   2041                     Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold();
   2042                     if (curStageSum < tmpStageThreshold)
   2043                     {
   2044                         //drop
   2045                         h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U;
   2046                         break;
   2047                     }
   2048                 }
   2049             }
   2050         }
   2051     }
   2052 
   2053     std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride());
   2054     Ncv32u i = 0;
   2055     for (; i<anchorsRoi.height * h_pixelMask.stride(); i++)
   2056     {
   2057         if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U)
   2058         {
   2059             break;
   2060         }
   2061     }
   2062     numDetections = i;
   2063 
   2064     return NCV_SUCCESS;
   2065 }
   2066 
   2067 
   2068 NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask,
   2069                                        Ncv32u numPixelMaskDetections,
   2070                                        NCVVector<NcvRect32u> &hypotheses,
   2071                                        Ncv32u &totalDetections,
   2072                                        Ncv32u totalMaxDetections,
   2073                                        Ncv32u rectWidth,
   2074                                        Ncv32u rectHeight,
   2075                                        Ncv32f curScale)
   2076 {
   2077     ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR);
   2078     ncvAssertReturn(pixelMask.memType() == hypotheses.memType() &&
   2079                     pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR);
   2080     ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI);
   2081     ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE);
   2082     ncvAssertReturn(totalMaxDetections <= hypotheses.length() &&
   2083                     numPixelMaskDetections <= pixelMask.length() &&
   2084                     totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT);
   2085 
   2086     NCVStatus ncvStat = NCV_SUCCESS;
   2087     Ncv32u numDetsToCopy = numPixelMaskDetections;
   2088 
   2089     if (numDetsToCopy == 0)
   2090     {
   2091         return ncvStat;
   2092     }
   2093 
   2094     if (totalDetections + numPixelMaskDetections > totalMaxDetections)
   2095     {
   2096         ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW;
   2097         numDetsToCopy = totalMaxDetections - totalDetections;
   2098     }
   2099 
   2100     for (Ncv32u i=0; i<numDetsToCopy; i++)
   2101     {
   2102         hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale);
   2103     }
   2104 
   2105     totalDetections += numDetsToCopy;
   2106     return ncvStat;
   2107 }
   2108 
   2109 static NCVStatus loadFromXML(const cv::String &filename,
   2110                       HaarClassifierCascadeDescriptor &haar,
   2111                       std::vector<HaarStage64> &haarStages,
   2112                       std::vector<HaarClassifierNode128> &haarClassifierNodes,
   2113                       std::vector<HaarFeature64> &haarFeatures)
   2114 {
   2115 #ifndef HAVE_OPENCV_OBJDETECT
   2116     (void) filename;
   2117     (void) haar;
   2118     (void) haarStages;
   2119     (void) haarClassifierNodes;
   2120     (void) haarFeatures;
   2121     CV_Error(cv::Error::StsNotImplemented, "This functionality requires objdetect module");
   2122     return NCV_HAAR_XML_LOADING_EXCEPTION;
   2123 #else
   2124     NCVStatus ncvStat;
   2125 
   2126     haar.NumStages = 0;
   2127     haar.NumClassifierRootNodes = 0;
   2128     haar.NumClassifierTotalNodes = 0;
   2129     haar.NumFeatures = 0;
   2130     haar.ClassifierSize.width = 0;
   2131     haar.ClassifierSize.height = 0;
   2132     haar.bHasStumpsOnly = true;
   2133     haar.bNeedsTiltedII = false;
   2134     Ncv32u curMaxTreeDepth = 0;
   2135 
   2136     std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes;
   2137     haarStages.resize(0);
   2138     haarClassifierNodes.resize(0);
   2139     haarFeatures.resize(0);
   2140 
   2141     cv::Ptr<CvHaarClassifierCascade> oldCascade((CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0));
   2142     if (!oldCascade)
   2143     {
   2144         return NCV_HAAR_XML_LOADING_EXCEPTION;
   2145     }
   2146 
   2147     haar.ClassifierSize.width = oldCascade->orig_window_size.width;
   2148     haar.ClassifierSize.height = oldCascade->orig_window_size.height;
   2149 
   2150     int stagesCound = oldCascade->count;
   2151     for(int s = 0; s < stagesCound; ++s) // by stages
   2152     {
   2153         HaarStage64 curStage;
   2154         curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size()));
   2155 
   2156         curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold);
   2157 
   2158         int treesCount = oldCascade->stage_classifier[s].count;
   2159         for(int t = 0; t < treesCount; ++t) // by trees
   2160         {
   2161             Ncv32u nodeId = 0;
   2162             CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t];
   2163 
   2164             int nodesCount = tree->count;
   2165             for(int n = 0; n < nodesCount; ++n)  //by features
   2166             {
   2167                 CvHaarFeature* feature = &tree->haar_feature[n];
   2168 
   2169                 HaarClassifierNode128 curNode;
   2170                 curNode.setThreshold(tree->threshold[n]);
   2171 
   2172                 NcvBool bIsLeftNodeLeaf = false;
   2173                 NcvBool bIsRightNodeLeaf = false;
   2174 
   2175                 HaarClassifierNodeDescriptor32 nodeLeft;
   2176                 if ( tree->left[n] <= 0 )
   2177                 {
   2178                     Ncv32f leftVal = tree->alpha[-tree->left[n]];
   2179                     ncvStat = nodeLeft.create(leftVal);
   2180                     ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
   2181                     bIsLeftNodeLeaf = true;
   2182                 }
   2183                 else
   2184                 {
   2185                     Ncv32u leftNodeOffset = tree->left[n];
   2186                     nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1));
   2187                     haar.bHasStumpsOnly = false;
   2188                 }
   2189                 curNode.setLeftNodeDesc(nodeLeft);
   2190 
   2191                 HaarClassifierNodeDescriptor32 nodeRight;
   2192                 if ( tree->right[n] <= 0 )
   2193                 {
   2194                     Ncv32f rightVal = tree->alpha[-tree->right[n]];
   2195                     ncvStat = nodeRight.create(rightVal);
   2196                     ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat);
   2197                     bIsRightNodeLeaf = true;
   2198                 }
   2199                 else
   2200                 {
   2201                     Ncv32u rightNodeOffset = tree->right[n];
   2202                     nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1));
   2203                     haar.bHasStumpsOnly = false;
   2204                 }
   2205                 curNode.setRightNodeDesc(nodeRight);
   2206 
   2207                 Ncv32u tiltedVal = feature->tilted;
   2208                 haar.bNeedsTiltedII = (tiltedVal != 0);
   2209 
   2210                 Ncv32u featureId = 0;
   2211                 for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects
   2212                 {
   2213                     Ncv32u rectX = feature->rect[l].r.x;
   2214                     Ncv32u rectY = feature->rect[l].r.y;
   2215                     Ncv32u rectWidth = feature->rect[l].r.width;
   2216                     Ncv32u rectHeight = feature->rect[l].r.height;
   2217 
   2218                     Ncv32f rectWeight = feature->rect[l].weight;
   2219 
   2220                     if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/)
   2221                         break;
   2222 
   2223                     HaarFeature64 curFeature;
   2224                     ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height);
   2225                     curFeature.setWeight(rectWeight);
   2226                     ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
   2227                     haarFeatures.push_back(curFeature);
   2228 
   2229                     featureId++;
   2230                 }
   2231 
   2232                 HaarFeatureDescriptor32 tmpFeatureDesc;
   2233                 ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf,
   2234                     featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId);
   2235                 ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat);
   2236                 curNode.setFeatureDesc(tmpFeatureDesc);
   2237 
   2238                 if (!nodeId)
   2239                 {
   2240                     //root node
   2241                     haarClassifierNodes.push_back(curNode);
   2242                     curMaxTreeDepth = 1;
   2243                 }
   2244                 else
   2245                 {
   2246                     //other node
   2247                     h_TmpClassifierNotRootNodes.push_back(curNode);
   2248                     curMaxTreeDepth++;
   2249                 }
   2250 
   2251                 nodeId++;
   2252             }
   2253         }
   2254 
   2255         curStage.setNumClassifierRootNodes(treesCount);
   2256         haarStages.push_back(curStage);
   2257     }
   2258 
   2259     //fill in cascade stats
   2260     haar.NumStages = static_cast<Ncv32u>(haarStages.size());
   2261     haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size());
   2262     haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size());
   2263     haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size());
   2264 
   2265     //merge root and leaf nodes in one classifiers array
   2266     Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size());
   2267     for (Ncv32u i=0; i<haarClassifierNodes.size(); i++)
   2268     {
   2269         HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc();
   2270 
   2271         HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc();
   2272         if (!featureDesc.isLeftNodeLeaf())
   2273         {
   2274             Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
   2275             nodeLeft.create(newOffset);
   2276         }
   2277         haarClassifierNodes[i].setLeftNodeDesc(nodeLeft);
   2278 
   2279         HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc();
   2280         if (!featureDesc.isRightNodeLeaf())
   2281         {
   2282             Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
   2283             nodeRight.create(newOffset);
   2284         }
   2285         haarClassifierNodes[i].setRightNodeDesc(nodeRight);
   2286     }
   2287 
   2288     for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++)
   2289     {
   2290         HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc();
   2291 
   2292         HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc();
   2293         if (!featureDesc.isLeftNodeLeaf())
   2294         {
   2295             Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot;
   2296             nodeLeft.create(newOffset);
   2297         }
   2298         h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft);
   2299 
   2300         HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc();
   2301         if (!featureDesc.isRightNodeLeaf())
   2302         {
   2303             Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot;
   2304             nodeRight.create(newOffset);
   2305         }
   2306         h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight);
   2307 
   2308         haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]);
   2309     }
   2310 
   2311     return NCV_SUCCESS;
   2312 #endif
   2313 }
   2314 
   2315 
   2316 #define NVBIN_HAAR_SIZERESERVED     16
   2317 #define NVBIN_HAAR_VERSION          0x1
   2318 
   2319 
   2320 static NCVStatus loadFromNVBIN(const cv::String &filename,
   2321                                HaarClassifierCascadeDescriptor &haar,
   2322                                std::vector<HaarStage64> &haarStages,
   2323                                std::vector<HaarClassifierNode128> &haarClassifierNodes,
   2324                                std::vector<HaarFeature64> &haarFeatures)
   2325 {
   2326     size_t readCount;
   2327     FILE *fp = fopen(filename.c_str(), "rb");
   2328     ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
   2329     Ncv32u fileVersion;
   2330     readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
   2331     ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
   2332     ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
   2333     Ncv32u fsize;
   2334     readCount = fread(&fsize, sizeof(Ncv32u), 1, fp);
   2335     ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
   2336     fseek(fp, 0, SEEK_END);
   2337     Ncv32u fsizeActual = ftell(fp);
   2338     ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR);
   2339 
   2340     std::vector<unsigned char> fdata;
   2341     fdata.resize(fsize);
   2342     Ncv32u dataOffset = 0;
   2343     fseek(fp, 0, SEEK_SET);
   2344     readCount = fread(&fdata[0], fsize, 1, fp);
   2345     ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
   2346     fclose(fp);
   2347 
   2348     //data
   2349     dataOffset = NVBIN_HAAR_SIZERESERVED;
   2350     haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset);
   2351     dataOffset += sizeof(Ncv32u);
   2352     haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
   2353     dataOffset += sizeof(Ncv32u);
   2354     haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset);
   2355     dataOffset += sizeof(Ncv32u);
   2356     haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset);
   2357     dataOffset += sizeof(Ncv32u);
   2358     haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset);
   2359     dataOffset += sizeof(NcvSize32u);
   2360     haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset);
   2361     dataOffset += sizeof(NcvBool);
   2362     haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset);
   2363     dataOffset += sizeof(NcvBool);
   2364 
   2365     haarStages.resize(haar.NumStages);
   2366     haarClassifierNodes.resize(haar.NumClassifierTotalNodes);
   2367     haarFeatures.resize(haar.NumFeatures);
   2368 
   2369     Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
   2370     Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
   2371     Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
   2372 
   2373     memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages);
   2374     dataOffset += szStages;
   2375     memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers);
   2376     dataOffset += szClassifiers;
   2377     memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures);
   2378     dataOffset += szFeatures;
   2379 
   2380     return NCV_SUCCESS;
   2381 }
   2382 
   2383 
   2384 NCVStatus ncvHaarGetClassifierSize(const cv::String &filename, Ncv32u &numStages,
   2385                                    Ncv32u &numNodes, Ncv32u &numFeatures)
   2386 {
   2387     size_t readCount;
   2388     NCVStatus ncvStat;
   2389 
   2390     cv::String fext = filename.substr(filename.find_last_of(".") + 1);
   2391     fext = fext.toLowerCase();
   2392 
   2393     if (fext == "nvbin")
   2394     {
   2395         FILE *fp = fopen(filename.c_str(), "rb");
   2396         ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
   2397         Ncv32u fileVersion;
   2398         readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp);
   2399         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
   2400         ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR);
   2401         fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET);
   2402         Ncv32u tmp;
   2403         readCount = fread(&numStages,   sizeof(Ncv32u), 1, fp);
   2404         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
   2405         readCount = fread(&tmp,         sizeof(Ncv32u), 1, fp);
   2406         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
   2407         readCount = fread(&numNodes,    sizeof(Ncv32u), 1, fp);
   2408         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
   2409         readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp);
   2410         ncvAssertReturn(1 == readCount, NCV_FILE_ERROR);
   2411         fclose(fp);
   2412     }
   2413     else if (fext == "xml")
   2414     {
   2415         HaarClassifierCascadeDescriptor haar;
   2416         std::vector<HaarStage64> haarStages;
   2417         std::vector<HaarClassifierNode128> haarNodes;
   2418         std::vector<HaarFeature64> haarFeatures;
   2419 
   2420         ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
   2421         ncvAssertReturnNcvStat(ncvStat);
   2422 
   2423         numStages = haar.NumStages;
   2424         numNodes = haar.NumClassifierTotalNodes;
   2425         numFeatures = haar.NumFeatures;
   2426     }
   2427     else
   2428     {
   2429         return NCV_HAAR_XML_LOADING_EXCEPTION;
   2430     }
   2431 
   2432     return NCV_SUCCESS;
   2433 }
   2434 
   2435 
   2436 NCVStatus ncvHaarLoadFromFile_host(const cv::String &filename,
   2437                                    HaarClassifierCascadeDescriptor &haar,
   2438                                    NCVVector<HaarStage64> &h_HaarStages,
   2439                                    NCVVector<HaarClassifierNode128> &h_HaarNodes,
   2440                                    NCVVector<HaarFeature64> &h_HaarFeatures)
   2441 {
   2442     ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
   2443                     h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
   2444                     h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
   2445 
   2446     NCVStatus ncvStat;
   2447 
   2448     cv::String fext = filename.substr(filename.find_last_of(".") + 1);
   2449     fext = fext.toLowerCase();
   2450 
   2451     std::vector<HaarStage64> haarStages;
   2452     std::vector<HaarClassifierNode128> haarNodes;
   2453     std::vector<HaarFeature64> haarFeatures;
   2454 
   2455     if (fext == "nvbin")
   2456     {
   2457         ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures);
   2458         ncvAssertReturnNcvStat(ncvStat);
   2459     }
   2460     else if (fext == "xml")
   2461     {
   2462         ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures);
   2463         ncvAssertReturnNcvStat(ncvStat);
   2464     }
   2465     else
   2466     {
   2467         return NCV_HAAR_XML_LOADING_EXCEPTION;
   2468     }
   2469 
   2470     ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
   2471     ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
   2472     ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY);
   2473 
   2474     memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64));
   2475     memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128));
   2476     memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64));
   2477 
   2478     return NCV_SUCCESS;
   2479 }
   2480 
   2481 
   2482 NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename,
   2483                                  HaarClassifierCascadeDescriptor haar,
   2484                                  NCVVector<HaarStage64> &h_HaarStages,
   2485                                  NCVVector<HaarClassifierNode128> &h_HaarNodes,
   2486                                  NCVVector<HaarFeature64> &h_HaarFeatures)
   2487 {
   2488     ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT);
   2489     ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT);
   2490     ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT);
   2491     ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned &&
   2492                     h_HaarNodes.memType() == NCVMemoryTypeHostPinned &&
   2493                     h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR);
   2494 
   2495     Ncv32u szStages = haar.NumStages * sizeof(HaarStage64);
   2496     Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128);
   2497     Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64);
   2498 
   2499     Ncv32u dataOffset = 0;
   2500     std::vector<unsigned char> fdata;
   2501     fdata.resize(szStages+szClassifiers+szFeatures+1024, 0);
   2502 
   2503     //header
   2504     *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION;
   2505 
   2506     //data
   2507     dataOffset = NVBIN_HAAR_SIZERESERVED;
   2508     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages;
   2509     dataOffset += sizeof(Ncv32u);
   2510     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes;
   2511     dataOffset += sizeof(Ncv32u);
   2512     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes;
   2513     dataOffset += sizeof(Ncv32u);
   2514     *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures;
   2515     dataOffset += sizeof(Ncv32u);
   2516     *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize;
   2517     dataOffset += sizeof(NcvSize32u);
   2518     *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII;
   2519     dataOffset += sizeof(NcvBool);
   2520     *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly;
   2521     dataOffset += sizeof(NcvBool);
   2522 
   2523     memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages);
   2524     dataOffset += szStages;
   2525     memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers);
   2526     dataOffset += szClassifiers;
   2527     memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures);
   2528     dataOffset += szFeatures;
   2529     Ncv32u fsize = dataOffset;
   2530 
   2531     //TODO: CRC32 here
   2532 
   2533     //update header
   2534     dataOffset = sizeof(Ncv32u);
   2535     *(Ncv32u *)(&fdata[0]+dataOffset) = fsize;
   2536 
   2537     FILE *fp = fopen(filename.c_str(), "wb");
   2538     ncvAssertReturn(fp != NULL, NCV_FILE_ERROR);
   2539     fwrite(&fdata[0], fsize, 1, fp);
   2540     fclose(fp);
   2541     return NCV_SUCCESS;
   2542 }
   2543