1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 //////////////////////////////////////////////////////////////////////////////// 44 // 45 // NVIDIA CUDA implementation of Viola-Jones Object Detection Framework 46 // 47 // The algorithm and code are explained in the upcoming GPU Computing Gems 48 // chapter in detail: 49 // 50 // Anton Obukhov, "Haar Classifiers for Object Detection with CUDA" 51 // PDF URL placeholder 52 // email: aobukhov (at) nvidia.com, devsupport (at) nvidia.com 53 // 54 // Credits for help with the code to: 55 // Alexey Mendelenko, Cyril Crassin, and Mikhail Smirnov. 56 // 57 //////////////////////////////////////////////////////////////////////////////// 58 59 #include <algorithm> 60 #include <cstdio> 61 62 #include "opencv2/core/cuda/warp.hpp" 63 #include "opencv2/core/cuda/warp_shuffle.hpp" 64 65 #include "opencv2/opencv_modules.hpp" 66 67 #ifdef HAVE_OPENCV_OBJDETECT 68 # include "opencv2/objdetect.hpp" 69 # include "opencv2/objdetect/objdetect_c.h" 70 #endif 71 72 #include "opencv2/cudalegacy/NCV.hpp" 73 #include "opencv2/cudalegacy/NPP_staging.hpp" 74 #include "opencv2/cudalegacy/NCVHaarObjectDetection.hpp" 75 76 #include "NCVRuntimeTemplates.hpp" 77 #include "NCVAlg.hpp" 78 79 80 //============================================================================== 81 // 82 // BlockScan file 83 // 84 //============================================================================== 85 86 87 NCV_CT_ASSERT(K_WARP_SIZE == 32); //this is required for the manual unroll of the loop in warpScanInclusive 88 89 90 //Almost the same as naive scan1Inclusive, but doesn't need __syncthreads() 91 //assuming size <= WARP_SIZE and size is power of 2 92 __device__ Ncv32u warpScanInclusive(Ncv32u idata, volatile Ncv32u *s_Data) 93 { 94 #if __CUDA_ARCH__ >= 300 95 const unsigned int laneId = cv::cuda::device::Warp::laneId(); 96 97 // scan on shuffl functions 98 #pragma unroll 99 for (int i = 1; i <= (K_WARP_SIZE / 2); i *= 2) 100 { 101 const Ncv32u n = cv::cuda::device::shfl_up(idata, i); 102 if (laneId >= i) 103 idata += n; 104 } 105 106 return idata; 107 #else 108 Ncv32u pos = 2 * threadIdx.x - (threadIdx.x & (K_WARP_SIZE - 1)); 109 s_Data[pos] = 0; 110 pos += K_WARP_SIZE; 111 s_Data[pos] = idata; 112 113 s_Data[pos] += s_Data[pos - 1]; 114 s_Data[pos] += s_Data[pos - 2]; 115 s_Data[pos] += s_Data[pos - 4]; 116 s_Data[pos] += s_Data[pos - 8]; 117 s_Data[pos] += s_Data[pos - 16]; 118 119 return s_Data[pos]; 120 #endif 121 } 122 123 __device__ __forceinline__ Ncv32u warpScanExclusive(Ncv32u idata, volatile Ncv32u *s_Data) 124 { 125 return warpScanInclusive(idata, s_Data) - idata; 126 } 127 128 template <Ncv32u tiNumScanThreads> 129 __device__ Ncv32u scan1Inclusive(Ncv32u idata, volatile Ncv32u *s_Data) 130 { 131 if (tiNumScanThreads > K_WARP_SIZE) 132 { 133 //Bottom-level inclusive warp scan 134 Ncv32u warpResult = warpScanInclusive(idata, s_Data); 135 136 //Save top elements of each warp for exclusive warp scan 137 //sync to wait for warp scans to complete (because s_Data is being overwritten) 138 __syncthreads(); 139 if( (threadIdx.x & (K_WARP_SIZE - 1)) == (K_WARP_SIZE - 1) ) 140 { 141 s_Data[threadIdx.x >> K_LOG2_WARP_SIZE] = warpResult; 142 } 143 144 //wait for warp scans to complete 145 __syncthreads(); 146 147 if( threadIdx.x < (tiNumScanThreads / K_WARP_SIZE) ) 148 { 149 //grab top warp elements 150 Ncv32u val = s_Data[threadIdx.x]; 151 //calculate exclusive scan and write back to shared memory 152 s_Data[threadIdx.x] = warpScanExclusive(val, s_Data); 153 } 154 155 //return updated warp scans with exclusive scan results 156 __syncthreads(); 157 return warpResult + s_Data[threadIdx.x >> K_LOG2_WARP_SIZE]; 158 } 159 else 160 { 161 return warpScanInclusive(idata, s_Data); 162 } 163 } 164 165 166 //============================================================================== 167 // 168 // HaarClassifierCascade file 169 // 170 //============================================================================== 171 172 173 const Ncv32u MAX_GRID_DIM = 65535; 174 175 176 const Ncv32u NUM_THREADS_ANCHORSPARALLEL = 64; 177 178 179 #define NUM_THREADS_CLASSIFIERPARALLEL_LOG2 6 180 #define NUM_THREADS_CLASSIFIERPARALLEL (1 << NUM_THREADS_CLASSIFIERPARALLEL_LOG2) 181 182 183 /** \internal 184 * Haar features solid array. 185 */ 186 texture<uint2, 1, cudaReadModeElementType> texHaarFeatures; 187 188 189 /** \internal 190 * Haar classifiers flattened trees container. 191 * Two parts: first contains root nodes, second - nodes that are referred by root nodes. 192 * Drawback: breaks tree locality (might cause more cache misses 193 * Advantage: No need to introduce additional 32-bit field to index root nodes offsets 194 */ 195 texture<uint4, 1, cudaReadModeElementType> texHaarClassifierNodes; 196 197 198 texture<Ncv32u, 1, cudaReadModeElementType> texIImage; 199 200 201 __device__ HaarStage64 getStage(Ncv32u iStage, HaarStage64 *d_Stages) 202 { 203 return d_Stages[iStage]; 204 } 205 206 207 template <NcvBool tbCacheTextureCascade> 208 __device__ HaarClassifierNode128 getClassifierNode(Ncv32u iNode, HaarClassifierNode128 *d_ClassifierNodes) 209 { 210 HaarClassifierNode128 tmpNode; 211 if (tbCacheTextureCascade) 212 { 213 tmpNode._ui4 = tex1Dfetch(texHaarClassifierNodes, iNode); 214 } 215 else 216 { 217 tmpNode = d_ClassifierNodes[iNode]; 218 } 219 return tmpNode; 220 } 221 222 223 template <NcvBool tbCacheTextureCascade> 224 __device__ void getFeature(Ncv32u iFeature, HaarFeature64 *d_Features, 225 Ncv32f *weight, 226 Ncv32u *rectX, Ncv32u *rectY, Ncv32u *rectWidth, Ncv32u *rectHeight) 227 { 228 HaarFeature64 feature; 229 if (tbCacheTextureCascade) 230 { 231 feature._ui2 = tex1Dfetch(texHaarFeatures, iFeature); 232 } 233 else 234 { 235 feature = d_Features[iFeature]; 236 } 237 feature.getRect(rectX, rectY, rectWidth, rectHeight); 238 *weight = feature.getWeight(); 239 } 240 241 242 template <NcvBool tbCacheTextureIImg> 243 __device__ Ncv32u getElemIImg(Ncv32u x, Ncv32u *d_IImg) 244 { 245 if (tbCacheTextureIImg) 246 { 247 return tex1Dfetch(texIImage, x); 248 } 249 else 250 { 251 return d_IImg[x]; 252 } 253 } 254 255 256 __device__ Ncv32u d_outMaskPosition; 257 258 259 __device__ void compactBlockWriteOutAnchorParallel(Ncv32u threadPassFlag, Ncv32u threadElem, Ncv32u *vectorOut) 260 { 261 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110 262 263 __shared__ Ncv32u shmem[NUM_THREADS_ANCHORSPARALLEL * 2]; 264 __shared__ Ncv32u numPassed; 265 __shared__ Ncv32u outMaskOffset; 266 267 Ncv32u incScan = scan1Inclusive<NUM_THREADS_ANCHORSPARALLEL>(threadPassFlag, shmem); 268 __syncthreads(); 269 270 if (threadIdx.x == NUM_THREADS_ANCHORSPARALLEL-1) 271 { 272 numPassed = incScan; 273 outMaskOffset = atomicAdd(&d_outMaskPosition, incScan); 274 } 275 276 if (threadPassFlag) 277 { 278 Ncv32u excScan = incScan - threadPassFlag; 279 shmem[excScan] = threadElem; 280 } 281 282 __syncthreads(); 283 284 if (threadIdx.x < numPassed) 285 { 286 vectorOut[outMaskOffset + threadIdx.x] = shmem[threadIdx.x]; 287 } 288 #endif 289 } 290 291 292 template <NcvBool tbInitMaskPositively, 293 NcvBool tbCacheTextureIImg, 294 NcvBool tbCacheTextureCascade, 295 NcvBool tbReadPixelIndexFromVector, 296 NcvBool tbDoAtomicCompaction> 297 __global__ void applyHaarClassifierAnchorParallel(Ncv32u *d_IImg, Ncv32u IImgStride, 298 Ncv32f *d_weights, Ncv32u weightsStride, 299 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, 300 Ncv32u *d_inMask, Ncv32u *d_outMask, 301 Ncv32u mask1Dlen, Ncv32u mask2Dstride, 302 NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) 303 { 304 Ncv32u y_offs; 305 Ncv32u x_offs; 306 Ncv32u maskOffset; 307 Ncv32u outMaskVal; 308 309 NcvBool bInactiveThread = false; 310 311 if (tbReadPixelIndexFromVector) 312 { 313 maskOffset = (MAX_GRID_DIM * blockIdx.y + blockIdx.x) * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x; 314 315 if (maskOffset >= mask1Dlen) 316 { 317 if (tbDoAtomicCompaction) bInactiveThread = true; else return; 318 } 319 320 if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread) 321 { 322 outMaskVal = d_inMask[maskOffset]; 323 y_offs = outMaskVal >> 16; 324 x_offs = outMaskVal & 0xFFFF; 325 } 326 } 327 else 328 { 329 y_offs = blockIdx.y; 330 x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x; 331 332 if (x_offs >= mask2Dstride) 333 { 334 if (tbDoAtomicCompaction) bInactiveThread = true; else return; 335 } 336 337 if (!tbDoAtomicCompaction || tbDoAtomicCompaction && !bInactiveThread) 338 { 339 maskOffset = y_offs * mask2Dstride + x_offs; 340 341 if ((x_offs >= anchorsRoi.width) || 342 (!tbInitMaskPositively && 343 d_inMask != d_outMask && 344 d_inMask[maskOffset] == OBJDET_MASK_ELEMENT_INVALID_32U)) 345 { 346 if (tbDoAtomicCompaction) 347 { 348 bInactiveThread = true; 349 } 350 else 351 { 352 d_outMask[maskOffset] = OBJDET_MASK_ELEMENT_INVALID_32U; 353 return; 354 } 355 } 356 357 outMaskVal = (y_offs << 16) | x_offs; 358 } 359 } 360 361 NcvBool bPass = true; 362 363 if (!tbDoAtomicCompaction || tbDoAtomicCompaction) 364 { 365 Ncv32f pixelStdDev = 0.0f; 366 367 if (!bInactiveThread) 368 pixelStdDev = d_weights[y_offs * weightsStride + x_offs]; 369 370 for (Ncv32u iStage = startStageInc; iStage < endStageExc; iStage++) 371 { 372 Ncv32f curStageSum = 0.0f; 373 374 HaarStage64 curStage = getStage(iStage, d_Stages); 375 Ncv32u numRootNodesInStage = curStage.getNumClassifierRootNodes(); 376 Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset(); 377 Ncv32f stageThreshold = curStage.getStageThreshold(); 378 379 while (numRootNodesInStage--) 380 { 381 NcvBool bMoreNodesToTraverse = true; 382 Ncv32u iNode = curRootNodeOffset; 383 384 if (bPass && !bInactiveThread) 385 { 386 while (bMoreNodesToTraverse) 387 { 388 HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes); 389 HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); 390 Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); 391 Ncv32u iFeature = featuresDesc.getFeaturesOffset(); 392 393 Ncv32f curNodeVal = 0.0f; 394 395 for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++) 396 { 397 Ncv32f rectWeight; 398 Ncv32u rectX, rectY, rectWidth, rectHeight; 399 getFeature<tbCacheTextureCascade> 400 (iFeature + iRect, d_Features, 401 &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); 402 403 Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); 404 Ncv32u iioffsTR = iioffsTL + rectWidth; 405 Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; 406 Ncv32u iioffsBR = iioffsBL + rectWidth; 407 408 Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) - 409 getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) + 410 getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) - 411 getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg); 412 413 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY 414 curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); 415 #else 416 curNodeVal += (Ncv32f)rectSum * rectWeight; 417 #endif 418 } 419 420 HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc(); 421 HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc(); 422 Ncv32f nodeThreshold = curNode.getThreshold(); 423 424 HaarClassifierNodeDescriptor32 nextNodeDescriptor; 425 NcvBool nextNodeIsLeaf; 426 427 if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) 428 { 429 nextNodeDescriptor = nodeLeft; 430 nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf(); 431 } 432 else 433 { 434 nextNodeDescriptor = nodeRight; 435 nextNodeIsLeaf = featuresDesc.isRightNodeLeaf(); 436 } 437 438 if (nextNodeIsLeaf) 439 { 440 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue(); 441 curStageSum += tmpLeafValue; 442 bMoreNodesToTraverse = false; 443 } 444 else 445 { 446 iNode = nextNodeDescriptor.getNextNodeOffset(); 447 } 448 } 449 } 450 451 __syncthreads(); 452 curRootNodeOffset++; 453 } 454 455 if (curStageSum < stageThreshold) 456 { 457 bPass = false; 458 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U; 459 } 460 } 461 } 462 463 __syncthreads(); 464 465 if (!tbDoAtomicCompaction) 466 { 467 if (!tbReadPixelIndexFromVector || 468 (tbReadPixelIndexFromVector && (!bPass || d_inMask != d_outMask))) 469 { 470 d_outMask[maskOffset] = outMaskVal; 471 } 472 } 473 else 474 { 475 compactBlockWriteOutAnchorParallel(bPass && !bInactiveThread, 476 outMaskVal, 477 d_outMask); 478 } 479 } 480 481 482 template <NcvBool tbCacheTextureIImg, 483 NcvBool tbCacheTextureCascade, 484 NcvBool tbDoAtomicCompaction> 485 __global__ void applyHaarClassifierClassifierParallel(Ncv32u *d_IImg, Ncv32u IImgStride, 486 Ncv32f *d_weights, Ncv32u weightsStride, 487 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, 488 Ncv32u *d_inMask, Ncv32u *d_outMask, 489 Ncv32u mask1Dlen, Ncv32u mask2Dstride, 490 NcvSize32u anchorsRoi, Ncv32u startStageInc, Ncv32u endStageExc, Ncv32f scaleArea) 491 { 492 Ncv32u maskOffset = MAX_GRID_DIM * blockIdx.y + blockIdx.x; 493 494 if (maskOffset >= mask1Dlen) 495 { 496 return; 497 } 498 499 Ncv32u outMaskVal = d_inMask[maskOffset]; 500 Ncv32u y_offs = outMaskVal >> 16; 501 Ncv32u x_offs = outMaskVal & 0xFFFF; 502 503 Ncv32f pixelStdDev = d_weights[y_offs * weightsStride + x_offs]; 504 NcvBool bPass = true; 505 506 for (Ncv32u iStage = startStageInc; iStage<endStageExc; iStage++) 507 { 508 //this variable is subject to reduction 509 Ncv32f curStageSum = 0.0f; 510 511 HaarStage64 curStage = getStage(iStage, d_Stages); 512 Ncv32s numRootNodesInStage = curStage.getNumClassifierRootNodes(); 513 Ncv32u curRootNodeOffset = curStage.getStartClassifierRootNodeOffset() + threadIdx.x; 514 Ncv32f stageThreshold = curStage.getStageThreshold(); 515 516 Ncv32u numRootChunks = (numRootNodesInStage + NUM_THREADS_CLASSIFIERPARALLEL - 1) >> NUM_THREADS_CLASSIFIERPARALLEL_LOG2; 517 518 for (Ncv32u chunkId=0; chunkId<numRootChunks; chunkId++) 519 { 520 NcvBool bMoreNodesToTraverse = true; 521 522 if (chunkId * NUM_THREADS_CLASSIFIERPARALLEL + threadIdx.x < numRootNodesInStage) 523 { 524 Ncv32u iNode = curRootNodeOffset; 525 526 while (bMoreNodesToTraverse) 527 { 528 HaarClassifierNode128 curNode = getClassifierNode<tbCacheTextureCascade>(iNode, d_ClassifierNodes); 529 HaarFeatureDescriptor32 featuresDesc = curNode.getFeatureDesc(); 530 Ncv32u curNodeFeaturesNum = featuresDesc.getNumFeatures(); 531 Ncv32u iFeature = featuresDesc.getFeaturesOffset(); 532 533 Ncv32f curNodeVal = 0.0f; 534 //TODO: fetch into shmem if size suffices. Shmem can be shared with reduce 535 for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++) 536 { 537 Ncv32f rectWeight; 538 Ncv32u rectX, rectY, rectWidth, rectHeight; 539 getFeature<tbCacheTextureCascade> 540 (iFeature + iRect, d_Features, 541 &rectWeight, &rectX, &rectY, &rectWidth, &rectHeight); 542 543 Ncv32u iioffsTL = (y_offs + rectY) * IImgStride + (x_offs + rectX); 544 Ncv32u iioffsTR = iioffsTL + rectWidth; 545 Ncv32u iioffsBL = iioffsTL + rectHeight * IImgStride; 546 Ncv32u iioffsBR = iioffsBL + rectWidth; 547 548 Ncv32u rectSum = getElemIImg<tbCacheTextureIImg>(iioffsBR, d_IImg) - 549 getElemIImg<tbCacheTextureIImg>(iioffsBL, d_IImg) + 550 getElemIImg<tbCacheTextureIImg>(iioffsTL, d_IImg) - 551 getElemIImg<tbCacheTextureIImg>(iioffsTR, d_IImg); 552 553 #if defined CPU_FP_COMPLIANCE || defined DISABLE_MAD_SELECTIVELY 554 curNodeVal += __fmul_rn((Ncv32f)rectSum, rectWeight); 555 #else 556 curNodeVal += (Ncv32f)rectSum * rectWeight; 557 #endif 558 } 559 560 HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc(); 561 HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc(); 562 Ncv32f nodeThreshold = curNode.getThreshold(); 563 564 HaarClassifierNodeDescriptor32 nextNodeDescriptor; 565 NcvBool nextNodeIsLeaf; 566 567 if (curNodeVal < scaleArea * pixelStdDev * nodeThreshold) 568 { 569 nextNodeDescriptor = nodeLeft; 570 nextNodeIsLeaf = featuresDesc.isLeftNodeLeaf(); 571 } 572 else 573 { 574 nextNodeDescriptor = nodeRight; 575 nextNodeIsLeaf = featuresDesc.isRightNodeLeaf(); 576 } 577 578 if (nextNodeIsLeaf) 579 { 580 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValue(); 581 curStageSum += tmpLeafValue; 582 bMoreNodesToTraverse = false; 583 } 584 else 585 { 586 iNode = nextNodeDescriptor.getNextNodeOffset(); 587 } 588 } 589 } 590 __syncthreads(); 591 592 curRootNodeOffset += NUM_THREADS_CLASSIFIERPARALLEL; 593 } 594 595 Ncv32f finalStageSum = subReduce<Ncv32f, functorAddValues<Ncv32f>, NUM_THREADS_CLASSIFIERPARALLEL>(curStageSum); 596 597 if (finalStageSum < stageThreshold) 598 { 599 bPass = false; 600 outMaskVal = OBJDET_MASK_ELEMENT_INVALID_32U; 601 break; 602 } 603 } 604 605 if (!tbDoAtomicCompaction) 606 { 607 if (!bPass || d_inMask != d_outMask) 608 { 609 if (!threadIdx.x) 610 { 611 d_outMask[maskOffset] = outMaskVal; 612 } 613 } 614 } 615 else 616 { 617 #if __CUDA_ARCH__ && __CUDA_ARCH__ >= 110 618 if (bPass && !threadIdx.x) 619 { 620 Ncv32u outMaskOffset = atomicAdd(&d_outMaskPosition, 1); 621 d_outMask[outMaskOffset] = outMaskVal; 622 } 623 #endif 624 } 625 } 626 627 628 template <NcvBool tbMaskByInmask, 629 NcvBool tbDoAtomicCompaction> 630 __global__ void initializeMaskVector(Ncv32u *d_inMask, Ncv32u *d_outMask, 631 Ncv32u mask1Dlen, Ncv32u mask2Dstride, 632 NcvSize32u anchorsRoi, Ncv32u step) 633 { 634 Ncv32u y_offs = blockIdx.y; 635 Ncv32u x_offs = blockIdx.x * NUM_THREADS_ANCHORSPARALLEL + threadIdx.x; 636 Ncv32u outMaskOffset = y_offs * gridDim.x * blockDim.x + x_offs; 637 638 Ncv32u y_offs_upsc = step * y_offs; 639 Ncv32u x_offs_upsc = step * x_offs; 640 Ncv32u inMaskOffset = y_offs_upsc * mask2Dstride + x_offs_upsc; 641 642 Ncv32u outElem = OBJDET_MASK_ELEMENT_INVALID_32U; 643 644 if (x_offs_upsc < anchorsRoi.width && 645 (!tbMaskByInmask || d_inMask[inMaskOffset] != OBJDET_MASK_ELEMENT_INVALID_32U)) 646 { 647 outElem = (y_offs_upsc << 16) | x_offs_upsc; 648 } 649 650 if (!tbDoAtomicCompaction) 651 { 652 d_outMask[outMaskOffset] = outElem; 653 } 654 else 655 { 656 compactBlockWriteOutAnchorParallel(outElem != OBJDET_MASK_ELEMENT_INVALID_32U, 657 outElem, 658 d_outMask); 659 } 660 } 661 662 663 struct applyHaarClassifierAnchorParallelFunctor 664 { 665 dim3 gridConf, blockConf; 666 cudaStream_t cuStream; 667 668 //Kernel arguments are stored as members; 669 Ncv32u *d_IImg; 670 Ncv32u IImgStride; 671 Ncv32f *d_weights; 672 Ncv32u weightsStride; 673 HaarFeature64 *d_Features; 674 HaarClassifierNode128 *d_ClassifierNodes; 675 HaarStage64 *d_Stages; 676 Ncv32u *d_inMask; 677 Ncv32u *d_outMask; 678 Ncv32u mask1Dlen; 679 Ncv32u mask2Dstride; 680 NcvSize32u anchorsRoi; 681 Ncv32u startStageInc; 682 Ncv32u endStageExc; 683 Ncv32f scaleArea; 684 685 //Arguments are passed through the constructor 686 applyHaarClassifierAnchorParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, 687 Ncv32u *_d_IImg, Ncv32u _IImgStride, 688 Ncv32f *_d_weights, Ncv32u _weightsStride, 689 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, 690 Ncv32u *_d_inMask, Ncv32u *_d_outMask, 691 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, 692 NcvSize32u _anchorsRoi, Ncv32u _startStageInc, 693 Ncv32u _endStageExc, Ncv32f _scaleArea) : 694 gridConf(_gridConf), 695 blockConf(_blockConf), 696 cuStream(_cuStream), 697 d_IImg(_d_IImg), 698 IImgStride(_IImgStride), 699 d_weights(_d_weights), 700 weightsStride(_weightsStride), 701 d_Features(_d_Features), 702 d_ClassifierNodes(_d_ClassifierNodes), 703 d_Stages(_d_Stages), 704 d_inMask(_d_inMask), 705 d_outMask(_d_outMask), 706 mask1Dlen(_mask1Dlen), 707 mask2Dstride(_mask2Dstride), 708 anchorsRoi(_anchorsRoi), 709 startStageInc(_startStageInc), 710 endStageExc(_endStageExc), 711 scaleArea(_scaleArea) 712 {} 713 714 template<class TList> 715 void call(TList tl) 716 { 717 (void)tl; 718 applyHaarClassifierAnchorParallel < 719 Loki::TL::TypeAt<TList, 0>::Result::value, 720 Loki::TL::TypeAt<TList, 1>::Result::value, 721 Loki::TL::TypeAt<TList, 2>::Result::value, 722 Loki::TL::TypeAt<TList, 3>::Result::value, 723 Loki::TL::TypeAt<TList, 4>::Result::value > 724 <<<gridConf, blockConf, 0, cuStream>>> 725 (d_IImg, IImgStride, 726 d_weights, weightsStride, 727 d_Features, d_ClassifierNodes, d_Stages, 728 d_inMask, d_outMask, 729 mask1Dlen, mask2Dstride, 730 anchorsRoi, startStageInc, 731 endStageExc, scaleArea); 732 } 733 }; 734 735 736 void applyHaarClassifierAnchorParallelDynTemplate(NcvBool tbInitMaskPositively, 737 NcvBool tbCacheTextureIImg, 738 NcvBool tbCacheTextureCascade, 739 NcvBool tbReadPixelIndexFromVector, 740 NcvBool tbDoAtomicCompaction, 741 742 dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, 743 744 Ncv32u *d_IImg, Ncv32u IImgStride, 745 Ncv32f *d_weights, Ncv32u weightsStride, 746 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, 747 Ncv32u *d_inMask, Ncv32u *d_outMask, 748 Ncv32u mask1Dlen, Ncv32u mask2Dstride, 749 NcvSize32u anchorsRoi, Ncv32u startStageInc, 750 Ncv32u endStageExc, Ncv32f scaleArea) 751 { 752 753 applyHaarClassifierAnchorParallelFunctor functor(gridConf, blockConf, cuStream, 754 d_IImg, IImgStride, 755 d_weights, weightsStride, 756 d_Features, d_ClassifierNodes, d_Stages, 757 d_inMask, d_outMask, 758 mask1Dlen, mask2Dstride, 759 anchorsRoi, startStageInc, 760 endStageExc, scaleArea); 761 762 //Second parameter is the number of "dynamic" template parameters 763 NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 5, applyHaarClassifierAnchorParallelFunctor> 764 ::call( &functor, 765 tbInitMaskPositively, 766 tbCacheTextureIImg, 767 tbCacheTextureCascade, 768 tbReadPixelIndexFromVector, 769 tbDoAtomicCompaction); 770 } 771 772 773 struct applyHaarClassifierClassifierParallelFunctor 774 { 775 dim3 gridConf, blockConf; 776 cudaStream_t cuStream; 777 778 //Kernel arguments are stored as members; 779 Ncv32u *d_IImg; 780 Ncv32u IImgStride; 781 Ncv32f *d_weights; 782 Ncv32u weightsStride; 783 HaarFeature64 *d_Features; 784 HaarClassifierNode128 *d_ClassifierNodes; 785 HaarStage64 *d_Stages; 786 Ncv32u *d_inMask; 787 Ncv32u *d_outMask; 788 Ncv32u mask1Dlen; 789 Ncv32u mask2Dstride; 790 NcvSize32u anchorsRoi; 791 Ncv32u startStageInc; 792 Ncv32u endStageExc; 793 Ncv32f scaleArea; 794 795 //Arguments are passed through the constructor 796 applyHaarClassifierClassifierParallelFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, 797 Ncv32u *_d_IImg, Ncv32u _IImgStride, 798 Ncv32f *_d_weights, Ncv32u _weightsStride, 799 HaarFeature64 *_d_Features, HaarClassifierNode128 *_d_ClassifierNodes, HaarStage64 *_d_Stages, 800 Ncv32u *_d_inMask, Ncv32u *_d_outMask, 801 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, 802 NcvSize32u _anchorsRoi, Ncv32u _startStageInc, 803 Ncv32u _endStageExc, Ncv32f _scaleArea) : 804 gridConf(_gridConf), 805 blockConf(_blockConf), 806 cuStream(_cuStream), 807 d_IImg(_d_IImg), 808 IImgStride(_IImgStride), 809 d_weights(_d_weights), 810 weightsStride(_weightsStride), 811 d_Features(_d_Features), 812 d_ClassifierNodes(_d_ClassifierNodes), 813 d_Stages(_d_Stages), 814 d_inMask(_d_inMask), 815 d_outMask(_d_outMask), 816 mask1Dlen(_mask1Dlen), 817 mask2Dstride(_mask2Dstride), 818 anchorsRoi(_anchorsRoi), 819 startStageInc(_startStageInc), 820 endStageExc(_endStageExc), 821 scaleArea(_scaleArea) 822 {} 823 824 template<class TList> 825 void call(TList tl) 826 { 827 (void)tl; 828 applyHaarClassifierClassifierParallel < 829 Loki::TL::TypeAt<TList, 0>::Result::value, 830 Loki::TL::TypeAt<TList, 1>::Result::value, 831 Loki::TL::TypeAt<TList, 2>::Result::value > 832 <<<gridConf, blockConf, 0, cuStream>>> 833 (d_IImg, IImgStride, 834 d_weights, weightsStride, 835 d_Features, d_ClassifierNodes, d_Stages, 836 d_inMask, d_outMask, 837 mask1Dlen, mask2Dstride, 838 anchorsRoi, startStageInc, 839 endStageExc, scaleArea); 840 } 841 }; 842 843 844 void applyHaarClassifierClassifierParallelDynTemplate(NcvBool tbCacheTextureIImg, 845 NcvBool tbCacheTextureCascade, 846 NcvBool tbDoAtomicCompaction, 847 848 dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, 849 850 Ncv32u *d_IImg, Ncv32u IImgStride, 851 Ncv32f *d_weights, Ncv32u weightsStride, 852 HaarFeature64 *d_Features, HaarClassifierNode128 *d_ClassifierNodes, HaarStage64 *d_Stages, 853 Ncv32u *d_inMask, Ncv32u *d_outMask, 854 Ncv32u mask1Dlen, Ncv32u mask2Dstride, 855 NcvSize32u anchorsRoi, Ncv32u startStageInc, 856 Ncv32u endStageExc, Ncv32f scaleArea) 857 { 858 applyHaarClassifierClassifierParallelFunctor functor(gridConf, blockConf, cuStream, 859 d_IImg, IImgStride, 860 d_weights, weightsStride, 861 d_Features, d_ClassifierNodes, d_Stages, 862 d_inMask, d_outMask, 863 mask1Dlen, mask2Dstride, 864 anchorsRoi, startStageInc, 865 endStageExc, scaleArea); 866 867 //Second parameter is the number of "dynamic" template parameters 868 NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 3, applyHaarClassifierClassifierParallelFunctor> 869 ::call( &functor, 870 tbCacheTextureIImg, 871 tbCacheTextureCascade, 872 tbDoAtomicCompaction); 873 } 874 875 876 struct initializeMaskVectorFunctor 877 { 878 dim3 gridConf, blockConf; 879 cudaStream_t cuStream; 880 881 //Kernel arguments are stored as members; 882 Ncv32u *d_inMask; 883 Ncv32u *d_outMask; 884 Ncv32u mask1Dlen; 885 Ncv32u mask2Dstride; 886 NcvSize32u anchorsRoi; 887 Ncv32u step; 888 889 //Arguments are passed through the constructor 890 initializeMaskVectorFunctor(dim3 _gridConf, dim3 _blockConf, cudaStream_t _cuStream, 891 Ncv32u *_d_inMask, Ncv32u *_d_outMask, 892 Ncv32u _mask1Dlen, Ncv32u _mask2Dstride, 893 NcvSize32u _anchorsRoi, Ncv32u _step) : 894 gridConf(_gridConf), 895 blockConf(_blockConf), 896 cuStream(_cuStream), 897 d_inMask(_d_inMask), 898 d_outMask(_d_outMask), 899 mask1Dlen(_mask1Dlen), 900 mask2Dstride(_mask2Dstride), 901 anchorsRoi(_anchorsRoi), 902 step(_step) 903 {} 904 905 template<class TList> 906 void call(TList tl) 907 { 908 (void)tl; 909 initializeMaskVector < 910 Loki::TL::TypeAt<TList, 0>::Result::value, 911 Loki::TL::TypeAt<TList, 1>::Result::value > 912 <<<gridConf, blockConf, 0, cuStream>>> 913 (d_inMask, d_outMask, 914 mask1Dlen, mask2Dstride, 915 anchorsRoi, step); 916 } 917 }; 918 919 920 void initializeMaskVectorDynTemplate(NcvBool tbMaskByInmask, 921 NcvBool tbDoAtomicCompaction, 922 923 dim3 gridConf, dim3 blockConf, cudaStream_t cuStream, 924 925 Ncv32u *d_inMask, Ncv32u *d_outMask, 926 Ncv32u mask1Dlen, Ncv32u mask2Dstride, 927 NcvSize32u anchorsRoi, Ncv32u step) 928 { 929 initializeMaskVectorFunctor functor(gridConf, blockConf, cuStream, 930 d_inMask, d_outMask, 931 mask1Dlen, mask2Dstride, 932 anchorsRoi, step); 933 934 //Second parameter is the number of "dynamic" template parameters 935 NCVRuntimeTemplateBool::KernelCaller<Loki::NullType, 2, initializeMaskVectorFunctor> 936 ::call( &functor, 937 tbMaskByInmask, 938 tbDoAtomicCompaction); 939 } 940 941 942 Ncv32u getStageNumWithNotLessThanNclassifiers(Ncv32u N, HaarClassifierCascadeDescriptor &haar, 943 NCVVector<HaarStage64> &h_HaarStages) 944 { 945 Ncv32u i = 0; 946 for (; i<haar.NumStages; i++) 947 { 948 if (h_HaarStages.ptr()[i].getNumClassifierRootNodes() >= N) 949 { 950 break; 951 } 952 } 953 return i; 954 } 955 956 957 NCVStatus ncvApplyHaarClassifierCascade_device(NCVMatrix<Ncv32u> &integral, 958 NCVMatrix<Ncv32f> &d_weights, 959 NCVMatrixAlloc<Ncv32u> &d_pixelMask, 960 Ncv32u &numDetections, 961 HaarClassifierCascadeDescriptor &haar, 962 NCVVector<HaarStage64> &h_HaarStages, 963 NCVVector<HaarStage64> &d_HaarStages, 964 NCVVector<HaarClassifierNode128> &d_HaarNodes, 965 NCVVector<HaarFeature64> &d_HaarFeatures, 966 NcvBool bMaskElements, 967 NcvSize32u anchorsRoi, 968 Ncv32u pixelStep, 969 Ncv32f scaleArea, 970 INCVMemAllocator &gpuAllocator, 971 INCVMemAllocator &cpuAllocator, 972 cudaDeviceProp &devProp, 973 cudaStream_t cuStream) 974 { 975 ncvAssertReturn(integral.memType() == d_weights.memType()&& 976 integral.memType() == d_pixelMask.memType() && 977 integral.memType() == gpuAllocator.memType() && 978 (integral.memType() == NCVMemoryTypeDevice || 979 integral.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR); 980 981 ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() && 982 d_HaarStages.memType() == d_HaarFeatures.memType() && 983 (d_HaarStages.memType() == NCVMemoryTypeDevice || 984 d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR); 985 986 ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR); 987 988 ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED); 989 990 ncvAssertReturn((integral.ptr() != NULL && d_weights.ptr() != NULL && d_pixelMask.ptr() != NULL && 991 h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL && 992 d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR); 993 994 ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 && 995 d_pixelMask.width() >= anchorsRoi.width && d_pixelMask.height() >= anchorsRoi.height && 996 d_weights.width() >= anchorsRoi.width && d_weights.height() >= anchorsRoi.height && 997 integral.width() >= anchorsRoi.width + haar.ClassifierSize.width && 998 integral.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID); 999 1000 ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE); 1001 1002 ncvAssertReturn(d_HaarStages.length() >= haar.NumStages && 1003 d_HaarNodes.length() >= haar.NumClassifierTotalNodes && 1004 d_HaarFeatures.length() >= haar.NumFeatures && 1005 d_HaarStages.length() == h_HaarStages.length() && 1006 haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID); 1007 1008 ncvAssertReturn(haar.bNeedsTiltedII == false || gpuAllocator.isCounting(), NCV_NOIMPL_HAAR_TILTED_FEATURES); 1009 1010 ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP); 1011 1012 NCV_SET_SKIP_COND(gpuAllocator.isCounting()); 1013 1014 #if defined _SELF_TEST_ 1015 1016 NCVStatus ncvStat; 1017 1018 NCVMatrixAlloc<Ncv32u> h_integralImage(cpuAllocator, integral.width, integral.height, integral.pitch); 1019 ncvAssertReturn(h_integralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1020 NCVMatrixAlloc<Ncv32f> h_weights(cpuAllocator, d_weights.width, d_weights.height, d_weights.pitch); 1021 ncvAssertReturn(h_weights.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1022 NCVMatrixAlloc<Ncv32u> h_pixelMask(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch); 1023 ncvAssertReturn(h_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1024 NCVVectorAlloc<HaarClassifierNode128> h_HaarNodes(cpuAllocator, d_HaarNodes.length); 1025 ncvAssertReturn(h_HaarNodes.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1026 NCVVectorAlloc<HaarFeature64> h_HaarFeatures(cpuAllocator, d_HaarFeatures.length); 1027 ncvAssertReturn(h_HaarFeatures.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1028 1029 NCVMatrixAlloc<Ncv32u> h_pixelMask_d(cpuAllocator, d_pixelMask.width, d_pixelMask.height, d_pixelMask.pitch); 1030 ncvAssertReturn(h_pixelMask_d.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1031 1032 NCV_SKIP_COND_BEGIN 1033 1034 ncvStat = d_pixelMask.copySolid(h_pixelMask, 0); 1035 ncvAssertReturnNcvStat(ncvStat); 1036 ncvStat = integral.copySolid(h_integralImage, 0); 1037 ncvAssertReturnNcvStat(ncvStat); 1038 ncvStat = d_weights.copySolid(h_weights, 0); 1039 ncvAssertReturnNcvStat(ncvStat); 1040 ncvStat = d_HaarNodes.copySolid(h_HaarNodes, 0); 1041 ncvAssertReturnNcvStat(ncvStat); 1042 ncvStat = d_HaarFeatures.copySolid(h_HaarFeatures, 0); 1043 ncvAssertReturnNcvStat(ncvStat); 1044 ncvAssertCUDAReturn(cudaStreamSynchronize(0), NCV_CUDA_ERROR); 1045 1046 for (Ncv32u i=0; i<(Ncv32u)anchorsRoi.height; i++) 1047 { 1048 for (Ncv32u j=0; j<d_pixelMask.stride(); j++) 1049 { 1050 if ((i%pixelStep==0) && (j%pixelStep==0) && (j<(Ncv32u)anchorsRoi.width)) 1051 { 1052 if (!bMaskElements || h_pixelMask.ptr[i*d_pixelMask.stride()+j] != OBJDET_MASK_ELEMENT_INVALID_32U) 1053 { 1054 h_pixelMask.ptr[i*d_pixelMask.stride()+j] = (i << 16) | j; 1055 } 1056 } 1057 else 1058 { 1059 h_pixelMask.ptr[i*d_pixelMask.stride()+j] = OBJDET_MASK_ELEMENT_INVALID_32U; 1060 } 1061 } 1062 } 1063 1064 NCV_SKIP_COND_END 1065 1066 #endif 1067 1068 NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment(), anchorsRoi.height * d_pixelMask.stride()); 1069 ncvAssertReturn(d_vecPixelMask.isMemReused(), NCV_ALLOCATOR_BAD_REUSE); 1070 1071 NCVVectorAlloc<Ncv32u> d_vecPixelMaskTmp(gpuAllocator, static_cast<Ncv32u>(d_vecPixelMask.length())); 1072 ncvAssertReturn(d_vecPixelMaskTmp.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1073 1074 NCVVectorAlloc<Ncv32u> hp_pool32u(cpuAllocator, 2); 1075 ncvAssertReturn(hp_pool32u.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1076 Ncv32u *hp_zero = &hp_pool32u.ptr()[0]; 1077 Ncv32u *hp_numDet = &hp_pool32u.ptr()[1]; 1078 1079 NCV_SKIP_COND_BEGIN 1080 *hp_zero = 0; 1081 *hp_numDet = 0; 1082 NCV_SKIP_COND_END 1083 1084 Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) * 1085 (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER)); 1086 1087 NcvBool bTexCacheCascade = devProp.major < 2; 1088 NcvBool bTexCacheIImg = true; //this works better even on Fermi so far 1089 NcvBool bDoAtomicCompaction = devProp.major >= 2 || (devProp.major == 1 && devProp.minor >= 3); 1090 1091 NCVVector<Ncv32u> *d_ptrNowData = &d_vecPixelMask; 1092 NCVVector<Ncv32u> *d_ptrNowTmp = &d_vecPixelMaskTmp; 1093 1094 Ncv32u szNppCompactTmpBuf; 1095 nppsStCompactGetSize_32u(static_cast<Ncv32u>(d_vecPixelMask.length()), &szNppCompactTmpBuf, devProp); 1096 if (bDoAtomicCompaction) 1097 { 1098 szNppCompactTmpBuf = 0; 1099 } 1100 NCVVectorAlloc<Ncv8u> d_tmpBufCompact(gpuAllocator, szNppCompactTmpBuf); 1101 1102 NCV_SKIP_COND_BEGIN 1103 1104 if (bTexCacheIImg) 1105 { 1106 cudaChannelFormatDesc cfdTexIImage; 1107 cfdTexIImage = cudaCreateChannelDesc<Ncv32u>(); 1108 1109 size_t alignmentOffset; 1110 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texIImage, integral.ptr(), cfdTexIImage, 1111 (anchorsRoi.height + haar.ClassifierSize.height) * integral.pitch()), NCV_CUDA_ERROR); 1112 ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); 1113 } 1114 1115 if (bTexCacheCascade) 1116 { 1117 cudaChannelFormatDesc cfdTexHaarFeatures; 1118 cudaChannelFormatDesc cfdTexHaarClassifierNodes; 1119 cfdTexHaarFeatures = cudaCreateChannelDesc<uint2>(); 1120 cfdTexHaarClassifierNodes = cudaCreateChannelDesc<uint4>(); 1121 1122 size_t alignmentOffset; 1123 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarFeatures, 1124 d_HaarFeatures.ptr(), cfdTexHaarFeatures,sizeof(HaarFeature64) * haar.NumFeatures), NCV_CUDA_ERROR); 1125 ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); 1126 ncvAssertCUDAReturn(cudaBindTexture(&alignmentOffset, texHaarClassifierNodes, 1127 d_HaarNodes.ptr(), cfdTexHaarClassifierNodes, sizeof(HaarClassifierNode128) * haar.NumClassifierTotalNodes), NCV_CUDA_ERROR); 1128 ncvAssertReturn(alignmentOffset==0, NCV_TEXTURE_BIND_ERROR); 1129 } 1130 1131 Ncv32u stageStartAnchorParallel = 0; 1132 Ncv32u stageMiddleSwitch = getStageNumWithNotLessThanNclassifiers(NUM_THREADS_CLASSIFIERPARALLEL, 1133 haar, h_HaarStages); 1134 Ncv32u stageEndClassifierParallel = haar.NumStages; 1135 if (stageMiddleSwitch == 0) 1136 { 1137 stageMiddleSwitch = 1; 1138 } 1139 1140 //create stages subdivision for pixel-parallel processing 1141 const Ncv32u compactEveryNstage = bDoAtomicCompaction ? 7 : 1; 1142 Ncv32u curStop = stageStartAnchorParallel; 1143 std::vector<Ncv32u> pixParallelStageStops; 1144 while (curStop < stageMiddleSwitch) 1145 { 1146 pixParallelStageStops.push_back(curStop); 1147 curStop += compactEveryNstage; 1148 } 1149 if (curStop > compactEveryNstage && curStop - stageMiddleSwitch > compactEveryNstage / 2) 1150 { 1151 pixParallelStageStops[pixParallelStageStops.size()-1] = 1152 (stageMiddleSwitch - (curStop - 2 * compactEveryNstage)) / 2; 1153 } 1154 pixParallelStageStops.push_back(stageMiddleSwitch); 1155 Ncv32u pixParallelStageStopsIndex = 0; 1156 1157 if (pixelStep != 1 || bMaskElements) 1158 { 1159 if (bDoAtomicCompaction) 1160 { 1161 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u), 1162 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR); 1163 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1164 } 1165 1166 dim3 gridInit((((anchorsRoi.width + pixelStep - 1) / pixelStep + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL), 1167 (anchorsRoi.height + pixelStep - 1) / pixelStep); 1168 dim3 blockInit(NUM_THREADS_ANCHORSPARALLEL); 1169 1170 if (gridInit.x == 0 || gridInit.y == 0) 1171 { 1172 numDetections = 0; 1173 return NCV_SUCCESS; 1174 } 1175 1176 initializeMaskVectorDynTemplate(bMaskElements, 1177 bDoAtomicCompaction, 1178 gridInit, blockInit, cuStream, 1179 d_ptrNowData->ptr(), 1180 d_ptrNowTmp->ptr(), 1181 static_cast<Ncv32u>(d_vecPixelMask.length()), d_pixelMask.stride(), 1182 anchorsRoi, pixelStep); 1183 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); 1184 1185 if (bDoAtomicCompaction) 1186 { 1187 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1188 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u), 1189 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR); 1190 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1191 swap(d_ptrNowData, d_ptrNowTmp); 1192 } 1193 else 1194 { 1195 NCVStatus nppSt; 1196 nppSt = nppsStCompact_32u(d_ptrNowTmp->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()), 1197 d_ptrNowData->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, 1198 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp); 1199 ncvAssertReturn(nppSt == NPPST_SUCCESS, NCV_NPP_ERROR); 1200 } 1201 numDetections = *hp_numDet; 1202 } 1203 else 1204 { 1205 // 1206 // 1. Run the first pixel-input pixel-parallel classifier for few stages 1207 // 1208 1209 if (bDoAtomicCompaction) 1210 { 1211 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u), 1212 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR); 1213 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1214 } 1215 1216 dim3 grid1(((d_pixelMask.stride() + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL), 1217 anchorsRoi.height); 1218 dim3 block1(NUM_THREADS_ANCHORSPARALLEL); 1219 applyHaarClassifierAnchorParallelDynTemplate( 1220 true, //tbInitMaskPositively 1221 bTexCacheIImg, //tbCacheTextureIImg 1222 bTexCacheCascade, //tbCacheTextureCascade 1223 pixParallelStageStops[pixParallelStageStopsIndex] != 0,//tbReadPixelIndexFromVector 1224 bDoAtomicCompaction, //tbDoAtomicCompaction 1225 grid1, 1226 block1, 1227 cuStream, 1228 integral.ptr(), integral.stride(), 1229 d_weights.ptr(), d_weights.stride(), 1230 d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), 1231 d_ptrNowData->ptr(), 1232 bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), 1233 0, 1234 d_pixelMask.stride(), 1235 anchorsRoi, 1236 pixParallelStageStops[pixParallelStageStopsIndex], 1237 pixParallelStageStops[pixParallelStageStopsIndex+1], 1238 scaleAreaPixels); 1239 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); 1240 1241 if (bDoAtomicCompaction) 1242 { 1243 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1244 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u), 1245 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR); 1246 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1247 } 1248 else 1249 { 1250 NCVStatus nppSt; 1251 nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), static_cast<Ncv32u>(d_vecPixelMask.length()), 1252 d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, 1253 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp); 1254 ncvAssertReturnNcvStat(nppSt); 1255 } 1256 1257 swap(d_ptrNowData, d_ptrNowTmp); 1258 numDetections = *hp_numDet; 1259 1260 pixParallelStageStopsIndex++; 1261 } 1262 1263 // 1264 // 2. Run pixel-parallel stages 1265 // 1266 1267 for (; pixParallelStageStopsIndex < pixParallelStageStops.size()-1; pixParallelStageStopsIndex++) 1268 { 1269 if (numDetections == 0) 1270 { 1271 break; 1272 } 1273 1274 if (bDoAtomicCompaction) 1275 { 1276 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u), 1277 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR); 1278 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1279 } 1280 1281 dim3 grid2((numDetections + NUM_THREADS_ANCHORSPARALLEL - 1) / NUM_THREADS_ANCHORSPARALLEL); 1282 if (numDetections > MAX_GRID_DIM) 1283 { 1284 grid2.x = MAX_GRID_DIM; 1285 grid2.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM; 1286 } 1287 dim3 block2(NUM_THREADS_ANCHORSPARALLEL); 1288 1289 applyHaarClassifierAnchorParallelDynTemplate( 1290 false, //tbInitMaskPositively 1291 bTexCacheIImg, //tbCacheTextureIImg 1292 bTexCacheCascade, //tbCacheTextureCascade 1293 pixParallelStageStops[pixParallelStageStopsIndex] != 0 || pixelStep != 1 || bMaskElements,//tbReadPixelIndexFromVector 1294 bDoAtomicCompaction, //tbDoAtomicCompaction 1295 grid2, 1296 block2, 1297 cuStream, 1298 integral.ptr(), integral.stride(), 1299 d_weights.ptr(), d_weights.stride(), 1300 d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), 1301 d_ptrNowData->ptr(), 1302 bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), 1303 numDetections, 1304 d_pixelMask.stride(), 1305 anchorsRoi, 1306 pixParallelStageStops[pixParallelStageStopsIndex], 1307 pixParallelStageStops[pixParallelStageStopsIndex+1], 1308 scaleAreaPixels); 1309 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); 1310 1311 if (bDoAtomicCompaction) 1312 { 1313 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1314 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u), 1315 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR); 1316 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1317 } 1318 else 1319 { 1320 NCVStatus nppSt; 1321 nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections, 1322 d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, 1323 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp); 1324 ncvAssertReturnNcvStat(nppSt); 1325 } 1326 1327 swap(d_ptrNowData, d_ptrNowTmp); 1328 numDetections = *hp_numDet; 1329 } 1330 1331 // 1332 // 3. Run all left stages in one stage-parallel kernel 1333 // 1334 1335 if (numDetections > 0 && stageMiddleSwitch < stageEndClassifierParallel) 1336 { 1337 if (bDoAtomicCompaction) 1338 { 1339 ncvAssertCUDAReturn(cudaMemcpyToSymbolAsync(d_outMaskPosition, hp_zero, sizeof(Ncv32u), 1340 0, cudaMemcpyHostToDevice, cuStream), NCV_CUDA_ERROR); 1341 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1342 } 1343 1344 dim3 grid3(numDetections); 1345 if (numDetections > MAX_GRID_DIM) 1346 { 1347 grid3.x = MAX_GRID_DIM; 1348 grid3.y = (numDetections + MAX_GRID_DIM - 1) / MAX_GRID_DIM; 1349 } 1350 dim3 block3(NUM_THREADS_CLASSIFIERPARALLEL); 1351 1352 applyHaarClassifierClassifierParallelDynTemplate( 1353 bTexCacheIImg, //tbCacheTextureIImg 1354 bTexCacheCascade, //tbCacheTextureCascade 1355 bDoAtomicCompaction, //tbDoAtomicCompaction 1356 grid3, 1357 block3, 1358 cuStream, 1359 integral.ptr(), integral.stride(), 1360 d_weights.ptr(), d_weights.stride(), 1361 d_HaarFeatures.ptr(), d_HaarNodes.ptr(), d_HaarStages.ptr(), 1362 d_ptrNowData->ptr(), 1363 bDoAtomicCompaction ? d_ptrNowTmp->ptr() : d_ptrNowData->ptr(), 1364 numDetections, 1365 d_pixelMask.stride(), 1366 anchorsRoi, 1367 stageMiddleSwitch, 1368 stageEndClassifierParallel, 1369 scaleAreaPixels); 1370 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); 1371 1372 if (bDoAtomicCompaction) 1373 { 1374 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1375 ncvAssertCUDAReturn(cudaMemcpyFromSymbolAsync(hp_numDet, d_outMaskPosition, sizeof(Ncv32u), 1376 0, cudaMemcpyDeviceToHost, cuStream), NCV_CUDA_ERROR); 1377 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1378 } 1379 else 1380 { 1381 NCVStatus nppSt; 1382 nppSt = nppsStCompact_32u(d_ptrNowData->ptr(), numDetections, 1383 d_ptrNowTmp->ptr(), hp_numDet, OBJDET_MASK_ELEMENT_INVALID_32U, 1384 d_tmpBufCompact.ptr(), szNppCompactTmpBuf, devProp); 1385 ncvAssertReturnNcvStat(nppSt); 1386 } 1387 1388 swap(d_ptrNowData, d_ptrNowTmp); 1389 numDetections = *hp_numDet; 1390 } 1391 1392 if (d_ptrNowData != &d_vecPixelMask) 1393 { 1394 d_vecPixelMaskTmp.copySolid(d_vecPixelMask, cuStream); 1395 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1396 } 1397 1398 #if defined _SELF_TEST_ 1399 1400 ncvStat = d_pixelMask.copySolid(h_pixelMask_d, 0); 1401 ncvAssertReturnNcvStat(ncvStat); 1402 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1403 1404 if (bDoAtomicCompaction) 1405 { 1406 std::sort(h_pixelMask_d.ptr, h_pixelMask_d.ptr + numDetections); 1407 } 1408 1409 Ncv32u fpu_oldcw, fpu_cw; 1410 _controlfp_s(&fpu_cw, 0, 0); 1411 fpu_oldcw = fpu_cw; 1412 _controlfp_s(&fpu_cw, _PC_24, _MCW_PC); 1413 Ncv32u numDetGold; 1414 ncvStat = ncvApplyHaarClassifierCascade_host(h_integralImage, h_weights, h_pixelMask, numDetGold, haar, 1415 h_HaarStages, h_HaarNodes, h_HaarFeatures, 1416 bMaskElements, anchorsRoi, pixelStep, scaleArea); 1417 ncvAssertReturnNcvStat(ncvStat); 1418 _controlfp_s(&fpu_cw, fpu_oldcw, _MCW_PC); 1419 1420 bool bPass = true; 1421 1422 if (numDetGold != numDetections) 1423 { 1424 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade numdetections don't match: cpu=%d, gpu=%d\n", numDetGold, numDetections); 1425 bPass = false; 1426 } 1427 else 1428 { 1429 for (Ncv32u i=0; i<std::max(numDetGold, numDetections) && bPass; i++) 1430 { 1431 if (h_pixelMask.ptr[i] != h_pixelMask_d.ptr[i]) 1432 { 1433 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade self test failed: i=%d, cpu=%d, gpu=%d\n", i, h_pixelMask.ptr[i], h_pixelMask_d.ptr[i]); 1434 bPass = false; 1435 } 1436 } 1437 } 1438 1439 printf("NCVHaarClassifierCascade::applyHaarClassifierCascade %s\n", bPass?"PASSED":"FAILED"); 1440 #endif 1441 1442 NCV_SKIP_COND_END 1443 1444 return NCV_SUCCESS; 1445 } 1446 1447 1448 //============================================================================== 1449 // 1450 // HypothesesOperations file 1451 // 1452 //============================================================================== 1453 1454 1455 const Ncv32u NUM_GROW_THREADS = 128; 1456 1457 1458 __device__ __host__ NcvRect32u pixelToRect(Ncv32u pixel, Ncv32u width, Ncv32u height, Ncv32f scale) 1459 { 1460 NcvRect32u res; 1461 res.x = (Ncv32u)(scale * (pixel & 0xFFFF)); 1462 res.y = (Ncv32u)(scale * (pixel >> 16)); 1463 res.width = (Ncv32u)(scale * width); 1464 res.height = (Ncv32u)(scale * height); 1465 return res; 1466 } 1467 1468 1469 __global__ void growDetectionsKernel(Ncv32u *pixelMask, Ncv32u numElements, 1470 NcvRect32u *hypotheses, 1471 Ncv32u rectWidth, Ncv32u rectHeight, Ncv32f curScale) 1472 { 1473 Ncv32u blockId = blockIdx.y * 65535 + blockIdx.x; 1474 Ncv32u elemAddr = blockId * NUM_GROW_THREADS + threadIdx.x; 1475 if (elemAddr >= numElements) 1476 { 1477 return; 1478 } 1479 hypotheses[elemAddr] = pixelToRect(pixelMask[elemAddr], rectWidth, rectHeight, curScale); 1480 } 1481 1482 1483 NCVStatus ncvGrowDetectionsVector_device(NCVVector<Ncv32u> &pixelMask, 1484 Ncv32u numPixelMaskDetections, 1485 NCVVector<NcvRect32u> &hypotheses, 1486 Ncv32u &totalDetections, 1487 Ncv32u totalMaxDetections, 1488 Ncv32u rectWidth, 1489 Ncv32u rectHeight, 1490 Ncv32f curScale, 1491 cudaStream_t cuStream) 1492 { 1493 ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR); 1494 1495 ncvAssertReturn(pixelMask.memType() == hypotheses.memType() && 1496 pixelMask.memType() == NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR); 1497 1498 ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI); 1499 1500 ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE); 1501 1502 ncvAssertReturn(totalMaxDetections <= hypotheses.length() && 1503 numPixelMaskDetections <= pixelMask.length() && 1504 totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT); 1505 1506 NCVStatus ncvStat = NCV_SUCCESS; 1507 Ncv32u numDetsToCopy = numPixelMaskDetections; 1508 1509 if (numDetsToCopy == 0) 1510 { 1511 return ncvStat; 1512 } 1513 1514 if (totalDetections + numPixelMaskDetections > totalMaxDetections) 1515 { 1516 ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW; 1517 numDetsToCopy = totalMaxDetections - totalDetections; 1518 } 1519 1520 dim3 block(NUM_GROW_THREADS); 1521 dim3 grid((numDetsToCopy + NUM_GROW_THREADS - 1) / NUM_GROW_THREADS); 1522 if (grid.x > 65535) 1523 { 1524 grid.y = (grid.x + 65534) / 65535; 1525 grid.x = 65535; 1526 } 1527 growDetectionsKernel<<<grid, block, 0, cuStream>>>(pixelMask.ptr(), numDetsToCopy, 1528 hypotheses.ptr() + totalDetections, 1529 rectWidth, rectHeight, curScale); 1530 ncvAssertCUDAReturn(cudaGetLastError(), NCV_CUDA_ERROR); 1531 1532 totalDetections += numDetsToCopy; 1533 return ncvStat; 1534 } 1535 1536 1537 //============================================================================== 1538 // 1539 // Pipeline file 1540 // 1541 //============================================================================== 1542 1543 1544 NCVStatus ncvDetectObjectsMultiScale_device(NCVMatrix<Ncv8u> &d_srcImg, 1545 NcvSize32u srcRoi, 1546 NCVVector<NcvRect32u> &d_dstRects, 1547 Ncv32u &dstNumRects, 1548 1549 HaarClassifierCascadeDescriptor &haar, 1550 NCVVector<HaarStage64> &h_HaarStages, 1551 NCVVector<HaarStage64> &d_HaarStages, 1552 NCVVector<HaarClassifierNode128> &d_HaarNodes, 1553 NCVVector<HaarFeature64> &d_HaarFeatures, 1554 1555 NcvSize32u minObjSize, 1556 Ncv32u minNeighbors, //default 4 1557 Ncv32f scaleStep, //default 1.2f 1558 Ncv32u pixelStep, //default 1 1559 Ncv32u flags, //default NCVPipeObjDet_Default 1560 1561 INCVMemAllocator &gpuAllocator, 1562 INCVMemAllocator &cpuAllocator, 1563 cudaDeviceProp &devProp, 1564 cudaStream_t cuStream) 1565 { 1566 ncvAssertReturn(d_srcImg.memType() == d_dstRects.memType() && 1567 d_srcImg.memType() == gpuAllocator.memType() && 1568 (d_srcImg.memType() == NCVMemoryTypeDevice || 1569 d_srcImg.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR); 1570 1571 ncvAssertReturn(d_HaarStages.memType() == d_HaarNodes.memType() && 1572 d_HaarStages.memType() == d_HaarFeatures.memType() && 1573 (d_HaarStages.memType() == NCVMemoryTypeDevice || 1574 d_HaarStages.memType() == NCVMemoryTypeNone), NCV_MEM_RESIDENCE_ERROR); 1575 1576 ncvAssertReturn(h_HaarStages.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR); 1577 1578 ncvAssertReturn(gpuAllocator.isInitialized() && cpuAllocator.isInitialized(), NCV_ALLOCATOR_NOT_INITIALIZED); 1579 1580 ncvAssertReturn((d_srcImg.ptr() != NULL && d_dstRects.ptr() != NULL && 1581 h_HaarStages.ptr() != NULL && d_HaarStages.ptr() != NULL && d_HaarNodes.ptr() != NULL && 1582 d_HaarFeatures.ptr() != NULL) || gpuAllocator.isCounting(), NCV_NULL_PTR); 1583 ncvAssertReturn(srcRoi.width > 0 && srcRoi.height > 0 && 1584 d_srcImg.width() >= srcRoi.width && d_srcImg.height() >= srcRoi.height && 1585 srcRoi.width >= minObjSize.width && srcRoi.height >= minObjSize.height && 1586 d_dstRects.length() >= 1, NCV_DIMENSIONS_INVALID); 1587 1588 ncvAssertReturn(scaleStep > 1.0f, NCV_INVALID_SCALE); 1589 1590 ncvAssertReturn(d_HaarStages.length() >= haar.NumStages && 1591 d_HaarNodes.length() >= haar.NumClassifierTotalNodes && 1592 d_HaarFeatures.length() >= haar.NumFeatures && 1593 d_HaarStages.length() == h_HaarStages.length() && 1594 haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID); 1595 1596 ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES); 1597 1598 ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP); 1599 1600 //TODO: set NPP active stream to cuStream 1601 1602 NCVStatus ncvStat; 1603 NCV_SET_SKIP_COND(gpuAllocator.isCounting()); 1604 1605 Ncv32u integralWidth = d_srcImg.width() + 1; 1606 Ncv32u integralHeight = d_srcImg.height() + 1; 1607 1608 NCVMatrixAlloc<Ncv32u> integral(gpuAllocator, integralWidth, integralHeight); 1609 ncvAssertReturn(integral.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1610 NCVMatrixAlloc<Ncv64u> d_sqIntegralImage(gpuAllocator, integralWidth, integralHeight); 1611 ncvAssertReturn(d_sqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1612 1613 NCVMatrixAlloc<Ncv32f> d_rectStdDev(gpuAllocator, d_srcImg.width(), d_srcImg.height()); 1614 ncvAssertReturn(d_rectStdDev.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1615 NCVMatrixAlloc<Ncv32u> d_pixelMask(gpuAllocator, d_srcImg.width(), d_srcImg.height()); 1616 ncvAssertReturn(d_pixelMask.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1617 1618 NCVMatrixAlloc<Ncv32u> d_scaledIntegralImage(gpuAllocator, integralWidth, integralHeight); 1619 ncvAssertReturn(d_scaledIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1620 NCVMatrixAlloc<Ncv64u> d_scaledSqIntegralImage(gpuAllocator, integralWidth, integralHeight); 1621 ncvAssertReturn(d_scaledSqIntegralImage.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1622 1623 NCVVectorAlloc<NcvRect32u> d_hypothesesIntermediate(gpuAllocator, d_srcImg.width() * d_srcImg.height()); 1624 ncvAssertReturn(d_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1625 NCVVectorAlloc<NcvRect32u> h_hypothesesIntermediate(cpuAllocator, d_srcImg.width() * d_srcImg.height()); 1626 ncvAssertReturn(h_hypothesesIntermediate.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1627 1628 NCVStatus nppStat; 1629 Ncv32u szTmpBufIntegral, szTmpBufSqIntegral; 1630 nppStat = nppiStIntegralGetSize_8u32u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufIntegral, devProp); 1631 ncvAssertReturnNcvStat(nppStat); 1632 nppStat = nppiStSqrIntegralGetSize_8u64u(NcvSize32u(d_srcImg.width(), d_srcImg.height()), &szTmpBufSqIntegral, devProp); 1633 ncvAssertReturnNcvStat(nppStat); 1634 NCVVectorAlloc<Ncv8u> d_tmpIIbuf(gpuAllocator, std::max(szTmpBufIntegral, szTmpBufSqIntegral)); 1635 ncvAssertReturn(d_tmpIIbuf.isMemAllocated(), NCV_ALLOCATOR_BAD_ALLOC); 1636 1637 NCV_SKIP_COND_BEGIN 1638 1639 nppStat = nppiStIntegral_8u32u_C1R(d_srcImg.ptr(), d_srcImg.pitch(), 1640 integral.ptr(), integral.pitch(), 1641 NcvSize32u(d_srcImg.width(), d_srcImg.height()), 1642 d_tmpIIbuf.ptr(), szTmpBufIntegral, devProp); 1643 ncvAssertReturnNcvStat(nppStat); 1644 1645 nppStat = nppiStSqrIntegral_8u64u_C1R(d_srcImg.ptr(), d_srcImg.pitch(), 1646 d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(), 1647 NcvSize32u(d_srcImg.width(), d_srcImg.height()), 1648 d_tmpIIbuf.ptr(), szTmpBufSqIntegral, devProp); 1649 ncvAssertReturnNcvStat(nppStat); 1650 1651 NCV_SKIP_COND_END 1652 1653 dstNumRects = 0; 1654 1655 Ncv32u lastCheckedScale = 0; 1656 NcvBool bReverseTraverseScale = ((flags & NCVPipeObjDet_FindLargestObject) != 0); 1657 std::vector<Ncv32u> scalesVector; 1658 1659 NcvBool bFoundLargestFace = false; 1660 1661 for (Ncv32f scaleIter = 1.0f; ; scaleIter *= scaleStep) 1662 { 1663 Ncv32u scale = (Ncv32u)scaleIter; 1664 if (lastCheckedScale == scale) 1665 { 1666 continue; 1667 } 1668 lastCheckedScale = scale; 1669 1670 if (haar.ClassifierSize.width * (Ncv32s)scale < minObjSize.width || 1671 haar.ClassifierSize.height * (Ncv32s)scale < minObjSize.height) 1672 { 1673 continue; 1674 } 1675 1676 NcvSize32s srcRoi_, srcIIRo_i, scaledIIRoi, searchRoi; 1677 1678 srcRoi_.width = d_srcImg.width(); 1679 srcRoi_.height = d_srcImg.height(); 1680 1681 srcIIRo_i.width = srcRoi_.width + 1; 1682 srcIIRo_i.height = srcRoi_.height + 1; 1683 1684 scaledIIRoi.width = srcIIRo_i.width / scale; 1685 scaledIIRoi.height = srcIIRo_i.height / scale; 1686 1687 searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width; 1688 searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height; 1689 1690 if (searchRoi.width <= 0 || searchRoi.height <= 0) 1691 { 1692 break; 1693 } 1694 1695 scalesVector.push_back(scale); 1696 1697 if (gpuAllocator.isCounting()) 1698 { 1699 break; 1700 } 1701 } 1702 1703 if (bReverseTraverseScale) 1704 { 1705 std::reverse(scalesVector.begin(), scalesVector.end()); 1706 } 1707 1708 //TODO: handle _fair_scale_ flag 1709 for (Ncv32u i=0; i<scalesVector.size(); i++) 1710 { 1711 Ncv32u scale = scalesVector[i]; 1712 1713 NcvSize32u srcRoi_, scaledIIRoi, searchRoi; 1714 NcvSize32u srcIIRoi; 1715 srcRoi_.width = d_srcImg.width(); 1716 srcRoi_.height = d_srcImg.height(); 1717 srcIIRoi.width = srcRoi_.width + 1; 1718 srcIIRoi.height = srcRoi_.height + 1; 1719 scaledIIRoi.width = srcIIRoi.width / scale; 1720 scaledIIRoi.height = srcIIRoi.height / scale; 1721 searchRoi.width = scaledIIRoi.width - haar.ClassifierSize.width; 1722 searchRoi.height = scaledIIRoi.height - haar.ClassifierSize.height; 1723 1724 NCV_SKIP_COND_BEGIN 1725 1726 nppStat = nppiStDecimate_32u_C1R( 1727 integral.ptr(), integral.pitch(), 1728 d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(), 1729 srcIIRoi, scale, true); 1730 ncvAssertReturnNcvStat(nppStat); 1731 1732 nppStat = nppiStDecimate_64u_C1R( 1733 d_sqIntegralImage.ptr(), d_sqIntegralImage.pitch(), 1734 d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(), 1735 srcIIRoi, scale, true); 1736 ncvAssertReturnNcvStat(nppStat); 1737 1738 const NcvRect32u rect( 1739 HAAR_STDDEV_BORDER, 1740 HAAR_STDDEV_BORDER, 1741 haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER, 1742 haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER); 1743 nppStat = nppiStRectStdDev_32f_C1R( 1744 d_scaledIntegralImage.ptr(), d_scaledIntegralImage.pitch(), 1745 d_scaledSqIntegralImage.ptr(), d_scaledSqIntegralImage.pitch(), 1746 d_rectStdDev.ptr(), d_rectStdDev.pitch(), 1747 NcvSize32u(searchRoi.width, searchRoi.height), rect, 1748 (Ncv32f)scale*scale, true); 1749 ncvAssertReturnNcvStat(nppStat); 1750 1751 NCV_SKIP_COND_END 1752 1753 Ncv32u detectionsOnThisScale; 1754 ncvStat = ncvApplyHaarClassifierCascade_device( 1755 d_scaledIntegralImage, d_rectStdDev, d_pixelMask, 1756 detectionsOnThisScale, 1757 haar, h_HaarStages, d_HaarStages, d_HaarNodes, d_HaarFeatures, false, 1758 searchRoi, pixelStep, (Ncv32f)scale*scale, 1759 gpuAllocator, cpuAllocator, devProp, cuStream); 1760 ncvAssertReturnNcvStat(nppStat); 1761 1762 NCV_SKIP_COND_BEGIN 1763 1764 NCVVectorReuse<Ncv32u> d_vecPixelMask(d_pixelMask.getSegment()); 1765 ncvStat = ncvGrowDetectionsVector_device( 1766 d_vecPixelMask, 1767 detectionsOnThisScale, 1768 d_hypothesesIntermediate, 1769 dstNumRects, 1770 static_cast<Ncv32u>(d_hypothesesIntermediate.length()), 1771 haar.ClassifierSize.width, 1772 haar.ClassifierSize.height, 1773 (Ncv32f)scale, 1774 cuStream); 1775 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); 1776 1777 if (flags & NCVPipeObjDet_FindLargestObject) 1778 { 1779 if (dstNumRects == 0) 1780 { 1781 continue; 1782 } 1783 1784 if (dstNumRects != 0) 1785 { 1786 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1787 ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream, 1788 dstNumRects * sizeof(NcvRect32u)); 1789 ncvAssertReturnNcvStat(ncvStat); 1790 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1791 } 1792 1793 Ncv32u numStrongHypothesesNow = dstNumRects; 1794 ncvStat = ncvGroupRectangles_host( 1795 h_hypothesesIntermediate, 1796 numStrongHypothesesNow, 1797 minNeighbors, 1798 RECT_SIMILARITY_PROPORTION, 1799 NULL); 1800 ncvAssertReturnNcvStat(ncvStat); 1801 1802 if (numStrongHypothesesNow > 0) 1803 { 1804 NcvRect32u maxRect = h_hypothesesIntermediate.ptr()[0]; 1805 for (Ncv32u j=1; j<numStrongHypothesesNow; j++) 1806 { 1807 if (maxRect.width < h_hypothesesIntermediate.ptr()[j].width) 1808 { 1809 maxRect = h_hypothesesIntermediate.ptr()[j]; 1810 } 1811 } 1812 1813 h_hypothesesIntermediate.ptr()[0] = maxRect; 1814 dstNumRects = 1; 1815 1816 ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, sizeof(NcvRect32u)); 1817 ncvAssertReturnNcvStat(ncvStat); 1818 1819 bFoundLargestFace = true; 1820 1821 break; 1822 } 1823 } 1824 1825 NCV_SKIP_COND_END 1826 1827 if (gpuAllocator.isCounting()) 1828 { 1829 break; 1830 } 1831 } 1832 1833 NCVStatus ncvRetCode = NCV_SUCCESS; 1834 1835 NCV_SKIP_COND_BEGIN 1836 1837 if (flags & NCVPipeObjDet_FindLargestObject) 1838 { 1839 if (!bFoundLargestFace) 1840 { 1841 dstNumRects = 0; 1842 } 1843 } 1844 else 1845 { 1846 //TODO: move hypotheses filtration to GPU pipeline (the only CPU-resident element of the pipeline left) 1847 if (dstNumRects != 0) 1848 { 1849 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1850 ncvStat = d_hypothesesIntermediate.copySolid(h_hypothesesIntermediate, cuStream, 1851 dstNumRects * sizeof(NcvRect32u)); 1852 ncvAssertReturnNcvStat(ncvStat); 1853 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1854 } 1855 1856 ncvStat = ncvGroupRectangles_host( 1857 h_hypothesesIntermediate, 1858 dstNumRects, 1859 minNeighbors, 1860 RECT_SIMILARITY_PROPORTION, 1861 NULL); 1862 ncvAssertReturnNcvStat(ncvStat); 1863 1864 if (dstNumRects > d_dstRects.length()) 1865 { 1866 ncvRetCode = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW; 1867 dstNumRects = static_cast<Ncv32u>(d_dstRects.length()); 1868 } 1869 1870 if (dstNumRects != 0) 1871 { 1872 ncvStat = h_hypothesesIntermediate.copySolid(d_dstRects, cuStream, 1873 dstNumRects * sizeof(NcvRect32u)); 1874 ncvAssertReturnNcvStat(ncvStat); 1875 } 1876 } 1877 1878 if (flags & NCVPipeObjDet_VisualizeInPlace) 1879 { 1880 ncvAssertCUDAReturn(cudaStreamSynchronize(cuStream), NCV_CUDA_ERROR); 1881 ncvDrawRects_8u_device(d_srcImg.ptr(), d_srcImg.stride(), 1882 d_srcImg.width(), d_srcImg.height(), 1883 d_dstRects.ptr(), dstNumRects, 255, cuStream); 1884 } 1885 1886 NCV_SKIP_COND_END 1887 1888 return ncvRetCode; 1889 } 1890 1891 1892 //============================================================================== 1893 // 1894 // Purely Host code: classifier IO, mock-ups 1895 // 1896 //============================================================================== 1897 1898 1899 #ifdef _SELF_TEST_ 1900 #include <float.h> 1901 #endif 1902 1903 1904 NCVStatus ncvApplyHaarClassifierCascade_host(NCVMatrix<Ncv32u> &h_integralImage, 1905 NCVMatrix<Ncv32f> &h_weights, 1906 NCVMatrixAlloc<Ncv32u> &h_pixelMask, 1907 Ncv32u &numDetections, 1908 HaarClassifierCascadeDescriptor &haar, 1909 NCVVector<HaarStage64> &h_HaarStages, 1910 NCVVector<HaarClassifierNode128> &h_HaarNodes, 1911 NCVVector<HaarFeature64> &h_HaarFeatures, 1912 NcvBool bMaskElements, 1913 NcvSize32u anchorsRoi, 1914 Ncv32u pixelStep, 1915 Ncv32f scaleArea) 1916 { 1917 ncvAssertReturn(h_integralImage.memType() == h_weights.memType() && 1918 h_integralImage.memType() == h_pixelMask.memType() && 1919 (h_integralImage.memType() == NCVMemoryTypeHostPageable || 1920 h_integralImage.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR); 1921 ncvAssertReturn(h_HaarStages.memType() == h_HaarNodes.memType() && 1922 h_HaarStages.memType() == h_HaarFeatures.memType() && 1923 (h_HaarStages.memType() == NCVMemoryTypeHostPageable || 1924 h_HaarStages.memType() == NCVMemoryTypeHostPinned), NCV_MEM_RESIDENCE_ERROR); 1925 ncvAssertReturn(h_integralImage.ptr() != NULL && h_weights.ptr() != NULL && h_pixelMask.ptr() != NULL && 1926 h_HaarStages.ptr() != NULL && h_HaarNodes.ptr() != NULL && h_HaarFeatures.ptr() != NULL, NCV_NULL_PTR); 1927 ncvAssertReturn(anchorsRoi.width > 0 && anchorsRoi.height > 0 && 1928 h_pixelMask.width() >= anchorsRoi.width && h_pixelMask.height() >= anchorsRoi.height && 1929 h_weights.width() >= anchorsRoi.width && h_weights.height() >= anchorsRoi.height && 1930 h_integralImage.width() >= anchorsRoi.width + haar.ClassifierSize.width && 1931 h_integralImage.height() >= anchorsRoi.height + haar.ClassifierSize.height, NCV_DIMENSIONS_INVALID); 1932 ncvAssertReturn(scaleArea > 0, NCV_INVALID_SCALE); 1933 ncvAssertReturn(h_HaarStages.length() >= haar.NumStages && 1934 h_HaarNodes.length() >= haar.NumClassifierTotalNodes && 1935 h_HaarFeatures.length() >= haar.NumFeatures && 1936 h_HaarStages.length() == h_HaarStages.length() && 1937 haar.NumClassifierRootNodes <= haar.NumClassifierTotalNodes, NCV_DIMENSIONS_INVALID); 1938 ncvAssertReturn(haar.bNeedsTiltedII == false, NCV_NOIMPL_HAAR_TILTED_FEATURES); 1939 ncvAssertReturn(pixelStep == 1 || pixelStep == 2, NCV_HAAR_INVALID_PIXEL_STEP); 1940 1941 Ncv32f scaleAreaPixels = scaleArea * ((haar.ClassifierSize.width - 2*HAAR_STDDEV_BORDER) * 1942 (haar.ClassifierSize.height - 2*HAAR_STDDEV_BORDER)); 1943 1944 for (Ncv32u i=0; i<anchorsRoi.height; i++) 1945 { 1946 for (Ncv32u j=0; j<h_pixelMask.stride(); j++) 1947 { 1948 if (i % pixelStep != 0 || j % pixelStep != 0 || j >= anchorsRoi.width) 1949 { 1950 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U; 1951 } 1952 else 1953 { 1954 for (Ncv32u iStage = 0; iStage < haar.NumStages; iStage++) 1955 { 1956 Ncv32f curStageSum = 0.0f; 1957 Ncv32u numRootNodesInStage = h_HaarStages.ptr()[iStage].getNumClassifierRootNodes(); 1958 Ncv32u curRootNodeOffset = h_HaarStages.ptr()[iStage].getStartClassifierRootNodeOffset(); 1959 1960 if (iStage == 0) 1961 { 1962 if (bMaskElements && h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U) 1963 { 1964 break; 1965 } 1966 else 1967 { 1968 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = ((i << 16) | j); 1969 } 1970 } 1971 else if (h_pixelMask.ptr()[i * h_pixelMask.stride() + j] == OBJDET_MASK_ELEMENT_INVALID_32U) 1972 { 1973 break; 1974 } 1975 1976 while (numRootNodesInStage--) 1977 { 1978 NcvBool bMoreNodesToTraverse = true; 1979 Ncv32u curNodeOffset = curRootNodeOffset; 1980 1981 while (bMoreNodesToTraverse) 1982 { 1983 HaarClassifierNode128 curNode = h_HaarNodes.ptr()[curNodeOffset]; 1984 HaarFeatureDescriptor32 curFeatDesc = curNode.getFeatureDesc(); 1985 Ncv32u curNodeFeaturesNum = curFeatDesc.getNumFeatures(); 1986 Ncv32u curNodeFeaturesOffs = curFeatDesc.getFeaturesOffset(); 1987 1988 Ncv32f curNodeVal = 0.f; 1989 for (Ncv32u iRect=0; iRect<curNodeFeaturesNum; iRect++) 1990 { 1991 HaarFeature64 feature = h_HaarFeatures.ptr()[curNodeFeaturesOffs + iRect]; 1992 Ncv32u rectX, rectY, rectWidth, rectHeight; 1993 feature.getRect(&rectX, &rectY, &rectWidth, &rectHeight); 1994 Ncv32f rectWeight = feature.getWeight(); 1995 Ncv32u iioffsTL = (i + rectY) * h_integralImage.stride() + (j + rectX); 1996 Ncv32u iioffsTR = iioffsTL + rectWidth; 1997 Ncv32u iioffsBL = iioffsTL + rectHeight * h_integralImage.stride(); 1998 Ncv32u iioffsBR = iioffsBL + rectWidth; 1999 2000 Ncv32u iivalTL = h_integralImage.ptr()[iioffsTL]; 2001 Ncv32u iivalTR = h_integralImage.ptr()[iioffsTR]; 2002 Ncv32u iivalBL = h_integralImage.ptr()[iioffsBL]; 2003 Ncv32u iivalBR = h_integralImage.ptr()[iioffsBR]; 2004 Ncv32u rectSum = iivalBR - iivalBL + iivalTL - iivalTR; 2005 curNodeVal += (Ncv32f)rectSum * rectWeight; 2006 } 2007 2008 HaarClassifierNodeDescriptor32 nodeLeft = curNode.getLeftNodeDesc(); 2009 HaarClassifierNodeDescriptor32 nodeRight = curNode.getRightNodeDesc(); 2010 Ncv32f nodeThreshold = curNode.getThreshold(); 2011 2012 HaarClassifierNodeDescriptor32 nextNodeDescriptor; 2013 NcvBool nextNodeIsLeaf; 2014 2015 if (curNodeVal < scaleAreaPixels * h_weights.ptr()[i * h_weights.stride() + j] * nodeThreshold) 2016 { 2017 nextNodeDescriptor = nodeLeft; 2018 nextNodeIsLeaf = curFeatDesc.isLeftNodeLeaf(); 2019 } 2020 else 2021 { 2022 nextNodeDescriptor = nodeRight; 2023 nextNodeIsLeaf = curFeatDesc.isRightNodeLeaf(); 2024 } 2025 2026 if (nextNodeIsLeaf) 2027 { 2028 Ncv32f tmpLeafValue = nextNodeDescriptor.getLeafValueHost(); 2029 curStageSum += tmpLeafValue; 2030 bMoreNodesToTraverse = false; 2031 } 2032 else 2033 { 2034 curNodeOffset = nextNodeDescriptor.getNextNodeOffset(); 2035 } 2036 } 2037 2038 curRootNodeOffset++; 2039 } 2040 2041 Ncv32f tmpStageThreshold = h_HaarStages.ptr()[iStage].getStageThreshold(); 2042 if (curStageSum < tmpStageThreshold) 2043 { 2044 //drop 2045 h_pixelMask.ptr()[i * h_pixelMask.stride() + j] = OBJDET_MASK_ELEMENT_INVALID_32U; 2046 break; 2047 } 2048 } 2049 } 2050 } 2051 } 2052 2053 std::sort(h_pixelMask.ptr(), h_pixelMask.ptr() + anchorsRoi.height * h_pixelMask.stride()); 2054 Ncv32u i = 0; 2055 for (; i<anchorsRoi.height * h_pixelMask.stride(); i++) 2056 { 2057 if (h_pixelMask.ptr()[i] == OBJDET_MASK_ELEMENT_INVALID_32U) 2058 { 2059 break; 2060 } 2061 } 2062 numDetections = i; 2063 2064 return NCV_SUCCESS; 2065 } 2066 2067 2068 NCVStatus ncvGrowDetectionsVector_host(NCVVector<Ncv32u> &pixelMask, 2069 Ncv32u numPixelMaskDetections, 2070 NCVVector<NcvRect32u> &hypotheses, 2071 Ncv32u &totalDetections, 2072 Ncv32u totalMaxDetections, 2073 Ncv32u rectWidth, 2074 Ncv32u rectHeight, 2075 Ncv32f curScale) 2076 { 2077 ncvAssertReturn(pixelMask.ptr() != NULL && hypotheses.ptr() != NULL, NCV_NULL_PTR); 2078 ncvAssertReturn(pixelMask.memType() == hypotheses.memType() && 2079 pixelMask.memType() != NCVMemoryTypeDevice, NCV_MEM_RESIDENCE_ERROR); 2080 ncvAssertReturn(rectWidth > 0 && rectHeight > 0 && curScale > 0, NCV_INVALID_ROI); 2081 ncvAssertReturn(curScale > 0, NCV_INVALID_SCALE); 2082 ncvAssertReturn(totalMaxDetections <= hypotheses.length() && 2083 numPixelMaskDetections <= pixelMask.length() && 2084 totalMaxDetections <= totalMaxDetections, NCV_INCONSISTENT_INPUT); 2085 2086 NCVStatus ncvStat = NCV_SUCCESS; 2087 Ncv32u numDetsToCopy = numPixelMaskDetections; 2088 2089 if (numDetsToCopy == 0) 2090 { 2091 return ncvStat; 2092 } 2093 2094 if (totalDetections + numPixelMaskDetections > totalMaxDetections) 2095 { 2096 ncvStat = NCV_WARNING_HAAR_DETECTIONS_VECTOR_OVERFLOW; 2097 numDetsToCopy = totalMaxDetections - totalDetections; 2098 } 2099 2100 for (Ncv32u i=0; i<numDetsToCopy; i++) 2101 { 2102 hypotheses.ptr()[totalDetections + i] = pixelToRect(pixelMask.ptr()[i], rectWidth, rectHeight, curScale); 2103 } 2104 2105 totalDetections += numDetsToCopy; 2106 return ncvStat; 2107 } 2108 2109 static NCVStatus loadFromXML(const cv::String &filename, 2110 HaarClassifierCascadeDescriptor &haar, 2111 std::vector<HaarStage64> &haarStages, 2112 std::vector<HaarClassifierNode128> &haarClassifierNodes, 2113 std::vector<HaarFeature64> &haarFeatures) 2114 { 2115 #ifndef HAVE_OPENCV_OBJDETECT 2116 (void) filename; 2117 (void) haar; 2118 (void) haarStages; 2119 (void) haarClassifierNodes; 2120 (void) haarFeatures; 2121 CV_Error(cv::Error::StsNotImplemented, "This functionality requires objdetect module"); 2122 return NCV_HAAR_XML_LOADING_EXCEPTION; 2123 #else 2124 NCVStatus ncvStat; 2125 2126 haar.NumStages = 0; 2127 haar.NumClassifierRootNodes = 0; 2128 haar.NumClassifierTotalNodes = 0; 2129 haar.NumFeatures = 0; 2130 haar.ClassifierSize.width = 0; 2131 haar.ClassifierSize.height = 0; 2132 haar.bHasStumpsOnly = true; 2133 haar.bNeedsTiltedII = false; 2134 Ncv32u curMaxTreeDepth = 0; 2135 2136 std::vector<HaarClassifierNode128> h_TmpClassifierNotRootNodes; 2137 haarStages.resize(0); 2138 haarClassifierNodes.resize(0); 2139 haarFeatures.resize(0); 2140 2141 cv::Ptr<CvHaarClassifierCascade> oldCascade((CvHaarClassifierCascade*)cvLoad(filename.c_str(), 0, 0, 0)); 2142 if (!oldCascade) 2143 { 2144 return NCV_HAAR_XML_LOADING_EXCEPTION; 2145 } 2146 2147 haar.ClassifierSize.width = oldCascade->orig_window_size.width; 2148 haar.ClassifierSize.height = oldCascade->orig_window_size.height; 2149 2150 int stagesCound = oldCascade->count; 2151 for(int s = 0; s < stagesCound; ++s) // by stages 2152 { 2153 HaarStage64 curStage; 2154 curStage.setStartClassifierRootNodeOffset(static_cast<Ncv32u>(haarClassifierNodes.size())); 2155 2156 curStage.setStageThreshold(oldCascade->stage_classifier[s].threshold); 2157 2158 int treesCount = oldCascade->stage_classifier[s].count; 2159 for(int t = 0; t < treesCount; ++t) // by trees 2160 { 2161 Ncv32u nodeId = 0; 2162 CvHaarClassifier* tree = &oldCascade->stage_classifier[s].classifier[t]; 2163 2164 int nodesCount = tree->count; 2165 for(int n = 0; n < nodesCount; ++n) //by features 2166 { 2167 CvHaarFeature* feature = &tree->haar_feature[n]; 2168 2169 HaarClassifierNode128 curNode; 2170 curNode.setThreshold(tree->threshold[n]); 2171 2172 NcvBool bIsLeftNodeLeaf = false; 2173 NcvBool bIsRightNodeLeaf = false; 2174 2175 HaarClassifierNodeDescriptor32 nodeLeft; 2176 if ( tree->left[n] <= 0 ) 2177 { 2178 Ncv32f leftVal = tree->alpha[-tree->left[n]]; 2179 ncvStat = nodeLeft.create(leftVal); 2180 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); 2181 bIsLeftNodeLeaf = true; 2182 } 2183 else 2184 { 2185 Ncv32u leftNodeOffset = tree->left[n]; 2186 nodeLeft.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + leftNodeOffset - 1)); 2187 haar.bHasStumpsOnly = false; 2188 } 2189 curNode.setLeftNodeDesc(nodeLeft); 2190 2191 HaarClassifierNodeDescriptor32 nodeRight; 2192 if ( tree->right[n] <= 0 ) 2193 { 2194 Ncv32f rightVal = tree->alpha[-tree->right[n]]; 2195 ncvStat = nodeRight.create(rightVal); 2196 ncvAssertReturn(ncvStat == NCV_SUCCESS, ncvStat); 2197 bIsRightNodeLeaf = true; 2198 } 2199 else 2200 { 2201 Ncv32u rightNodeOffset = tree->right[n]; 2202 nodeRight.create((Ncv32u)(h_TmpClassifierNotRootNodes.size() + rightNodeOffset - 1)); 2203 haar.bHasStumpsOnly = false; 2204 } 2205 curNode.setRightNodeDesc(nodeRight); 2206 2207 Ncv32u tiltedVal = feature->tilted; 2208 haar.bNeedsTiltedII = (tiltedVal != 0); 2209 2210 Ncv32u featureId = 0; 2211 for(int l = 0; l < CV_HAAR_FEATURE_MAX; ++l) //by rects 2212 { 2213 Ncv32u rectX = feature->rect[l].r.x; 2214 Ncv32u rectY = feature->rect[l].r.y; 2215 Ncv32u rectWidth = feature->rect[l].r.width; 2216 Ncv32u rectHeight = feature->rect[l].r.height; 2217 2218 Ncv32f rectWeight = feature->rect[l].weight; 2219 2220 if (rectWeight == 0/* && rectX == 0 &&rectY == 0 && rectWidth == 0 && rectHeight == 0*/) 2221 break; 2222 2223 HaarFeature64 curFeature; 2224 ncvStat = curFeature.setRect(rectX, rectY, rectWidth, rectHeight, haar.ClassifierSize.width, haar.ClassifierSize.height); 2225 curFeature.setWeight(rectWeight); 2226 ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); 2227 haarFeatures.push_back(curFeature); 2228 2229 featureId++; 2230 } 2231 2232 HaarFeatureDescriptor32 tmpFeatureDesc; 2233 ncvStat = tmpFeatureDesc.create(haar.bNeedsTiltedII, bIsLeftNodeLeaf, bIsRightNodeLeaf, 2234 featureId, static_cast<Ncv32u>(haarFeatures.size()) - featureId); 2235 ncvAssertReturn(NCV_SUCCESS == ncvStat, ncvStat); 2236 curNode.setFeatureDesc(tmpFeatureDesc); 2237 2238 if (!nodeId) 2239 { 2240 //root node 2241 haarClassifierNodes.push_back(curNode); 2242 curMaxTreeDepth = 1; 2243 } 2244 else 2245 { 2246 //other node 2247 h_TmpClassifierNotRootNodes.push_back(curNode); 2248 curMaxTreeDepth++; 2249 } 2250 2251 nodeId++; 2252 } 2253 } 2254 2255 curStage.setNumClassifierRootNodes(treesCount); 2256 haarStages.push_back(curStage); 2257 } 2258 2259 //fill in cascade stats 2260 haar.NumStages = static_cast<Ncv32u>(haarStages.size()); 2261 haar.NumClassifierRootNodes = static_cast<Ncv32u>(haarClassifierNodes.size()); 2262 haar.NumClassifierTotalNodes = static_cast<Ncv32u>(haar.NumClassifierRootNodes + h_TmpClassifierNotRootNodes.size()); 2263 haar.NumFeatures = static_cast<Ncv32u>(haarFeatures.size()); 2264 2265 //merge root and leaf nodes in one classifiers array 2266 Ncv32u offsetRoot = static_cast<Ncv32u>(haarClassifierNodes.size()); 2267 for (Ncv32u i=0; i<haarClassifierNodes.size(); i++) 2268 { 2269 HaarFeatureDescriptor32 featureDesc = haarClassifierNodes[i].getFeatureDesc(); 2270 2271 HaarClassifierNodeDescriptor32 nodeLeft = haarClassifierNodes[i].getLeftNodeDesc(); 2272 if (!featureDesc.isLeftNodeLeaf()) 2273 { 2274 Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot; 2275 nodeLeft.create(newOffset); 2276 } 2277 haarClassifierNodes[i].setLeftNodeDesc(nodeLeft); 2278 2279 HaarClassifierNodeDescriptor32 nodeRight = haarClassifierNodes[i].getRightNodeDesc(); 2280 if (!featureDesc.isRightNodeLeaf()) 2281 { 2282 Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot; 2283 nodeRight.create(newOffset); 2284 } 2285 haarClassifierNodes[i].setRightNodeDesc(nodeRight); 2286 } 2287 2288 for (Ncv32u i=0; i<h_TmpClassifierNotRootNodes.size(); i++) 2289 { 2290 HaarFeatureDescriptor32 featureDesc = h_TmpClassifierNotRootNodes[i].getFeatureDesc(); 2291 2292 HaarClassifierNodeDescriptor32 nodeLeft = h_TmpClassifierNotRootNodes[i].getLeftNodeDesc(); 2293 if (!featureDesc.isLeftNodeLeaf()) 2294 { 2295 Ncv32u newOffset = nodeLeft.getNextNodeOffset() + offsetRoot; 2296 nodeLeft.create(newOffset); 2297 } 2298 h_TmpClassifierNotRootNodes[i].setLeftNodeDesc(nodeLeft); 2299 2300 HaarClassifierNodeDescriptor32 nodeRight = h_TmpClassifierNotRootNodes[i].getRightNodeDesc(); 2301 if (!featureDesc.isRightNodeLeaf()) 2302 { 2303 Ncv32u newOffset = nodeRight.getNextNodeOffset() + offsetRoot; 2304 nodeRight.create(newOffset); 2305 } 2306 h_TmpClassifierNotRootNodes[i].setRightNodeDesc(nodeRight); 2307 2308 haarClassifierNodes.push_back(h_TmpClassifierNotRootNodes[i]); 2309 } 2310 2311 return NCV_SUCCESS; 2312 #endif 2313 } 2314 2315 2316 #define NVBIN_HAAR_SIZERESERVED 16 2317 #define NVBIN_HAAR_VERSION 0x1 2318 2319 2320 static NCVStatus loadFromNVBIN(const cv::String &filename, 2321 HaarClassifierCascadeDescriptor &haar, 2322 std::vector<HaarStage64> &haarStages, 2323 std::vector<HaarClassifierNode128> &haarClassifierNodes, 2324 std::vector<HaarFeature64> &haarFeatures) 2325 { 2326 size_t readCount; 2327 FILE *fp = fopen(filename.c_str(), "rb"); 2328 ncvAssertReturn(fp != NULL, NCV_FILE_ERROR); 2329 Ncv32u fileVersion; 2330 readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp); 2331 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR); 2332 ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR); 2333 Ncv32u fsize; 2334 readCount = fread(&fsize, sizeof(Ncv32u), 1, fp); 2335 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR); 2336 fseek(fp, 0, SEEK_END); 2337 Ncv32u fsizeActual = ftell(fp); 2338 ncvAssertReturn(fsize == fsizeActual, NCV_FILE_ERROR); 2339 2340 std::vector<unsigned char> fdata; 2341 fdata.resize(fsize); 2342 Ncv32u dataOffset = 0; 2343 fseek(fp, 0, SEEK_SET); 2344 readCount = fread(&fdata[0], fsize, 1, fp); 2345 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR); 2346 fclose(fp); 2347 2348 //data 2349 dataOffset = NVBIN_HAAR_SIZERESERVED; 2350 haar.NumStages = *(Ncv32u *)(&fdata[0]+dataOffset); 2351 dataOffset += sizeof(Ncv32u); 2352 haar.NumClassifierRootNodes = *(Ncv32u *)(&fdata[0]+dataOffset); 2353 dataOffset += sizeof(Ncv32u); 2354 haar.NumClassifierTotalNodes = *(Ncv32u *)(&fdata[0]+dataOffset); 2355 dataOffset += sizeof(Ncv32u); 2356 haar.NumFeatures = *(Ncv32u *)(&fdata[0]+dataOffset); 2357 dataOffset += sizeof(Ncv32u); 2358 haar.ClassifierSize = *(NcvSize32u *)(&fdata[0]+dataOffset); 2359 dataOffset += sizeof(NcvSize32u); 2360 haar.bNeedsTiltedII = *(NcvBool *)(&fdata[0]+dataOffset); 2361 dataOffset += sizeof(NcvBool); 2362 haar.bHasStumpsOnly = *(NcvBool *)(&fdata[0]+dataOffset); 2363 dataOffset += sizeof(NcvBool); 2364 2365 haarStages.resize(haar.NumStages); 2366 haarClassifierNodes.resize(haar.NumClassifierTotalNodes); 2367 haarFeatures.resize(haar.NumFeatures); 2368 2369 Ncv32u szStages = haar.NumStages * sizeof(HaarStage64); 2370 Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128); 2371 Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64); 2372 2373 memcpy(&haarStages[0], &fdata[0]+dataOffset, szStages); 2374 dataOffset += szStages; 2375 memcpy(&haarClassifierNodes[0], &fdata[0]+dataOffset, szClassifiers); 2376 dataOffset += szClassifiers; 2377 memcpy(&haarFeatures[0], &fdata[0]+dataOffset, szFeatures); 2378 dataOffset += szFeatures; 2379 2380 return NCV_SUCCESS; 2381 } 2382 2383 2384 NCVStatus ncvHaarGetClassifierSize(const cv::String &filename, Ncv32u &numStages, 2385 Ncv32u &numNodes, Ncv32u &numFeatures) 2386 { 2387 size_t readCount; 2388 NCVStatus ncvStat; 2389 2390 cv::String fext = filename.substr(filename.find_last_of(".") + 1); 2391 fext = fext.toLowerCase(); 2392 2393 if (fext == "nvbin") 2394 { 2395 FILE *fp = fopen(filename.c_str(), "rb"); 2396 ncvAssertReturn(fp != NULL, NCV_FILE_ERROR); 2397 Ncv32u fileVersion; 2398 readCount = fread(&fileVersion, sizeof(Ncv32u), 1, fp); 2399 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR); 2400 ncvAssertReturn(fileVersion == NVBIN_HAAR_VERSION, NCV_FILE_ERROR); 2401 fseek(fp, NVBIN_HAAR_SIZERESERVED, SEEK_SET); 2402 Ncv32u tmp; 2403 readCount = fread(&numStages, sizeof(Ncv32u), 1, fp); 2404 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR); 2405 readCount = fread(&tmp, sizeof(Ncv32u), 1, fp); 2406 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR); 2407 readCount = fread(&numNodes, sizeof(Ncv32u), 1, fp); 2408 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR); 2409 readCount = fread(&numFeatures, sizeof(Ncv32u), 1, fp); 2410 ncvAssertReturn(1 == readCount, NCV_FILE_ERROR); 2411 fclose(fp); 2412 } 2413 else if (fext == "xml") 2414 { 2415 HaarClassifierCascadeDescriptor haar; 2416 std::vector<HaarStage64> haarStages; 2417 std::vector<HaarClassifierNode128> haarNodes; 2418 std::vector<HaarFeature64> haarFeatures; 2419 2420 ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures); 2421 ncvAssertReturnNcvStat(ncvStat); 2422 2423 numStages = haar.NumStages; 2424 numNodes = haar.NumClassifierTotalNodes; 2425 numFeatures = haar.NumFeatures; 2426 } 2427 else 2428 { 2429 return NCV_HAAR_XML_LOADING_EXCEPTION; 2430 } 2431 2432 return NCV_SUCCESS; 2433 } 2434 2435 2436 NCVStatus ncvHaarLoadFromFile_host(const cv::String &filename, 2437 HaarClassifierCascadeDescriptor &haar, 2438 NCVVector<HaarStage64> &h_HaarStages, 2439 NCVVector<HaarClassifierNode128> &h_HaarNodes, 2440 NCVVector<HaarFeature64> &h_HaarFeatures) 2441 { 2442 ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned && 2443 h_HaarNodes.memType() == NCVMemoryTypeHostPinned && 2444 h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR); 2445 2446 NCVStatus ncvStat; 2447 2448 cv::String fext = filename.substr(filename.find_last_of(".") + 1); 2449 fext = fext.toLowerCase(); 2450 2451 std::vector<HaarStage64> haarStages; 2452 std::vector<HaarClassifierNode128> haarNodes; 2453 std::vector<HaarFeature64> haarFeatures; 2454 2455 if (fext == "nvbin") 2456 { 2457 ncvStat = loadFromNVBIN(filename, haar, haarStages, haarNodes, haarFeatures); 2458 ncvAssertReturnNcvStat(ncvStat); 2459 } 2460 else if (fext == "xml") 2461 { 2462 ncvStat = loadFromXML(filename, haar, haarStages, haarNodes, haarFeatures); 2463 ncvAssertReturnNcvStat(ncvStat); 2464 } 2465 else 2466 { 2467 return NCV_HAAR_XML_LOADING_EXCEPTION; 2468 } 2469 2470 ncvAssertReturn(h_HaarStages.length() >= haarStages.size(), NCV_MEM_INSUFFICIENT_CAPACITY); 2471 ncvAssertReturn(h_HaarNodes.length() >= haarNodes.size(), NCV_MEM_INSUFFICIENT_CAPACITY); 2472 ncvAssertReturn(h_HaarFeatures.length() >= haarFeatures.size(), NCV_MEM_INSUFFICIENT_CAPACITY); 2473 2474 memcpy(h_HaarStages.ptr(), &haarStages[0], haarStages.size()*sizeof(HaarStage64)); 2475 memcpy(h_HaarNodes.ptr(), &haarNodes[0], haarNodes.size()*sizeof(HaarClassifierNode128)); 2476 memcpy(h_HaarFeatures.ptr(), &haarFeatures[0], haarFeatures.size()*sizeof(HaarFeature64)); 2477 2478 return NCV_SUCCESS; 2479 } 2480 2481 2482 NCVStatus ncvHaarStoreNVBIN_host(const cv::String &filename, 2483 HaarClassifierCascadeDescriptor haar, 2484 NCVVector<HaarStage64> &h_HaarStages, 2485 NCVVector<HaarClassifierNode128> &h_HaarNodes, 2486 NCVVector<HaarFeature64> &h_HaarFeatures) 2487 { 2488 ncvAssertReturn(h_HaarStages.length() >= haar.NumStages, NCV_INCONSISTENT_INPUT); 2489 ncvAssertReturn(h_HaarNodes.length() >= haar.NumClassifierTotalNodes, NCV_INCONSISTENT_INPUT); 2490 ncvAssertReturn(h_HaarFeatures.length() >= haar.NumFeatures, NCV_INCONSISTENT_INPUT); 2491 ncvAssertReturn(h_HaarStages.memType() == NCVMemoryTypeHostPinned && 2492 h_HaarNodes.memType() == NCVMemoryTypeHostPinned && 2493 h_HaarFeatures.memType() == NCVMemoryTypeHostPinned, NCV_MEM_RESIDENCE_ERROR); 2494 2495 Ncv32u szStages = haar.NumStages * sizeof(HaarStage64); 2496 Ncv32u szClassifiers = haar.NumClassifierTotalNodes * sizeof(HaarClassifierNode128); 2497 Ncv32u szFeatures = haar.NumFeatures * sizeof(HaarFeature64); 2498 2499 Ncv32u dataOffset = 0; 2500 std::vector<unsigned char> fdata; 2501 fdata.resize(szStages+szClassifiers+szFeatures+1024, 0); 2502 2503 //header 2504 *(Ncv32u *)(&fdata[0]+dataOffset) = NVBIN_HAAR_VERSION; 2505 2506 //data 2507 dataOffset = NVBIN_HAAR_SIZERESERVED; 2508 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumStages; 2509 dataOffset += sizeof(Ncv32u); 2510 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierRootNodes; 2511 dataOffset += sizeof(Ncv32u); 2512 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumClassifierTotalNodes; 2513 dataOffset += sizeof(Ncv32u); 2514 *(Ncv32u *)(&fdata[0]+dataOffset) = haar.NumFeatures; 2515 dataOffset += sizeof(Ncv32u); 2516 *(NcvSize32u *)(&fdata[0]+dataOffset) = haar.ClassifierSize; 2517 dataOffset += sizeof(NcvSize32u); 2518 *(NcvBool *)(&fdata[0]+dataOffset) = haar.bNeedsTiltedII; 2519 dataOffset += sizeof(NcvBool); 2520 *(NcvBool *)(&fdata[0]+dataOffset) = haar.bHasStumpsOnly; 2521 dataOffset += sizeof(NcvBool); 2522 2523 memcpy(&fdata[0]+dataOffset, h_HaarStages.ptr(), szStages); 2524 dataOffset += szStages; 2525 memcpy(&fdata[0]+dataOffset, h_HaarNodes.ptr(), szClassifiers); 2526 dataOffset += szClassifiers; 2527 memcpy(&fdata[0]+dataOffset, h_HaarFeatures.ptr(), szFeatures); 2528 dataOffset += szFeatures; 2529 Ncv32u fsize = dataOffset; 2530 2531 //TODO: CRC32 here 2532 2533 //update header 2534 dataOffset = sizeof(Ncv32u); 2535 *(Ncv32u *)(&fdata[0]+dataOffset) = fsize; 2536 2537 FILE *fp = fopen(filename.c_str(), "wb"); 2538 ncvAssertReturn(fp != NULL, NCV_FILE_ERROR); 2539 fwrite(&fdata[0], fsize, 1, fp); 2540 fclose(fp); 2541 return NCV_SUCCESS; 2542 } 2543