1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2014, Itseez Inc., all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #include "precomp.hpp" 45 #include "opencl_kernels_imgproc.hpp" 46 47 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) 48 static IppStatus sts = ippInit(); 49 #endif 50 51 namespace cv 52 { 53 54 template <typename T, typename ST, typename QT> 55 struct Integral_SIMD 56 { 57 bool operator()(const T *, size_t, 58 ST *, size_t, 59 QT *, size_t, 60 ST *, size_t, 61 Size, int) const 62 { 63 return false; 64 } 65 }; 66 67 #if CV_SSE2 68 69 template <> 70 struct Integral_SIMD<uchar, int, double> 71 { 72 Integral_SIMD() 73 { 74 haveSSE2 = checkHardwareSupport(CV_CPU_SSE2); 75 } 76 77 bool operator()(const uchar * src, size_t _srcstep, 78 int * sum, size_t _sumstep, 79 double * sqsum, size_t, 80 int * tilted, size_t, 81 Size size, int cn) const 82 { 83 if (sqsum || tilted || cn != 1 || !haveSSE2) 84 return false; 85 86 // the first iteration 87 memset(sum, 0, (size.width + 1) * sizeof(int)); 88 89 __m128i v_zero = _mm_setzero_si128(), prev = v_zero; 90 int j = 0; 91 92 // the others 93 for (int i = 0; i < size.height; ++i) 94 { 95 const uchar * src_row = src + _srcstep * i; 96 int * prev_sum_row = (int *)((uchar *)sum + _sumstep * i) + 1; 97 int * sum_row = (int *)((uchar *)sum + _sumstep * (i + 1)) + 1; 98 99 sum_row[-1] = 0; 100 101 prev = v_zero; 102 j = 0; 103 104 for ( ; j + 7 < size.width; j += 8) 105 { 106 __m128i vsuml = _mm_loadu_si128((const __m128i *)(prev_sum_row + j)); 107 __m128i vsumh = _mm_loadu_si128((const __m128i *)(prev_sum_row + j + 4)); 108 109 __m128i el8shr0 = _mm_loadl_epi64((const __m128i *)(src_row + j)); 110 __m128i el8shr1 = _mm_slli_si128(el8shr0, 1); 111 __m128i el8shr2 = _mm_slli_si128(el8shr0, 2); 112 __m128i el8shr3 = _mm_slli_si128(el8shr0, 3); 113 114 vsuml = _mm_add_epi32(vsuml, prev); 115 vsumh = _mm_add_epi32(vsumh, prev); 116 117 __m128i el8shr12 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr1, v_zero), 118 _mm_unpacklo_epi8(el8shr2, v_zero)); 119 __m128i el8shr03 = _mm_add_epi16(_mm_unpacklo_epi8(el8shr0, v_zero), 120 _mm_unpacklo_epi8(el8shr3, v_zero)); 121 __m128i el8 = _mm_add_epi16(el8shr12, el8shr03); 122 123 __m128i el4h = _mm_add_epi16(_mm_unpackhi_epi16(el8, v_zero), 124 _mm_unpacklo_epi16(el8, v_zero)); 125 126 vsuml = _mm_add_epi32(vsuml, _mm_unpacklo_epi16(el8, v_zero)); 127 vsumh = _mm_add_epi32(vsumh, el4h); 128 129 _mm_storeu_si128((__m128i *)(sum_row + j), vsuml); 130 _mm_storeu_si128((__m128i *)(sum_row + j + 4), vsumh); 131 132 prev = _mm_add_epi32(prev, _mm_shuffle_epi32(el4h, _MM_SHUFFLE(3, 3, 3, 3))); 133 } 134 135 for (int v = sum_row[j - 1] - prev_sum_row[j - 1]; j < size.width; ++j) 136 sum_row[j] = (v += src_row[j]) + prev_sum_row[j]; 137 } 138 139 return true; 140 } 141 142 bool haveSSE2; 143 }; 144 145 #endif 146 147 template<typename T, typename ST, typename QT> 148 void integral_( const T* src, size_t _srcstep, ST* sum, size_t _sumstep, 149 QT* sqsum, size_t _sqsumstep, ST* tilted, size_t _tiltedstep, 150 Size size, int cn ) 151 { 152 int x, y, k; 153 154 if (Integral_SIMD<T, ST, QT>()(src, _srcstep, 155 sum, _sumstep, 156 sqsum, _sqsumstep, 157 tilted, _tiltedstep, 158 size, cn)) 159 return; 160 161 int srcstep = (int)(_srcstep/sizeof(T)); 162 int sumstep = (int)(_sumstep/sizeof(ST)); 163 int tiltedstep = (int)(_tiltedstep/sizeof(ST)); 164 int sqsumstep = (int)(_sqsumstep/sizeof(QT)); 165 166 size.width *= cn; 167 168 memset( sum, 0, (size.width+cn)*sizeof(sum[0])); 169 sum += sumstep + cn; 170 171 if( sqsum ) 172 { 173 memset( sqsum, 0, (size.width+cn)*sizeof(sqsum[0])); 174 sqsum += sqsumstep + cn; 175 } 176 177 if( tilted ) 178 { 179 memset( tilted, 0, (size.width+cn)*sizeof(tilted[0])); 180 tilted += tiltedstep + cn; 181 } 182 183 if( sqsum == 0 && tilted == 0 ) 184 { 185 for( y = 0; y < size.height; y++, src += srcstep - cn, sum += sumstep - cn ) 186 { 187 for( k = 0; k < cn; k++, src++, sum++ ) 188 { 189 ST s = sum[-cn] = 0; 190 for( x = 0; x < size.width; x += cn ) 191 { 192 s += src[x]; 193 sum[x] = sum[x - sumstep] + s; 194 } 195 } 196 } 197 } 198 else if( tilted == 0 ) 199 { 200 for( y = 0; y < size.height; y++, src += srcstep - cn, 201 sum += sumstep - cn, sqsum += sqsumstep - cn ) 202 { 203 for( k = 0; k < cn; k++, src++, sum++, sqsum++ ) 204 { 205 ST s = sum[-cn] = 0; 206 QT sq = sqsum[-cn] = 0; 207 for( x = 0; x < size.width; x += cn ) 208 { 209 T it = src[x]; 210 s += it; 211 sq += (QT)it*it; 212 ST t = sum[x - sumstep] + s; 213 QT tq = sqsum[x - sqsumstep] + sq; 214 sum[x] = t; 215 sqsum[x] = tq; 216 } 217 } 218 } 219 } 220 else 221 { 222 AutoBuffer<ST> _buf(size.width+cn); 223 ST* buf = _buf; 224 ST s; 225 QT sq; 226 for( k = 0; k < cn; k++, src++, sum++, tilted++, buf++ ) 227 { 228 sum[-cn] = tilted[-cn] = 0; 229 230 for( x = 0, s = 0, sq = 0; x < size.width; x += cn ) 231 { 232 T it = src[x]; 233 buf[x] = tilted[x] = it; 234 s += it; 235 sq += (QT)it*it; 236 sum[x] = s; 237 if( sqsum ) 238 sqsum[x] = sq; 239 } 240 241 if( size.width == cn ) 242 buf[cn] = 0; 243 244 if( sqsum ) 245 { 246 sqsum[-cn] = 0; 247 sqsum++; 248 } 249 } 250 251 for( y = 1; y < size.height; y++ ) 252 { 253 src += srcstep - cn; 254 sum += sumstep - cn; 255 tilted += tiltedstep - cn; 256 buf += -cn; 257 258 if( sqsum ) 259 sqsum += sqsumstep - cn; 260 261 for( k = 0; k < cn; k++, src++, sum++, tilted++, buf++ ) 262 { 263 T it = src[0]; 264 ST t0 = s = it; 265 QT tq0 = sq = (QT)it*it; 266 267 sum[-cn] = 0; 268 if( sqsum ) 269 sqsum[-cn] = 0; 270 tilted[-cn] = tilted[-tiltedstep]; 271 272 sum[0] = sum[-sumstep] + t0; 273 if( sqsum ) 274 sqsum[0] = sqsum[-sqsumstep] + tq0; 275 tilted[0] = tilted[-tiltedstep] + t0 + buf[cn]; 276 277 for( x = cn; x < size.width - cn; x += cn ) 278 { 279 ST t1 = buf[x]; 280 buf[x - cn] = t1 + t0; 281 t0 = it = src[x]; 282 tq0 = (QT)it*it; 283 s += t0; 284 sq += tq0; 285 sum[x] = sum[x - sumstep] + s; 286 if( sqsum ) 287 sqsum[x] = sqsum[x - sqsumstep] + sq; 288 t1 += buf[x + cn] + t0 + tilted[x - tiltedstep - cn]; 289 tilted[x] = t1; 290 } 291 292 if( size.width > cn ) 293 { 294 ST t1 = buf[x]; 295 buf[x - cn] = t1 + t0; 296 t0 = it = src[x]; 297 tq0 = (QT)it*it; 298 s += t0; 299 sq += tq0; 300 sum[x] = sum[x - sumstep] + s; 301 if( sqsum ) 302 sqsum[x] = sqsum[x - sqsumstep] + sq; 303 tilted[x] = t0 + t1 + tilted[x - tiltedstep - cn]; 304 buf[x] = t0; 305 } 306 307 if( sqsum ) 308 sqsum++; 309 } 310 } 311 } 312 } 313 314 315 #define DEF_INTEGRAL_FUNC(suffix, T, ST, QT) \ 316 static void integral_##suffix( T* src, size_t srcstep, ST* sum, size_t sumstep, QT* sqsum, size_t sqsumstep, \ 317 ST* tilted, size_t tiltedstep, Size size, int cn ) \ 318 { integral_(src, srcstep, sum, sumstep, sqsum, sqsumstep, tilted, tiltedstep, size, cn); } 319 320 DEF_INTEGRAL_FUNC(8u32s, uchar, int, double) 321 DEF_INTEGRAL_FUNC(8u32s32s, uchar, int, int) 322 DEF_INTEGRAL_FUNC(8u32f64f, uchar, float, double) 323 DEF_INTEGRAL_FUNC(8u64f64f, uchar, double, double) 324 DEF_INTEGRAL_FUNC(16u64f64f, ushort, double, double) 325 DEF_INTEGRAL_FUNC(16s64f64f, short, double, double) 326 DEF_INTEGRAL_FUNC(32f32f64f, float, float, double) 327 DEF_INTEGRAL_FUNC(32f64f64f, float, double, double) 328 DEF_INTEGRAL_FUNC(64f64f64f, double, double, double) 329 330 DEF_INTEGRAL_FUNC(8u32s32f, uchar, int, float) 331 DEF_INTEGRAL_FUNC(8u32f32f, uchar, float, float) 332 DEF_INTEGRAL_FUNC(32f32f32f, float, float, float) 333 334 typedef void (*IntegralFunc)(const uchar* src, size_t srcstep, uchar* sum, size_t sumstep, 335 uchar* sqsum, size_t sqsumstep, uchar* tilted, size_t tstep, 336 Size size, int cn ); 337 338 #ifdef HAVE_OPENCL 339 340 static bool ocl_integral( InputArray _src, OutputArray _sum, int sdepth ) 341 { 342 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; 343 344 if ( (_src.type() != CV_8UC1) || 345 !(sdepth == CV_32S || sdepth == CV_32F || (doubleSupport && sdepth == CV_64F))) 346 return false; 347 348 static const int tileSize = 16; 349 350 String build_opt = format("-D sumT=%s -D LOCAL_SUM_SIZE=%d%s", 351 ocl::typeToStr(sdepth), tileSize, 352 doubleSupport ? " -D DOUBLE_SUPPORT" : ""); 353 354 ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt); 355 if (kcols.empty()) 356 return false; 357 358 UMat src = _src.getUMat(); 359 Size src_size = src.size(); 360 Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize); 361 UMat buf(bufsize, sdepth); 362 kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf)); 363 size_t gt = src.cols, lt = tileSize; 364 if (!kcols.run(1, >, <, false)) 365 return false; 366 367 ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt); 368 if (krows.empty()) 369 return false; 370 371 Size sumsize(src_size.width + 1, src_size.height + 1); 372 _sum.create(sumsize, sdepth); 373 UMat sum = _sum.getUMat(); 374 375 krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::WriteOnly(sum)); 376 gt = src.rows; 377 return krows.run(1, >, <, false); 378 } 379 380 static bool ocl_integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, int sdepth, int sqdepth ) 381 { 382 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; 383 384 if ( _src.type() != CV_8UC1 || (!doubleSupport && (sdepth == CV_64F || sqdepth == CV_64F)) ) 385 return false; 386 387 static const int tileSize = 16; 388 389 String build_opt = format("-D SUM_SQUARE -D sumT=%s -D sumSQT=%s -D LOCAL_SUM_SIZE=%d%s", 390 ocl::typeToStr(sdepth), ocl::typeToStr(sqdepth), 391 tileSize, 392 doubleSupport ? " -D DOUBLE_SUPPORT" : ""); 393 394 ocl::Kernel kcols("integral_sum_cols", ocl::imgproc::integral_sum_oclsrc, build_opt); 395 if (kcols.empty()) 396 return false; 397 398 UMat src = _src.getUMat(); 399 Size src_size = src.size(); 400 Size bufsize(((src_size.height + tileSize - 1) / tileSize) * tileSize, ((src_size.width + tileSize - 1) / tileSize) * tileSize); 401 UMat buf(bufsize, sdepth); 402 UMat buf_sq(bufsize, sqdepth); 403 kcols.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnlyNoSize(buf), ocl::KernelArg::WriteOnlyNoSize(buf_sq)); 404 size_t gt = src.cols, lt = tileSize; 405 if (!kcols.run(1, >, <, false)) 406 return false; 407 408 ocl::Kernel krows("integral_sum_rows", ocl::imgproc::integral_sum_oclsrc, build_opt); 409 if (krows.empty()) 410 return false; 411 412 Size sumsize(src_size.width + 1, src_size.height + 1); 413 _sum.create(sumsize, sdepth); 414 UMat sum = _sum.getUMat(); 415 _sqsum.create(sumsize, sqdepth); 416 UMat sum_sq = _sqsum.getUMat(); 417 418 krows.args(ocl::KernelArg::ReadOnlyNoSize(buf), ocl::KernelArg::ReadOnlyNoSize(buf_sq), ocl::KernelArg::WriteOnly(sum), ocl::KernelArg::WriteOnlyNoSize(sum_sq)); 419 gt = src.rows; 420 return krows.run(1, >, <, false); 421 } 422 423 #endif 424 425 } 426 427 428 void cv::integral( InputArray _src, OutputArray _sum, OutputArray _sqsum, OutputArray _tilted, int sdepth, int sqdepth ) 429 { 430 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 431 if( sdepth <= 0 ) 432 sdepth = depth == CV_8U ? CV_32S : CV_64F; 433 if ( sqdepth <= 0 ) 434 sqdepth = CV_64F; 435 sdepth = CV_MAT_DEPTH(sdepth), sqdepth = CV_MAT_DEPTH(sqdepth); 436 437 #ifdef HAVE_OPENCL 438 if (ocl::useOpenCL() && _sum.isUMat() && !_tilted.needed()) 439 { 440 if (!_sqsum.needed()) 441 { 442 CV_OCL_RUN(ocl::useOpenCL(), ocl_integral(_src, _sum, sdepth)) 443 } 444 else if (_sqsum.isUMat()) 445 CV_OCL_RUN(ocl::useOpenCL(), ocl_integral(_src, _sum, _sqsum, sdepth, sqdepth)) 446 } 447 #endif 448 449 Size ssize = _src.size(), isize(ssize.width + 1, ssize.height + 1); 450 _sum.create( isize, CV_MAKETYPE(sdepth, cn) ); 451 Mat src = _src.getMat(), sum =_sum.getMat(), sqsum, tilted; 452 453 if( _sqsum.needed() ) 454 { 455 _sqsum.create( isize, CV_MAKETYPE(sqdepth, cn) ); 456 sqsum = _sqsum.getMat(); 457 }; 458 459 #if defined(HAVE_IPP) && !defined(HAVE_IPP_ICV_ONLY) // Disabled on ICV due invalid results 460 CV_IPP_CHECK() 461 { 462 if( ( depth == CV_8U ) && ( sdepth == CV_32F || sdepth == CV_32S ) && ( !_tilted.needed() ) && ( !_sqsum.needed() || sqdepth == CV_64F ) && ( cn == 1 ) ) 463 { 464 IppStatus status = ippStsErr; 465 IppiSize srcRoiSize = ippiSize( src.cols, src.rows ); 466 if( sdepth == CV_32F ) 467 { 468 if( _sqsum.needed() ) 469 { 470 status = ippiSqrIntegral_8u32f64f_C1R( (const Ipp8u*)src.data, (int)src.step, (Ipp32f*)sum.data, (int)sum.step, (Ipp64f*)sqsum.data, (int)sqsum.step, srcRoiSize, 0, 0 ); 471 } 472 else 473 { 474 status = ippiIntegral_8u32f_C1R( (const Ipp8u*)src.data, (int)src.step, (Ipp32f*)sum.data, (int)sum.step, srcRoiSize, 0 ); 475 } 476 } 477 else if( sdepth == CV_32S ) 478 { 479 if( _sqsum.needed() ) 480 { 481 status = ippiSqrIntegral_8u32s64f_C1R( (const Ipp8u*)src.data, (int)src.step, (Ipp32s*)sum.data, (int)sum.step, (Ipp64f*)sqsum.data, (int)sqsum.step, srcRoiSize, 0, 0 ); 482 } 483 else 484 { 485 status = ippiIntegral_8u32s_C1R( (const Ipp8u*)src.data, (int)src.step, (Ipp32s*)sum.data, (int)sum.step, srcRoiSize, 0 ); 486 } 487 } 488 if (0 <= status) 489 { 490 CV_IMPL_ADD(CV_IMPL_IPP); 491 return; 492 } 493 setIppErrorStatus(); 494 } 495 } 496 #endif 497 498 if( _tilted.needed() ) 499 { 500 _tilted.create( isize, CV_MAKETYPE(sdepth, cn) ); 501 tilted = _tilted.getMat(); 502 } 503 504 IntegralFunc func = 0; 505 if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_64F ) 506 func = (IntegralFunc)GET_OPTIMIZED(integral_8u32s); 507 else if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32F ) 508 func = (IntegralFunc)integral_8u32s32f; 509 else if( depth == CV_8U && sdepth == CV_32S && sqdepth == CV_32S ) 510 func = (IntegralFunc)integral_8u32s32s; 511 else if( depth == CV_8U && sdepth == CV_32F && sqdepth == CV_64F ) 512 func = (IntegralFunc)integral_8u32f64f; 513 else if( depth == CV_8U && sdepth == CV_32F && sqdepth == CV_32F ) 514 func = (IntegralFunc)integral_8u32f32f; 515 else if( depth == CV_8U && sdepth == CV_64F && sqdepth == CV_64F ) 516 func = (IntegralFunc)integral_8u64f64f; 517 else if( depth == CV_16U && sdepth == CV_64F && sqdepth == CV_64F ) 518 func = (IntegralFunc)integral_16u64f64f; 519 else if( depth == CV_16S && sdepth == CV_64F && sqdepth == CV_64F ) 520 func = (IntegralFunc)integral_16s64f64f; 521 else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_64F ) 522 func = (IntegralFunc)integral_32f32f64f; 523 else if( depth == CV_32F && sdepth == CV_32F && sqdepth == CV_32F ) 524 func = (IntegralFunc)integral_32f32f32f; 525 else if( depth == CV_32F && sdepth == CV_64F && sqdepth == CV_64F ) 526 func = (IntegralFunc)integral_32f64f64f; 527 else if( depth == CV_64F && sdepth == CV_64F && sqdepth == CV_64F ) 528 func = (IntegralFunc)integral_64f64f64f; 529 else 530 CV_Error( CV_StsUnsupportedFormat, "" ); 531 532 func( src.ptr(), src.step, sum.ptr(), sum.step, sqsum.ptr(), sqsum.step, 533 tilted.ptr(), tilted.step, src.size(), cn ); 534 } 535 536 void cv::integral( InputArray src, OutputArray sum, int sdepth ) 537 { 538 integral( src, sum, noArray(), noArray(), sdepth ); 539 } 540 541 void cv::integral( InputArray src, OutputArray sum, OutputArray sqsum, int sdepth, int sqdepth ) 542 { 543 integral( src, sum, sqsum, noArray(), sdepth, sqdepth ); 544 } 545 546 547 CV_IMPL void 548 cvIntegral( const CvArr* image, CvArr* sumImage, 549 CvArr* sumSqImage, CvArr* tiltedSumImage ) 550 { 551 cv::Mat src = cv::cvarrToMat(image), sum = cv::cvarrToMat(sumImage), sum0 = sum; 552 cv::Mat sqsum0, sqsum, tilted0, tilted; 553 cv::Mat *psqsum = 0, *ptilted = 0; 554 555 if( sumSqImage ) 556 { 557 sqsum0 = sqsum = cv::cvarrToMat(sumSqImage); 558 psqsum = &sqsum; 559 } 560 561 if( tiltedSumImage ) 562 { 563 tilted0 = tilted = cv::cvarrToMat(tiltedSumImage); 564 ptilted = &tilted; 565 } 566 cv::integral( src, sum, psqsum ? cv::_OutputArray(*psqsum) : cv::_OutputArray(), 567 ptilted ? cv::_OutputArray(*ptilted) : cv::_OutputArray(), sum.depth() ); 568 569 CV_Assert( sum.data == sum0.data && sqsum.data == sqsum0.data && tilted.data == tilted0.data ); 570 } 571 572 /* End of file. */ 573