1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #include "precomp.hpp" 44 #include "opencl_kernels_imgproc.hpp" 45 46 /****************************************************************************************\ 47 Base Image Filter 48 \****************************************************************************************/ 49 50 #if IPP_VERSION_X100 >= 701 51 #define USE_IPP_SEP_FILTERS 1 52 #else 53 #undef USE_IPP_SEP_FILTERS 54 #endif 55 56 namespace cv 57 { 58 59 BaseRowFilter::BaseRowFilter() { ksize = anchor = -1; } 60 BaseRowFilter::~BaseRowFilter() {} 61 62 BaseColumnFilter::BaseColumnFilter() { ksize = anchor = -1; } 63 BaseColumnFilter::~BaseColumnFilter() {} 64 void BaseColumnFilter::reset() {} 65 66 BaseFilter::BaseFilter() { ksize = Size(-1,-1); anchor = Point(-1,-1); } 67 BaseFilter::~BaseFilter() {} 68 void BaseFilter::reset() {} 69 70 FilterEngine::FilterEngine() 71 { 72 srcType = dstType = bufType = -1; 73 rowBorderType = columnBorderType = BORDER_REPLICATE; 74 bufStep = startY = startY0 = endY = rowCount = dstY = 0; 75 maxWidth = 0; 76 77 wholeSize = Size(-1,-1); 78 } 79 80 81 FilterEngine::FilterEngine( const Ptr<BaseFilter>& _filter2D, 82 const Ptr<BaseRowFilter>& _rowFilter, 83 const Ptr<BaseColumnFilter>& _columnFilter, 84 int _srcType, int _dstType, int _bufType, 85 int _rowBorderType, int _columnBorderType, 86 const Scalar& _borderValue ) 87 { 88 init(_filter2D, _rowFilter, _columnFilter, _srcType, _dstType, _bufType, 89 _rowBorderType, _columnBorderType, _borderValue); 90 } 91 92 FilterEngine::~FilterEngine() 93 { 94 } 95 96 97 void FilterEngine::init( const Ptr<BaseFilter>& _filter2D, 98 const Ptr<BaseRowFilter>& _rowFilter, 99 const Ptr<BaseColumnFilter>& _columnFilter, 100 int _srcType, int _dstType, int _bufType, 101 int _rowBorderType, int _columnBorderType, 102 const Scalar& _borderValue ) 103 { 104 _srcType = CV_MAT_TYPE(_srcType); 105 _bufType = CV_MAT_TYPE(_bufType); 106 _dstType = CV_MAT_TYPE(_dstType); 107 108 srcType = _srcType; 109 int srcElemSize = (int)getElemSize(srcType); 110 dstType = _dstType; 111 bufType = _bufType; 112 113 filter2D = _filter2D; 114 rowFilter = _rowFilter; 115 columnFilter = _columnFilter; 116 117 if( _columnBorderType < 0 ) 118 _columnBorderType = _rowBorderType; 119 120 rowBorderType = _rowBorderType; 121 columnBorderType = _columnBorderType; 122 123 CV_Assert( columnBorderType != BORDER_WRAP ); 124 125 if( isSeparable() ) 126 { 127 CV_Assert( rowFilter && columnFilter ); 128 ksize = Size(rowFilter->ksize, columnFilter->ksize); 129 anchor = Point(rowFilter->anchor, columnFilter->anchor); 130 } 131 else 132 { 133 CV_Assert( bufType == srcType ); 134 ksize = filter2D->ksize; 135 anchor = filter2D->anchor; 136 } 137 138 CV_Assert( 0 <= anchor.x && anchor.x < ksize.width && 139 0 <= anchor.y && anchor.y < ksize.height ); 140 141 borderElemSize = srcElemSize/(CV_MAT_DEPTH(srcType) >= CV_32S ? sizeof(int) : 1); 142 int borderLength = std::max(ksize.width - 1, 1); 143 borderTab.resize(borderLength*borderElemSize); 144 145 maxWidth = bufStep = 0; 146 constBorderRow.clear(); 147 148 if( rowBorderType == BORDER_CONSTANT || columnBorderType == BORDER_CONSTANT ) 149 { 150 constBorderValue.resize(srcElemSize*borderLength); 151 int srcType1 = CV_MAKETYPE(CV_MAT_DEPTH(srcType), MIN(CV_MAT_CN(srcType), 4)); 152 scalarToRawData(_borderValue, &constBorderValue[0], srcType1, 153 borderLength*CV_MAT_CN(srcType)); 154 } 155 156 wholeSize = Size(-1,-1); 157 } 158 159 #define VEC_ALIGN CV_MALLOC_ALIGN 160 161 int FilterEngine::start(Size _wholeSize, Rect _roi, int _maxBufRows) 162 { 163 int i, j; 164 165 wholeSize = _wholeSize; 166 roi = _roi; 167 CV_Assert( roi.x >= 0 && roi.y >= 0 && roi.width >= 0 && roi.height >= 0 && 168 roi.x + roi.width <= wholeSize.width && 169 roi.y + roi.height <= wholeSize.height ); 170 171 int esz = (int)getElemSize(srcType); 172 int bufElemSize = (int)getElemSize(bufType); 173 const uchar* constVal = !constBorderValue.empty() ? &constBorderValue[0] : 0; 174 175 if( _maxBufRows < 0 ) 176 _maxBufRows = ksize.height + 3; 177 _maxBufRows = std::max(_maxBufRows, std::max(anchor.y, ksize.height-anchor.y-1)*2+1); 178 179 if( maxWidth < roi.width || _maxBufRows != (int)rows.size() ) 180 { 181 rows.resize(_maxBufRows); 182 maxWidth = std::max(maxWidth, roi.width); 183 int cn = CV_MAT_CN(srcType); 184 srcRow.resize(esz*(maxWidth + ksize.width - 1)); 185 if( columnBorderType == BORDER_CONSTANT ) 186 { 187 constBorderRow.resize(getElemSize(bufType)*(maxWidth + ksize.width - 1 + VEC_ALIGN)); 188 uchar *dst = alignPtr(&constBorderRow[0], VEC_ALIGN), *tdst; 189 int n = (int)constBorderValue.size(), N; 190 N = (maxWidth + ksize.width - 1)*esz; 191 tdst = isSeparable() ? &srcRow[0] : dst; 192 193 for( i = 0; i < N; i += n ) 194 { 195 n = std::min( n, N - i ); 196 for(j = 0; j < n; j++) 197 tdst[i+j] = constVal[j]; 198 } 199 200 if( isSeparable() ) 201 (*rowFilter)(&srcRow[0], dst, maxWidth, cn); 202 } 203 204 int maxBufStep = bufElemSize*(int)alignSize(maxWidth + 205 (!isSeparable() ? ksize.width - 1 : 0),VEC_ALIGN); 206 ringBuf.resize(maxBufStep*rows.size()+VEC_ALIGN); 207 } 208 209 // adjust bufstep so that the used part of the ring buffer stays compact in memory 210 bufStep = bufElemSize*(int)alignSize(roi.width + (!isSeparable() ? ksize.width - 1 : 0),16); 211 212 dx1 = std::max(anchor.x - roi.x, 0); 213 dx2 = std::max(ksize.width - anchor.x - 1 + roi.x + roi.width - wholeSize.width, 0); 214 215 // recompute border tables 216 if( dx1 > 0 || dx2 > 0 ) 217 { 218 if( rowBorderType == BORDER_CONSTANT ) 219 { 220 int nr = isSeparable() ? 1 : (int)rows.size(); 221 for( i = 0; i < nr; i++ ) 222 { 223 uchar* dst = isSeparable() ? &srcRow[0] : alignPtr(&ringBuf[0],VEC_ALIGN) + bufStep*i; 224 memcpy( dst, constVal, dx1*esz ); 225 memcpy( dst + (roi.width + ksize.width - 1 - dx2)*esz, constVal, dx2*esz ); 226 } 227 } 228 else 229 { 230 int xofs1 = std::min(roi.x, anchor.x) - roi.x; 231 232 int btab_esz = borderElemSize, wholeWidth = wholeSize.width; 233 int* btab = (int*)&borderTab[0]; 234 235 for( i = 0; i < dx1; i++ ) 236 { 237 int p0 = (borderInterpolate(i-dx1, wholeWidth, rowBorderType) + xofs1)*btab_esz; 238 for( j = 0; j < btab_esz; j++ ) 239 btab[i*btab_esz + j] = p0 + j; 240 } 241 242 for( i = 0; i < dx2; i++ ) 243 { 244 int p0 = (borderInterpolate(wholeWidth + i, wholeWidth, rowBorderType) + xofs1)*btab_esz; 245 for( j = 0; j < btab_esz; j++ ) 246 btab[(i + dx1)*btab_esz + j] = p0 + j; 247 } 248 } 249 } 250 251 rowCount = dstY = 0; 252 startY = startY0 = std::max(roi.y - anchor.y, 0); 253 endY = std::min(roi.y + roi.height + ksize.height - anchor.y - 1, wholeSize.height); 254 if( columnFilter ) 255 columnFilter->reset(); 256 if( filter2D ) 257 filter2D->reset(); 258 259 return startY; 260 } 261 262 263 int FilterEngine::start(const Mat& src, const Rect& _srcRoi, 264 bool isolated, int maxBufRows) 265 { 266 Rect srcRoi = _srcRoi; 267 268 if( srcRoi == Rect(0,0,-1,-1) ) 269 srcRoi = Rect(0,0,src.cols,src.rows); 270 271 CV_Assert( srcRoi.x >= 0 && srcRoi.y >= 0 && 272 srcRoi.width >= 0 && srcRoi.height >= 0 && 273 srcRoi.x + srcRoi.width <= src.cols && 274 srcRoi.y + srcRoi.height <= src.rows ); 275 276 Point ofs; 277 Size wsz(src.cols, src.rows); 278 if( !isolated ) 279 src.locateROI( wsz, ofs ); 280 start( wsz, srcRoi + ofs, maxBufRows ); 281 282 return startY - ofs.y; 283 } 284 285 286 int FilterEngine::remainingInputRows() const 287 { 288 return endY - startY - rowCount; 289 } 290 291 int FilterEngine::remainingOutputRows() const 292 { 293 return roi.height - dstY; 294 } 295 296 int FilterEngine::proceed( const uchar* src, int srcstep, int count, 297 uchar* dst, int dststep ) 298 { 299 CV_Assert( wholeSize.width > 0 && wholeSize.height > 0 ); 300 301 const int *btab = &borderTab[0]; 302 int esz = (int)getElemSize(srcType), btab_esz = borderElemSize; 303 uchar** brows = &rows[0]; 304 int bufRows = (int)rows.size(); 305 int cn = CV_MAT_CN(bufType); 306 int width = roi.width, kwidth = ksize.width; 307 int kheight = ksize.height, ay = anchor.y; 308 int _dx1 = dx1, _dx2 = dx2; 309 int width1 = roi.width + kwidth - 1; 310 int xofs1 = std::min(roi.x, anchor.x); 311 bool isSep = isSeparable(); 312 bool makeBorder = (_dx1 > 0 || _dx2 > 0) && rowBorderType != BORDER_CONSTANT; 313 int dy = 0, i = 0; 314 315 src -= xofs1*esz; 316 count = std::min(count, remainingInputRows()); 317 318 CV_Assert( src && dst && count > 0 ); 319 320 for(;; dst += dststep*i, dy += i) 321 { 322 int dcount = bufRows - ay - startY - rowCount + roi.y; 323 dcount = dcount > 0 ? dcount : bufRows - kheight + 1; 324 dcount = std::min(dcount, count); 325 count -= dcount; 326 for( ; dcount-- > 0; src += srcstep ) 327 { 328 int bi = (startY - startY0 + rowCount) % bufRows; 329 uchar* brow = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; 330 uchar* row = isSep ? &srcRow[0] : brow; 331 332 if( ++rowCount > bufRows ) 333 { 334 --rowCount; 335 ++startY; 336 } 337 338 memcpy( row + _dx1*esz, src, (width1 - _dx2 - _dx1)*esz ); 339 340 if( makeBorder ) 341 { 342 if( btab_esz*(int)sizeof(int) == esz ) 343 { 344 const int* isrc = (const int*)src; 345 int* irow = (int*)row; 346 347 for( i = 0; i < _dx1*btab_esz; i++ ) 348 irow[i] = isrc[btab[i]]; 349 for( i = 0; i < _dx2*btab_esz; i++ ) 350 irow[i + (width1 - _dx2)*btab_esz] = isrc[btab[i+_dx1*btab_esz]]; 351 } 352 else 353 { 354 for( i = 0; i < _dx1*esz; i++ ) 355 row[i] = src[btab[i]]; 356 for( i = 0; i < _dx2*esz; i++ ) 357 row[i + (width1 - _dx2)*esz] = src[btab[i+_dx1*esz]]; 358 } 359 } 360 361 if( isSep ) 362 (*rowFilter)(row, brow, width, CV_MAT_CN(srcType)); 363 } 364 365 int max_i = std::min(bufRows, roi.height - (dstY + dy) + (kheight - 1)); 366 for( i = 0; i < max_i; i++ ) 367 { 368 int srcY = borderInterpolate(dstY + dy + i + roi.y - ay, 369 wholeSize.height, columnBorderType); 370 if( srcY < 0 ) // can happen only with constant border type 371 brows[i] = alignPtr(&constBorderRow[0], VEC_ALIGN); 372 else 373 { 374 CV_Assert( srcY >= startY ); 375 if( srcY >= startY + rowCount ) 376 break; 377 int bi = (srcY - startY0) % bufRows; 378 brows[i] = alignPtr(&ringBuf[0], VEC_ALIGN) + bi*bufStep; 379 } 380 } 381 if( i < kheight ) 382 break; 383 i -= kheight - 1; 384 if( isSeparable() ) 385 (*columnFilter)((const uchar**)brows, dst, dststep, i, roi.width*cn); 386 else 387 (*filter2D)((const uchar**)brows, dst, dststep, i, roi.width, cn); 388 } 389 390 dstY += dy; 391 CV_Assert( dstY <= roi.height ); 392 return dy; 393 } 394 395 396 void FilterEngine::apply(const Mat& src, Mat& dst, 397 const Rect& _srcRoi, Point dstOfs, bool isolated) 398 { 399 CV_Assert( src.type() == srcType && dst.type() == dstType ); 400 401 Rect srcRoi = _srcRoi; 402 if( srcRoi == Rect(0,0,-1,-1) ) 403 srcRoi = Rect(0,0,src.cols,src.rows); 404 405 if( srcRoi.area() == 0 ) 406 return; 407 408 CV_Assert( dstOfs.x >= 0 && dstOfs.y >= 0 && 409 dstOfs.x + srcRoi.width <= dst.cols && 410 dstOfs.y + srcRoi.height <= dst.rows ); 411 412 int y = start(src, srcRoi, isolated); 413 proceed( src.ptr() + y*src.step + srcRoi.x*src.elemSize(), 414 (int)src.step, endY - startY, 415 dst.ptr(dstOfs.y) + 416 dstOfs.x*dst.elemSize(), (int)dst.step ); 417 } 418 419 } 420 421 /****************************************************************************************\ 422 * Separable linear filter * 423 \****************************************************************************************/ 424 425 int cv::getKernelType(InputArray filter_kernel, Point anchor) 426 { 427 Mat _kernel = filter_kernel.getMat(); 428 CV_Assert( _kernel.channels() == 1 ); 429 int i, sz = _kernel.rows*_kernel.cols; 430 431 Mat kernel; 432 _kernel.convertTo(kernel, CV_64F); 433 434 const double* coeffs = kernel.ptr<double>(); 435 double sum = 0; 436 int type = KERNEL_SMOOTH + KERNEL_INTEGER; 437 if( (_kernel.rows == 1 || _kernel.cols == 1) && 438 anchor.x*2 + 1 == _kernel.cols && 439 anchor.y*2 + 1 == _kernel.rows ) 440 type |= (KERNEL_SYMMETRICAL + KERNEL_ASYMMETRICAL); 441 442 for( i = 0; i < sz; i++ ) 443 { 444 double a = coeffs[i], b = coeffs[sz - i - 1]; 445 if( a != b ) 446 type &= ~KERNEL_SYMMETRICAL; 447 if( a != -b ) 448 type &= ~KERNEL_ASYMMETRICAL; 449 if( a < 0 ) 450 type &= ~KERNEL_SMOOTH; 451 if( a != saturate_cast<int>(a) ) 452 type &= ~KERNEL_INTEGER; 453 sum += a; 454 } 455 456 if( fabs(sum - 1) > FLT_EPSILON*(fabs(sum) + 1) ) 457 type &= ~KERNEL_SMOOTH; 458 return type; 459 } 460 461 462 namespace cv 463 { 464 465 struct RowNoVec 466 { 467 RowNoVec() {} 468 RowNoVec(const Mat&) {} 469 int operator()(const uchar*, uchar*, int, int) const { return 0; } 470 }; 471 472 struct ColumnNoVec 473 { 474 ColumnNoVec() {} 475 ColumnNoVec(const Mat&, int, int, double) {} 476 int operator()(const uchar**, uchar*, int) const { return 0; } 477 }; 478 479 struct SymmRowSmallNoVec 480 { 481 SymmRowSmallNoVec() {} 482 SymmRowSmallNoVec(const Mat&, int) {} 483 int operator()(const uchar*, uchar*, int, int) const { return 0; } 484 }; 485 486 struct SymmColumnSmallNoVec 487 { 488 SymmColumnSmallNoVec() {} 489 SymmColumnSmallNoVec(const Mat&, int, int, double) {} 490 int operator()(const uchar**, uchar*, int) const { return 0; } 491 }; 492 493 struct FilterNoVec 494 { 495 FilterNoVec() {} 496 FilterNoVec(const Mat&, int, double) {} 497 int operator()(const uchar**, uchar*, int) const { return 0; } 498 }; 499 500 501 #if CV_SSE2 502 503 ///////////////////////////////////// 8u-16s & 8u-8u ////////////////////////////////// 504 505 struct RowVec_8u32s 506 { 507 RowVec_8u32s() { smallValues = false; } 508 RowVec_8u32s( const Mat& _kernel ) 509 { 510 kernel = _kernel; 511 smallValues = true; 512 int k, ksize = kernel.rows + kernel.cols - 1; 513 for( k = 0; k < ksize; k++ ) 514 { 515 int v = kernel.ptr<int>()[k]; 516 if( v < SHRT_MIN || v > SHRT_MAX ) 517 { 518 smallValues = false; 519 break; 520 } 521 } 522 } 523 524 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const 525 { 526 if( !checkHardwareSupport(CV_CPU_SSE2) ) 527 return 0; 528 529 int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; 530 int* dst = (int*)_dst; 531 const int* _kx = kernel.ptr<int>(); 532 width *= cn; 533 534 if( smallValues ) 535 { 536 for( ; i <= width - 16; i += 16 ) 537 { 538 const uchar* src = _src + i; 539 __m128i f, z = _mm_setzero_si128(), s0 = z, s1 = z, s2 = z, s3 = z; 540 __m128i x0, x1, x2, x3; 541 542 for( k = 0; k < _ksize; k++, src += cn ) 543 { 544 f = _mm_cvtsi32_si128(_kx[k]); 545 f = _mm_shuffle_epi32(f, 0); 546 f = _mm_packs_epi32(f, f); 547 548 x0 = _mm_loadu_si128((const __m128i*)src); 549 x2 = _mm_unpackhi_epi8(x0, z); 550 x0 = _mm_unpacklo_epi8(x0, z); 551 x1 = _mm_mulhi_epi16(x0, f); 552 x3 = _mm_mulhi_epi16(x2, f); 553 x0 = _mm_mullo_epi16(x0, f); 554 x2 = _mm_mullo_epi16(x2, f); 555 556 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1)); 557 s1 = _mm_add_epi32(s1, _mm_unpackhi_epi16(x0, x1)); 558 s2 = _mm_add_epi32(s2, _mm_unpacklo_epi16(x2, x3)); 559 s3 = _mm_add_epi32(s3, _mm_unpackhi_epi16(x2, x3)); 560 } 561 562 _mm_store_si128((__m128i*)(dst + i), s0); 563 _mm_store_si128((__m128i*)(dst + i + 4), s1); 564 _mm_store_si128((__m128i*)(dst + i + 8), s2); 565 _mm_store_si128((__m128i*)(dst + i + 12), s3); 566 } 567 568 for( ; i <= width - 4; i += 4 ) 569 { 570 const uchar* src = _src + i; 571 __m128i f, z = _mm_setzero_si128(), s0 = z, x0, x1; 572 573 for( k = 0; k < _ksize; k++, src += cn ) 574 { 575 f = _mm_cvtsi32_si128(_kx[k]); 576 f = _mm_shuffle_epi32(f, 0); 577 f = _mm_packs_epi32(f, f); 578 579 x0 = _mm_cvtsi32_si128(*(const int*)src); 580 x0 = _mm_unpacklo_epi8(x0, z); 581 x1 = _mm_mulhi_epi16(x0, f); 582 x0 = _mm_mullo_epi16(x0, f); 583 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1)); 584 } 585 _mm_store_si128((__m128i*)(dst + i), s0); 586 } 587 } 588 return i; 589 } 590 591 Mat kernel; 592 bool smallValues; 593 }; 594 595 596 struct SymmRowSmallVec_8u32s 597 { 598 SymmRowSmallVec_8u32s() { smallValues = false; } 599 SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType ) 600 { 601 kernel = _kernel; 602 symmetryType = _symmetryType; 603 smallValues = true; 604 int k, ksize = kernel.rows + kernel.cols - 1; 605 for( k = 0; k < ksize; k++ ) 606 { 607 int v = kernel.ptr<int>()[k]; 608 if( v < SHRT_MIN || v > SHRT_MAX ) 609 { 610 smallValues = false; 611 break; 612 } 613 } 614 } 615 616 int operator()(const uchar* src, uchar* _dst, int width, int cn) const 617 { 618 if( !checkHardwareSupport(CV_CPU_SSE2) ) 619 return 0; 620 621 int i = 0, j, k, _ksize = kernel.rows + kernel.cols - 1; 622 int* dst = (int*)_dst; 623 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 624 const int* kx = kernel.ptr<int>() + _ksize/2; 625 if( !smallValues ) 626 return 0; 627 628 src += (_ksize/2)*cn; 629 width *= cn; 630 631 __m128i z = _mm_setzero_si128(); 632 if( symmetrical ) 633 { 634 if( _ksize == 1 ) 635 return 0; 636 if( _ksize == 3 ) 637 { 638 if( kx[0] == 2 && kx[1] == 1 ) 639 for( ; i <= width - 16; i += 16, src += 16 ) 640 { 641 __m128i x0, x1, x2, y0, y1, y2; 642 x0 = _mm_loadu_si128((__m128i*)(src - cn)); 643 x1 = _mm_loadu_si128((__m128i*)src); 644 x2 = _mm_loadu_si128((__m128i*)(src + cn)); 645 y0 = _mm_unpackhi_epi8(x0, z); 646 x0 = _mm_unpacklo_epi8(x0, z); 647 y1 = _mm_unpackhi_epi8(x1, z); 648 x1 = _mm_unpacklo_epi8(x1, z); 649 y2 = _mm_unpackhi_epi8(x2, z); 650 x2 = _mm_unpacklo_epi8(x2, z); 651 x0 = _mm_add_epi16(x0, _mm_add_epi16(_mm_add_epi16(x1, x1), x2)); 652 y0 = _mm_add_epi16(y0, _mm_add_epi16(_mm_add_epi16(y1, y1), y2)); 653 _mm_store_si128((__m128i*)(dst + i), _mm_unpacklo_epi16(x0, z)); 654 _mm_store_si128((__m128i*)(dst + i + 4), _mm_unpackhi_epi16(x0, z)); 655 _mm_store_si128((__m128i*)(dst + i + 8), _mm_unpacklo_epi16(y0, z)); 656 _mm_store_si128((__m128i*)(dst + i + 12), _mm_unpackhi_epi16(y0, z)); 657 } 658 else if( kx[0] == -2 && kx[1] == 1 ) 659 for( ; i <= width - 16; i += 16, src += 16 ) 660 { 661 __m128i x0, x1, x2, y0, y1, y2; 662 x0 = _mm_loadu_si128((__m128i*)(src - cn)); 663 x1 = _mm_loadu_si128((__m128i*)src); 664 x2 = _mm_loadu_si128((__m128i*)(src + cn)); 665 y0 = _mm_unpackhi_epi8(x0, z); 666 x0 = _mm_unpacklo_epi8(x0, z); 667 y1 = _mm_unpackhi_epi8(x1, z); 668 x1 = _mm_unpacklo_epi8(x1, z); 669 y2 = _mm_unpackhi_epi8(x2, z); 670 x2 = _mm_unpacklo_epi8(x2, z); 671 x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1))); 672 y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1))); 673 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16)); 674 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16)); 675 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16)); 676 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16)); 677 } 678 else 679 { 680 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0), 681 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0); 682 k0 = _mm_packs_epi32(k0, k0); 683 k1 = _mm_packs_epi32(k1, k1); 684 685 for( ; i <= width - 16; i += 16, src += 16 ) 686 { 687 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3; 688 x0 = _mm_loadu_si128((__m128i*)(src - cn)); 689 x1 = _mm_loadu_si128((__m128i*)src); 690 x2 = _mm_loadu_si128((__m128i*)(src + cn)); 691 y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z)); 692 x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z)); 693 y1 = _mm_unpackhi_epi8(x1, z); 694 x1 = _mm_unpacklo_epi8(x1, z); 695 696 t1 = _mm_mulhi_epi16(x1, k0); 697 t0 = _mm_mullo_epi16(x1, k0); 698 x2 = _mm_mulhi_epi16(x0, k1); 699 x0 = _mm_mullo_epi16(x0, k1); 700 z0 = _mm_unpacklo_epi16(t0, t1); 701 z1 = _mm_unpackhi_epi16(t0, t1); 702 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2)); 703 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2)); 704 705 t1 = _mm_mulhi_epi16(y1, k0); 706 t0 = _mm_mullo_epi16(y1, k0); 707 y1 = _mm_mulhi_epi16(y0, k1); 708 y0 = _mm_mullo_epi16(y0, k1); 709 z2 = _mm_unpacklo_epi16(t0, t1); 710 z3 = _mm_unpackhi_epi16(t0, t1); 711 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1)); 712 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1)); 713 _mm_store_si128((__m128i*)(dst + i), z0); 714 _mm_store_si128((__m128i*)(dst + i + 4), z1); 715 _mm_store_si128((__m128i*)(dst + i + 8), z2); 716 _mm_store_si128((__m128i*)(dst + i + 12), z3); 717 } 718 } 719 } 720 else if( _ksize == 5 ) 721 { 722 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) 723 for( ; i <= width - 16; i += 16, src += 16 ) 724 { 725 __m128i x0, x1, x2, y0, y1, y2; 726 x0 = _mm_loadu_si128((__m128i*)(src - cn*2)); 727 x1 = _mm_loadu_si128((__m128i*)src); 728 x2 = _mm_loadu_si128((__m128i*)(src + cn*2)); 729 y0 = _mm_unpackhi_epi8(x0, z); 730 x0 = _mm_unpacklo_epi8(x0, z); 731 y1 = _mm_unpackhi_epi8(x1, z); 732 x1 = _mm_unpacklo_epi8(x1, z); 733 y2 = _mm_unpackhi_epi8(x2, z); 734 x2 = _mm_unpacklo_epi8(x2, z); 735 x0 = _mm_add_epi16(x0, _mm_sub_epi16(x2, _mm_add_epi16(x1, x1))); 736 y0 = _mm_add_epi16(y0, _mm_sub_epi16(y2, _mm_add_epi16(y1, y1))); 737 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16)); 738 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16)); 739 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16)); 740 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16)); 741 } 742 else 743 { 744 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0), 745 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0), 746 k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0); 747 k0 = _mm_packs_epi32(k0, k0); 748 k1 = _mm_packs_epi32(k1, k1); 749 k2 = _mm_packs_epi32(k2, k2); 750 751 for( ; i <= width - 16; i += 16, src += 16 ) 752 { 753 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3; 754 x0 = _mm_loadu_si128((__m128i*)(src - cn)); 755 x1 = _mm_loadu_si128((__m128i*)src); 756 x2 = _mm_loadu_si128((__m128i*)(src + cn)); 757 y0 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z)); 758 x0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z)); 759 y1 = _mm_unpackhi_epi8(x1, z); 760 x1 = _mm_unpacklo_epi8(x1, z); 761 762 t1 = _mm_mulhi_epi16(x1, k0); 763 t0 = _mm_mullo_epi16(x1, k0); 764 x2 = _mm_mulhi_epi16(x0, k1); 765 x0 = _mm_mullo_epi16(x0, k1); 766 z0 = _mm_unpacklo_epi16(t0, t1); 767 z1 = _mm_unpackhi_epi16(t0, t1); 768 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(x0, x2)); 769 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(x0, x2)); 770 771 t1 = _mm_mulhi_epi16(y1, k0); 772 t0 = _mm_mullo_epi16(y1, k0); 773 y1 = _mm_mulhi_epi16(y0, k1); 774 y0 = _mm_mullo_epi16(y0, k1); 775 z2 = _mm_unpacklo_epi16(t0, t1); 776 z3 = _mm_unpackhi_epi16(t0, t1); 777 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1)); 778 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1)); 779 780 x0 = _mm_loadu_si128((__m128i*)(src - cn*2)); 781 x1 = _mm_loadu_si128((__m128i*)(src + cn*2)); 782 y1 = _mm_add_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z)); 783 y0 = _mm_add_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z)); 784 785 t1 = _mm_mulhi_epi16(y0, k2); 786 t0 = _mm_mullo_epi16(y0, k2); 787 y0 = _mm_mullo_epi16(y1, k2); 788 y1 = _mm_mulhi_epi16(y1, k2); 789 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1)); 790 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1)); 791 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1)); 792 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1)); 793 794 _mm_store_si128((__m128i*)(dst + i), z0); 795 _mm_store_si128((__m128i*)(dst + i + 4), z1); 796 _mm_store_si128((__m128i*)(dst + i + 8), z2); 797 _mm_store_si128((__m128i*)(dst + i + 12), z3); 798 } 799 } 800 } 801 } 802 else 803 { 804 if( _ksize == 3 ) 805 { 806 if( kx[0] == 0 && kx[1] == 1 ) 807 for( ; i <= width - 16; i += 16, src += 16 ) 808 { 809 __m128i x0, x1, y0; 810 x0 = _mm_loadu_si128((__m128i*)(src + cn)); 811 x1 = _mm_loadu_si128((__m128i*)(src - cn)); 812 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z)); 813 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z)); 814 _mm_store_si128((__m128i*)(dst + i), _mm_srai_epi32(_mm_unpacklo_epi16(x0, x0),16)); 815 _mm_store_si128((__m128i*)(dst + i + 4), _mm_srai_epi32(_mm_unpackhi_epi16(x0, x0),16)); 816 _mm_store_si128((__m128i*)(dst + i + 8), _mm_srai_epi32(_mm_unpacklo_epi16(y0, y0),16)); 817 _mm_store_si128((__m128i*)(dst + i + 12), _mm_srai_epi32(_mm_unpackhi_epi16(y0, y0),16)); 818 } 819 else 820 { 821 __m128i k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0); 822 k1 = _mm_packs_epi32(k1, k1); 823 824 for( ; i <= width - 16; i += 16, src += 16 ) 825 { 826 __m128i x0, x1, y0, y1, z0, z1, z2, z3; 827 x0 = _mm_loadu_si128((__m128i*)(src + cn)); 828 x1 = _mm_loadu_si128((__m128i*)(src - cn)); 829 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z)); 830 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z)); 831 832 x1 = _mm_mulhi_epi16(x0, k1); 833 x0 = _mm_mullo_epi16(x0, k1); 834 z0 = _mm_unpacklo_epi16(x0, x1); 835 z1 = _mm_unpackhi_epi16(x0, x1); 836 837 y1 = _mm_mulhi_epi16(y0, k1); 838 y0 = _mm_mullo_epi16(y0, k1); 839 z2 = _mm_unpacklo_epi16(y0, y1); 840 z3 = _mm_unpackhi_epi16(y0, y1); 841 _mm_store_si128((__m128i*)(dst + i), z0); 842 _mm_store_si128((__m128i*)(dst + i + 4), z1); 843 _mm_store_si128((__m128i*)(dst + i + 8), z2); 844 _mm_store_si128((__m128i*)(dst + i + 12), z3); 845 } 846 } 847 } 848 else if( _ksize == 5 ) 849 { 850 __m128i k0 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[0]), 0), 851 k1 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[1]), 0), 852 k2 = _mm_shuffle_epi32(_mm_cvtsi32_si128(kx[2]), 0); 853 k0 = _mm_packs_epi32(k0, k0); 854 k1 = _mm_packs_epi32(k1, k1); 855 k2 = _mm_packs_epi32(k2, k2); 856 857 for( ; i <= width - 16; i += 16, src += 16 ) 858 { 859 __m128i x0, x1, x2, y0, y1, t0, t1, z0, z1, z2, z3; 860 x0 = _mm_loadu_si128((__m128i*)(src + cn)); 861 x2 = _mm_loadu_si128((__m128i*)(src - cn)); 862 y0 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x2, z)); 863 x0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x2, z)); 864 865 x2 = _mm_mulhi_epi16(x0, k1); 866 x0 = _mm_mullo_epi16(x0, k1); 867 z0 = _mm_unpacklo_epi16(x0, x2); 868 z1 = _mm_unpackhi_epi16(x0, x2); 869 y1 = _mm_mulhi_epi16(y0, k1); 870 y0 = _mm_mullo_epi16(y0, k1); 871 z2 = _mm_unpacklo_epi16(y0, y1); 872 z3 = _mm_unpackhi_epi16(y0, y1); 873 874 x0 = _mm_loadu_si128((__m128i*)(src + cn*2)); 875 x1 = _mm_loadu_si128((__m128i*)(src - cn*2)); 876 y1 = _mm_sub_epi16(_mm_unpackhi_epi8(x0, z), _mm_unpackhi_epi8(x1, z)); 877 y0 = _mm_sub_epi16(_mm_unpacklo_epi8(x0, z), _mm_unpacklo_epi8(x1, z)); 878 879 t1 = _mm_mulhi_epi16(y0, k2); 880 t0 = _mm_mullo_epi16(y0, k2); 881 y0 = _mm_mullo_epi16(y1, k2); 882 y1 = _mm_mulhi_epi16(y1, k2); 883 z0 = _mm_add_epi32(z0, _mm_unpacklo_epi16(t0, t1)); 884 z1 = _mm_add_epi32(z1, _mm_unpackhi_epi16(t0, t1)); 885 z2 = _mm_add_epi32(z2, _mm_unpacklo_epi16(y0, y1)); 886 z3 = _mm_add_epi32(z3, _mm_unpackhi_epi16(y0, y1)); 887 888 _mm_store_si128((__m128i*)(dst + i), z0); 889 _mm_store_si128((__m128i*)(dst + i + 4), z1); 890 _mm_store_si128((__m128i*)(dst + i + 8), z2); 891 _mm_store_si128((__m128i*)(dst + i + 12), z3); 892 } 893 } 894 } 895 896 src -= (_ksize/2)*cn; 897 kx -= _ksize/2; 898 for( ; i <= width - 4; i += 4, src += 4 ) 899 { 900 __m128i f, s0 = z, x0, x1; 901 902 for( k = j = 0; k < _ksize; k++, j += cn ) 903 { 904 f = _mm_cvtsi32_si128(kx[k]); 905 f = _mm_shuffle_epi32(f, 0); 906 f = _mm_packs_epi32(f, f); 907 908 x0 = _mm_cvtsi32_si128(*(const int*)(src + j)); 909 x0 = _mm_unpacklo_epi8(x0, z); 910 x1 = _mm_mulhi_epi16(x0, f); 911 x0 = _mm_mullo_epi16(x0, f); 912 s0 = _mm_add_epi32(s0, _mm_unpacklo_epi16(x0, x1)); 913 } 914 _mm_store_si128((__m128i*)(dst + i), s0); 915 } 916 917 return i; 918 } 919 920 Mat kernel; 921 int symmetryType; 922 bool smallValues; 923 }; 924 925 926 struct SymmColumnVec_32s8u 927 { 928 SymmColumnVec_32s8u() { symmetryType=0; } 929 SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta) 930 { 931 symmetryType = _symmetryType; 932 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); 933 delta = (float)(_delta/(1 << _bits)); 934 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); 935 } 936 937 int operator()(const uchar** _src, uchar* dst, int width) const 938 { 939 if( !checkHardwareSupport(CV_CPU_SSE2) ) 940 return 0; 941 942 int ksize2 = (kernel.rows + kernel.cols - 1)/2; 943 const float* ky = kernel.ptr<float>() + ksize2; 944 int i = 0, k; 945 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 946 const int** src = (const int**)_src; 947 const __m128i *S, *S2; 948 __m128 d4 = _mm_set1_ps(delta); 949 950 if( symmetrical ) 951 { 952 for( ; i <= width - 16; i += 16 ) 953 { 954 __m128 f = _mm_load_ss(ky); 955 f = _mm_shuffle_ps(f, f, 0); 956 __m128 s0, s1, s2, s3; 957 __m128i x0, x1; 958 S = (const __m128i*)(src[0] + i); 959 s0 = _mm_cvtepi32_ps(_mm_load_si128(S)); 960 s1 = _mm_cvtepi32_ps(_mm_load_si128(S+1)); 961 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); 962 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); 963 s2 = _mm_cvtepi32_ps(_mm_load_si128(S+2)); 964 s3 = _mm_cvtepi32_ps(_mm_load_si128(S+3)); 965 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); 966 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); 967 968 for( k = 1; k <= ksize2; k++ ) 969 { 970 S = (const __m128i*)(src[k] + i); 971 S2 = (const __m128i*)(src[-k] + i); 972 f = _mm_load_ss(ky+k); 973 f = _mm_shuffle_ps(f, f, 0); 974 x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2)); 975 x1 = _mm_add_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1)); 976 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); 977 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); 978 x0 = _mm_add_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2)); 979 x1 = _mm_add_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3)); 980 s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); 981 s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); 982 } 983 984 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); 985 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); 986 x0 = _mm_packus_epi16(x0, x1); 987 _mm_storeu_si128((__m128i*)(dst + i), x0); 988 } 989 990 for( ; i <= width - 4; i += 4 ) 991 { 992 __m128 f = _mm_load_ss(ky); 993 f = _mm_shuffle_ps(f, f, 0); 994 __m128i x0; 995 __m128 s0 = _mm_cvtepi32_ps(_mm_load_si128((const __m128i*)(src[0] + i))); 996 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); 997 998 for( k = 1; k <= ksize2; k++ ) 999 { 1000 S = (const __m128i*)(src[k] + i); 1001 S2 = (const __m128i*)(src[-k] + i); 1002 f = _mm_load_ss(ky+k); 1003 f = _mm_shuffle_ps(f, f, 0); 1004 x0 = _mm_add_epi32(_mm_load_si128(S), _mm_load_si128(S2)); 1005 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); 1006 } 1007 1008 x0 = _mm_cvtps_epi32(s0); 1009 x0 = _mm_packs_epi32(x0, x0); 1010 x0 = _mm_packus_epi16(x0, x0); 1011 *(int*)(dst + i) = _mm_cvtsi128_si32(x0); 1012 } 1013 } 1014 else 1015 { 1016 for( ; i <= width - 16; i += 16 ) 1017 { 1018 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; 1019 __m128i x0, x1; 1020 1021 for( k = 1; k <= ksize2; k++ ) 1022 { 1023 S = (const __m128i*)(src[k] + i); 1024 S2 = (const __m128i*)(src[-k] + i); 1025 f = _mm_load_ss(ky+k); 1026 f = _mm_shuffle_ps(f, f, 0); 1027 x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2)); 1028 x1 = _mm_sub_epi32(_mm_load_si128(S+1), _mm_load_si128(S2+1)); 1029 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); 1030 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); 1031 x0 = _mm_sub_epi32(_mm_load_si128(S+2), _mm_load_si128(S2+2)); 1032 x1 = _mm_sub_epi32(_mm_load_si128(S+3), _mm_load_si128(S2+3)); 1033 s2 = _mm_add_ps(s2, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); 1034 s3 = _mm_add_ps(s3, _mm_mul_ps(_mm_cvtepi32_ps(x1), f)); 1035 } 1036 1037 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); 1038 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); 1039 x0 = _mm_packus_epi16(x0, x1); 1040 _mm_storeu_si128((__m128i*)(dst + i), x0); 1041 } 1042 1043 for( ; i <= width - 4; i += 4 ) 1044 { 1045 __m128 f, s0 = d4; 1046 __m128i x0; 1047 1048 for( k = 1; k <= ksize2; k++ ) 1049 { 1050 S = (const __m128i*)(src[k] + i); 1051 S2 = (const __m128i*)(src[-k] + i); 1052 f = _mm_load_ss(ky+k); 1053 f = _mm_shuffle_ps(f, f, 0); 1054 x0 = _mm_sub_epi32(_mm_load_si128(S), _mm_load_si128(S2)); 1055 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0), f)); 1056 } 1057 1058 x0 = _mm_cvtps_epi32(s0); 1059 x0 = _mm_packs_epi32(x0, x0); 1060 x0 = _mm_packus_epi16(x0, x0); 1061 *(int*)(dst + i) = _mm_cvtsi128_si32(x0); 1062 } 1063 } 1064 1065 return i; 1066 } 1067 1068 int symmetryType; 1069 float delta; 1070 Mat kernel; 1071 }; 1072 1073 1074 struct SymmColumnSmallVec_32s16s 1075 { 1076 SymmColumnSmallVec_32s16s() { symmetryType=0; } 1077 SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta) 1078 { 1079 symmetryType = _symmetryType; 1080 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); 1081 delta = (float)(_delta/(1 << _bits)); 1082 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); 1083 } 1084 1085 int operator()(const uchar** _src, uchar* _dst, int width) const 1086 { 1087 if( !checkHardwareSupport(CV_CPU_SSE2) ) 1088 return 0; 1089 1090 int ksize2 = (kernel.rows + kernel.cols - 1)/2; 1091 const float* ky = kernel.ptr<float>() + ksize2; 1092 int i = 0; 1093 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 1094 const int** src = (const int**)_src; 1095 const int *S0 = src[-1], *S1 = src[0], *S2 = src[1]; 1096 short* dst = (short*)_dst; 1097 __m128 df4 = _mm_set1_ps(delta); 1098 __m128i d4 = _mm_cvtps_epi32(df4); 1099 1100 if( symmetrical ) 1101 { 1102 if( ky[0] == 2 && ky[1] == 1 ) 1103 { 1104 for( ; i <= width - 8; i += 8 ) 1105 { 1106 __m128i s0, s1, s2, s3, s4, s5; 1107 s0 = _mm_load_si128((__m128i*)(S0 + i)); 1108 s1 = _mm_load_si128((__m128i*)(S0 + i + 4)); 1109 s2 = _mm_load_si128((__m128i*)(S1 + i)); 1110 s3 = _mm_load_si128((__m128i*)(S1 + i + 4)); 1111 s4 = _mm_load_si128((__m128i*)(S2 + i)); 1112 s5 = _mm_load_si128((__m128i*)(S2 + i + 4)); 1113 s0 = _mm_add_epi32(s0, _mm_add_epi32(s4, _mm_add_epi32(s2, s2))); 1114 s1 = _mm_add_epi32(s1, _mm_add_epi32(s5, _mm_add_epi32(s3, s3))); 1115 s0 = _mm_add_epi32(s0, d4); 1116 s1 = _mm_add_epi32(s1, d4); 1117 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1)); 1118 } 1119 } 1120 else if( ky[0] == -2 && ky[1] == 1 ) 1121 { 1122 for( ; i <= width - 8; i += 8 ) 1123 { 1124 __m128i s0, s1, s2, s3, s4, s5; 1125 s0 = _mm_load_si128((__m128i*)(S0 + i)); 1126 s1 = _mm_load_si128((__m128i*)(S0 + i + 4)); 1127 s2 = _mm_load_si128((__m128i*)(S1 + i)); 1128 s3 = _mm_load_si128((__m128i*)(S1 + i + 4)); 1129 s4 = _mm_load_si128((__m128i*)(S2 + i)); 1130 s5 = _mm_load_si128((__m128i*)(S2 + i + 4)); 1131 s0 = _mm_add_epi32(s0, _mm_sub_epi32(s4, _mm_add_epi32(s2, s2))); 1132 s1 = _mm_add_epi32(s1, _mm_sub_epi32(s5, _mm_add_epi32(s3, s3))); 1133 s0 = _mm_add_epi32(s0, d4); 1134 s1 = _mm_add_epi32(s1, d4); 1135 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1)); 1136 } 1137 } 1138 else 1139 { 1140 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]); 1141 for( ; i <= width - 8; i += 8 ) 1142 { 1143 __m128 s0, s1; 1144 s0 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i))); 1145 s1 = _mm_cvtepi32_ps(_mm_load_si128((__m128i*)(S1 + i + 4))); 1146 s0 = _mm_add_ps(_mm_mul_ps(s0, k0), df4); 1147 s1 = _mm_add_ps(_mm_mul_ps(s1, k0), df4); 1148 __m128i x0, x1; 1149 x0 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i)), 1150 _mm_load_si128((__m128i*)(S2 + i))); 1151 x1 = _mm_add_epi32(_mm_load_si128((__m128i*)(S0 + i + 4)), 1152 _mm_load_si128((__m128i*)(S2 + i + 4))); 1153 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1)); 1154 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1)); 1155 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); 1156 _mm_storeu_si128((__m128i*)(dst + i), x0); 1157 } 1158 } 1159 } 1160 else 1161 { 1162 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) 1163 { 1164 if( ky[1] < 0 ) 1165 std::swap(S0, S2); 1166 for( ; i <= width - 8; i += 8 ) 1167 { 1168 __m128i s0, s1, s2, s3; 1169 s0 = _mm_load_si128((__m128i*)(S2 + i)); 1170 s1 = _mm_load_si128((__m128i*)(S2 + i + 4)); 1171 s2 = _mm_load_si128((__m128i*)(S0 + i)); 1172 s3 = _mm_load_si128((__m128i*)(S0 + i + 4)); 1173 s0 = _mm_add_epi32(_mm_sub_epi32(s0, s2), d4); 1174 s1 = _mm_add_epi32(_mm_sub_epi32(s1, s3), d4); 1175 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0, s1)); 1176 } 1177 } 1178 else 1179 { 1180 __m128 k1 = _mm_set1_ps(ky[1]); 1181 for( ; i <= width - 8; i += 8 ) 1182 { 1183 __m128 s0 = df4, s1 = df4; 1184 __m128i x0, x1; 1185 x0 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i)), 1186 _mm_load_si128((__m128i*)(S0 + i))); 1187 x1 = _mm_sub_epi32(_mm_load_si128((__m128i*)(S2 + i + 4)), 1188 _mm_load_si128((__m128i*)(S0 + i + 4))); 1189 s0 = _mm_add_ps(s0, _mm_mul_ps(_mm_cvtepi32_ps(x0),k1)); 1190 s1 = _mm_add_ps(s1, _mm_mul_ps(_mm_cvtepi32_ps(x1),k1)); 1191 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); 1192 _mm_storeu_si128((__m128i*)(dst + i), x0); 1193 } 1194 } 1195 } 1196 1197 return i; 1198 } 1199 1200 int symmetryType; 1201 float delta; 1202 Mat kernel; 1203 }; 1204 1205 1206 /////////////////////////////////////// 16s ////////////////////////////////// 1207 1208 struct RowVec_16s32f 1209 { 1210 RowVec_16s32f() {} 1211 RowVec_16s32f( const Mat& _kernel ) 1212 { 1213 kernel = _kernel; 1214 sse2_supported = checkHardwareSupport(CV_CPU_SSE2); 1215 } 1216 1217 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const 1218 { 1219 if( !sse2_supported ) 1220 return 0; 1221 1222 int i = 0, k, _ksize = kernel.rows + kernel.cols - 1; 1223 float* dst = (float*)_dst; 1224 const float* _kx = kernel.ptr<float>(); 1225 width *= cn; 1226 1227 for( ; i <= width - 8; i += 8 ) 1228 { 1229 const short* src = (const short*)_src + i; 1230 __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1; 1231 for( k = 0; k < _ksize; k++, src += cn ) 1232 { 1233 f = _mm_load_ss(_kx+k); 1234 f = _mm_shuffle_ps(f, f, 0); 1235 1236 __m128i x0i = _mm_loadu_si128((const __m128i*)src); 1237 __m128i x1i = _mm_srai_epi32(_mm_unpackhi_epi16(x0i, x0i), 16); 1238 x0i = _mm_srai_epi32(_mm_unpacklo_epi16(x0i, x0i), 16); 1239 x0 = _mm_cvtepi32_ps(x0i); 1240 x1 = _mm_cvtepi32_ps(x1i); 1241 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1242 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); 1243 } 1244 _mm_store_ps(dst + i, s0); 1245 _mm_store_ps(dst + i + 4, s1); 1246 } 1247 return i; 1248 } 1249 1250 Mat kernel; 1251 bool sse2_supported; 1252 }; 1253 1254 1255 struct SymmColumnVec_32f16s 1256 { 1257 SymmColumnVec_32f16s() { symmetryType=0; } 1258 SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta) 1259 { 1260 symmetryType = _symmetryType; 1261 kernel = _kernel; 1262 delta = (float)_delta; 1263 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); 1264 sse2_supported = checkHardwareSupport(CV_CPU_SSE2); 1265 } 1266 1267 int operator()(const uchar** _src, uchar* _dst, int width) const 1268 { 1269 if( !sse2_supported ) 1270 return 0; 1271 1272 int ksize2 = (kernel.rows + kernel.cols - 1)/2; 1273 const float* ky = kernel.ptr<float>() + ksize2; 1274 int i = 0, k; 1275 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 1276 const float** src = (const float**)_src; 1277 const float *S, *S2; 1278 short* dst = (short*)_dst; 1279 __m128 d4 = _mm_set1_ps(delta); 1280 1281 if( symmetrical ) 1282 { 1283 for( ; i <= width - 16; i += 16 ) 1284 { 1285 __m128 f = _mm_load_ss(ky); 1286 f = _mm_shuffle_ps(f, f, 0); 1287 __m128 s0, s1, s2, s3; 1288 __m128 x0, x1; 1289 S = src[0] + i; 1290 s0 = _mm_load_ps(S); 1291 s1 = _mm_load_ps(S+4); 1292 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); 1293 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); 1294 s2 = _mm_load_ps(S+8); 1295 s3 = _mm_load_ps(S+12); 1296 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); 1297 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); 1298 1299 for( k = 1; k <= ksize2; k++ ) 1300 { 1301 S = src[k] + i; 1302 S2 = src[-k] + i; 1303 f = _mm_load_ss(ky+k); 1304 f = _mm_shuffle_ps(f, f, 0); 1305 x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2)); 1306 x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); 1307 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1308 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); 1309 x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); 1310 x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); 1311 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); 1312 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); 1313 } 1314 1315 __m128i s0i = _mm_cvtps_epi32(s0); 1316 __m128i s1i = _mm_cvtps_epi32(s1); 1317 __m128i s2i = _mm_cvtps_epi32(s2); 1318 __m128i s3i = _mm_cvtps_epi32(s3); 1319 1320 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i)); 1321 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i)); 1322 } 1323 1324 for( ; i <= width - 4; i += 4 ) 1325 { 1326 __m128 f = _mm_load_ss(ky); 1327 f = _mm_shuffle_ps(f, f, 0); 1328 __m128 x0, s0 = _mm_load_ps(src[0] + i); 1329 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); 1330 1331 for( k = 1; k <= ksize2; k++ ) 1332 { 1333 f = _mm_load_ss(ky+k); 1334 f = _mm_shuffle_ps(f, f, 0); 1335 S = src[k] + i; 1336 S2 = src[-k] + i; 1337 x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); 1338 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1339 } 1340 1341 __m128i s0i = _mm_cvtps_epi32(s0); 1342 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i)); 1343 } 1344 } 1345 else 1346 { 1347 for( ; i <= width - 16; i += 16 ) 1348 { 1349 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; 1350 __m128 x0, x1; 1351 S = src[0] + i; 1352 1353 for( k = 1; k <= ksize2; k++ ) 1354 { 1355 S = src[k] + i; 1356 S2 = src[-k] + i; 1357 f = _mm_load_ss(ky+k); 1358 f = _mm_shuffle_ps(f, f, 0); 1359 x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2)); 1360 x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); 1361 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1362 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); 1363 x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); 1364 x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); 1365 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); 1366 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); 1367 } 1368 1369 __m128i s0i = _mm_cvtps_epi32(s0); 1370 __m128i s1i = _mm_cvtps_epi32(s1); 1371 __m128i s2i = _mm_cvtps_epi32(s2); 1372 __m128i s3i = _mm_cvtps_epi32(s3); 1373 1374 _mm_storeu_si128((__m128i*)(dst + i), _mm_packs_epi32(s0i, s1i)); 1375 _mm_storeu_si128((__m128i*)(dst + i + 8), _mm_packs_epi32(s2i, s3i)); 1376 } 1377 1378 for( ; i <= width - 4; i += 4 ) 1379 { 1380 __m128 f, x0, s0 = d4; 1381 1382 for( k = 1; k <= ksize2; k++ ) 1383 { 1384 f = _mm_load_ss(ky+k); 1385 f = _mm_shuffle_ps(f, f, 0); 1386 x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); 1387 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1388 } 1389 1390 __m128i s0i = _mm_cvtps_epi32(s0); 1391 _mm_storel_epi64((__m128i*)(dst + i), _mm_packs_epi32(s0i, s0i)); 1392 } 1393 } 1394 1395 return i; 1396 } 1397 1398 int symmetryType; 1399 float delta; 1400 Mat kernel; 1401 bool sse2_supported; 1402 }; 1403 1404 1405 /////////////////////////////////////// 32f ////////////////////////////////// 1406 1407 struct RowVec_32f 1408 { 1409 RowVec_32f() 1410 { 1411 haveSSE = checkHardwareSupport(CV_CPU_SSE); 1412 } 1413 1414 RowVec_32f( const Mat& _kernel ) 1415 { 1416 kernel = _kernel; 1417 haveSSE = checkHardwareSupport(CV_CPU_SSE); 1418 #if defined USE_IPP_SEP_FILTERS && 0 1419 bufsz = -1; 1420 #endif 1421 } 1422 1423 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const 1424 { 1425 #if defined USE_IPP_SEP_FILTERS && 0 1426 CV_IPP_CHECK() 1427 { 1428 int ret = ippiOperator(_src, _dst, width, cn); 1429 if (ret > 0) 1430 return ret; 1431 } 1432 #endif 1433 int _ksize = kernel.rows + kernel.cols - 1; 1434 const float* src0 = (const float*)_src; 1435 float* dst = (float*)_dst; 1436 const float* _kx = kernel.ptr<float>(); 1437 1438 if( !haveSSE ) 1439 return 0; 1440 1441 int i = 0, k; 1442 width *= cn; 1443 1444 for( ; i <= width - 8; i += 8 ) 1445 { 1446 const float* src = src0 + i; 1447 __m128 f, s0 = _mm_setzero_ps(), s1 = s0, x0, x1; 1448 for( k = 0; k < _ksize; k++, src += cn ) 1449 { 1450 f = _mm_load_ss(_kx+k); 1451 f = _mm_shuffle_ps(f, f, 0); 1452 1453 x0 = _mm_loadu_ps(src); 1454 x1 = _mm_loadu_ps(src + 4); 1455 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1456 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); 1457 } 1458 _mm_store_ps(dst + i, s0); 1459 _mm_store_ps(dst + i + 4, s1); 1460 } 1461 return i; 1462 } 1463 1464 Mat kernel; 1465 bool haveSSE; 1466 #if defined USE_IPP_SEP_FILTERS && 0 1467 private: 1468 mutable int bufsz; 1469 int ippiOperator(const uchar* _src, uchar* _dst, int width, int cn) const 1470 { 1471 int _ksize = kernel.rows + kernel.cols - 1; 1472 if ((1 != cn && 3 != cn) || width < _ksize*8) 1473 return 0; 1474 1475 const float* src = (const float*)_src; 1476 float* dst = (float*)_dst; 1477 const float* _kx = (const float*)kernel.data; 1478 1479 IppiSize roisz = { width, 1 }; 1480 if( bufsz < 0 ) 1481 { 1482 if( (cn == 1 && ippiFilterRowBorderPipelineGetBufferSize_32f_C1R(roisz, _ksize, &bufsz) < 0) || 1483 (cn == 3 && ippiFilterRowBorderPipelineGetBufferSize_32f_C3R(roisz, _ksize, &bufsz) < 0)) 1484 return 0; 1485 } 1486 AutoBuffer<uchar> buf(bufsz + 64); 1487 uchar* bufptr = alignPtr((uchar*)buf, 32); 1488 int step = (int)(width*sizeof(dst[0])*cn); 1489 float borderValue[] = {0.f, 0.f, 0.f}; 1490 // here is the trick. IPP needs border type and extrapolates the row. We did it already. 1491 // So we pass anchor=0 and ignore the right tail of results since they are incorrect there. 1492 if( (cn == 1 && ippiFilterRowBorderPipeline_32f_C1R(src, step, &dst, roisz, _kx, _ksize, 0, 1493 ippBorderRepl, borderValue[0], bufptr) < 0) || 1494 (cn == 3 && ippiFilterRowBorderPipeline_32f_C3R(src, step, &dst, roisz, _kx, _ksize, 0, 1495 ippBorderRepl, borderValue, bufptr) < 0)) 1496 { 1497 setIppErrorStatus(); 1498 return 0; 1499 } 1500 CV_IMPL_ADD(CV_IMPL_IPP); 1501 return width - _ksize + 1; 1502 } 1503 #endif 1504 }; 1505 1506 1507 struct SymmRowSmallVec_32f 1508 { 1509 SymmRowSmallVec_32f() {} 1510 SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType ) 1511 { 1512 kernel = _kernel; 1513 symmetryType = _symmetryType; 1514 } 1515 1516 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const 1517 { 1518 if( !checkHardwareSupport(CV_CPU_SSE) ) 1519 return 0; 1520 1521 int i = 0, _ksize = kernel.rows + kernel.cols - 1; 1522 float* dst = (float*)_dst; 1523 const float* src = (const float*)_src + (_ksize/2)*cn; 1524 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 1525 const float* kx = kernel.ptr<float>() + _ksize/2; 1526 width *= cn; 1527 1528 if( symmetrical ) 1529 { 1530 if( _ksize == 1 ) 1531 return 0; 1532 if( _ksize == 3 ) 1533 { 1534 if( kx[0] == 2 && kx[1] == 1 ) 1535 for( ; i <= width - 8; i += 8, src += 8 ) 1536 { 1537 __m128 x0, x1, x2, y0, y1, y2; 1538 x0 = _mm_loadu_ps(src - cn); 1539 x1 = _mm_loadu_ps(src); 1540 x2 = _mm_loadu_ps(src + cn); 1541 y0 = _mm_loadu_ps(src - cn + 4); 1542 y1 = _mm_loadu_ps(src + 4); 1543 y2 = _mm_loadu_ps(src + cn + 4); 1544 x0 = _mm_add_ps(x0, _mm_add_ps(_mm_add_ps(x1, x1), x2)); 1545 y0 = _mm_add_ps(y0, _mm_add_ps(_mm_add_ps(y1, y1), y2)); 1546 _mm_store_ps(dst + i, x0); 1547 _mm_store_ps(dst + i + 4, y0); 1548 } 1549 else if( kx[0] == -2 && kx[1] == 1 ) 1550 for( ; i <= width - 8; i += 8, src += 8 ) 1551 { 1552 __m128 x0, x1, x2, y0, y1, y2; 1553 x0 = _mm_loadu_ps(src - cn); 1554 x1 = _mm_loadu_ps(src); 1555 x2 = _mm_loadu_ps(src + cn); 1556 y0 = _mm_loadu_ps(src - cn + 4); 1557 y1 = _mm_loadu_ps(src + 4); 1558 y2 = _mm_loadu_ps(src + cn + 4); 1559 x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1))); 1560 y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1))); 1561 _mm_store_ps(dst + i, x0); 1562 _mm_store_ps(dst + i + 4, y0); 1563 } 1564 else 1565 { 1566 __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]); 1567 for( ; i <= width - 8; i += 8, src += 8 ) 1568 { 1569 __m128 x0, x1, x2, y0, y1, y2; 1570 x0 = _mm_loadu_ps(src - cn); 1571 x1 = _mm_loadu_ps(src); 1572 x2 = _mm_loadu_ps(src + cn); 1573 y0 = _mm_loadu_ps(src - cn + 4); 1574 y1 = _mm_loadu_ps(src + 4); 1575 y2 = _mm_loadu_ps(src + cn + 4); 1576 1577 x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1); 1578 y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1); 1579 x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0)); 1580 y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0)); 1581 _mm_store_ps(dst + i, x0); 1582 _mm_store_ps(dst + i + 4, y0); 1583 } 1584 } 1585 } 1586 else if( _ksize == 5 ) 1587 { 1588 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) 1589 for( ; i <= width - 8; i += 8, src += 8 ) 1590 { 1591 __m128 x0, x1, x2, y0, y1, y2; 1592 x0 = _mm_loadu_ps(src - cn*2); 1593 x1 = _mm_loadu_ps(src); 1594 x2 = _mm_loadu_ps(src + cn*2); 1595 y0 = _mm_loadu_ps(src - cn*2 + 4); 1596 y1 = _mm_loadu_ps(src + 4); 1597 y2 = _mm_loadu_ps(src + cn*2 + 4); 1598 x0 = _mm_add_ps(x0, _mm_sub_ps(x2, _mm_add_ps(x1, x1))); 1599 y0 = _mm_add_ps(y0, _mm_sub_ps(y2, _mm_add_ps(y1, y1))); 1600 _mm_store_ps(dst + i, x0); 1601 _mm_store_ps(dst + i + 4, y0); 1602 } 1603 else 1604 { 1605 __m128 k0 = _mm_set1_ps(kx[0]), k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]); 1606 for( ; i <= width - 8; i += 8, src += 8 ) 1607 { 1608 __m128 x0, x1, x2, y0, y1, y2; 1609 x0 = _mm_loadu_ps(src - cn); 1610 x1 = _mm_loadu_ps(src); 1611 x2 = _mm_loadu_ps(src + cn); 1612 y0 = _mm_loadu_ps(src - cn + 4); 1613 y1 = _mm_loadu_ps(src + 4); 1614 y2 = _mm_loadu_ps(src + cn + 4); 1615 1616 x0 = _mm_mul_ps(_mm_add_ps(x0, x2), k1); 1617 y0 = _mm_mul_ps(_mm_add_ps(y0, y2), k1); 1618 x0 = _mm_add_ps(x0, _mm_mul_ps(x1, k0)); 1619 y0 = _mm_add_ps(y0, _mm_mul_ps(y1, k0)); 1620 1621 x2 = _mm_add_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2)); 1622 y2 = _mm_add_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4)); 1623 x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2)); 1624 y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2)); 1625 1626 _mm_store_ps(dst + i, x0); 1627 _mm_store_ps(dst + i + 4, y0); 1628 } 1629 } 1630 } 1631 } 1632 else 1633 { 1634 if( _ksize == 3 ) 1635 { 1636 if( kx[0] == 0 && kx[1] == 1 ) 1637 for( ; i <= width - 8; i += 8, src += 8 ) 1638 { 1639 __m128 x0, x2, y0, y2; 1640 x0 = _mm_loadu_ps(src + cn); 1641 x2 = _mm_loadu_ps(src - cn); 1642 y0 = _mm_loadu_ps(src + cn + 4); 1643 y2 = _mm_loadu_ps(src - cn + 4); 1644 x0 = _mm_sub_ps(x0, x2); 1645 y0 = _mm_sub_ps(y0, y2); 1646 _mm_store_ps(dst + i, x0); 1647 _mm_store_ps(dst + i + 4, y0); 1648 } 1649 else 1650 { 1651 __m128 k1 = _mm_set1_ps(kx[1]); 1652 for( ; i <= width - 8; i += 8, src += 8 ) 1653 { 1654 __m128 x0, x2, y0, y2; 1655 x0 = _mm_loadu_ps(src + cn); 1656 x2 = _mm_loadu_ps(src - cn); 1657 y0 = _mm_loadu_ps(src + cn + 4); 1658 y2 = _mm_loadu_ps(src - cn + 4); 1659 1660 x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1); 1661 y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1); 1662 _mm_store_ps(dst + i, x0); 1663 _mm_store_ps(dst + i + 4, y0); 1664 } 1665 } 1666 } 1667 else if( _ksize == 5 ) 1668 { 1669 __m128 k1 = _mm_set1_ps(kx[1]), k2 = _mm_set1_ps(kx[2]); 1670 for( ; i <= width - 8; i += 8, src += 8 ) 1671 { 1672 __m128 x0, x2, y0, y2; 1673 x0 = _mm_loadu_ps(src + cn); 1674 x2 = _mm_loadu_ps(src - cn); 1675 y0 = _mm_loadu_ps(src + cn + 4); 1676 y2 = _mm_loadu_ps(src - cn + 4); 1677 1678 x0 = _mm_mul_ps(_mm_sub_ps(x0, x2), k1); 1679 y0 = _mm_mul_ps(_mm_sub_ps(y0, y2), k1); 1680 1681 x2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2), _mm_loadu_ps(src - cn*2)); 1682 y2 = _mm_sub_ps(_mm_loadu_ps(src + cn*2 + 4), _mm_loadu_ps(src - cn*2 + 4)); 1683 x0 = _mm_add_ps(x0, _mm_mul_ps(x2, k2)); 1684 y0 = _mm_add_ps(y0, _mm_mul_ps(y2, k2)); 1685 1686 _mm_store_ps(dst + i, x0); 1687 _mm_store_ps(dst + i + 4, y0); 1688 } 1689 } 1690 } 1691 1692 return i; 1693 } 1694 1695 Mat kernel; 1696 int symmetryType; 1697 }; 1698 1699 1700 struct SymmColumnVec_32f 1701 { 1702 SymmColumnVec_32f() { symmetryType=0; } 1703 SymmColumnVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) 1704 { 1705 symmetryType = _symmetryType; 1706 kernel = _kernel; 1707 delta = (float)_delta; 1708 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); 1709 } 1710 1711 int operator()(const uchar** _src, uchar* _dst, int width) const 1712 { 1713 if( !checkHardwareSupport(CV_CPU_SSE) ) 1714 return 0; 1715 1716 int ksize2 = (kernel.rows + kernel.cols - 1)/2; 1717 const float* ky = kernel.ptr<float>() + ksize2; 1718 int i = 0, k; 1719 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 1720 const float** src = (const float**)_src; 1721 const float *S, *S2; 1722 float* dst = (float*)_dst; 1723 __m128 d4 = _mm_set1_ps(delta); 1724 1725 if( symmetrical ) 1726 { 1727 for( ; i <= width - 16; i += 16 ) 1728 { 1729 __m128 f = _mm_load_ss(ky); 1730 f = _mm_shuffle_ps(f, f, 0); 1731 __m128 s0, s1, s2, s3; 1732 __m128 x0, x1; 1733 S = src[0] + i; 1734 s0 = _mm_load_ps(S); 1735 s1 = _mm_load_ps(S+4); 1736 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); 1737 s1 = _mm_add_ps(_mm_mul_ps(s1, f), d4); 1738 s2 = _mm_load_ps(S+8); 1739 s3 = _mm_load_ps(S+12); 1740 s2 = _mm_add_ps(_mm_mul_ps(s2, f), d4); 1741 s3 = _mm_add_ps(_mm_mul_ps(s3, f), d4); 1742 1743 for( k = 1; k <= ksize2; k++ ) 1744 { 1745 S = src[k] + i; 1746 S2 = src[-k] + i; 1747 f = _mm_load_ss(ky+k); 1748 f = _mm_shuffle_ps(f, f, 0); 1749 x0 = _mm_add_ps(_mm_load_ps(S), _mm_load_ps(S2)); 1750 x1 = _mm_add_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); 1751 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1752 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); 1753 x0 = _mm_add_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); 1754 x1 = _mm_add_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); 1755 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); 1756 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); 1757 } 1758 1759 _mm_storeu_ps(dst + i, s0); 1760 _mm_storeu_ps(dst + i + 4, s1); 1761 _mm_storeu_ps(dst + i + 8, s2); 1762 _mm_storeu_ps(dst + i + 12, s3); 1763 } 1764 1765 for( ; i <= width - 4; i += 4 ) 1766 { 1767 __m128 f = _mm_load_ss(ky); 1768 f = _mm_shuffle_ps(f, f, 0); 1769 __m128 x0, s0 = _mm_load_ps(src[0] + i); 1770 s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4); 1771 1772 for( k = 1; k <= ksize2; k++ ) 1773 { 1774 f = _mm_load_ss(ky+k); 1775 f = _mm_shuffle_ps(f, f, 0); 1776 S = src[k] + i; 1777 S2 = src[-k] + i; 1778 x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); 1779 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1780 } 1781 1782 _mm_storeu_ps(dst + i, s0); 1783 } 1784 } 1785 else 1786 { 1787 for( ; i <= width - 16; i += 16 ) 1788 { 1789 __m128 f, s0 = d4, s1 = d4, s2 = d4, s3 = d4; 1790 __m128 x0, x1; 1791 S = src[0] + i; 1792 1793 for( k = 1; k <= ksize2; k++ ) 1794 { 1795 S = src[k] + i; 1796 S2 = src[-k] + i; 1797 f = _mm_load_ss(ky+k); 1798 f = _mm_shuffle_ps(f, f, 0); 1799 x0 = _mm_sub_ps(_mm_load_ps(S), _mm_load_ps(S2)); 1800 x1 = _mm_sub_ps(_mm_load_ps(S+4), _mm_load_ps(S2+4)); 1801 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1802 s1 = _mm_add_ps(s1, _mm_mul_ps(x1, f)); 1803 x0 = _mm_sub_ps(_mm_load_ps(S+8), _mm_load_ps(S2+8)); 1804 x1 = _mm_sub_ps(_mm_load_ps(S+12), _mm_load_ps(S2+12)); 1805 s2 = _mm_add_ps(s2, _mm_mul_ps(x0, f)); 1806 s3 = _mm_add_ps(s3, _mm_mul_ps(x1, f)); 1807 } 1808 1809 _mm_storeu_ps(dst + i, s0); 1810 _mm_storeu_ps(dst + i + 4, s1); 1811 _mm_storeu_ps(dst + i + 8, s2); 1812 _mm_storeu_ps(dst + i + 12, s3); 1813 } 1814 1815 for( ; i <= width - 4; i += 4 ) 1816 { 1817 __m128 f, x0, s0 = d4; 1818 1819 for( k = 1; k <= ksize2; k++ ) 1820 { 1821 f = _mm_load_ss(ky+k); 1822 f = _mm_shuffle_ps(f, f, 0); 1823 x0 = _mm_sub_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i)); 1824 s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f)); 1825 } 1826 1827 _mm_storeu_ps(dst + i, s0); 1828 } 1829 } 1830 1831 return i; 1832 } 1833 1834 int symmetryType; 1835 float delta; 1836 Mat kernel; 1837 }; 1838 1839 1840 struct SymmColumnSmallVec_32f 1841 { 1842 SymmColumnSmallVec_32f() { symmetryType=0; } 1843 SymmColumnSmallVec_32f(const Mat& _kernel, int _symmetryType, int, double _delta) 1844 { 1845 symmetryType = _symmetryType; 1846 kernel = _kernel; 1847 delta = (float)_delta; 1848 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); 1849 } 1850 1851 int operator()(const uchar** _src, uchar* _dst, int width) const 1852 { 1853 if( !checkHardwareSupport(CV_CPU_SSE) ) 1854 return 0; 1855 1856 int ksize2 = (kernel.rows + kernel.cols - 1)/2; 1857 const float* ky = kernel.ptr<float>() + ksize2; 1858 int i = 0; 1859 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 1860 const float** src = (const float**)_src; 1861 const float *S0 = src[-1], *S1 = src[0], *S2 = src[1]; 1862 float* dst = (float*)_dst; 1863 __m128 d4 = _mm_set1_ps(delta); 1864 1865 if( symmetrical ) 1866 { 1867 if( ky[0] == 2 && ky[1] == 1 ) 1868 { 1869 for( ; i <= width - 8; i += 8 ) 1870 { 1871 __m128 s0, s1, s2, s3, s4, s5; 1872 s0 = _mm_load_ps(S0 + i); 1873 s1 = _mm_load_ps(S0 + i + 4); 1874 s2 = _mm_load_ps(S1 + i); 1875 s3 = _mm_load_ps(S1 + i + 4); 1876 s4 = _mm_load_ps(S2 + i); 1877 s5 = _mm_load_ps(S2 + i + 4); 1878 s0 = _mm_add_ps(s0, _mm_add_ps(s4, _mm_add_ps(s2, s2))); 1879 s1 = _mm_add_ps(s1, _mm_add_ps(s5, _mm_add_ps(s3, s3))); 1880 s0 = _mm_add_ps(s0, d4); 1881 s1 = _mm_add_ps(s1, d4); 1882 _mm_storeu_ps(dst + i, s0); 1883 _mm_storeu_ps(dst + i + 4, s1); 1884 } 1885 } 1886 else if( ky[0] == -2 && ky[1] == 1 ) 1887 { 1888 for( ; i <= width - 8; i += 8 ) 1889 { 1890 __m128 s0, s1, s2, s3, s4, s5; 1891 s0 = _mm_load_ps(S0 + i); 1892 s1 = _mm_load_ps(S0 + i + 4); 1893 s2 = _mm_load_ps(S1 + i); 1894 s3 = _mm_load_ps(S1 + i + 4); 1895 s4 = _mm_load_ps(S2 + i); 1896 s5 = _mm_load_ps(S2 + i + 4); 1897 s0 = _mm_add_ps(s0, _mm_sub_ps(s4, _mm_add_ps(s2, s2))); 1898 s1 = _mm_add_ps(s1, _mm_sub_ps(s5, _mm_add_ps(s3, s3))); 1899 s0 = _mm_add_ps(s0, d4); 1900 s1 = _mm_add_ps(s1, d4); 1901 _mm_storeu_ps(dst + i, s0); 1902 _mm_storeu_ps(dst + i + 4, s1); 1903 } 1904 } 1905 else 1906 { 1907 __m128 k0 = _mm_set1_ps(ky[0]), k1 = _mm_set1_ps(ky[1]); 1908 for( ; i <= width - 8; i += 8 ) 1909 { 1910 __m128 s0, s1, x0, x1; 1911 s0 = _mm_load_ps(S1 + i); 1912 s1 = _mm_load_ps(S1 + i + 4); 1913 s0 = _mm_add_ps(_mm_mul_ps(s0, k0), d4); 1914 s1 = _mm_add_ps(_mm_mul_ps(s1, k0), d4); 1915 x0 = _mm_add_ps(_mm_load_ps(S0 + i), _mm_load_ps(S2 + i)); 1916 x1 = _mm_add_ps(_mm_load_ps(S0 + i + 4), _mm_load_ps(S2 + i + 4)); 1917 s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1)); 1918 s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1)); 1919 _mm_storeu_ps(dst + i, s0); 1920 _mm_storeu_ps(dst + i + 4, s1); 1921 } 1922 } 1923 } 1924 else 1925 { 1926 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) 1927 { 1928 if( ky[1] < 0 ) 1929 std::swap(S0, S2); 1930 for( ; i <= width - 8; i += 8 ) 1931 { 1932 __m128 s0, s1, s2, s3; 1933 s0 = _mm_load_ps(S2 + i); 1934 s1 = _mm_load_ps(S2 + i + 4); 1935 s2 = _mm_load_ps(S0 + i); 1936 s3 = _mm_load_ps(S0 + i + 4); 1937 s0 = _mm_add_ps(_mm_sub_ps(s0, s2), d4); 1938 s1 = _mm_add_ps(_mm_sub_ps(s1, s3), d4); 1939 _mm_storeu_ps(dst + i, s0); 1940 _mm_storeu_ps(dst + i + 4, s1); 1941 } 1942 } 1943 else 1944 { 1945 __m128 k1 = _mm_set1_ps(ky[1]); 1946 for( ; i <= width - 8; i += 8 ) 1947 { 1948 __m128 s0 = d4, s1 = d4, x0, x1; 1949 x0 = _mm_sub_ps(_mm_load_ps(S2 + i), _mm_load_ps(S0 + i)); 1950 x1 = _mm_sub_ps(_mm_load_ps(S2 + i + 4), _mm_load_ps(S0 + i + 4)); 1951 s0 = _mm_add_ps(s0, _mm_mul_ps(x0,k1)); 1952 s1 = _mm_add_ps(s1, _mm_mul_ps(x1,k1)); 1953 _mm_storeu_ps(dst + i, s0); 1954 _mm_storeu_ps(dst + i + 4, s1); 1955 } 1956 } 1957 } 1958 1959 return i; 1960 } 1961 1962 int symmetryType; 1963 float delta; 1964 Mat kernel; 1965 }; 1966 1967 1968 /////////////////////////////// non-separable filters /////////////////////////////// 1969 1970 ///////////////////////////////// 8u<->8u, 8u<->16s ///////////////////////////////// 1971 1972 struct FilterVec_8u 1973 { 1974 FilterVec_8u() {} 1975 FilterVec_8u(const Mat& _kernel, int _bits, double _delta) 1976 { 1977 Mat kernel; 1978 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); 1979 delta = (float)(_delta/(1 << _bits)); 1980 std::vector<Point> coords; 1981 preprocess2DKernel(kernel, coords, coeffs); 1982 _nz = (int)coords.size(); 1983 } 1984 1985 int operator()(const uchar** src, uchar* dst, int width) const 1986 { 1987 if( !checkHardwareSupport(CV_CPU_SSE2) ) 1988 return 0; 1989 1990 const float* kf = (const float*)&coeffs[0]; 1991 int i = 0, k, nz = _nz; 1992 __m128 d4 = _mm_set1_ps(delta); 1993 1994 for( ; i <= width - 16; i += 16 ) 1995 { 1996 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4; 1997 __m128i x0, x1, z = _mm_setzero_si128(); 1998 1999 for( k = 0; k < nz; k++ ) 2000 { 2001 __m128 f = _mm_load_ss(kf+k), t0, t1; 2002 f = _mm_shuffle_ps(f, f, 0); 2003 2004 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i)); 2005 x1 = _mm_unpackhi_epi8(x0, z); 2006 x0 = _mm_unpacklo_epi8(x0, z); 2007 2008 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); 2009 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z)); 2010 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); 2011 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); 2012 2013 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z)); 2014 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z)); 2015 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); 2016 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); 2017 } 2018 2019 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); 2020 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); 2021 x0 = _mm_packus_epi16(x0, x1); 2022 _mm_storeu_si128((__m128i*)(dst + i), x0); 2023 } 2024 2025 for( ; i <= width - 4; i += 4 ) 2026 { 2027 __m128 s0 = d4; 2028 __m128i x0, z = _mm_setzero_si128(); 2029 2030 for( k = 0; k < nz; k++ ) 2031 { 2032 __m128 f = _mm_load_ss(kf+k), t0; 2033 f = _mm_shuffle_ps(f, f, 0); 2034 2035 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i)); 2036 x0 = _mm_unpacklo_epi8(x0, z); 2037 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); 2038 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); 2039 } 2040 2041 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z); 2042 x0 = _mm_packus_epi16(x0, x0); 2043 *(int*)(dst + i) = _mm_cvtsi128_si32(x0); 2044 } 2045 2046 return i; 2047 } 2048 2049 int _nz; 2050 std::vector<uchar> coeffs; 2051 float delta; 2052 }; 2053 2054 2055 struct FilterVec_8u16s 2056 { 2057 FilterVec_8u16s() {} 2058 FilterVec_8u16s(const Mat& _kernel, int _bits, double _delta) 2059 { 2060 Mat kernel; 2061 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); 2062 delta = (float)(_delta/(1 << _bits)); 2063 std::vector<Point> coords; 2064 preprocess2DKernel(kernel, coords, coeffs); 2065 _nz = (int)coords.size(); 2066 } 2067 2068 int operator()(const uchar** src, uchar* _dst, int width) const 2069 { 2070 if( !checkHardwareSupport(CV_CPU_SSE2) ) 2071 return 0; 2072 2073 const float* kf = (const float*)&coeffs[0]; 2074 short* dst = (short*)_dst; 2075 int i = 0, k, nz = _nz; 2076 __m128 d4 = _mm_set1_ps(delta); 2077 2078 for( ; i <= width - 16; i += 16 ) 2079 { 2080 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4; 2081 __m128i x0, x1, z = _mm_setzero_si128(); 2082 2083 for( k = 0; k < nz; k++ ) 2084 { 2085 __m128 f = _mm_load_ss(kf+k), t0, t1; 2086 f = _mm_shuffle_ps(f, f, 0); 2087 2088 x0 = _mm_loadu_si128((const __m128i*)(src[k] + i)); 2089 x1 = _mm_unpackhi_epi8(x0, z); 2090 x0 = _mm_unpacklo_epi8(x0, z); 2091 2092 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); 2093 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x0, z)); 2094 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); 2095 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); 2096 2097 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x1, z)); 2098 t1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(x1, z)); 2099 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); 2100 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); 2101 } 2102 2103 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), _mm_cvtps_epi32(s1)); 2104 x1 = _mm_packs_epi32(_mm_cvtps_epi32(s2), _mm_cvtps_epi32(s3)); 2105 _mm_storeu_si128((__m128i*)(dst + i), x0); 2106 _mm_storeu_si128((__m128i*)(dst + i + 8), x1); 2107 } 2108 2109 for( ; i <= width - 4; i += 4 ) 2110 { 2111 __m128 s0 = d4; 2112 __m128i x0, z = _mm_setzero_si128(); 2113 2114 for( k = 0; k < nz; k++ ) 2115 { 2116 __m128 f = _mm_load_ss(kf+k), t0; 2117 f = _mm_shuffle_ps(f, f, 0); 2118 2119 x0 = _mm_cvtsi32_si128(*(const int*)(src[k] + i)); 2120 x0 = _mm_unpacklo_epi8(x0, z); 2121 t0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(x0, z)); 2122 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); 2123 } 2124 2125 x0 = _mm_packs_epi32(_mm_cvtps_epi32(s0), z); 2126 _mm_storel_epi64((__m128i*)(dst + i), x0); 2127 } 2128 2129 return i; 2130 } 2131 2132 int _nz; 2133 std::vector<uchar> coeffs; 2134 float delta; 2135 }; 2136 2137 2138 struct FilterVec_32f 2139 { 2140 FilterVec_32f() {} 2141 FilterVec_32f(const Mat& _kernel, int, double _delta) 2142 { 2143 delta = (float)_delta; 2144 std::vector<Point> coords; 2145 preprocess2DKernel(_kernel, coords, coeffs); 2146 _nz = (int)coords.size(); 2147 } 2148 2149 int operator()(const uchar** _src, uchar* _dst, int width) const 2150 { 2151 if( !checkHardwareSupport(CV_CPU_SSE) ) 2152 return 0; 2153 2154 const float* kf = (const float*)&coeffs[0]; 2155 const float** src = (const float**)_src; 2156 float* dst = (float*)_dst; 2157 int i = 0, k, nz = _nz; 2158 __m128 d4 = _mm_set1_ps(delta); 2159 2160 for( ; i <= width - 16; i += 16 ) 2161 { 2162 __m128 s0 = d4, s1 = d4, s2 = d4, s3 = d4; 2163 2164 for( k = 0; k < nz; k++ ) 2165 { 2166 __m128 f = _mm_load_ss(kf+k), t0, t1; 2167 f = _mm_shuffle_ps(f, f, 0); 2168 const float* S = src[k] + i; 2169 2170 t0 = _mm_loadu_ps(S); 2171 t1 = _mm_loadu_ps(S + 4); 2172 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); 2173 s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); 2174 2175 t0 = _mm_loadu_ps(S + 8); 2176 t1 = _mm_loadu_ps(S + 12); 2177 s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); 2178 s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); 2179 } 2180 2181 _mm_storeu_ps(dst + i, s0); 2182 _mm_storeu_ps(dst + i + 4, s1); 2183 _mm_storeu_ps(dst + i + 8, s2); 2184 _mm_storeu_ps(dst + i + 12, s3); 2185 } 2186 2187 for( ; i <= width - 4; i += 4 ) 2188 { 2189 __m128 s0 = d4; 2190 2191 for( k = 0; k < nz; k++ ) 2192 { 2193 __m128 f = _mm_load_ss(kf+k), t0; 2194 f = _mm_shuffle_ps(f, f, 0); 2195 t0 = _mm_loadu_ps(src[k] + i); 2196 s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); 2197 } 2198 _mm_storeu_ps(dst + i, s0); 2199 } 2200 2201 return i; 2202 } 2203 2204 int _nz; 2205 std::vector<uchar> coeffs; 2206 float delta; 2207 }; 2208 2209 2210 #elif CV_NEON 2211 2212 struct SymmRowSmallVec_8u32s 2213 { 2214 SymmRowSmallVec_8u32s() { smallValues = false; } 2215 SymmRowSmallVec_8u32s( const Mat& _kernel, int _symmetryType ) 2216 { 2217 kernel = _kernel; 2218 symmetryType = _symmetryType; 2219 smallValues = true; 2220 int k, ksize = kernel.rows + kernel.cols - 1; 2221 for( k = 0; k < ksize; k++ ) 2222 { 2223 int v = kernel.ptr<int>()[k]; 2224 if( v < SHRT_MIN || v > SHRT_MAX ) 2225 { 2226 smallValues = false; 2227 break; 2228 } 2229 } 2230 } 2231 2232 int operator()(const uchar* src, uchar* _dst, int width, int cn) const 2233 { 2234 if( !checkHardwareSupport(CV_CPU_NEON) ) 2235 return 0; 2236 2237 int i = 0, _ksize = kernel.rows + kernel.cols - 1; 2238 int* dst = (int*)_dst; 2239 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 2240 const int* kx = kernel.ptr<int>() + _ksize/2; 2241 if( !smallValues ) 2242 return 0; 2243 2244 src += (_ksize/2)*cn; 2245 width *= cn; 2246 2247 if( symmetrical ) 2248 { 2249 if( _ksize == 1 ) 2250 return 0; 2251 if( _ksize == 3 ) 2252 { 2253 if( kx[0] == 2 && kx[1] == 1 ) 2254 { 2255 uint16x8_t zq = vdupq_n_u16(0); 2256 2257 for( ; i <= width - 8; i += 8, src += 8 ) 2258 { 2259 uint8x8_t x0, x1, x2; 2260 x0 = vld1_u8( (uint8_t *) (src - cn) ); 2261 x1 = vld1_u8( (uint8_t *) (src) ); 2262 x2 = vld1_u8( (uint8_t *) (src + cn) ); 2263 2264 uint16x8_t y0, y1, y2; 2265 y0 = vaddl_u8(x0, x2); 2266 y1 = vshll_n_u8(x1, 1); 2267 y2 = vaddq_u16(y0, y1); 2268 2269 uint16x8x2_t str; 2270 str.val[0] = y2; str.val[1] = zq; 2271 vst2q_u16( (uint16_t *) (dst + i), str ); 2272 } 2273 } 2274 else if( kx[0] == -2 && kx[1] == 1 ) 2275 return 0; 2276 else 2277 { 2278 int32x4_t k32 = vdupq_n_s32(0); 2279 k32 = vld1q_lane_s32(kx, k32, 0); 2280 k32 = vld1q_lane_s32(kx + 1, k32, 1); 2281 2282 int16x4_t k = vqmovn_s32(k32); 2283 2284 uint8x8_t z = vdup_n_u8(0); 2285 2286 for( ; i <= width - 8; i += 8, src += 8 ) 2287 { 2288 uint8x8_t x0, x1, x2; 2289 x0 = vld1_u8( (uint8_t *) (src - cn) ); 2290 x1 = vld1_u8( (uint8_t *) (src) ); 2291 x2 = vld1_u8( (uint8_t *) (src + cn) ); 2292 2293 int16x8_t y0, y1; 2294 int32x4_t y2, y3; 2295 y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z)); 2296 y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2)); 2297 y2 = vmull_lane_s16(vget_low_s16(y0), k, 0); 2298 y2 = vmlal_lane_s16(y2, vget_low_s16(y1), k, 1); 2299 y3 = vmull_lane_s16(vget_high_s16(y0), k, 0); 2300 y3 = vmlal_lane_s16(y3, vget_high_s16(y1), k, 1); 2301 2302 vst1q_s32((int32_t *)(dst + i), y2); 2303 vst1q_s32((int32_t *)(dst + i + 4), y3); 2304 } 2305 } 2306 } 2307 else if( _ksize == 5 ) 2308 { 2309 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) 2310 return 0; 2311 else 2312 { 2313 int32x4_t k32 = vdupq_n_s32(0); 2314 k32 = vld1q_lane_s32(kx, k32, 0); 2315 k32 = vld1q_lane_s32(kx + 1, k32, 1); 2316 k32 = vld1q_lane_s32(kx + 2, k32, 2); 2317 2318 int16x4_t k = vqmovn_s32(k32); 2319 2320 uint8x8_t z = vdup_n_u8(0); 2321 2322 for( ; i <= width - 8; i += 8, src += 8 ) 2323 { 2324 uint8x8_t x0, x1, x2, x3, x4; 2325 x0 = vld1_u8( (uint8_t *) (src - cn) ); 2326 x1 = vld1_u8( (uint8_t *) (src) ); 2327 x2 = vld1_u8( (uint8_t *) (src + cn) ); 2328 2329 int16x8_t y0, y1; 2330 int32x4_t accl, acch; 2331 y0 = vreinterpretq_s16_u16(vaddl_u8(x1, z)); 2332 y1 = vreinterpretq_s16_u16(vaddl_u8(x0, x2)); 2333 accl = vmull_lane_s16(vget_low_s16(y0), k, 0); 2334 accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 1); 2335 acch = vmull_lane_s16(vget_high_s16(y0), k, 0); 2336 acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 1); 2337 2338 int16x8_t y2; 2339 x3 = vld1_u8( (uint8_t *) (src - cn*2) ); 2340 x4 = vld1_u8( (uint8_t *) (src + cn*2) ); 2341 y2 = vreinterpretq_s16_u16(vaddl_u8(x3, x4)); 2342 accl = vmlal_lane_s16(accl, vget_low_s16(y2), k, 2); 2343 acch = vmlal_lane_s16(acch, vget_high_s16(y2), k, 2); 2344 2345 vst1q_s32((int32_t *)(dst + i), accl); 2346 vst1q_s32((int32_t *)(dst + i + 4), acch); 2347 } 2348 } 2349 } 2350 } 2351 else 2352 { 2353 if( _ksize == 3 ) 2354 { 2355 if( kx[0] == 0 && kx[1] == 1 ) 2356 { 2357 uint8x8_t z = vdup_n_u8(0); 2358 2359 for( ; i <= width - 8; i += 8, src += 8 ) 2360 { 2361 uint8x8_t x0, x1; 2362 x0 = vld1_u8( (uint8_t *) (src - cn) ); 2363 x1 = vld1_u8( (uint8_t *) (src + cn) ); 2364 2365 int16x8_t y0; 2366 y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)), 2367 vreinterpretq_s16_u16(vaddl_u8(x0, z))); 2368 2369 vst1q_s32((int32_t *)(dst + i), vmovl_s16(vget_low_s16(y0))); 2370 vst1q_s32((int32_t *)(dst + i + 4), vmovl_s16(vget_high_s16(y0))); 2371 } 2372 } 2373 else 2374 { 2375 int32x4_t k32 = vdupq_n_s32(0); 2376 k32 = vld1q_lane_s32(kx + 1, k32, 1); 2377 2378 int16x4_t k = vqmovn_s32(k32); 2379 2380 uint8x8_t z = vdup_n_u8(0); 2381 2382 for( ; i <= width - 8; i += 8, src += 8 ) 2383 { 2384 uint8x8_t x0, x1; 2385 x0 = vld1_u8( (uint8_t *) (src - cn) ); 2386 x1 = vld1_u8( (uint8_t *) (src + cn) ); 2387 2388 int16x8_t y0; 2389 int32x4_t y1, y2; 2390 y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)), 2391 vreinterpretq_s16_u16(vaddl_u8(x0, z))); 2392 y1 = vmull_lane_s16(vget_low_s16(y0), k, 1); 2393 y2 = vmull_lane_s16(vget_high_s16(y0), k, 1); 2394 2395 vst1q_s32((int32_t *)(dst + i), y1); 2396 vst1q_s32((int32_t *)(dst + i + 4), y2); 2397 } 2398 } 2399 } 2400 else if( _ksize == 5 ) 2401 { 2402 int32x4_t k32 = vdupq_n_s32(0); 2403 k32 = vld1q_lane_s32(kx + 1, k32, 1); 2404 k32 = vld1q_lane_s32(kx + 2, k32, 2); 2405 2406 int16x4_t k = vqmovn_s32(k32); 2407 2408 uint8x8_t z = vdup_n_u8(0); 2409 2410 for( ; i <= width - 8; i += 8, src += 8 ) 2411 { 2412 uint8x8_t x0, x1; 2413 x0 = vld1_u8( (uint8_t *) (src - cn) ); 2414 x1 = vld1_u8( (uint8_t *) (src + cn) ); 2415 2416 int32x4_t accl, acch; 2417 int16x8_t y0; 2418 y0 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x1, z)), 2419 vreinterpretq_s16_u16(vaddl_u8(x0, z))); 2420 accl = vmull_lane_s16(vget_low_s16(y0), k, 1); 2421 acch = vmull_lane_s16(vget_high_s16(y0), k, 1); 2422 2423 uint8x8_t x2, x3; 2424 x2 = vld1_u8( (uint8_t *) (src - cn*2) ); 2425 x3 = vld1_u8( (uint8_t *) (src + cn*2) ); 2426 2427 int16x8_t y1; 2428 y1 = vsubq_s16(vreinterpretq_s16_u16(vaddl_u8(x3, z)), 2429 vreinterpretq_s16_u16(vaddl_u8(x2, z))); 2430 accl = vmlal_lane_s16(accl, vget_low_s16(y1), k, 2); 2431 acch = vmlal_lane_s16(acch, vget_high_s16(y1), k, 2); 2432 2433 vst1q_s32((int32_t *)(dst + i), accl); 2434 vst1q_s32((int32_t *)(dst + i + 4), acch); 2435 } 2436 } 2437 } 2438 2439 return i; 2440 } 2441 2442 Mat kernel; 2443 int symmetryType; 2444 bool smallValues; 2445 }; 2446 2447 2448 struct SymmColumnVec_32s8u 2449 { 2450 SymmColumnVec_32s8u() { symmetryType=0; } 2451 SymmColumnVec_32s8u(const Mat& _kernel, int _symmetryType, int _bits, double _delta) 2452 { 2453 symmetryType = _symmetryType; 2454 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); 2455 delta = (float)(_delta/(1 << _bits)); 2456 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); 2457 } 2458 2459 int operator()(const uchar** _src, uchar* dst, int width) const 2460 { 2461 if( !checkHardwareSupport(CV_CPU_NEON) ) 2462 return 0; 2463 2464 int _ksize = kernel.rows + kernel.cols - 1; 2465 int ksize2 = _ksize / 2; 2466 const float* ky = kernel.ptr<float>() + ksize2; 2467 int i = 0, k; 2468 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 2469 const int** src = (const int**)_src; 2470 const int *S, *S2; 2471 2472 float32x4_t d4 = vdupq_n_f32(delta); 2473 2474 if( symmetrical ) 2475 { 2476 if( _ksize == 1 ) 2477 return 0; 2478 2479 2480 float32x2_t k32; 2481 k32 = vdup_n_f32(0); 2482 k32 = vld1_lane_f32(ky, k32, 0); 2483 k32 = vld1_lane_f32(ky + 1, k32, 1); 2484 2485 for( ; i <= width - 8; i += 8 ) 2486 { 2487 float32x4_t accl, acch; 2488 float32x4_t f0l, f0h, f1l, f1h, f2l, f2h; 2489 2490 S = src[0] + i; 2491 2492 f0l = vcvtq_f32_s32( vld1q_s32(S) ); 2493 f0h = vcvtq_f32_s32( vld1q_s32(S + 4) ); 2494 2495 S = src[1] + i; 2496 S2 = src[-1] + i; 2497 2498 f1l = vcvtq_f32_s32( vld1q_s32(S) ); 2499 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) ); 2500 f2l = vcvtq_f32_s32( vld1q_s32(S2) ); 2501 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) ); 2502 2503 accl = acch = d4; 2504 accl = vmlaq_lane_f32(accl, f0l, k32, 0); 2505 acch = vmlaq_lane_f32(acch, f0h, k32, 0); 2506 accl = vmlaq_lane_f32(accl, vaddq_f32(f1l, f2l), k32, 1); 2507 acch = vmlaq_lane_f32(acch, vaddq_f32(f1h, f2h), k32, 1); 2508 2509 for( k = 2; k <= ksize2; k++ ) 2510 { 2511 S = src[k] + i; 2512 S2 = src[-k] + i; 2513 2514 float32x4_t f3l, f3h, f4l, f4h; 2515 f3l = vcvtq_f32_s32( vld1q_s32(S) ); 2516 f3h = vcvtq_f32_s32( vld1q_s32(S + 4) ); 2517 f4l = vcvtq_f32_s32( vld1q_s32(S2) ); 2518 f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) ); 2519 2520 accl = vmlaq_n_f32(accl, vaddq_f32(f3l, f4l), ky[k]); 2521 acch = vmlaq_n_f32(acch, vaddq_f32(f3h, f4h), ky[k]); 2522 } 2523 2524 int32x4_t s32l, s32h; 2525 s32l = vcvtq_s32_f32(accl); 2526 s32h = vcvtq_s32_f32(acch); 2527 2528 int16x4_t s16l, s16h; 2529 s16l = vqmovn_s32(s32l); 2530 s16h = vqmovn_s32(s32h); 2531 2532 uint8x8_t u8; 2533 u8 = vqmovun_s16(vcombine_s16(s16l, s16h)); 2534 2535 vst1_u8((uint8_t *)(dst + i), u8); 2536 } 2537 } 2538 else 2539 { 2540 float32x2_t k32; 2541 k32 = vdup_n_f32(0); 2542 k32 = vld1_lane_f32(ky + 1, k32, 1); 2543 2544 for( ; i <= width - 8; i += 8 ) 2545 { 2546 float32x4_t accl, acch; 2547 float32x4_t f1l, f1h, f2l, f2h; 2548 2549 S = src[1] + i; 2550 S2 = src[-1] + i; 2551 2552 f1l = vcvtq_f32_s32( vld1q_s32(S) ); 2553 f1h = vcvtq_f32_s32( vld1q_s32(S + 4) ); 2554 f2l = vcvtq_f32_s32( vld1q_s32(S2) ); 2555 f2h = vcvtq_f32_s32( vld1q_s32(S2 + 4) ); 2556 2557 accl = acch = d4; 2558 accl = vmlaq_lane_f32(accl, vsubq_f32(f1l, f2l), k32, 1); 2559 acch = vmlaq_lane_f32(acch, vsubq_f32(f1h, f2h), k32, 1); 2560 2561 for( k = 2; k <= ksize2; k++ ) 2562 { 2563 S = src[k] + i; 2564 S2 = src[-k] + i; 2565 2566 float32x4_t f3l, f3h, f4l, f4h; 2567 f3l = vcvtq_f32_s32( vld1q_s32(S) ); 2568 f3h = vcvtq_f32_s32( vld1q_s32(S + 4) ); 2569 f4l = vcvtq_f32_s32( vld1q_s32(S2) ); 2570 f4h = vcvtq_f32_s32( vld1q_s32(S2 + 4) ); 2571 2572 accl = vmlaq_n_f32(accl, vsubq_f32(f3l, f4l), ky[k]); 2573 acch = vmlaq_n_f32(acch, vsubq_f32(f3h, f4h), ky[k]); 2574 } 2575 2576 int32x4_t s32l, s32h; 2577 s32l = vcvtq_s32_f32(accl); 2578 s32h = vcvtq_s32_f32(acch); 2579 2580 int16x4_t s16l, s16h; 2581 s16l = vqmovn_s32(s32l); 2582 s16h = vqmovn_s32(s32h); 2583 2584 uint8x8_t u8; 2585 u8 = vqmovun_s16(vcombine_s16(s16l, s16h)); 2586 2587 vst1_u8((uint8_t *)(dst + i), u8); 2588 } 2589 } 2590 2591 return i; 2592 } 2593 2594 int symmetryType; 2595 float delta; 2596 Mat kernel; 2597 }; 2598 2599 2600 struct SymmColumnSmallVec_32s16s 2601 { 2602 SymmColumnSmallVec_32s16s() { symmetryType=0; } 2603 SymmColumnSmallVec_32s16s(const Mat& _kernel, int _symmetryType, int _bits, double _delta) 2604 { 2605 symmetryType = _symmetryType; 2606 _kernel.convertTo(kernel, CV_32F, 1./(1 << _bits), 0); 2607 delta = (float)(_delta/(1 << _bits)); 2608 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); 2609 } 2610 2611 int operator()(const uchar** _src, uchar* _dst, int width) const 2612 { 2613 if( !checkHardwareSupport(CV_CPU_NEON) ) 2614 return 0; 2615 2616 int ksize2 = (kernel.rows + kernel.cols - 1)/2; 2617 const float* ky = kernel.ptr<float>() + ksize2; 2618 int i = 0; 2619 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 2620 const int** src = (const int**)_src; 2621 const int *S0 = src[-1], *S1 = src[0], *S2 = src[1]; 2622 short* dst = (short*)_dst; 2623 float32x4_t df4 = vdupq_n_f32(delta); 2624 int32x4_t d4 = vcvtq_s32_f32(df4); 2625 2626 if( symmetrical ) 2627 { 2628 if( ky[0] == 2 && ky[1] == 1 ) 2629 { 2630 for( ; i <= width - 4; i += 4 ) 2631 { 2632 int32x4_t x0, x1, x2; 2633 x0 = vld1q_s32((int32_t const *)(S0 + i)); 2634 x1 = vld1q_s32((int32_t const *)(S1 + i)); 2635 x2 = vld1q_s32((int32_t const *)(S2 + i)); 2636 2637 int32x4_t y0, y1, y2, y3; 2638 y0 = vaddq_s32(x0, x2); 2639 y1 = vqshlq_n_s32(x1, 1); 2640 y2 = vaddq_s32(y0, y1); 2641 y3 = vaddq_s32(y2, d4); 2642 2643 int16x4_t t; 2644 t = vqmovn_s32(y3); 2645 2646 vst1_s16((int16_t *)(dst + i), t); 2647 } 2648 } 2649 else if( ky[0] == -2 && ky[1] == 1 ) 2650 { 2651 for( ; i <= width - 4; i += 4 ) 2652 { 2653 int32x4_t x0, x1, x2; 2654 x0 = vld1q_s32((int32_t const *)(S0 + i)); 2655 x1 = vld1q_s32((int32_t const *)(S1 + i)); 2656 x2 = vld1q_s32((int32_t const *)(S2 + i)); 2657 2658 int32x4_t y0, y1, y2, y3; 2659 y0 = vaddq_s32(x0, x2); 2660 y1 = vqshlq_n_s32(x1, 1); 2661 y2 = vsubq_s32(y0, y1); 2662 y3 = vaddq_s32(y2, d4); 2663 2664 int16x4_t t; 2665 t = vqmovn_s32(y3); 2666 2667 vst1_s16((int16_t *)(dst + i), t); 2668 } 2669 } 2670 else if( ky[0] == 10 && ky[1] == 3 ) 2671 { 2672 for( ; i <= width - 4; i += 4 ) 2673 { 2674 int32x4_t x0, x1, x2, x3; 2675 x0 = vld1q_s32((int32_t const *)(S0 + i)); 2676 x1 = vld1q_s32((int32_t const *)(S1 + i)); 2677 x2 = vld1q_s32((int32_t const *)(S2 + i)); 2678 2679 x3 = vaddq_s32(x0, x2); 2680 2681 int32x4_t y0; 2682 y0 = vmlaq_n_s32(d4, x1, 10); 2683 y0 = vmlaq_n_s32(y0, x3, 3); 2684 2685 int16x4_t t; 2686 t = vqmovn_s32(y0); 2687 2688 vst1_s16((int16_t *)(dst + i), t); 2689 } 2690 } 2691 else 2692 { 2693 float32x2_t k32 = vdup_n_f32(0); 2694 k32 = vld1_lane_f32(ky, k32, 0); 2695 k32 = vld1_lane_f32(ky + 1, k32, 1); 2696 2697 for( ; i <= width - 4; i += 4 ) 2698 { 2699 int32x4_t x0, x1, x2, x3, x4; 2700 x0 = vld1q_s32((int32_t const *)(S0 + i)); 2701 x1 = vld1q_s32((int32_t const *)(S1 + i)); 2702 x2 = vld1q_s32((int32_t const *)(S2 + i)); 2703 2704 x3 = vaddq_s32(x0, x2); 2705 2706 float32x4_t s0, s1, s2; 2707 s0 = vcvtq_f32_s32(x1); 2708 s1 = vcvtq_f32_s32(x3); 2709 s2 = vmlaq_lane_f32(df4, s0, k32, 0); 2710 s2 = vmlaq_lane_f32(s2, s1, k32, 1); 2711 2712 x4 = vcvtq_s32_f32(s2); 2713 2714 int16x4_t x5; 2715 x5 = vqmovn_s32(x4); 2716 2717 vst1_s16((int16_t *)(dst + i), x5); 2718 } 2719 } 2720 } 2721 else 2722 { 2723 if( fabs(ky[1]) == 1 && ky[1] == -ky[-1] ) 2724 { 2725 if( ky[1] < 0 ) 2726 std::swap(S0, S2); 2727 for( ; i <= width - 4; i += 4 ) 2728 { 2729 int32x4_t x0, x1; 2730 x0 = vld1q_s32((int32_t const *)(S0 + i)); 2731 x1 = vld1q_s32((int32_t const *)(S2 + i)); 2732 2733 int32x4_t y0, y1; 2734 y0 = vsubq_s32(x1, x0); 2735 y1 = vqaddq_s32(y0, d4); 2736 2737 int16x4_t t; 2738 t = vqmovn_s32(y1); 2739 2740 vst1_s16((int16_t *)(dst + i), t); 2741 } 2742 } 2743 else 2744 { 2745 float32x2_t k32 = vdup_n_f32(0); 2746 k32 = vld1_lane_f32(ky + 1, k32, 1); 2747 2748 for( ; i <= width - 4; i += 4 ) 2749 { 2750 int32x4_t x0, x1, x2, x3; 2751 x0 = vld1q_s32((int32_t const *)(S0 + i)); 2752 x1 = vld1q_s32((int32_t const *)(S2 + i)); 2753 2754 x2 = vsubq_s32(x1, x0); 2755 2756 float32x4_t s0, s1; 2757 s0 = vcvtq_f32_s32(x2); 2758 s1 = vmlaq_lane_f32(df4, s0, k32, 1); 2759 2760 x3 = vcvtq_s32_f32(s1); 2761 2762 int16x4_t x4; 2763 x4 = vqmovn_s32(x3); 2764 2765 vst1_s16((int16_t *)(dst + i), x4); 2766 } 2767 } 2768 } 2769 2770 return i; 2771 } 2772 2773 int symmetryType; 2774 float delta; 2775 Mat kernel; 2776 }; 2777 2778 2779 struct SymmColumnVec_32f16s 2780 { 2781 SymmColumnVec_32f16s() { symmetryType=0; } 2782 SymmColumnVec_32f16s(const Mat& _kernel, int _symmetryType, int, double _delta) 2783 { 2784 symmetryType = _symmetryType; 2785 kernel = _kernel; 2786 delta = (float)_delta; 2787 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); 2788 neon_supported = checkHardwareSupport(CV_CPU_NEON); 2789 } 2790 2791 int operator()(const uchar** _src, uchar* _dst, int width) const 2792 { 2793 if( !neon_supported ) 2794 return 0; 2795 2796 int _ksize = kernel.rows + kernel.cols - 1; 2797 int ksize2 = _ksize / 2; 2798 const float* ky = kernel.ptr<float>() + ksize2; 2799 int i = 0, k; 2800 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 2801 const float** src = (const float**)_src; 2802 const float *S, *S2; 2803 short* dst = (short*)_dst; 2804 2805 float32x4_t d4 = vdupq_n_f32(delta); 2806 2807 if( symmetrical ) 2808 { 2809 if( _ksize == 1 ) 2810 return 0; 2811 2812 2813 float32x2_t k32; 2814 k32 = vdup_n_f32(0); 2815 k32 = vld1_lane_f32(ky, k32, 0); 2816 k32 = vld1_lane_f32(ky + 1, k32, 1); 2817 2818 for( ; i <= width - 8; i += 8 ) 2819 { 2820 float32x4_t x0l, x0h, x1l, x1h, x2l, x2h; 2821 float32x4_t accl, acch; 2822 2823 S = src[0] + i; 2824 2825 x0l = vld1q_f32(S); 2826 x0h = vld1q_f32(S + 4); 2827 2828 S = src[1] + i; 2829 S2 = src[-1] + i; 2830 2831 x1l = vld1q_f32(S); 2832 x1h = vld1q_f32(S + 4); 2833 x2l = vld1q_f32(S2); 2834 x2h = vld1q_f32(S2 + 4); 2835 2836 accl = acch = d4; 2837 accl = vmlaq_lane_f32(accl, x0l, k32, 0); 2838 acch = vmlaq_lane_f32(acch, x0h, k32, 0); 2839 accl = vmlaq_lane_f32(accl, vaddq_f32(x1l, x2l), k32, 1); 2840 acch = vmlaq_lane_f32(acch, vaddq_f32(x1h, x2h), k32, 1); 2841 2842 for( k = 2; k <= ksize2; k++ ) 2843 { 2844 S = src[k] + i; 2845 S2 = src[-k] + i; 2846 2847 float32x4_t x3l, x3h, x4l, x4h; 2848 x3l = vld1q_f32(S); 2849 x3h = vld1q_f32(S + 4); 2850 x4l = vld1q_f32(S2); 2851 x4h = vld1q_f32(S2 + 4); 2852 2853 accl = vmlaq_n_f32(accl, vaddq_f32(x3l, x4l), ky[k]); 2854 acch = vmlaq_n_f32(acch, vaddq_f32(x3h, x4h), ky[k]); 2855 } 2856 2857 int32x4_t s32l, s32h; 2858 s32l = vcvtq_s32_f32(accl); 2859 s32h = vcvtq_s32_f32(acch); 2860 2861 int16x4_t s16l, s16h; 2862 s16l = vqmovn_s32(s32l); 2863 s16h = vqmovn_s32(s32h); 2864 2865 vst1_s16((int16_t *)(dst + i), s16l); 2866 vst1_s16((int16_t *)(dst + i + 4), s16h); 2867 } 2868 } 2869 else 2870 { 2871 float32x2_t k32; 2872 k32 = vdup_n_f32(0); 2873 k32 = vld1_lane_f32(ky + 1, k32, 1); 2874 2875 for( ; i <= width - 8; i += 8 ) 2876 { 2877 float32x4_t x1l, x1h, x2l, x2h; 2878 float32x4_t accl, acch; 2879 2880 S = src[1] + i; 2881 S2 = src[-1] + i; 2882 2883 x1l = vld1q_f32(S); 2884 x1h = vld1q_f32(S + 4); 2885 x2l = vld1q_f32(S2); 2886 x2h = vld1q_f32(S2 + 4); 2887 2888 accl = acch = d4; 2889 accl = vmlaq_lane_f32(accl, vsubq_f32(x1l, x2l), k32, 1); 2890 acch = vmlaq_lane_f32(acch, vsubq_f32(x1h, x2h), k32, 1); 2891 2892 for( k = 2; k <= ksize2; k++ ) 2893 { 2894 S = src[k] + i; 2895 S2 = src[-k] + i; 2896 2897 float32x4_t x3l, x3h, x4l, x4h; 2898 x3l = vld1q_f32(S); 2899 x3h = vld1q_f32(S + 4); 2900 x4l = vld1q_f32(S2); 2901 x4h = vld1q_f32(S2 + 4); 2902 2903 accl = vmlaq_n_f32(accl, vsubq_f32(x3l, x4l), ky[k]); 2904 acch = vmlaq_n_f32(acch, vsubq_f32(x3h, x4h), ky[k]); 2905 } 2906 2907 int32x4_t s32l, s32h; 2908 s32l = vcvtq_s32_f32(accl); 2909 s32h = vcvtq_s32_f32(acch); 2910 2911 int16x4_t s16l, s16h; 2912 s16l = vqmovn_s32(s32l); 2913 s16h = vqmovn_s32(s32h); 2914 2915 vst1_s16((int16_t *)(dst + i), s16l); 2916 vst1_s16((int16_t *)(dst + i + 4), s16h); 2917 } 2918 } 2919 2920 return i; 2921 } 2922 2923 int symmetryType; 2924 float delta; 2925 Mat kernel; 2926 bool neon_supported; 2927 }; 2928 2929 2930 struct SymmRowSmallVec_32f 2931 { 2932 SymmRowSmallVec_32f() {} 2933 SymmRowSmallVec_32f( const Mat& _kernel, int _symmetryType ) 2934 { 2935 kernel = _kernel; 2936 symmetryType = _symmetryType; 2937 } 2938 2939 int operator()(const uchar* _src, uchar* _dst, int width, int cn) const 2940 { 2941 if( !checkHardwareSupport(CV_CPU_NEON) ) 2942 return 0; 2943 2944 int i = 0, _ksize = kernel.rows + kernel.cols - 1; 2945 float* dst = (float*)_dst; 2946 const float* src = (const float*)_src + (_ksize/2)*cn; 2947 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 2948 const float* kx = kernel.ptr<float>() + _ksize/2; 2949 width *= cn; 2950 2951 if( symmetrical ) 2952 { 2953 if( _ksize == 1 ) 2954 return 0; 2955 if( _ksize == 3 ) 2956 { 2957 if( kx[0] == 2 && kx[1] == 1 ) 2958 return 0; 2959 else if( kx[0] == -2 && kx[1] == 1 ) 2960 return 0; 2961 else 2962 { 2963 return 0; 2964 } 2965 } 2966 else if( _ksize == 5 ) 2967 { 2968 if( kx[0] == -2 && kx[1] == 0 && kx[2] == 1 ) 2969 return 0; 2970 else 2971 { 2972 float32x2_t k0, k1; 2973 k0 = k1 = vdup_n_f32(0); 2974 k0 = vld1_lane_f32(kx + 0, k0, 0); 2975 k0 = vld1_lane_f32(kx + 1, k0, 1); 2976 k1 = vld1_lane_f32(kx + 2, k1, 0); 2977 2978 for( ; i <= width - 4; i += 4, src += 4 ) 2979 { 2980 float32x4_t x0, x1, x2, x3, x4; 2981 x0 = vld1q_f32(src); 2982 x1 = vld1q_f32(src - cn); 2983 x2 = vld1q_f32(src + cn); 2984 x3 = vld1q_f32(src - cn*2); 2985 x4 = vld1q_f32(src + cn*2); 2986 2987 float32x4_t y0; 2988 y0 = vmulq_lane_f32(x0, k0, 0); 2989 y0 = vmlaq_lane_f32(y0, vaddq_f32(x1, x2), k0, 1); 2990 y0 = vmlaq_lane_f32(y0, vaddq_f32(x3, x4), k1, 0); 2991 2992 vst1q_f32(dst + i, y0); 2993 } 2994 } 2995 } 2996 } 2997 else 2998 { 2999 if( _ksize == 3 ) 3000 { 3001 if( kx[0] == 0 && kx[1] == 1 ) 3002 return 0; 3003 else 3004 { 3005 return 0; 3006 } 3007 } 3008 else if( _ksize == 5 ) 3009 { 3010 float32x2_t k; 3011 k = vdup_n_f32(0); 3012 k = vld1_lane_f32(kx + 1, k, 0); 3013 k = vld1_lane_f32(kx + 2, k, 1); 3014 3015 for( ; i <= width - 4; i += 4, src += 4 ) 3016 { 3017 float32x4_t x0, x1, x2, x3; 3018 x0 = vld1q_f32(src - cn); 3019 x1 = vld1q_f32(src + cn); 3020 x2 = vld1q_f32(src - cn*2); 3021 x3 = vld1q_f32(src + cn*2); 3022 3023 float32x4_t y0; 3024 y0 = vmulq_lane_f32(vsubq_f32(x1, x0), k, 0); 3025 y0 = vmlaq_lane_f32(y0, vsubq_f32(x3, x2), k, 1); 3026 3027 vst1q_f32(dst + i, y0); 3028 } 3029 } 3030 } 3031 3032 return i; 3033 } 3034 3035 Mat kernel; 3036 int symmetryType; 3037 }; 3038 3039 3040 typedef RowNoVec RowVec_8u32s; 3041 typedef RowNoVec RowVec_16s32f; 3042 typedef RowNoVec RowVec_32f; 3043 typedef ColumnNoVec SymmColumnVec_32f; 3044 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f; 3045 typedef FilterNoVec FilterVec_8u; 3046 typedef FilterNoVec FilterVec_8u16s; 3047 typedef FilterNoVec FilterVec_32f; 3048 3049 3050 #else 3051 3052 typedef RowNoVec RowVec_8u32s; 3053 typedef RowNoVec RowVec_16s32f; 3054 typedef RowNoVec RowVec_32f; 3055 typedef SymmRowSmallNoVec SymmRowSmallVec_8u32s; 3056 typedef SymmRowSmallNoVec SymmRowSmallVec_32f; 3057 typedef ColumnNoVec SymmColumnVec_32s8u; 3058 typedef ColumnNoVec SymmColumnVec_32f16s; 3059 typedef ColumnNoVec SymmColumnVec_32f; 3060 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32s16s; 3061 typedef SymmColumnSmallNoVec SymmColumnSmallVec_32f; 3062 typedef FilterNoVec FilterVec_8u; 3063 typedef FilterNoVec FilterVec_8u16s; 3064 typedef FilterNoVec FilterVec_32f; 3065 3066 #endif 3067 3068 3069 template<typename ST, typename DT, class VecOp> struct RowFilter : public BaseRowFilter 3070 { 3071 RowFilter( const Mat& _kernel, int _anchor, const VecOp& _vecOp=VecOp() ) 3072 { 3073 if( _kernel.isContinuous() ) 3074 kernel = _kernel; 3075 else 3076 _kernel.copyTo(kernel); 3077 anchor = _anchor; 3078 ksize = kernel.rows + kernel.cols - 1; 3079 CV_Assert( kernel.type() == DataType<DT>::type && 3080 (kernel.rows == 1 || kernel.cols == 1)); 3081 vecOp = _vecOp; 3082 } 3083 3084 void operator()(const uchar* src, uchar* dst, int width, int cn) 3085 { 3086 int _ksize = ksize; 3087 const DT* kx = kernel.ptr<DT>(); 3088 const ST* S; 3089 DT* D = (DT*)dst; 3090 int i, k; 3091 3092 i = vecOp(src, dst, width, cn); 3093 width *= cn; 3094 #if CV_ENABLE_UNROLLED 3095 for( ; i <= width - 4; i += 4 ) 3096 { 3097 S = (const ST*)src + i; 3098 DT f = kx[0]; 3099 DT s0 = f*S[0], s1 = f*S[1], s2 = f*S[2], s3 = f*S[3]; 3100 3101 for( k = 1; k < _ksize; k++ ) 3102 { 3103 S += cn; 3104 f = kx[k]; 3105 s0 += f*S[0]; s1 += f*S[1]; 3106 s2 += f*S[2]; s3 += f*S[3]; 3107 } 3108 3109 D[i] = s0; D[i+1] = s1; 3110 D[i+2] = s2; D[i+3] = s3; 3111 } 3112 #endif 3113 for( ; i < width; i++ ) 3114 { 3115 S = (const ST*)src + i; 3116 DT s0 = kx[0]*S[0]; 3117 for( k = 1; k < _ksize; k++ ) 3118 { 3119 S += cn; 3120 s0 += kx[k]*S[0]; 3121 } 3122 D[i] = s0; 3123 } 3124 } 3125 3126 Mat kernel; 3127 VecOp vecOp; 3128 }; 3129 3130 3131 template<typename ST, typename DT, class VecOp> struct SymmRowSmallFilter : 3132 public RowFilter<ST, DT, VecOp> 3133 { 3134 SymmRowSmallFilter( const Mat& _kernel, int _anchor, int _symmetryType, 3135 const VecOp& _vecOp = VecOp()) 3136 : RowFilter<ST, DT, VecOp>( _kernel, _anchor, _vecOp ) 3137 { 3138 symmetryType = _symmetryType; 3139 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 && this->ksize <= 5 ); 3140 } 3141 3142 void operator()(const uchar* src, uchar* dst, int width, int cn) 3143 { 3144 int ksize2 = this->ksize/2, ksize2n = ksize2*cn; 3145 const DT* kx = this->kernel.template ptr<DT>() + ksize2; 3146 bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; 3147 DT* D = (DT*)dst; 3148 int i = this->vecOp(src, dst, width, cn), j, k; 3149 const ST* S = (const ST*)src + i + ksize2n; 3150 width *= cn; 3151 3152 if( symmetrical ) 3153 { 3154 if( this->ksize == 1 && kx[0] == 1 ) 3155 { 3156 for( ; i <= width - 2; i += 2 ) 3157 { 3158 DT s0 = S[i], s1 = S[i+1]; 3159 D[i] = s0; D[i+1] = s1; 3160 } 3161 S += i; 3162 } 3163 else if( this->ksize == 3 ) 3164 { 3165 if( kx[0] == 2 && kx[1] == 1 ) 3166 for( ; i <= width - 2; i += 2, S += 2 ) 3167 { 3168 DT s0 = S[-cn] + S[0]*2 + S[cn], s1 = S[1-cn] + S[1]*2 + S[1+cn]; 3169 D[i] = s0; D[i+1] = s1; 3170 } 3171 else if( kx[0] == -2 && kx[1] == 1 ) 3172 for( ; i <= width - 2; i += 2, S += 2 ) 3173 { 3174 DT s0 = S[-cn] - S[0]*2 + S[cn], s1 = S[1-cn] - S[1]*2 + S[1+cn]; 3175 D[i] = s0; D[i+1] = s1; 3176 } 3177 else 3178 { 3179 DT k0 = kx[0], k1 = kx[1]; 3180 for( ; i <= width - 2; i += 2, S += 2 ) 3181 { 3182 DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1, s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1; 3183 D[i] = s0; D[i+1] = s1; 3184 } 3185 } 3186 } 3187 else if( this->ksize == 5 ) 3188 { 3189 DT k0 = kx[0], k1 = kx[1], k2 = kx[2]; 3190 if( k0 == -2 && k1 == 0 && k2 == 1 ) 3191 for( ; i <= width - 2; i += 2, S += 2 ) 3192 { 3193 DT s0 = -2*S[0] + S[-cn*2] + S[cn*2]; 3194 DT s1 = -2*S[1] + S[1-cn*2] + S[1+cn*2]; 3195 D[i] = s0; D[i+1] = s1; 3196 } 3197 else 3198 for( ; i <= width - 2; i += 2, S += 2 ) 3199 { 3200 DT s0 = S[0]*k0 + (S[-cn] + S[cn])*k1 + (S[-cn*2] + S[cn*2])*k2; 3201 DT s1 = S[1]*k0 + (S[1-cn] + S[1+cn])*k1 + (S[1-cn*2] + S[1+cn*2])*k2; 3202 D[i] = s0; D[i+1] = s1; 3203 } 3204 } 3205 3206 for( ; i < width; i++, S++ ) 3207 { 3208 DT s0 = kx[0]*S[0]; 3209 for( k = 1, j = cn; k <= ksize2; k++, j += cn ) 3210 s0 += kx[k]*(S[j] + S[-j]); 3211 D[i] = s0; 3212 } 3213 } 3214 else 3215 { 3216 if( this->ksize == 3 ) 3217 { 3218 if( kx[0] == 0 && kx[1] == 1 ) 3219 for( ; i <= width - 2; i += 2, S += 2 ) 3220 { 3221 DT s0 = S[cn] - S[-cn], s1 = S[1+cn] - S[1-cn]; 3222 D[i] = s0; D[i+1] = s1; 3223 } 3224 else 3225 { 3226 DT k1 = kx[1]; 3227 for( ; i <= width - 2; i += 2, S += 2 ) 3228 { 3229 DT s0 = (S[cn] - S[-cn])*k1, s1 = (S[1+cn] - S[1-cn])*k1; 3230 D[i] = s0; D[i+1] = s1; 3231 } 3232 } 3233 } 3234 else if( this->ksize == 5 ) 3235 { 3236 DT k1 = kx[1], k2 = kx[2]; 3237 for( ; i <= width - 2; i += 2, S += 2 ) 3238 { 3239 DT s0 = (S[cn] - S[-cn])*k1 + (S[cn*2] - S[-cn*2])*k2; 3240 DT s1 = (S[1+cn] - S[1-cn])*k1 + (S[1+cn*2] - S[1-cn*2])*k2; 3241 D[i] = s0; D[i+1] = s1; 3242 } 3243 } 3244 3245 for( ; i < width; i++, S++ ) 3246 { 3247 DT s0 = kx[0]*S[0]; 3248 for( k = 1, j = cn; k <= ksize2; k++, j += cn ) 3249 s0 += kx[k]*(S[j] - S[-j]); 3250 D[i] = s0; 3251 } 3252 } 3253 } 3254 3255 int symmetryType; 3256 }; 3257 3258 3259 template<class CastOp, class VecOp> struct ColumnFilter : public BaseColumnFilter 3260 { 3261 typedef typename CastOp::type1 ST; 3262 typedef typename CastOp::rtype DT; 3263 3264 ColumnFilter( const Mat& _kernel, int _anchor, 3265 double _delta, const CastOp& _castOp=CastOp(), 3266 const VecOp& _vecOp=VecOp() ) 3267 { 3268 if( _kernel.isContinuous() ) 3269 kernel = _kernel; 3270 else 3271 _kernel.copyTo(kernel); 3272 anchor = _anchor; 3273 ksize = kernel.rows + kernel.cols - 1; 3274 delta = saturate_cast<ST>(_delta); 3275 castOp0 = _castOp; 3276 vecOp = _vecOp; 3277 CV_Assert( kernel.type() == DataType<ST>::type && 3278 (kernel.rows == 1 || kernel.cols == 1)); 3279 } 3280 3281 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) 3282 { 3283 const ST* ky = kernel.template ptr<ST>(); 3284 ST _delta = delta; 3285 int _ksize = ksize; 3286 int i, k; 3287 CastOp castOp = castOp0; 3288 3289 for( ; count--; dst += dststep, src++ ) 3290 { 3291 DT* D = (DT*)dst; 3292 i = vecOp(src, dst, width); 3293 #if CV_ENABLE_UNROLLED 3294 for( ; i <= width - 4; i += 4 ) 3295 { 3296 ST f = ky[0]; 3297 const ST* S = (const ST*)src[0] + i; 3298 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta, 3299 s2 = f*S[2] + _delta, s3 = f*S[3] + _delta; 3300 3301 for( k = 1; k < _ksize; k++ ) 3302 { 3303 S = (const ST*)src[k] + i; f = ky[k]; 3304 s0 += f*S[0]; s1 += f*S[1]; 3305 s2 += f*S[2]; s3 += f*S[3]; 3306 } 3307 3308 D[i] = castOp(s0); D[i+1] = castOp(s1); 3309 D[i+2] = castOp(s2); D[i+3] = castOp(s3); 3310 } 3311 #endif 3312 for( ; i < width; i++ ) 3313 { 3314 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta; 3315 for( k = 1; k < _ksize; k++ ) 3316 s0 += ky[k]*((const ST*)src[k])[i]; 3317 D[i] = castOp(s0); 3318 } 3319 } 3320 } 3321 3322 Mat kernel; 3323 CastOp castOp0; 3324 VecOp vecOp; 3325 ST delta; 3326 }; 3327 3328 3329 template<class CastOp, class VecOp> struct SymmColumnFilter : public ColumnFilter<CastOp, VecOp> 3330 { 3331 typedef typename CastOp::type1 ST; 3332 typedef typename CastOp::rtype DT; 3333 3334 SymmColumnFilter( const Mat& _kernel, int _anchor, 3335 double _delta, int _symmetryType, 3336 const CastOp& _castOp=CastOp(), 3337 const VecOp& _vecOp=VecOp()) 3338 : ColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _castOp, _vecOp ) 3339 { 3340 symmetryType = _symmetryType; 3341 CV_Assert( (symmetryType & (KERNEL_SYMMETRICAL | KERNEL_ASYMMETRICAL)) != 0 ); 3342 } 3343 3344 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) 3345 { 3346 int ksize2 = this->ksize/2; 3347 const ST* ky = this->kernel.template ptr<ST>() + ksize2; 3348 int i, k; 3349 bool symmetrical = (symmetryType & KERNEL_SYMMETRICAL) != 0; 3350 ST _delta = this->delta; 3351 CastOp castOp = this->castOp0; 3352 src += ksize2; 3353 3354 if( symmetrical ) 3355 { 3356 for( ; count--; dst += dststep, src++ ) 3357 { 3358 DT* D = (DT*)dst; 3359 i = (this->vecOp)(src, dst, width); 3360 #if CV_ENABLE_UNROLLED 3361 for( ; i <= width - 4; i += 4 ) 3362 { 3363 ST f = ky[0]; 3364 const ST* S = (const ST*)src[0] + i, *S2; 3365 ST s0 = f*S[0] + _delta, s1 = f*S[1] + _delta, 3366 s2 = f*S[2] + _delta, s3 = f*S[3] + _delta; 3367 3368 for( k = 1; k <= ksize2; k++ ) 3369 { 3370 S = (const ST*)src[k] + i; 3371 S2 = (const ST*)src[-k] + i; 3372 f = ky[k]; 3373 s0 += f*(S[0] + S2[0]); 3374 s1 += f*(S[1] + S2[1]); 3375 s2 += f*(S[2] + S2[2]); 3376 s3 += f*(S[3] + S2[3]); 3377 } 3378 3379 D[i] = castOp(s0); D[i+1] = castOp(s1); 3380 D[i+2] = castOp(s2); D[i+3] = castOp(s3); 3381 } 3382 #endif 3383 for( ; i < width; i++ ) 3384 { 3385 ST s0 = ky[0]*((const ST*)src[0])[i] + _delta; 3386 for( k = 1; k <= ksize2; k++ ) 3387 s0 += ky[k]*(((const ST*)src[k])[i] + ((const ST*)src[-k])[i]); 3388 D[i] = castOp(s0); 3389 } 3390 } 3391 } 3392 else 3393 { 3394 for( ; count--; dst += dststep, src++ ) 3395 { 3396 DT* D = (DT*)dst; 3397 i = this->vecOp(src, dst, width); 3398 #if CV_ENABLE_UNROLLED 3399 for( ; i <= width - 4; i += 4 ) 3400 { 3401 ST f = ky[0]; 3402 const ST *S, *S2; 3403 ST s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta; 3404 3405 for( k = 1; k <= ksize2; k++ ) 3406 { 3407 S = (const ST*)src[k] + i; 3408 S2 = (const ST*)src[-k] + i; 3409 f = ky[k]; 3410 s0 += f*(S[0] - S2[0]); 3411 s1 += f*(S[1] - S2[1]); 3412 s2 += f*(S[2] - S2[2]); 3413 s3 += f*(S[3] - S2[3]); 3414 } 3415 3416 D[i] = castOp(s0); D[i+1] = castOp(s1); 3417 D[i+2] = castOp(s2); D[i+3] = castOp(s3); 3418 } 3419 #endif 3420 for( ; i < width; i++ ) 3421 { 3422 ST s0 = _delta; 3423 for( k = 1; k <= ksize2; k++ ) 3424 s0 += ky[k]*(((const ST*)src[k])[i] - ((const ST*)src[-k])[i]); 3425 D[i] = castOp(s0); 3426 } 3427 } 3428 } 3429 } 3430 3431 int symmetryType; 3432 }; 3433 3434 3435 template<class CastOp, class VecOp> 3436 struct SymmColumnSmallFilter : public SymmColumnFilter<CastOp, VecOp> 3437 { 3438 typedef typename CastOp::type1 ST; 3439 typedef typename CastOp::rtype DT; 3440 3441 SymmColumnSmallFilter( const Mat& _kernel, int _anchor, 3442 double _delta, int _symmetryType, 3443 const CastOp& _castOp=CastOp(), 3444 const VecOp& _vecOp=VecOp()) 3445 : SymmColumnFilter<CastOp, VecOp>( _kernel, _anchor, _delta, _symmetryType, _castOp, _vecOp ) 3446 { 3447 CV_Assert( this->ksize == 3 ); 3448 } 3449 3450 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width) 3451 { 3452 int ksize2 = this->ksize/2; 3453 const ST* ky = this->kernel.template ptr<ST>() + ksize2; 3454 int i; 3455 bool symmetrical = (this->symmetryType & KERNEL_SYMMETRICAL) != 0; 3456 bool is_1_2_1 = ky[0] == 2 && ky[1] == 1; 3457 bool is_1_m2_1 = ky[0] == -2 && ky[1] == 1; 3458 bool is_m1_0_1 = ky[0] == 0 && (ky[1] == 1 || ky[1] == -1); 3459 ST f0 = ky[0], f1 = ky[1]; 3460 ST _delta = this->delta; 3461 CastOp castOp = this->castOp0; 3462 src += ksize2; 3463 3464 for( ; count--; dst += dststep, src++ ) 3465 { 3466 DT* D = (DT*)dst; 3467 i = (this->vecOp)(src, dst, width); 3468 const ST* S0 = (const ST*)src[-1]; 3469 const ST* S1 = (const ST*)src[0]; 3470 const ST* S2 = (const ST*)src[1]; 3471 3472 if( symmetrical ) 3473 { 3474 if( is_1_2_1 ) 3475 { 3476 #if CV_ENABLE_UNROLLED 3477 for( ; i <= width - 4; i += 4 ) 3478 { 3479 ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta; 3480 ST s1 = S0[i+1] + S1[i+1]*2 + S2[i+1] + _delta; 3481 D[i] = castOp(s0); 3482 D[i+1] = castOp(s1); 3483 3484 s0 = S0[i+2] + S1[i+2]*2 + S2[i+2] + _delta; 3485 s1 = S0[i+3] + S1[i+3]*2 + S2[i+3] + _delta; 3486 D[i+2] = castOp(s0); 3487 D[i+3] = castOp(s1); 3488 } 3489 #endif 3490 for( ; i < width; i ++ ) 3491 { 3492 ST s0 = S0[i] + S1[i]*2 + S2[i] + _delta; 3493 D[i] = castOp(s0); 3494 } 3495 } 3496 else if( is_1_m2_1 ) 3497 { 3498 #if CV_ENABLE_UNROLLED 3499 for( ; i <= width - 4; i += 4 ) 3500 { 3501 ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta; 3502 ST s1 = S0[i+1] - S1[i+1]*2 + S2[i+1] + _delta; 3503 D[i] = castOp(s0); 3504 D[i+1] = castOp(s1); 3505 3506 s0 = S0[i+2] - S1[i+2]*2 + S2[i+2] + _delta; 3507 s1 = S0[i+3] - S1[i+3]*2 + S2[i+3] + _delta; 3508 D[i+2] = castOp(s0); 3509 D[i+3] = castOp(s1); 3510 } 3511 #endif 3512 for( ; i < width; i ++ ) 3513 { 3514 ST s0 = S0[i] - S1[i]*2 + S2[i] + _delta; 3515 D[i] = castOp(s0); 3516 } 3517 } 3518 else 3519 { 3520 #if CV_ENABLE_UNROLLED 3521 for( ; i <= width - 4; i += 4 ) 3522 { 3523 ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta; 3524 ST s1 = (S0[i+1] + S2[i+1])*f1 + S1[i+1]*f0 + _delta; 3525 D[i] = castOp(s0); 3526 D[i+1] = castOp(s1); 3527 3528 s0 = (S0[i+2] + S2[i+2])*f1 + S1[i+2]*f0 + _delta; 3529 s1 = (S0[i+3] + S2[i+3])*f1 + S1[i+3]*f0 + _delta; 3530 D[i+2] = castOp(s0); 3531 D[i+3] = castOp(s1); 3532 } 3533 #endif 3534 for( ; i < width; i ++ ) 3535 { 3536 ST s0 = (S0[i] + S2[i])*f1 + S1[i]*f0 + _delta; 3537 D[i] = castOp(s0); 3538 } 3539 } 3540 } 3541 else 3542 { 3543 if( is_m1_0_1 ) 3544 { 3545 if( f1 < 0 ) 3546 std::swap(S0, S2); 3547 #if CV_ENABLE_UNROLLED 3548 for( ; i <= width - 4; i += 4 ) 3549 { 3550 ST s0 = S2[i] - S0[i] + _delta; 3551 ST s1 = S2[i+1] - S0[i+1] + _delta; 3552 D[i] = castOp(s0); 3553 D[i+1] = castOp(s1); 3554 3555 s0 = S2[i+2] - S0[i+2] + _delta; 3556 s1 = S2[i+3] - S0[i+3] + _delta; 3557 D[i+2] = castOp(s0); 3558 D[i+3] = castOp(s1); 3559 } 3560 #endif 3561 for( ; i < width; i ++ ) 3562 { 3563 ST s0 = S2[i] - S0[i] + _delta; 3564 D[i] = castOp(s0); 3565 } 3566 if( f1 < 0 ) 3567 std::swap(S0, S2); 3568 } 3569 else 3570 { 3571 #if CV_ENABLE_UNROLLED 3572 for( ; i <= width - 4; i += 4 ) 3573 { 3574 ST s0 = (S2[i] - S0[i])*f1 + _delta; 3575 ST s1 = (S2[i+1] - S0[i+1])*f1 + _delta; 3576 D[i] = castOp(s0); 3577 D[i+1] = castOp(s1); 3578 3579 s0 = (S2[i+2] - S0[i+2])*f1 + _delta; 3580 s1 = (S2[i+3] - S0[i+3])*f1 + _delta; 3581 D[i+2] = castOp(s0); 3582 D[i+3] = castOp(s1); 3583 } 3584 #endif 3585 for( ; i < width; i++ ) 3586 D[i] = castOp((S2[i] - S0[i])*f1 + _delta); 3587 } 3588 } 3589 } 3590 } 3591 }; 3592 3593 template<typename ST, typename DT> struct Cast 3594 { 3595 typedef ST type1; 3596 typedef DT rtype; 3597 3598 DT operator()(ST val) const { return saturate_cast<DT>(val); } 3599 }; 3600 3601 template<typename ST, typename DT, int bits> struct FixedPtCast 3602 { 3603 typedef ST type1; 3604 typedef DT rtype; 3605 enum { SHIFT = bits, DELTA = 1 << (bits-1) }; 3606 3607 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); } 3608 }; 3609 3610 template<typename ST, typename DT> struct FixedPtCastEx 3611 { 3612 typedef ST type1; 3613 typedef DT rtype; 3614 3615 FixedPtCastEx() : SHIFT(0), DELTA(0) {} 3616 FixedPtCastEx(int bits) : SHIFT(bits), DELTA(bits ? 1 << (bits-1) : 0) {} 3617 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); } 3618 int SHIFT, DELTA; 3619 }; 3620 3621 } 3622 3623 cv::Ptr<cv::BaseRowFilter> cv::getLinearRowFilter( int srcType, int bufType, 3624 InputArray _kernel, int anchor, 3625 int symmetryType ) 3626 { 3627 Mat kernel = _kernel.getMat(); 3628 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(bufType); 3629 int cn = CV_MAT_CN(srcType); 3630 CV_Assert( cn == CV_MAT_CN(bufType) && 3631 ddepth >= std::max(sdepth, CV_32S) && 3632 kernel.type() == ddepth ); 3633 int ksize = kernel.rows + kernel.cols - 1; 3634 3635 if( (symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) != 0 && ksize <= 5 ) 3636 { 3637 if( sdepth == CV_8U && ddepth == CV_32S ) 3638 return makePtr<SymmRowSmallFilter<uchar, int, SymmRowSmallVec_8u32s> > 3639 (kernel, anchor, symmetryType, SymmRowSmallVec_8u32s(kernel, symmetryType)); 3640 if( sdepth == CV_32F && ddepth == CV_32F ) 3641 return makePtr<SymmRowSmallFilter<float, float, SymmRowSmallVec_32f> > 3642 (kernel, anchor, symmetryType, SymmRowSmallVec_32f(kernel, symmetryType)); 3643 } 3644 3645 if( sdepth == CV_8U && ddepth == CV_32S ) 3646 return makePtr<RowFilter<uchar, int, RowVec_8u32s> > 3647 (kernel, anchor, RowVec_8u32s(kernel)); 3648 if( sdepth == CV_8U && ddepth == CV_32F ) 3649 return makePtr<RowFilter<uchar, float, RowNoVec> >(kernel, anchor); 3650 if( sdepth == CV_8U && ddepth == CV_64F ) 3651 return makePtr<RowFilter<uchar, double, RowNoVec> >(kernel, anchor); 3652 if( sdepth == CV_16U && ddepth == CV_32F ) 3653 return makePtr<RowFilter<ushort, float, RowNoVec> >(kernel, anchor); 3654 if( sdepth == CV_16U && ddepth == CV_64F ) 3655 return makePtr<RowFilter<ushort, double, RowNoVec> >(kernel, anchor); 3656 if( sdepth == CV_16S && ddepth == CV_32F ) 3657 return makePtr<RowFilter<short, float, RowVec_16s32f> > 3658 (kernel, anchor, RowVec_16s32f(kernel)); 3659 if( sdepth == CV_16S && ddepth == CV_64F ) 3660 return makePtr<RowFilter<short, double, RowNoVec> >(kernel, anchor); 3661 if( sdepth == CV_32F && ddepth == CV_32F ) 3662 return makePtr<RowFilter<float, float, RowVec_32f> > 3663 (kernel, anchor, RowVec_32f(kernel)); 3664 if( sdepth == CV_32F && ddepth == CV_64F ) 3665 return makePtr<RowFilter<float, double, RowNoVec> >(kernel, anchor); 3666 if( sdepth == CV_64F && ddepth == CV_64F ) 3667 return makePtr<RowFilter<double, double, RowNoVec> >(kernel, anchor); 3668 3669 CV_Error_( CV_StsNotImplemented, 3670 ("Unsupported combination of source format (=%d), and buffer format (=%d)", 3671 srcType, bufType)); 3672 3673 return Ptr<BaseRowFilter>(); 3674 } 3675 3676 3677 cv::Ptr<cv::BaseColumnFilter> cv::getLinearColumnFilter( int bufType, int dstType, 3678 InputArray _kernel, int anchor, 3679 int symmetryType, double delta, 3680 int bits ) 3681 { 3682 Mat kernel = _kernel.getMat(); 3683 int sdepth = CV_MAT_DEPTH(bufType), ddepth = CV_MAT_DEPTH(dstType); 3684 int cn = CV_MAT_CN(dstType); 3685 CV_Assert( cn == CV_MAT_CN(bufType) && 3686 sdepth >= std::max(ddepth, CV_32S) && 3687 kernel.type() == sdepth ); 3688 3689 if( !(symmetryType & (KERNEL_SYMMETRICAL|KERNEL_ASYMMETRICAL)) ) 3690 { 3691 if( ddepth == CV_8U && sdepth == CV_32S ) 3692 return makePtr<ColumnFilter<FixedPtCastEx<int, uchar>, ColumnNoVec> > 3693 (kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits)); 3694 if( ddepth == CV_8U && sdepth == CV_32F ) 3695 return makePtr<ColumnFilter<Cast<float, uchar>, ColumnNoVec> >(kernel, anchor, delta); 3696 if( ddepth == CV_8U && sdepth == CV_64F ) 3697 return makePtr<ColumnFilter<Cast<double, uchar>, ColumnNoVec> >(kernel, anchor, delta); 3698 if( ddepth == CV_16U && sdepth == CV_32F ) 3699 return makePtr<ColumnFilter<Cast<float, ushort>, ColumnNoVec> >(kernel, anchor, delta); 3700 if( ddepth == CV_16U && sdepth == CV_64F ) 3701 return makePtr<ColumnFilter<Cast<double, ushort>, ColumnNoVec> >(kernel, anchor, delta); 3702 if( ddepth == CV_16S && sdepth == CV_32F ) 3703 return makePtr<ColumnFilter<Cast<float, short>, ColumnNoVec> >(kernel, anchor, delta); 3704 if( ddepth == CV_16S && sdepth == CV_64F ) 3705 return makePtr<ColumnFilter<Cast<double, short>, ColumnNoVec> >(kernel, anchor, delta); 3706 if( ddepth == CV_32F && sdepth == CV_32F ) 3707 return makePtr<ColumnFilter<Cast<float, float>, ColumnNoVec> >(kernel, anchor, delta); 3708 if( ddepth == CV_64F && sdepth == CV_64F ) 3709 return makePtr<ColumnFilter<Cast<double, double>, ColumnNoVec> >(kernel, anchor, delta); 3710 } 3711 else 3712 { 3713 int ksize = kernel.rows + kernel.cols - 1; 3714 if( ksize == 3 ) 3715 { 3716 if( ddepth == CV_8U && sdepth == CV_32S ) 3717 return makePtr<SymmColumnSmallFilter< 3718 FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> > 3719 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits), 3720 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)); 3721 if( ddepth == CV_16S && sdepth == CV_32S && bits == 0 ) 3722 return makePtr<SymmColumnSmallFilter<Cast<int, short>, 3723 SymmColumnSmallVec_32s16s> >(kernel, anchor, delta, symmetryType, 3724 Cast<int, short>(), SymmColumnSmallVec_32s16s(kernel, symmetryType, bits, delta)); 3725 if( ddepth == CV_32F && sdepth == CV_32F ) 3726 return makePtr<SymmColumnSmallFilter< 3727 Cast<float, float>,SymmColumnSmallVec_32f> > 3728 (kernel, anchor, delta, symmetryType, Cast<float, float>(), 3729 SymmColumnSmallVec_32f(kernel, symmetryType, 0, delta)); 3730 } 3731 if( ddepth == CV_8U && sdepth == CV_32S ) 3732 return makePtr<SymmColumnFilter<FixedPtCastEx<int, uchar>, SymmColumnVec_32s8u> > 3733 (kernel, anchor, delta, symmetryType, FixedPtCastEx<int, uchar>(bits), 3734 SymmColumnVec_32s8u(kernel, symmetryType, bits, delta)); 3735 if( ddepth == CV_8U && sdepth == CV_32F ) 3736 return makePtr<SymmColumnFilter<Cast<float, uchar>, ColumnNoVec> > 3737 (kernel, anchor, delta, symmetryType); 3738 if( ddepth == CV_8U && sdepth == CV_64F ) 3739 return makePtr<SymmColumnFilter<Cast<double, uchar>, ColumnNoVec> > 3740 (kernel, anchor, delta, symmetryType); 3741 if( ddepth == CV_16U && sdepth == CV_32F ) 3742 return makePtr<SymmColumnFilter<Cast<float, ushort>, ColumnNoVec> > 3743 (kernel, anchor, delta, symmetryType); 3744 if( ddepth == CV_16U && sdepth == CV_64F ) 3745 return makePtr<SymmColumnFilter<Cast<double, ushort>, ColumnNoVec> > 3746 (kernel, anchor, delta, symmetryType); 3747 if( ddepth == CV_16S && sdepth == CV_32S ) 3748 return makePtr<SymmColumnFilter<Cast<int, short>, ColumnNoVec> > 3749 (kernel, anchor, delta, symmetryType); 3750 if( ddepth == CV_16S && sdepth == CV_32F ) 3751 return makePtr<SymmColumnFilter<Cast<float, short>, SymmColumnVec_32f16s> > 3752 (kernel, anchor, delta, symmetryType, Cast<float, short>(), 3753 SymmColumnVec_32f16s(kernel, symmetryType, 0, delta)); 3754 if( ddepth == CV_16S && sdepth == CV_64F ) 3755 return makePtr<SymmColumnFilter<Cast<double, short>, ColumnNoVec> > 3756 (kernel, anchor, delta, symmetryType); 3757 if( ddepth == CV_32F && sdepth == CV_32F ) 3758 return makePtr<SymmColumnFilter<Cast<float, float>, SymmColumnVec_32f> > 3759 (kernel, anchor, delta, symmetryType, Cast<float, float>(), 3760 SymmColumnVec_32f(kernel, symmetryType, 0, delta)); 3761 if( ddepth == CV_64F && sdepth == CV_64F ) 3762 return makePtr<SymmColumnFilter<Cast<double, double>, ColumnNoVec> > 3763 (kernel, anchor, delta, symmetryType); 3764 } 3765 3766 CV_Error_( CV_StsNotImplemented, 3767 ("Unsupported combination of buffer format (=%d), and destination format (=%d)", 3768 bufType, dstType)); 3769 3770 return Ptr<BaseColumnFilter>(); 3771 } 3772 3773 3774 cv::Ptr<cv::FilterEngine> cv::createSeparableLinearFilter( 3775 int _srcType, int _dstType, 3776 InputArray __rowKernel, InputArray __columnKernel, 3777 Point _anchor, double _delta, 3778 int _rowBorderType, int _columnBorderType, 3779 const Scalar& _borderValue ) 3780 { 3781 Mat _rowKernel = __rowKernel.getMat(), _columnKernel = __columnKernel.getMat(); 3782 _srcType = CV_MAT_TYPE(_srcType); 3783 _dstType = CV_MAT_TYPE(_dstType); 3784 int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); 3785 int cn = CV_MAT_CN(_srcType); 3786 CV_Assert( cn == CV_MAT_CN(_dstType) ); 3787 int rsize = _rowKernel.rows + _rowKernel.cols - 1; 3788 int csize = _columnKernel.rows + _columnKernel.cols - 1; 3789 if( _anchor.x < 0 ) 3790 _anchor.x = rsize/2; 3791 if( _anchor.y < 0 ) 3792 _anchor.y = csize/2; 3793 int rtype = getKernelType(_rowKernel, 3794 _rowKernel.rows == 1 ? Point(_anchor.x, 0) : Point(0, _anchor.x)); 3795 int ctype = getKernelType(_columnKernel, 3796 _columnKernel.rows == 1 ? Point(_anchor.y, 0) : Point(0, _anchor.y)); 3797 Mat rowKernel, columnKernel; 3798 3799 int bdepth = std::max(CV_32F,std::max(sdepth, ddepth)); 3800 int bits = 0; 3801 3802 if( sdepth == CV_8U && 3803 ((rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && 3804 ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && 3805 ddepth == CV_8U) || 3806 ((rtype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && 3807 (ctype & (KERNEL_SYMMETRICAL+KERNEL_ASYMMETRICAL)) && 3808 (rtype & ctype & KERNEL_INTEGER) && 3809 ddepth == CV_16S)) ) 3810 { 3811 bdepth = CV_32S; 3812 bits = ddepth == CV_8U ? 8 : 0; 3813 _rowKernel.convertTo( rowKernel, CV_32S, 1 << bits ); 3814 _columnKernel.convertTo( columnKernel, CV_32S, 1 << bits ); 3815 bits *= 2; 3816 _delta *= (1 << bits); 3817 } 3818 else 3819 { 3820 if( _rowKernel.type() != bdepth ) 3821 _rowKernel.convertTo( rowKernel, bdepth ); 3822 else 3823 rowKernel = _rowKernel; 3824 if( _columnKernel.type() != bdepth ) 3825 _columnKernel.convertTo( columnKernel, bdepth ); 3826 else 3827 columnKernel = _columnKernel; 3828 } 3829 3830 int _bufType = CV_MAKETYPE(bdepth, cn); 3831 Ptr<BaseRowFilter> _rowFilter = getLinearRowFilter( 3832 _srcType, _bufType, rowKernel, _anchor.x, rtype); 3833 Ptr<BaseColumnFilter> _columnFilter = getLinearColumnFilter( 3834 _bufType, _dstType, columnKernel, _anchor.y, ctype, _delta, bits ); 3835 3836 return Ptr<FilterEngine>( new FilterEngine(Ptr<BaseFilter>(), _rowFilter, _columnFilter, 3837 _srcType, _dstType, _bufType, _rowBorderType, _columnBorderType, _borderValue )); 3838 } 3839 3840 3841 /****************************************************************************************\ 3842 * Non-separable linear filter * 3843 \****************************************************************************************/ 3844 3845 namespace cv 3846 { 3847 3848 void preprocess2DKernel( const Mat& kernel, std::vector<Point>& coords, std::vector<uchar>& coeffs ) 3849 { 3850 int i, j, k, nz = countNonZero(kernel), ktype = kernel.type(); 3851 if(nz == 0) 3852 nz = 1; 3853 CV_Assert( ktype == CV_8U || ktype == CV_32S || ktype == CV_32F || ktype == CV_64F ); 3854 coords.resize(nz); 3855 coeffs.resize(nz*getElemSize(ktype)); 3856 uchar* _coeffs = &coeffs[0]; 3857 3858 for( i = k = 0; i < kernel.rows; i++ ) 3859 { 3860 const uchar* krow = kernel.ptr(i); 3861 for( j = 0; j < kernel.cols; j++ ) 3862 { 3863 if( ktype == CV_8U ) 3864 { 3865 uchar val = krow[j]; 3866 if( val == 0 ) 3867 continue; 3868 coords[k] = Point(j,i); 3869 _coeffs[k++] = val; 3870 } 3871 else if( ktype == CV_32S ) 3872 { 3873 int val = ((const int*)krow)[j]; 3874 if( val == 0 ) 3875 continue; 3876 coords[k] = Point(j,i); 3877 ((int*)_coeffs)[k++] = val; 3878 } 3879 else if( ktype == CV_32F ) 3880 { 3881 float val = ((const float*)krow)[j]; 3882 if( val == 0 ) 3883 continue; 3884 coords[k] = Point(j,i); 3885 ((float*)_coeffs)[k++] = val; 3886 } 3887 else 3888 { 3889 double val = ((const double*)krow)[j]; 3890 if( val == 0 ) 3891 continue; 3892 coords[k] = Point(j,i); 3893 ((double*)_coeffs)[k++] = val; 3894 } 3895 } 3896 } 3897 } 3898 3899 3900 template<typename ST, class CastOp, class VecOp> struct Filter2D : public BaseFilter 3901 { 3902 typedef typename CastOp::type1 KT; 3903 typedef typename CastOp::rtype DT; 3904 3905 Filter2D( const Mat& _kernel, Point _anchor, 3906 double _delta, const CastOp& _castOp=CastOp(), 3907 const VecOp& _vecOp=VecOp() ) 3908 { 3909 anchor = _anchor; 3910 ksize = _kernel.size(); 3911 delta = saturate_cast<KT>(_delta); 3912 castOp0 = _castOp; 3913 vecOp = _vecOp; 3914 CV_Assert( _kernel.type() == DataType<KT>::type ); 3915 preprocess2DKernel( _kernel, coords, coeffs ); 3916 ptrs.resize( coords.size() ); 3917 } 3918 3919 void operator()(const uchar** src, uchar* dst, int dststep, int count, int width, int cn) 3920 { 3921 KT _delta = delta; 3922 const Point* pt = &coords[0]; 3923 const KT* kf = (const KT*)&coeffs[0]; 3924 const ST** kp = (const ST**)&ptrs[0]; 3925 int i, k, nz = (int)coords.size(); 3926 CastOp castOp = castOp0; 3927 3928 width *= cn; 3929 for( ; count > 0; count--, dst += dststep, src++ ) 3930 { 3931 DT* D = (DT*)dst; 3932 3933 for( k = 0; k < nz; k++ ) 3934 kp[k] = (const ST*)src[pt[k].y] + pt[k].x*cn; 3935 3936 i = vecOp((const uchar**)kp, dst, width); 3937 #if CV_ENABLE_UNROLLED 3938 for( ; i <= width - 4; i += 4 ) 3939 { 3940 KT s0 = _delta, s1 = _delta, s2 = _delta, s3 = _delta; 3941 3942 for( k = 0; k < nz; k++ ) 3943 { 3944 const ST* sptr = kp[k] + i; 3945 KT f = kf[k]; 3946 s0 += f*sptr[0]; 3947 s1 += f*sptr[1]; 3948 s2 += f*sptr[2]; 3949 s3 += f*sptr[3]; 3950 } 3951 3952 D[i] = castOp(s0); D[i+1] = castOp(s1); 3953 D[i+2] = castOp(s2); D[i+3] = castOp(s3); 3954 } 3955 #endif 3956 for( ; i < width; i++ ) 3957 { 3958 KT s0 = _delta; 3959 for( k = 0; k < nz; k++ ) 3960 s0 += kf[k]*kp[k][i]; 3961 D[i] = castOp(s0); 3962 } 3963 } 3964 } 3965 3966 std::vector<Point> coords; 3967 std::vector<uchar> coeffs; 3968 std::vector<uchar*> ptrs; 3969 KT delta; 3970 CastOp castOp0; 3971 VecOp vecOp; 3972 }; 3973 3974 #ifdef HAVE_OPENCL 3975 3976 #define DIVUP(total, grain) (((total) + (grain) - 1) / (grain)) 3977 #define ROUNDUP(sz, n) ((sz) + (n) - 1 - (((sz) + (n) - 1) % (n))) 3978 3979 // prepare kernel: transpose and make double rows (+align). Returns size of aligned row 3980 // Samples: 3981 // a b c 3982 // Input: d e f 3983 // g h i 3984 // Output, last two zeros is the alignment: 3985 // a d g a d g 0 0 3986 // b e h b e h 0 0 3987 // c f i c f i 0 0 3988 template <typename T> 3989 static int _prepareKernelFilter2D(std::vector<T> & data, const Mat & kernel) 3990 { 3991 Mat _kernel; kernel.convertTo(_kernel, DataDepth<T>::value); 3992 int size_y_aligned = ROUNDUP(kernel.rows * 2, 4); 3993 data.clear(); data.resize(size_y_aligned * kernel.cols, 0); 3994 for (int x = 0; x < kernel.cols; x++) 3995 { 3996 for (int y = 0; y < kernel.rows; y++) 3997 { 3998 data[x * size_y_aligned + y] = _kernel.at<T>(y, x); 3999 data[x * size_y_aligned + y + kernel.rows] = _kernel.at<T>(y, x); 4000 } 4001 } 4002 return size_y_aligned; 4003 } 4004 4005 static bool ocl_filter2D( InputArray _src, OutputArray _dst, int ddepth, 4006 InputArray _kernel, Point anchor, 4007 double delta, int borderType ) 4008 { 4009 int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 4010 ddepth = ddepth < 0 ? sdepth : ddepth; 4011 int dtype = CV_MAKE_TYPE(ddepth, cn), wdepth = std::max(std::max(sdepth, ddepth), CV_32F), 4012 wtype = CV_MAKE_TYPE(wdepth, cn); 4013 if (cn > 4) 4014 return false; 4015 4016 Size ksize = _kernel.size(); 4017 if (anchor.x < 0) 4018 anchor.x = ksize.width / 2; 4019 if (anchor.y < 0) 4020 anchor.y = ksize.height / 2; 4021 4022 bool isolated = (borderType & BORDER_ISOLATED) != 0; 4023 borderType &= ~BORDER_ISOLATED; 4024 const cv::ocl::Device &device = cv::ocl::Device::getDefault(); 4025 bool doubleSupport = device.doubleFPConfig() > 0; 4026 if (wdepth == CV_64F && !doubleSupport) 4027 return false; 4028 4029 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", 4030 "BORDER_WRAP", "BORDER_REFLECT_101" }; 4031 4032 cv::Mat kernelMat = _kernel.getMat(); 4033 cv::Size sz = _src.size(), wholeSize; 4034 size_t globalsize[2] = { sz.width, sz.height }; 4035 size_t localsize_general[2] = {0, 1}; 4036 size_t* localsize = NULL; 4037 4038 ocl::Kernel k; 4039 UMat src = _src.getUMat(); 4040 if (!isolated) 4041 { 4042 Point ofs; 4043 src.locateROI(wholeSize, ofs); 4044 } 4045 4046 size_t tryWorkItems = device.maxWorkGroupSize(); 4047 if (device.isIntel() && 128 < tryWorkItems) 4048 tryWorkItems = 128; 4049 char cvt[2][40]; 4050 4051 // For smaller filter kernels, there is a special kernel that is more 4052 // efficient than the general one. 4053 UMat kernalDataUMat; 4054 if (device.isIntel() && (device.type() & ocl::Device::TYPE_GPU) && 4055 ((ksize.width < 5 && ksize.height < 5) || 4056 (ksize.width == 5 && ksize.height == 5 && cn == 1))) 4057 { 4058 kernelMat = kernelMat.reshape(0, 1); 4059 String kerStr = ocl::kernelToStr(kernelMat, CV_32F); 4060 int h = isolated ? sz.height : wholeSize.height; 4061 int w = isolated ? sz.width : wholeSize.width; 4062 4063 if (w < ksize.width || h < ksize.height) 4064 return false; 4065 4066 // Figure out what vector size to use for loading the pixels. 4067 int pxLoadNumPixels = cn != 1 || sz.width % 4 ? 1 : 4; 4068 int pxLoadVecSize = cn * pxLoadNumPixels; 4069 4070 // Figure out how many pixels per work item to compute in X and Y 4071 // directions. Too many and we run out of registers. 4072 int pxPerWorkItemX = 1; 4073 int pxPerWorkItemY = 1; 4074 if (cn <= 2 && ksize.width <= 4 && ksize.height <= 4) 4075 { 4076 pxPerWorkItemX = sz.width % 8 ? sz.width % 4 ? sz.width % 2 ? 1 : 2 : 4 : 8; 4077 pxPerWorkItemY = sz.height % 2 ? 1 : 2; 4078 } 4079 else if (cn < 4 || (ksize.width <= 4 && ksize.height <= 4)) 4080 { 4081 pxPerWorkItemX = sz.width % 2 ? 1 : 2; 4082 pxPerWorkItemY = sz.height % 2 ? 1 : 2; 4083 } 4084 globalsize[0] = sz.width / pxPerWorkItemX; 4085 globalsize[1] = sz.height / pxPerWorkItemY; 4086 4087 // Need some padding in the private array for pixels 4088 int privDataWidth = ROUNDUP(pxPerWorkItemX + ksize.width - 1, pxLoadNumPixels); 4089 4090 // Make the global size a nice round number so the runtime can pick 4091 // from reasonable choices for the workgroup size 4092 const int wgRound = 256; 4093 globalsize[0] = ROUNDUP(globalsize[0], wgRound); 4094 4095 char build_options[1024]; 4096 sprintf(build_options, "-D cn=%d " 4097 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " 4098 "-D PX_LOAD_VEC_SIZE=%d -D PX_LOAD_NUM_PX=%d " 4099 "-D PX_PER_WI_X=%d -D PX_PER_WI_Y=%d -D PRIV_DATA_WIDTH=%d -D %s -D %s " 4100 "-D PX_LOAD_X_ITERATIONS=%d -D PX_LOAD_Y_ITERATIONS=%d " 4101 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " 4102 "-D convertToWT=%s -D convertToDstT=%s %s", 4103 cn, anchor.x, anchor.y, ksize.width, ksize.height, 4104 pxLoadVecSize, pxLoadNumPixels, 4105 pxPerWorkItemX, pxPerWorkItemY, privDataWidth, borderMap[borderType], 4106 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", 4107 privDataWidth / pxLoadNumPixels, pxPerWorkItemY + ksize.height - 1, 4108 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), 4109 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), 4110 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), 4111 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), kerStr.c_str()); 4112 4113 if (!k.create("filter2DSmall", cv::ocl::imgproc::filter2DSmall_oclsrc, build_options)) 4114 return false; 4115 } 4116 else 4117 { 4118 localsize = localsize_general; 4119 std::vector<float> kernelMatDataFloat; 4120 int kernel_size_y2_aligned = _prepareKernelFilter2D<float>(kernelMatDataFloat, kernelMat); 4121 String kerStr = ocl::kernelToStr(kernelMatDataFloat, CV_32F); 4122 4123 for ( ; ; ) 4124 { 4125 size_t BLOCK_SIZE = tryWorkItems; 4126 while (BLOCK_SIZE > 32 && BLOCK_SIZE >= (size_t)ksize.width * 2 && BLOCK_SIZE > (size_t)sz.width * 2) 4127 BLOCK_SIZE /= 2; 4128 4129 if ((size_t)ksize.width > BLOCK_SIZE) 4130 return false; 4131 4132 int requiredTop = anchor.y; 4133 int requiredLeft = (int)BLOCK_SIZE; // not this: anchor.x; 4134 int requiredBottom = ksize.height - 1 - anchor.y; 4135 int requiredRight = (int)BLOCK_SIZE; // not this: ksize.width - 1 - anchor.x; 4136 int h = isolated ? sz.height : wholeSize.height; 4137 int w = isolated ? sz.width : wholeSize.width; 4138 bool extra_extrapolation = h < requiredTop || h < requiredBottom || w < requiredLeft || w < requiredRight; 4139 4140 if ((w < ksize.width) || (h < ksize.height)) 4141 return false; 4142 4143 String opts = format("-D LOCAL_SIZE=%d -D cn=%d " 4144 "-D ANCHOR_X=%d -D ANCHOR_Y=%d -D KERNEL_SIZE_X=%d -D KERNEL_SIZE_Y=%d " 4145 "-D KERNEL_SIZE_Y2_ALIGNED=%d -D %s -D %s -D %s%s%s " 4146 "-D srcT=%s -D srcT1=%s -D dstT=%s -D dstT1=%s -D WT=%s -D WT1=%s " 4147 "-D convertToWT=%s -D convertToDstT=%s", 4148 (int)BLOCK_SIZE, cn, anchor.x, anchor.y, 4149 ksize.width, ksize.height, kernel_size_y2_aligned, borderMap[borderType], 4150 extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", 4151 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", 4152 doubleSupport ? " -D DOUBLE_SUPPORT" : "", kerStr.c_str(), 4153 ocl::typeToStr(type), ocl::typeToStr(sdepth), ocl::typeToStr(dtype), 4154 ocl::typeToStr(ddepth), ocl::typeToStr(wtype), ocl::typeToStr(wdepth), 4155 ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), 4156 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1])); 4157 4158 localsize[0] = BLOCK_SIZE; 4159 globalsize[0] = DIVUP(sz.width, BLOCK_SIZE - (ksize.width - 1)) * BLOCK_SIZE; 4160 globalsize[1] = sz.height; 4161 4162 if (!k.create("filter2D", cv::ocl::imgproc::filter2D_oclsrc, opts)) 4163 return false; 4164 4165 size_t kernelWorkGroupSize = k.workGroupSize(); 4166 if (localsize[0] <= kernelWorkGroupSize) 4167 break; 4168 if (BLOCK_SIZE < kernelWorkGroupSize) 4169 return false; 4170 tryWorkItems = kernelWorkGroupSize; 4171 } 4172 } 4173 4174 _dst.create(sz, dtype); 4175 UMat dst = _dst.getUMat(); 4176 4177 int srcOffsetX = (int)((src.offset % src.step) / src.elemSize()); 4178 int srcOffsetY = (int)(src.offset / src.step); 4179 int srcEndX = (isolated ? (srcOffsetX + sz.width) : wholeSize.width); 4180 int srcEndY = (isolated ? (srcOffsetY + sz.height) : wholeSize.height); 4181 4182 k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffsetX, srcOffsetY, 4183 srcEndX, srcEndY, ocl::KernelArg::WriteOnly(dst), (float)delta); 4184 4185 return k.run(2, globalsize, localsize, false); 4186 } 4187 4188 const int shift_bits = 8; 4189 4190 static bool ocl_sepRowFilter2D(const UMat & src, UMat & buf, const Mat & kernelX, int anchor, 4191 int borderType, int ddepth, bool fast8uc1, bool int_arithm) 4192 { 4193 int type = src.type(), cn = CV_MAT_CN(type), sdepth = CV_MAT_DEPTH(type); 4194 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; 4195 Size bufSize = buf.size(); 4196 int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); 4197 4198 if (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) 4199 return false; 4200 4201 #ifdef ANDROID 4202 size_t localsize[2] = {16, 10}; 4203 #else 4204 size_t localsize[2] = {16, 16}; 4205 #endif 4206 4207 size_t globalsize[2] = {DIVUP(bufSize.width, localsize[0]) * localsize[0], DIVUP(bufSize.height, localsize[1]) * localsize[1]}; 4208 if (fast8uc1) 4209 globalsize[0] = DIVUP((bufSize.width + 3) >> 2, localsize[0]) * localsize[0]; 4210 4211 int radiusX = anchor, radiusY = (buf.rows - src.rows) >> 1; 4212 4213 bool isolated = (borderType & BORDER_ISOLATED) != 0; 4214 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", "BORDER_REFLECT_101" }, 4215 * const btype = borderMap[borderType & ~BORDER_ISOLATED]; 4216 4217 bool extra_extrapolation = src.rows < (int)((-radiusY + globalsize[1]) >> 1) + 1; 4218 extra_extrapolation |= src.rows < radiusY; 4219 extra_extrapolation |= src.cols < (int)((-radiusX + globalsize[0] + 8 * localsize[0] + 3) >> 1) + 1; 4220 extra_extrapolation |= src.cols < radiusX; 4221 4222 char cvt[40]; 4223 cv::String build_options = cv::format("-D RADIUSX=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d -D %s -D %s -D %s" 4224 " -D srcT=%s -D dstT=%s -D convertToDstT=%s -D srcT1=%s -D dstT1=%s%s%s", 4225 radiusX, (int)localsize[0], (int)localsize[1], cn, btype, 4226 extra_extrapolation ? "EXTRA_EXTRAPOLATION" : "NO_EXTRA_EXTRAPOLATION", 4227 isolated ? "BORDER_ISOLATED" : "NO_BORDER_ISOLATED", 4228 ocl::typeToStr(type), ocl::typeToStr(buf_type), 4229 ocl::convertTypeStr(sdepth, bdepth, cn, cvt), 4230 ocl::typeToStr(sdepth), ocl::typeToStr(bdepth), 4231 doubleSupport ? " -D DOUBLE_SUPPORT" : "", 4232 int_arithm ? " -D INTEGER_ARITHMETIC" : ""); 4233 build_options += ocl::kernelToStr(kernelX, bdepth); 4234 4235 Size srcWholeSize; Point srcOffset; 4236 src.locateROI(srcWholeSize, srcOffset); 4237 4238 String kernelName("row_filter"); 4239 if (fast8uc1) 4240 kernelName += "_C1_D0"; 4241 4242 ocl::Kernel k(kernelName.c_str(), cv::ocl::imgproc::filterSepRow_oclsrc, 4243 build_options); 4244 if (k.empty()) 4245 return false; 4246 4247 if (fast8uc1) 4248 k.args(ocl::KernelArg::PtrReadOnly(src), (int)(src.step / src.elemSize()), srcOffset.x, 4249 srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, 4250 ocl::KernelArg::PtrWriteOnly(buf), (int)(buf.step / buf.elemSize()), 4251 buf.cols, buf.rows, radiusY); 4252 else 4253 k.args(ocl::KernelArg::PtrReadOnly(src), (int)src.step, srcOffset.x, 4254 srcOffset.y, src.cols, src.rows, srcWholeSize.width, srcWholeSize.height, 4255 ocl::KernelArg::PtrWriteOnly(buf), (int)buf.step, buf.cols, buf.rows, radiusY); 4256 4257 return k.run(2, globalsize, localsize, false); 4258 } 4259 4260 static bool ocl_sepColFilter2D(const UMat & buf, UMat & dst, const Mat & kernelY, double delta, int anchor, bool int_arithm) 4261 { 4262 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; 4263 if (dst.depth() == CV_64F && !doubleSupport) 4264 return false; 4265 4266 #ifdef ANDROID 4267 size_t localsize[2] = { 16, 10 }; 4268 #else 4269 size_t localsize[2] = { 16, 16 }; 4270 #endif 4271 size_t globalsize[2] = { 0, 0 }; 4272 4273 int dtype = dst.type(), cn = CV_MAT_CN(dtype), ddepth = CV_MAT_DEPTH(dtype); 4274 Size sz = dst.size(); 4275 int buf_type = buf.type(), bdepth = CV_MAT_DEPTH(buf_type); 4276 4277 globalsize[1] = DIVUP(sz.height, localsize[1]) * localsize[1]; 4278 globalsize[0] = DIVUP(sz.width, localsize[0]) * localsize[0]; 4279 4280 char cvt[40]; 4281 cv::String build_options = cv::format("-D RADIUSY=%d -D LSIZE0=%d -D LSIZE1=%d -D CN=%d" 4282 " -D srcT=%s -D dstT=%s -D convertToDstT=%s" 4283 " -D srcT1=%s -D dstT1=%s -D SHIFT_BITS=%d%s%s", 4284 anchor, (int)localsize[0], (int)localsize[1], cn, 4285 ocl::typeToStr(buf_type), ocl::typeToStr(dtype), 4286 ocl::convertTypeStr(bdepth, ddepth, cn, cvt), 4287 ocl::typeToStr(bdepth), ocl::typeToStr(ddepth), 4288 2*shift_bits, doubleSupport ? " -D DOUBLE_SUPPORT" : "", 4289 int_arithm ? " -D INTEGER_ARITHMETIC" : ""); 4290 build_options += ocl::kernelToStr(kernelY, bdepth); 4291 4292 ocl::Kernel k("col_filter", cv::ocl::imgproc::filterSepCol_oclsrc, 4293 build_options); 4294 if (k.empty()) 4295 return false; 4296 4297 k.args(ocl::KernelArg::ReadOnly(buf), ocl::KernelArg::WriteOnly(dst), 4298 static_cast<float>(delta)); 4299 4300 return k.run(2, globalsize, localsize, false); 4301 } 4302 4303 const int optimizedSepFilterLocalWidth = 16; 4304 const int optimizedSepFilterLocalHeight = 8; 4305 4306 static bool ocl_sepFilter2D_SinglePass(InputArray _src, OutputArray _dst, 4307 Mat row_kernel, Mat col_kernel, 4308 double delta, int borderType, int ddepth, int bdepth, bool int_arithm) 4309 { 4310 Size size = _src.size(), wholeSize; 4311 Point origin; 4312 int stype = _src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), 4313 esz = CV_ELEM_SIZE(stype), wdepth = std::max(std::max(sdepth, ddepth), bdepth), 4314 dtype = CV_MAKE_TYPE(ddepth, cn); 4315 size_t src_step = _src.step(), src_offset = _src.offset(); 4316 bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0; 4317 4318 if ((src_offset % src_step) % esz != 0 || (!doubleSupport && (sdepth == CV_64F || ddepth == CV_64F)) || 4319 !(borderType == BORDER_CONSTANT || borderType == BORDER_REPLICATE || 4320 borderType == BORDER_REFLECT || borderType == BORDER_WRAP || 4321 borderType == BORDER_REFLECT_101)) 4322 return false; 4323 4324 size_t lt2[2] = { optimizedSepFilterLocalWidth, optimizedSepFilterLocalHeight }; 4325 size_t gt2[2] = { lt2[0] * (1 + (size.width - 1) / lt2[0]), lt2[1]}; 4326 4327 char cvt[2][40]; 4328 const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", 4329 "BORDER_REFLECT_101" }; 4330 4331 String opts = cv::format("-D BLK_X=%d -D BLK_Y=%d -D RADIUSX=%d -D RADIUSY=%d%s%s" 4332 " -D srcT=%s -D convertToWT=%s -D WT=%s -D dstT=%s -D convertToDstT=%s" 4333 " -D %s -D srcT1=%s -D dstT1=%s -D WT1=%s -D CN=%d -D SHIFT_BITS=%d%s", 4334 (int)lt2[0], (int)lt2[1], row_kernel.cols / 2, col_kernel.cols / 2, 4335 ocl::kernelToStr(row_kernel, wdepth, "KERNEL_MATRIX_X").c_str(), 4336 ocl::kernelToStr(col_kernel, wdepth, "KERNEL_MATRIX_Y").c_str(), 4337 ocl::typeToStr(stype), ocl::convertTypeStr(sdepth, wdepth, cn, cvt[0]), 4338 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), ocl::typeToStr(dtype), 4339 ocl::convertTypeStr(wdepth, ddepth, cn, cvt[1]), borderMap[borderType], 4340 ocl::typeToStr(sdepth), ocl::typeToStr(ddepth), ocl::typeToStr(wdepth), 4341 cn, 2*shift_bits, int_arithm ? " -D INTEGER_ARITHMETIC" : ""); 4342 4343 ocl::Kernel k("sep_filter", ocl::imgproc::filterSep_singlePass_oclsrc, opts); 4344 if (k.empty()) 4345 return false; 4346 4347 UMat src = _src.getUMat(); 4348 _dst.create(size, dtype); 4349 UMat dst = _dst.getUMat(); 4350 4351 int src_offset_x = static_cast<int>((src_offset % src_step) / esz); 4352 int src_offset_y = static_cast<int>(src_offset / src_step); 4353 4354 src.locateROI(wholeSize, origin); 4355 4356 k.args(ocl::KernelArg::PtrReadOnly(src), (int)src_step, src_offset_x, src_offset_y, 4357 wholeSize.height, wholeSize.width, ocl::KernelArg::WriteOnly(dst), 4358 static_cast<float>(delta)); 4359 4360 return k.run(2, gt2, lt2, false); 4361 } 4362 4363 static bool ocl_sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, 4364 InputArray _kernelX, InputArray _kernelY, Point anchor, 4365 double delta, int borderType ) 4366 { 4367 const ocl::Device & d = ocl::Device::getDefault(); 4368 Size imgSize = _src.size(); 4369 4370 int type = _src.type(), sdepth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 4371 if (cn > 4) 4372 return false; 4373 4374 Mat kernelX = _kernelX.getMat().reshape(1, 1); 4375 if (kernelX.cols % 2 != 1) 4376 return false; 4377 Mat kernelY = _kernelY.getMat().reshape(1, 1); 4378 if (kernelY.cols % 2 != 1) 4379 return false; 4380 4381 if (ddepth < 0) 4382 ddepth = sdepth; 4383 4384 if (anchor.x < 0) 4385 anchor.x = kernelX.cols >> 1; 4386 if (anchor.y < 0) 4387 anchor.y = kernelY.cols >> 1; 4388 4389 int rtype = getKernelType(kernelX, 4390 kernelX.rows == 1 ? Point(anchor.x, 0) : Point(0, anchor.x)); 4391 int ctype = getKernelType(kernelY, 4392 kernelY.rows == 1 ? Point(anchor.y, 0) : Point(0, anchor.y)); 4393 4394 int bdepth = CV_32F; 4395 bool int_arithm = false; 4396 if( sdepth == CV_8U && ddepth == CV_8U && 4397 rtype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL && 4398 ctype == KERNEL_SMOOTH+KERNEL_SYMMETRICAL) 4399 { 4400 if (ocl::Device::getDefault().isIntel()) 4401 { 4402 for (int i=0; i<kernelX.cols; i++) 4403 kernelX.at<float>(0, i) = (float) cvRound(kernelX.at<float>(0, i) * (1 << shift_bits)); 4404 if (kernelX.data != kernelY.data) 4405 for (int i=0; i<kernelX.cols; i++) 4406 kernelY.at<float>(0, i) = (float) cvRound(kernelY.at<float>(0, i) * (1 << shift_bits)); 4407 } else 4408 { 4409 bdepth = CV_32S; 4410 kernelX.convertTo( kernelX, bdepth, 1 << shift_bits ); 4411 kernelY.convertTo( kernelY, bdepth, 1 << shift_bits ); 4412 } 4413 int_arithm = true; 4414 } 4415 4416 CV_OCL_RUN_(kernelY.cols <= 21 && kernelX.cols <= 21 && 4417 imgSize.width > optimizedSepFilterLocalWidth + anchor.x && 4418 imgSize.height > optimizedSepFilterLocalHeight + anchor.y && 4419 (!(borderType & BORDER_ISOLATED) || _src.offset() == 0) && 4420 anchor == Point(kernelX.cols >> 1, kernelY.cols >> 1) && 4421 (d.isIntel() || (d.isAMD() && !d.hostUnifiedMemory())), 4422 ocl_sepFilter2D_SinglePass(_src, _dst, kernelX, kernelY, delta, 4423 borderType & ~BORDER_ISOLATED, ddepth, bdepth, int_arithm), true) 4424 4425 UMat src = _src.getUMat(); 4426 Size srcWholeSize; Point srcOffset; 4427 src.locateROI(srcWholeSize, srcOffset); 4428 4429 bool fast8uc1 = type == CV_8UC1 && srcOffset.x % 4 == 0 && 4430 src.cols % 4 == 0 && src.step % 4 == 0; 4431 4432 Size srcSize = src.size(); 4433 Size bufSize(srcSize.width, srcSize.height + kernelY.cols - 1); 4434 UMat buf(bufSize, CV_MAKETYPE(bdepth, cn)); 4435 if (!ocl_sepRowFilter2D(src, buf, kernelX, anchor.x, borderType, ddepth, fast8uc1, int_arithm)) 4436 return false; 4437 4438 _dst.create(srcSize, CV_MAKETYPE(ddepth, cn)); 4439 UMat dst = _dst.getUMat(); 4440 4441 return ocl_sepColFilter2D(buf, dst, kernelY, delta, anchor.y, int_arithm); 4442 } 4443 4444 #endif 4445 4446 } 4447 4448 cv::Ptr<cv::BaseFilter> cv::getLinearFilter(int srcType, int dstType, 4449 InputArray filter_kernel, Point anchor, 4450 double delta, int bits) 4451 { 4452 Mat _kernel = filter_kernel.getMat(); 4453 int sdepth = CV_MAT_DEPTH(srcType), ddepth = CV_MAT_DEPTH(dstType); 4454 int cn = CV_MAT_CN(srcType), kdepth = _kernel.depth(); 4455 CV_Assert( cn == CV_MAT_CN(dstType) && ddepth >= sdepth ); 4456 4457 anchor = normalizeAnchor(anchor, _kernel.size()); 4458 4459 /*if( sdepth == CV_8U && ddepth == CV_8U && kdepth == CV_32S ) 4460 return makePtr<Filter2D<uchar, FixedPtCastEx<int, uchar>, FilterVec_8u> > 4461 (_kernel, anchor, delta, FixedPtCastEx<int, uchar>(bits), 4462 FilterVec_8u(_kernel, bits, delta)); 4463 if( sdepth == CV_8U && ddepth == CV_16S && kdepth == CV_32S ) 4464 return makePtr<Filter2D<uchar, FixedPtCastEx<int, short>, FilterVec_8u16s> > 4465 (_kernel, anchor, delta, FixedPtCastEx<int, short>(bits), 4466 FilterVec_8u16s(_kernel, bits, delta));*/ 4467 4468 kdepth = sdepth == CV_64F || ddepth == CV_64F ? CV_64F : CV_32F; 4469 Mat kernel; 4470 if( _kernel.type() == kdepth ) 4471 kernel = _kernel; 4472 else 4473 _kernel.convertTo(kernel, kdepth, _kernel.type() == CV_32S ? 1./(1 << bits) : 1.); 4474 4475 if( sdepth == CV_8U && ddepth == CV_8U ) 4476 return makePtr<Filter2D<uchar, Cast<float, uchar>, FilterVec_8u> > 4477 (kernel, anchor, delta, Cast<float, uchar>(), FilterVec_8u(kernel, 0, delta)); 4478 if( sdepth == CV_8U && ddepth == CV_16U ) 4479 return makePtr<Filter2D<uchar, 4480 Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta); 4481 if( sdepth == CV_8U && ddepth == CV_16S ) 4482 return makePtr<Filter2D<uchar, Cast<float, short>, FilterVec_8u16s> > 4483 (kernel, anchor, delta, Cast<float, short>(), FilterVec_8u16s(kernel, 0, delta)); 4484 if( sdepth == CV_8U && ddepth == CV_32F ) 4485 return makePtr<Filter2D<uchar, 4486 Cast<float, float>, FilterNoVec> >(kernel, anchor, delta); 4487 if( sdepth == CV_8U && ddepth == CV_64F ) 4488 return makePtr<Filter2D<uchar, 4489 Cast<double, double>, FilterNoVec> >(kernel, anchor, delta); 4490 4491 if( sdepth == CV_16U && ddepth == CV_16U ) 4492 return makePtr<Filter2D<ushort, 4493 Cast<float, ushort>, FilterNoVec> >(kernel, anchor, delta); 4494 if( sdepth == CV_16U && ddepth == CV_32F ) 4495 return makePtr<Filter2D<ushort, 4496 Cast<float, float>, FilterNoVec> >(kernel, anchor, delta); 4497 if( sdepth == CV_16U && ddepth == CV_64F ) 4498 return makePtr<Filter2D<ushort, 4499 Cast<double, double>, FilterNoVec> >(kernel, anchor, delta); 4500 4501 if( sdepth == CV_16S && ddepth == CV_16S ) 4502 return makePtr<Filter2D<short, 4503 Cast<float, short>, FilterNoVec> >(kernel, anchor, delta); 4504 if( sdepth == CV_16S && ddepth == CV_32F ) 4505 return makePtr<Filter2D<short, 4506 Cast<float, float>, FilterNoVec> >(kernel, anchor, delta); 4507 if( sdepth == CV_16S && ddepth == CV_64F ) 4508 return makePtr<Filter2D<short, 4509 Cast<double, double>, FilterNoVec> >(kernel, anchor, delta); 4510 4511 if( sdepth == CV_32F && ddepth == CV_32F ) 4512 return makePtr<Filter2D<float, Cast<float, float>, FilterVec_32f> > 4513 (kernel, anchor, delta, Cast<float, float>(), FilterVec_32f(kernel, 0, delta)); 4514 if( sdepth == CV_64F && ddepth == CV_64F ) 4515 return makePtr<Filter2D<double, 4516 Cast<double, double>, FilterNoVec> >(kernel, anchor, delta); 4517 4518 CV_Error_( CV_StsNotImplemented, 4519 ("Unsupported combination of source format (=%d), and destination format (=%d)", 4520 srcType, dstType)); 4521 4522 return Ptr<BaseFilter>(); 4523 } 4524 4525 4526 cv::Ptr<cv::FilterEngine> cv::createLinearFilter( int _srcType, int _dstType, 4527 InputArray filter_kernel, 4528 Point _anchor, double _delta, 4529 int _rowBorderType, int _columnBorderType, 4530 const Scalar& _borderValue ) 4531 { 4532 Mat _kernel = filter_kernel.getMat(); 4533 _srcType = CV_MAT_TYPE(_srcType); 4534 _dstType = CV_MAT_TYPE(_dstType); 4535 int cn = CV_MAT_CN(_srcType); 4536 CV_Assert( cn == CV_MAT_CN(_dstType) ); 4537 4538 Mat kernel = _kernel; 4539 int bits = 0; 4540 4541 /*int sdepth = CV_MAT_DEPTH(_srcType), ddepth = CV_MAT_DEPTH(_dstType); 4542 int ktype = _kernel.depth() == CV_32S ? KERNEL_INTEGER : getKernelType(_kernel, _anchor); 4543 if( sdepth == CV_8U && (ddepth == CV_8U || ddepth == CV_16S) && 4544 _kernel.rows*_kernel.cols <= (1 << 10) ) 4545 { 4546 bits = (ktype & KERNEL_INTEGER) ? 0 : 11; 4547 _kernel.convertTo(kernel, CV_32S, 1 << bits); 4548 }*/ 4549 4550 Ptr<BaseFilter> _filter2D = getLinearFilter(_srcType, _dstType, 4551 kernel, _anchor, _delta, bits); 4552 4553 return makePtr<FilterEngine>(_filter2D, Ptr<BaseRowFilter>(), 4554 Ptr<BaseColumnFilter>(), _srcType, _dstType, _srcType, 4555 _rowBorderType, _columnBorderType, _borderValue ); 4556 } 4557 4558 4559 void cv::filter2D( InputArray _src, OutputArray _dst, int ddepth, 4560 InputArray _kernel, Point anchor0, 4561 double delta, int borderType ) 4562 { 4563 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, 4564 ocl_filter2D(_src, _dst, ddepth, _kernel, anchor0, delta, borderType)) 4565 4566 Mat src = _src.getMat(), kernel = _kernel.getMat(); 4567 4568 if( ddepth < 0 ) 4569 ddepth = src.depth(); 4570 4571 #if CV_SSE2 4572 int dft_filter_size = ((src.depth() == CV_8U && (ddepth == CV_8U || ddepth == CV_16S)) || 4573 (src.depth() == CV_32F && ddepth == CV_32F)) && checkHardwareSupport(CV_CPU_SSE3)? 130 : 50; 4574 #else 4575 int dft_filter_size = 50; 4576 #endif 4577 4578 _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); 4579 Mat dst = _dst.getMat(); 4580 Point anchor = normalizeAnchor(anchor0, kernel.size()); 4581 4582 #if IPP_VERSION_X100 > 0 && !defined HAVE_IPP_ICV_ONLY 4583 CV_IPP_CHECK() 4584 { 4585 typedef IppStatus (CV_STDCALL * ippiFilterBorder)(const void * pSrc, int srcStep, void * pDst, int dstStep, IppiSize dstRoiSize, 4586 IppiBorderType border, const void * borderValue, 4587 const IppiFilterBorderSpec* pSpec, Ipp8u* pBuffer); 4588 4589 int stype = src.type(), sdepth = CV_MAT_DEPTH(stype), cn = CV_MAT_CN(stype), 4590 ktype = kernel.type(), kdepth = CV_MAT_DEPTH(ktype); 4591 bool isolated = (borderType & BORDER_ISOLATED) != 0; 4592 Point ippAnchor(kernel.cols >> 1, kernel.rows >> 1); 4593 int borderTypeNI = borderType & ~BORDER_ISOLATED; 4594 IppiBorderType ippBorderType = ippiGetBorderType(borderTypeNI); 4595 4596 if (borderTypeNI == BORDER_CONSTANT || borderTypeNI == BORDER_REPLICATE) 4597 { 4598 ippiFilterBorder ippFunc = 4599 stype == CV_8UC1 ? (ippiFilterBorder)ippiFilterBorder_8u_C1R : 4600 stype == CV_8UC3 ? (ippiFilterBorder)ippiFilterBorder_8u_C3R : 4601 stype == CV_8UC4 ? (ippiFilterBorder)ippiFilterBorder_8u_C4R : 4602 stype == CV_16UC1 ? (ippiFilterBorder)ippiFilterBorder_16u_C1R : 4603 stype == CV_16UC3 ? (ippiFilterBorder)ippiFilterBorder_16u_C3R : 4604 stype == CV_16UC4 ? (ippiFilterBorder)ippiFilterBorder_16u_C4R : 4605 stype == CV_16SC1 ? (ippiFilterBorder)ippiFilterBorder_16s_C1R : 4606 stype == CV_16SC3 ? (ippiFilterBorder)ippiFilterBorder_16s_C3R : 4607 stype == CV_16SC4 ? (ippiFilterBorder)ippiFilterBorder_16s_C4R : 4608 stype == CV_32FC1 ? (ippiFilterBorder)ippiFilterBorder_32f_C1R : 4609 stype == CV_32FC3 ? (ippiFilterBorder)ippiFilterBorder_32f_C3R : 4610 stype == CV_32FC4 ? (ippiFilterBorder)ippiFilterBorder_32f_C4R : 0; 4611 4612 if (sdepth == ddepth && (ktype == CV_16SC1 || ktype == CV_32FC1) && 4613 ippFunc && (int)ippBorderType >= 0 && (!src.isSubmatrix() || isolated) && 4614 std::fabs(delta - 0) < DBL_EPSILON && ippAnchor == anchor && dst.data != src.data) 4615 { 4616 IppiSize kernelSize = { kernel.cols, kernel.rows }, dstRoiSize = { dst.cols, dst.rows }; 4617 IppDataType dataType = ippiGetDataType(ddepth), kernelType = ippiGetDataType(kdepth); 4618 Ipp32s specSize = 0, bufsize = 0; 4619 IppStatus status = (IppStatus)-1; 4620 4621 if ((status = ippiFilterBorderGetSize(kernelSize, dstRoiSize, dataType, kernelType, cn, &specSize, &bufsize)) >= 0) 4622 { 4623 IppiFilterBorderSpec * spec = (IppiFilterBorderSpec *)ippMalloc(specSize); 4624 Ipp8u * buffer = ippsMalloc_8u(bufsize); 4625 Ipp32f borderValue[4] = { 0, 0, 0, 0 }; 4626 4627 Mat reversedKernel; 4628 flip(kernel, reversedKernel, -1); 4629 4630 if ((kdepth == CV_32F && (status = ippiFilterBorderInit_32f((const Ipp32f *)reversedKernel.data, kernelSize, 4631 dataType, cn, ippRndFinancial, spec)) >= 0 ) || 4632 (kdepth == CV_16S && (status = ippiFilterBorderInit_16s((const Ipp16s *)reversedKernel.data, 4633 kernelSize, 0, dataType, cn, ippRndFinancial, spec)) >= 0)) 4634 { 4635 status = ippFunc(src.data, (int)src.step, dst.data, (int)dst.step, dstRoiSize, 4636 ippBorderType, borderValue, spec, buffer); 4637 } 4638 4639 ippsFree(buffer); 4640 ippsFree(spec); 4641 } 4642 4643 if (status >= 0) 4644 { 4645 CV_IMPL_ADD(CV_IMPL_IPP); 4646 return; 4647 } 4648 setIppErrorStatus(); 4649 } 4650 } 4651 } 4652 #endif 4653 4654 #ifdef HAVE_TEGRA_OPTIMIZATION 4655 if( tegra::useTegra() && tegra::filter2D(src, dst, kernel, anchor, delta, borderType) ) 4656 return; 4657 #endif 4658 4659 if( kernel.cols*kernel.rows >= dft_filter_size ) 4660 { 4661 Mat temp; 4662 // crossCorr doesn't accept non-zero delta with multiple channels 4663 if( src.channels() != 1 && delta != 0 ) 4664 { 4665 // The semantics of filter2D require that the delta be applied 4666 // as floating-point math. So wee need an intermediate Mat 4667 // with a float datatype. If the dest is already floats, 4668 // we just use that. 4669 int corrDepth = dst.depth(); 4670 if( (dst.depth() == CV_32F || dst.depth() == CV_64F) && 4671 src.data != dst.data ) 4672 { 4673 temp = dst; 4674 } 4675 else 4676 { 4677 corrDepth = dst.depth() == CV_64F ? CV_64F : CV_32F; 4678 temp.create( dst.size(), CV_MAKETYPE(corrDepth, dst.channels()) ); 4679 } 4680 crossCorr( src, kernel, temp, src.size(), 4681 CV_MAKETYPE(corrDepth, src.channels()), 4682 anchor, 0, borderType ); 4683 add( temp, delta, temp ); 4684 if ( temp.data != dst.data ) 4685 { 4686 temp.convertTo( dst, dst.type() ); 4687 } 4688 } 4689 else 4690 { 4691 if( src.data != dst.data ) 4692 temp = dst; 4693 else 4694 temp.create(dst.size(), dst.type()); 4695 crossCorr( src, kernel, temp, src.size(), 4696 CV_MAKETYPE(ddepth, src.channels()), 4697 anchor, delta, borderType ); 4698 if( temp.data != dst.data ) 4699 temp.copyTo(dst); 4700 } 4701 return; 4702 } 4703 4704 Ptr<FilterEngine> f = createLinearFilter(src.type(), dst.type(), kernel, 4705 anchor, delta, borderType & ~BORDER_ISOLATED ); 4706 f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 ); 4707 } 4708 4709 4710 void cv::sepFilter2D( InputArray _src, OutputArray _dst, int ddepth, 4711 InputArray _kernelX, InputArray _kernelY, Point anchor, 4712 double delta, int borderType ) 4713 { 4714 CV_OCL_RUN(_dst.isUMat() && _src.dims() <= 2, 4715 ocl_sepFilter2D(_src, _dst, ddepth, _kernelX, _kernelY, anchor, delta, borderType)) 4716 4717 Mat src = _src.getMat(), kernelX = _kernelX.getMat(), kernelY = _kernelY.getMat(); 4718 4719 if( ddepth < 0 ) 4720 ddepth = src.depth(); 4721 4722 _dst.create( src.size(), CV_MAKETYPE(ddepth, src.channels()) ); 4723 Mat dst = _dst.getMat(); 4724 4725 Ptr<FilterEngine> f = createSeparableLinearFilter(src.type(), 4726 dst.type(), kernelX, kernelY, anchor, delta, borderType & ~BORDER_ISOLATED ); 4727 f->apply(src, dst, Rect(0,0,-1,-1), Point(), (borderType & BORDER_ISOLATED) != 0 ); 4728 } 4729 4730 4731 CV_IMPL void 4732 cvFilter2D( const CvArr* srcarr, CvArr* dstarr, const CvMat* _kernel, CvPoint anchor ) 4733 { 4734 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 4735 cv::Mat kernel = cv::cvarrToMat(_kernel); 4736 4737 CV_Assert( src.size() == dst.size() && src.channels() == dst.channels() ); 4738 4739 cv::filter2D( src, dst, dst.depth(), kernel, anchor, 0, cv::BORDER_REPLICATE ); 4740 } 4741 4742 /* End of file. */ 4743