1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 /* //////////////////////////////////////////////////////////////////// 45 // 46 // Geometrical transforms on images and matrices: rotation, zoom etc. 47 // 48 // */ 49 50 #include "precomp.hpp" 51 #include "opencl_kernels_imgproc.hpp" 52 53 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) 54 static IppStatus sts = ippInit(); 55 #endif 56 57 namespace cv 58 { 59 #if IPP_VERSION_X100 >= 701 60 typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*); 61 typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*); 62 typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*); 63 #endif 64 65 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) && 0 66 typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize); 67 typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int); 68 typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int); 69 70 template <int channels, typename Type> 71 bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func) 72 { 73 Type values[channels]; 74 for( int i = 0; i < channels; i++ ) 75 values[i] = saturate_cast<Type>(value[i]); 76 return func(values, dataPointer, step, size) >= 0; 77 } 78 79 static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth) 80 { 81 if( channels == 1 ) 82 { 83 switch( depth ) 84 { 85 case CV_8U: 86 return ippiSet_8u_C1R(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0; 87 case CV_16U: 88 return ippiSet_16u_C1R(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0; 89 case CV_32F: 90 return ippiSet_32f_C1R(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0; 91 } 92 } 93 else 94 { 95 if( channels == 3 ) 96 { 97 switch( depth ) 98 { 99 case CV_8U: 100 return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R); 101 case CV_16U: 102 return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R); 103 case CV_32F: 104 return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R); 105 } 106 } 107 else if( channels == 4 ) 108 { 109 switch( depth ) 110 { 111 case CV_8U: 112 return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R); 113 case CV_16U: 114 return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R); 115 case CV_32F: 116 return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R); 117 } 118 } 119 } 120 return false; 121 } 122 #endif 123 124 /************** interpolation formulas and tables ***************/ 125 126 const int INTER_RESIZE_COEF_BITS=11; 127 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS; 128 129 const int INTER_REMAP_COEF_BITS=15; 130 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS; 131 132 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2]; 133 134 static float BilinearTab_f[INTER_TAB_SIZE2][2][2]; 135 static short BilinearTab_i[INTER_TAB_SIZE2][2][2]; 136 137 #if CV_SSE2 || CV_NEON 138 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8]; 139 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16); 140 #endif 141 142 static float BicubicTab_f[INTER_TAB_SIZE2][4][4]; 143 static short BicubicTab_i[INTER_TAB_SIZE2][4][4]; 144 145 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8]; 146 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8]; 147 148 static inline void interpolateLinear( float x, float* coeffs ) 149 { 150 coeffs[0] = 1.f - x; 151 coeffs[1] = x; 152 } 153 154 static inline void interpolateCubic( float x, float* coeffs ) 155 { 156 const float A = -0.75f; 157 158 coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A; 159 coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1; 160 coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1; 161 coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2]; 162 } 163 164 static inline void interpolateLanczos4( float x, float* coeffs ) 165 { 166 static const double s45 = 0.70710678118654752440084436210485; 167 static const double cs[][2]= 168 {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}}; 169 170 if( x < FLT_EPSILON ) 171 { 172 for( int i = 0; i < 8; i++ ) 173 coeffs[i] = 0; 174 coeffs[3] = 1; 175 return; 176 } 177 178 float sum = 0; 179 double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0); 180 for(int i = 0; i < 8; i++ ) 181 { 182 double y = -(x+3-i)*CV_PI*0.25; 183 coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y)); 184 sum += coeffs[i]; 185 } 186 187 sum = 1.f/sum; 188 for(int i = 0; i < 8; i++ ) 189 coeffs[i] *= sum; 190 } 191 192 static void initInterTab1D(int method, float* tab, int tabsz) 193 { 194 float scale = 1.f/tabsz; 195 if( method == INTER_LINEAR ) 196 { 197 for( int i = 0; i < tabsz; i++, tab += 2 ) 198 interpolateLinear( i*scale, tab ); 199 } 200 else if( method == INTER_CUBIC ) 201 { 202 for( int i = 0; i < tabsz; i++, tab += 4 ) 203 interpolateCubic( i*scale, tab ); 204 } 205 else if( method == INTER_LANCZOS4 ) 206 { 207 for( int i = 0; i < tabsz; i++, tab += 8 ) 208 interpolateLanczos4( i*scale, tab ); 209 } 210 else 211 CV_Error( CV_StsBadArg, "Unknown interpolation method" ); 212 } 213 214 215 static const void* initInterTab2D( int method, bool fixpt ) 216 { 217 static bool inittab[INTER_MAX+1] = {false}; 218 float* tab = 0; 219 short* itab = 0; 220 int ksize = 0; 221 if( method == INTER_LINEAR ) 222 tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2; 223 else if( method == INTER_CUBIC ) 224 tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4; 225 else if( method == INTER_LANCZOS4 ) 226 tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8; 227 else 228 CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" ); 229 230 if( !inittab[method] ) 231 { 232 AutoBuffer<float> _tab(8*INTER_TAB_SIZE); 233 int i, j, k1, k2; 234 initInterTab1D(method, _tab, INTER_TAB_SIZE); 235 for( i = 0; i < INTER_TAB_SIZE; i++ ) 236 for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize ) 237 { 238 int isum = 0; 239 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2; 240 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2; 241 242 for( k1 = 0; k1 < ksize; k1++ ) 243 { 244 float vy = _tab[i*ksize + k1]; 245 for( k2 = 0; k2 < ksize; k2++ ) 246 { 247 float v = vy*_tab[j*ksize + k2]; 248 tab[k1*ksize + k2] = v; 249 isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE); 250 } 251 } 252 253 if( isum != INTER_REMAP_COEF_SCALE ) 254 { 255 int diff = isum - INTER_REMAP_COEF_SCALE; 256 int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2; 257 for( k1 = ksize2; k1 < ksize2+2; k1++ ) 258 for( k2 = ksize2; k2 < ksize2+2; k2++ ) 259 { 260 if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] ) 261 mk1 = k1, mk2 = k2; 262 else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] ) 263 Mk1 = k1, Mk2 = k2; 264 } 265 if( diff < 0 ) 266 itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff); 267 else 268 itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff); 269 } 270 } 271 tab -= INTER_TAB_SIZE2*ksize*ksize; 272 itab -= INTER_TAB_SIZE2*ksize*ksize; 273 #if CV_SSE2 || CV_NEON 274 if( method == INTER_LINEAR ) 275 { 276 for( i = 0; i < INTER_TAB_SIZE2; i++ ) 277 for( j = 0; j < 4; j++ ) 278 { 279 BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0]; 280 BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1]; 281 BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0]; 282 BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1]; 283 } 284 } 285 #endif 286 inittab[method] = true; 287 } 288 return fixpt ? (const void*)itab : (const void*)tab; 289 } 290 291 #ifndef __MINGW32__ 292 static bool initAllInterTab2D() 293 { 294 return initInterTab2D( INTER_LINEAR, false ) && 295 initInterTab2D( INTER_LINEAR, true ) && 296 initInterTab2D( INTER_CUBIC, false ) && 297 initInterTab2D( INTER_CUBIC, true ) && 298 initInterTab2D( INTER_LANCZOS4, false ) && 299 initInterTab2D( INTER_LANCZOS4, true ); 300 } 301 302 static volatile bool doInitAllInterTab2D = initAllInterTab2D(); 303 #endif 304 305 template<typename ST, typename DT> struct Cast 306 { 307 typedef ST type1; 308 typedef DT rtype; 309 310 DT operator()(ST val) const { return saturate_cast<DT>(val); } 311 }; 312 313 template<typename ST, typename DT, int bits> struct FixedPtCast 314 { 315 typedef ST type1; 316 typedef DT rtype; 317 enum { SHIFT = bits, DELTA = 1 << (bits-1) }; 318 319 DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); } 320 }; 321 322 /****************************************************************************************\ 323 * Resize * 324 \****************************************************************************************/ 325 326 class resizeNNInvoker : 327 public ParallelLoopBody 328 { 329 public: 330 resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) : 331 ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4), 332 ify(_ify) 333 { 334 } 335 336 virtual void operator() (const Range& range) const 337 { 338 Size ssize = src.size(), dsize = dst.size(); 339 int y, x, pix_size = (int)src.elemSize(); 340 341 for( y = range.start; y < range.end; y++ ) 342 { 343 uchar* D = dst.data + dst.step*y; 344 int sy = std::min(cvFloor(y*ify), ssize.height-1); 345 const uchar* S = src.ptr(sy); 346 347 switch( pix_size ) 348 { 349 case 1: 350 for( x = 0; x <= dsize.width - 2; x += 2 ) 351 { 352 uchar t0 = S[x_ofs[x]]; 353 uchar t1 = S[x_ofs[x+1]]; 354 D[x] = t0; 355 D[x+1] = t1; 356 } 357 358 for( ; x < dsize.width; x++ ) 359 D[x] = S[x_ofs[x]]; 360 break; 361 case 2: 362 for( x = 0; x < dsize.width; x++ ) 363 *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]); 364 break; 365 case 3: 366 for( x = 0; x < dsize.width; x++, D += 3 ) 367 { 368 const uchar* _tS = S + x_ofs[x]; 369 D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2]; 370 } 371 break; 372 case 4: 373 for( x = 0; x < dsize.width; x++ ) 374 *(int*)(D + x*4) = *(int*)(S + x_ofs[x]); 375 break; 376 case 6: 377 for( x = 0; x < dsize.width; x++, D += 6 ) 378 { 379 const ushort* _tS = (const ushort*)(S + x_ofs[x]); 380 ushort* _tD = (ushort*)D; 381 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2]; 382 } 383 break; 384 case 8: 385 for( x = 0; x < dsize.width; x++, D += 8 ) 386 { 387 const int* _tS = (const int*)(S + x_ofs[x]); 388 int* _tD = (int*)D; 389 _tD[0] = _tS[0]; _tD[1] = _tS[1]; 390 } 391 break; 392 case 12: 393 for( x = 0; x < dsize.width; x++, D += 12 ) 394 { 395 const int* _tS = (const int*)(S + x_ofs[x]); 396 int* _tD = (int*)D; 397 _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2]; 398 } 399 break; 400 default: 401 for( x = 0; x < dsize.width; x++, D += pix_size ) 402 { 403 const int* _tS = (const int*)(S + x_ofs[x]); 404 int* _tD = (int*)D; 405 for( int k = 0; k < pix_size4; k++ ) 406 _tD[k] = _tS[k]; 407 } 408 } 409 } 410 } 411 412 private: 413 const Mat src; 414 Mat dst; 415 int* x_ofs, pix_size4; 416 double ify; 417 418 resizeNNInvoker(const resizeNNInvoker&); 419 resizeNNInvoker& operator=(const resizeNNInvoker&); 420 }; 421 422 static void 423 resizeNN( const Mat& src, Mat& dst, double fx, double fy ) 424 { 425 Size ssize = src.size(), dsize = dst.size(); 426 AutoBuffer<int> _x_ofs(dsize.width); 427 int* x_ofs = _x_ofs; 428 int pix_size = (int)src.elemSize(); 429 int pix_size4 = (int)(pix_size / sizeof(int)); 430 double ifx = 1./fx, ify = 1./fy; 431 int x; 432 433 for( x = 0; x < dsize.width; x++ ) 434 { 435 int sx = cvFloor(x*ifx); 436 x_ofs[x] = std::min(sx, ssize.width-1)*pix_size; 437 } 438 439 Range range(0, dsize.height); 440 resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify); 441 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 442 } 443 444 445 struct VResizeNoVec 446 { 447 int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; } 448 }; 449 450 struct HResizeNoVec 451 { 452 int operator()(const uchar**, uchar**, int, const int*, 453 const uchar*, int, int, int, int, int) const { return 0; } 454 }; 455 456 #if CV_SSE2 457 458 struct VResizeLinearVec_32s8u 459 { 460 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const 461 { 462 if( !checkHardwareSupport(CV_CPU_SSE2) ) 463 return 0; 464 465 const int** src = (const int**)_src; 466 const short* beta = (const short*)_beta; 467 const int *S0 = src[0], *S1 = src[1]; 468 int x = 0; 469 __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]); 470 __m128i delta = _mm_set1_epi16(2); 471 472 if( (((size_t)S0|(size_t)S1)&15) == 0 ) 473 for( ; x <= width - 16; x += 16 ) 474 { 475 __m128i x0, x1, x2, y0, y1, y2; 476 x0 = _mm_load_si128((const __m128i*)(S0 + x)); 477 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); 478 y0 = _mm_load_si128((const __m128i*)(S1 + x)); 479 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); 480 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); 481 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); 482 483 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8)); 484 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12)); 485 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8)); 486 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12)); 487 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); 488 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); 489 490 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); 491 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); 492 493 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); 494 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); 495 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); 496 } 497 else 498 for( ; x <= width - 16; x += 16 ) 499 { 500 __m128i x0, x1, x2, y0, y1, y2; 501 x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); 502 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); 503 y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); 504 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); 505 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4)); 506 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4)); 507 508 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8)); 509 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12)); 510 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8)); 511 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12)); 512 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4)); 513 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4)); 514 515 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 )); 516 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 )); 517 518 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); 519 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2); 520 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1)); 521 } 522 523 for( ; x < width - 4; x += 4 ) 524 { 525 __m128i x0, y0; 526 x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4); 527 y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4); 528 x0 = _mm_packs_epi32(x0, x0); 529 y0 = _mm_packs_epi32(y0, y0); 530 x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1)); 531 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2); 532 x0 = _mm_packus_epi16(x0, x0); 533 *(int*)(dst + x) = _mm_cvtsi128_si32(x0); 534 } 535 536 return x; 537 } 538 }; 539 540 541 template<int shiftval> struct VResizeLinearVec_32f16 542 { 543 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 544 { 545 if( !checkHardwareSupport(CV_CPU_SSE2) ) 546 return 0; 547 548 const float** src = (const float**)_src; 549 const float* beta = (const float*)_beta; 550 const float *S0 = src[0], *S1 = src[1]; 551 ushort* dst = (ushort*)_dst; 552 int x = 0; 553 554 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); 555 __m128i preshift = _mm_set1_epi32(shiftval); 556 __m128i postshift = _mm_set1_epi16((short)shiftval); 557 558 if( (((size_t)S0|(size_t)S1)&15) == 0 ) 559 for( ; x <= width - 16; x += 16 ) 560 { 561 __m128 x0, x1, y0, y1; 562 __m128i t0, t1, t2; 563 x0 = _mm_load_ps(S0 + x); 564 x1 = _mm_load_ps(S0 + x + 4); 565 y0 = _mm_load_ps(S1 + x); 566 y1 = _mm_load_ps(S1 + x + 4); 567 568 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 569 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 570 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 571 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); 572 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); 573 574 x0 = _mm_load_ps(S0 + x + 8); 575 x1 = _mm_load_ps(S0 + x + 12); 576 y0 = _mm_load_ps(S1 + x + 8); 577 y1 = _mm_load_ps(S1 + x + 12); 578 579 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 580 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 581 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 582 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); 583 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); 584 585 _mm_storeu_si128( (__m128i*)(dst + x), t0); 586 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); 587 } 588 else 589 for( ; x <= width - 16; x += 16 ) 590 { 591 __m128 x0, x1, y0, y1; 592 __m128i t0, t1, t2; 593 x0 = _mm_loadu_ps(S0 + x); 594 x1 = _mm_loadu_ps(S0 + x + 4); 595 y0 = _mm_loadu_ps(S1 + x); 596 y1 = _mm_loadu_ps(S1 + x + 4); 597 598 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 599 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 600 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 601 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); 602 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift); 603 604 x0 = _mm_loadu_ps(S0 + x + 8); 605 x1 = _mm_loadu_ps(S0 + x + 12); 606 y0 = _mm_loadu_ps(S1 + x + 8); 607 y1 = _mm_loadu_ps(S1 + x + 12); 608 609 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 610 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 611 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 612 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift); 613 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift); 614 615 _mm_storeu_si128( (__m128i*)(dst + x), t0); 616 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1); 617 } 618 619 for( ; x < width - 4; x += 4 ) 620 { 621 __m128 x0, y0; 622 __m128i t0; 623 x0 = _mm_loadu_ps(S0 + x); 624 y0 = _mm_loadu_ps(S1 + x); 625 626 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 627 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift); 628 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift); 629 _mm_storel_epi64( (__m128i*)(dst + x), t0); 630 } 631 632 return x; 633 } 634 }; 635 636 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u; 637 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s; 638 639 struct VResizeLinearVec_32f 640 { 641 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 642 { 643 if( !checkHardwareSupport(CV_CPU_SSE) ) 644 return 0; 645 646 const float** src = (const float**)_src; 647 const float* beta = (const float*)_beta; 648 const float *S0 = src[0], *S1 = src[1]; 649 float* dst = (float*)_dst; 650 int x = 0; 651 652 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]); 653 654 if( (((size_t)S0|(size_t)S1)&15) == 0 ) 655 for( ; x <= width - 8; x += 8 ) 656 { 657 __m128 x0, x1, y0, y1; 658 x0 = _mm_load_ps(S0 + x); 659 x1 = _mm_load_ps(S0 + x + 4); 660 y0 = _mm_load_ps(S1 + x); 661 y1 = _mm_load_ps(S1 + x + 4); 662 663 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 664 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 665 666 _mm_storeu_ps( dst + x, x0); 667 _mm_storeu_ps( dst + x + 4, x1); 668 } 669 else 670 for( ; x <= width - 8; x += 8 ) 671 { 672 __m128 x0, x1, y0, y1; 673 x0 = _mm_loadu_ps(S0 + x); 674 x1 = _mm_loadu_ps(S0 + x + 4); 675 y0 = _mm_loadu_ps(S1 + x); 676 y1 = _mm_loadu_ps(S1 + x + 4); 677 678 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1)); 679 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1)); 680 681 _mm_storeu_ps( dst + x, x0); 682 _mm_storeu_ps( dst + x + 4, x1); 683 } 684 685 return x; 686 } 687 }; 688 689 690 struct VResizeCubicVec_32s8u 691 { 692 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const 693 { 694 if( !checkHardwareSupport(CV_CPU_SSE2) ) 695 return 0; 696 697 const int** src = (const int**)_src; 698 const short* beta = (const short*)_beta; 699 const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 700 int x = 0; 701 float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE); 702 __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale), 703 b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale); 704 705 if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 ) 706 for( ; x <= width - 8; x += 8 ) 707 { 708 __m128i x0, x1, y0, y1; 709 __m128 s0, s1, f0, f1; 710 x0 = _mm_load_si128((const __m128i*)(S0 + x)); 711 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4)); 712 y0 = _mm_load_si128((const __m128i*)(S1 + x)); 713 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4)); 714 715 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); 716 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); 717 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); 718 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); 719 s0 = _mm_add_ps(s0, f0); 720 s1 = _mm_add_ps(s1, f1); 721 722 x0 = _mm_load_si128((const __m128i*)(S2 + x)); 723 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4)); 724 y0 = _mm_load_si128((const __m128i*)(S3 + x)); 725 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4)); 726 727 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); 728 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); 729 s0 = _mm_add_ps(s0, f0); 730 s1 = _mm_add_ps(s1, f1); 731 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); 732 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); 733 s0 = _mm_add_ps(s0, f0); 734 s1 = _mm_add_ps(s1, f1); 735 736 x0 = _mm_cvtps_epi32(s0); 737 x1 = _mm_cvtps_epi32(s1); 738 739 x0 = _mm_packs_epi32(x0, x1); 740 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); 741 } 742 else 743 for( ; x <= width - 8; x += 8 ) 744 { 745 __m128i x0, x1, y0, y1; 746 __m128 s0, s1, f0, f1; 747 x0 = _mm_loadu_si128((const __m128i*)(S0 + x)); 748 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4)); 749 y0 = _mm_loadu_si128((const __m128i*)(S1 + x)); 750 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4)); 751 752 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0); 753 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0); 754 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1); 755 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1); 756 s0 = _mm_add_ps(s0, f0); 757 s1 = _mm_add_ps(s1, f1); 758 759 x0 = _mm_loadu_si128((const __m128i*)(S2 + x)); 760 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4)); 761 y0 = _mm_loadu_si128((const __m128i*)(S3 + x)); 762 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4)); 763 764 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2); 765 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2); 766 s0 = _mm_add_ps(s0, f0); 767 s1 = _mm_add_ps(s1, f1); 768 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3); 769 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3); 770 s0 = _mm_add_ps(s0, f0); 771 s1 = _mm_add_ps(s1, f1); 772 773 x0 = _mm_cvtps_epi32(s0); 774 x1 = _mm_cvtps_epi32(s1); 775 776 x0 = _mm_packs_epi32(x0, x1); 777 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0)); 778 } 779 780 return x; 781 } 782 }; 783 784 785 template<int shiftval> struct VResizeCubicVec_32f16 786 { 787 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 788 { 789 if( !checkHardwareSupport(CV_CPU_SSE2) ) 790 return 0; 791 792 const float** src = (const float**)_src; 793 const float* beta = (const float*)_beta; 794 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 795 ushort* dst = (ushort*)_dst; 796 int x = 0; 797 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), 798 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); 799 __m128i preshift = _mm_set1_epi32(shiftval); 800 __m128i postshift = _mm_set1_epi16((short)shiftval); 801 802 for( ; x <= width - 8; x += 8 ) 803 { 804 __m128 x0, x1, y0, y1, s0, s1; 805 __m128i t0, t1; 806 x0 = _mm_loadu_ps(S0 + x); 807 x1 = _mm_loadu_ps(S0 + x + 4); 808 y0 = _mm_loadu_ps(S1 + x); 809 y1 = _mm_loadu_ps(S1 + x + 4); 810 811 s0 = _mm_mul_ps(x0, b0); 812 s1 = _mm_mul_ps(x1, b0); 813 y0 = _mm_mul_ps(y0, b1); 814 y1 = _mm_mul_ps(y1, b1); 815 s0 = _mm_add_ps(s0, y0); 816 s1 = _mm_add_ps(s1, y1); 817 818 x0 = _mm_loadu_ps(S2 + x); 819 x1 = _mm_loadu_ps(S2 + x + 4); 820 y0 = _mm_loadu_ps(S3 + x); 821 y1 = _mm_loadu_ps(S3 + x + 4); 822 823 x0 = _mm_mul_ps(x0, b2); 824 x1 = _mm_mul_ps(x1, b2); 825 y0 = _mm_mul_ps(y0, b3); 826 y1 = _mm_mul_ps(y1, b3); 827 s0 = _mm_add_ps(s0, x0); 828 s1 = _mm_add_ps(s1, x1); 829 s0 = _mm_add_ps(s0, y0); 830 s1 = _mm_add_ps(s1, y1); 831 832 t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift); 833 t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift); 834 835 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift); 836 _mm_storeu_si128( (__m128i*)(dst + x), t0); 837 } 838 839 return x; 840 } 841 }; 842 843 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u; 844 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s; 845 846 struct VResizeCubicVec_32f 847 { 848 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 849 { 850 if( !checkHardwareSupport(CV_CPU_SSE) ) 851 return 0; 852 853 const float** src = (const float**)_src; 854 const float* beta = (const float*)_beta; 855 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 856 float* dst = (float*)_dst; 857 int x = 0; 858 __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]), 859 b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]); 860 861 for( ; x <= width - 8; x += 8 ) 862 { 863 __m128 x0, x1, y0, y1, s0, s1; 864 x0 = _mm_loadu_ps(S0 + x); 865 x1 = _mm_loadu_ps(S0 + x + 4); 866 y0 = _mm_loadu_ps(S1 + x); 867 y1 = _mm_loadu_ps(S1 + x + 4); 868 869 s0 = _mm_mul_ps(x0, b0); 870 s1 = _mm_mul_ps(x1, b0); 871 y0 = _mm_mul_ps(y0, b1); 872 y1 = _mm_mul_ps(y1, b1); 873 s0 = _mm_add_ps(s0, y0); 874 s1 = _mm_add_ps(s1, y1); 875 876 x0 = _mm_loadu_ps(S2 + x); 877 x1 = _mm_loadu_ps(S2 + x + 4); 878 y0 = _mm_loadu_ps(S3 + x); 879 y1 = _mm_loadu_ps(S3 + x + 4); 880 881 x0 = _mm_mul_ps(x0, b2); 882 x1 = _mm_mul_ps(x1, b2); 883 y0 = _mm_mul_ps(y0, b3); 884 y1 = _mm_mul_ps(y1, b3); 885 s0 = _mm_add_ps(s0, x0); 886 s1 = _mm_add_ps(s1, x1); 887 s0 = _mm_add_ps(s0, y0); 888 s1 = _mm_add_ps(s1, y1); 889 890 _mm_storeu_ps( dst + x, s0); 891 _mm_storeu_ps( dst + x + 4, s1); 892 } 893 894 return x; 895 } 896 }; 897 898 #if CV_SSE4_1 899 900 struct VResizeLanczos4Vec_32f16u 901 { 902 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 903 { 904 const float** src = (const float**)_src; 905 const float* beta = (const float*)_beta; 906 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 907 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 908 short * dst = (short*)_dst; 909 int x = 0; 910 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), 911 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), 912 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), 913 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); 914 915 for( ; x <= width - 8; x += 8 ) 916 { 917 __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); 918 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); 919 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); 920 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); 921 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); 922 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); 923 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); 924 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); 925 926 __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); 927 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); 928 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); 929 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); 930 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); 931 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); 932 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); 933 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); 934 935 __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); 936 __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); 937 938 _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1)); 939 } 940 941 return x; 942 } 943 }; 944 945 #else 946 947 typedef VResizeNoVec VResizeLanczos4Vec_32f16u; 948 949 #endif 950 951 struct VResizeLanczos4Vec_32f16s 952 { 953 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 954 { 955 const float** src = (const float**)_src; 956 const float* beta = (const float*)_beta; 957 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 958 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 959 short * dst = (short*)_dst; 960 int x = 0; 961 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), 962 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), 963 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), 964 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); 965 966 for( ; x <= width - 8; x += 8 ) 967 { 968 __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); 969 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); 970 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); 971 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); 972 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); 973 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); 974 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); 975 v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); 976 977 __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4)); 978 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4))); 979 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4))); 980 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4))); 981 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4))); 982 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4))); 983 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4))); 984 v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4))); 985 986 __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0); 987 __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1); 988 989 _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1)); 990 } 991 992 return x; 993 } 994 }; 995 996 997 struct VResizeLanczos4Vec_32f 998 { 999 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1000 { 1001 const float** src = (const float**)_src; 1002 const float* beta = (const float*)_beta; 1003 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 1004 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 1005 float* dst = (float*)_dst; 1006 int x = 0; 1007 1008 __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]), 1009 v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]), 1010 v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]), 1011 v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]); 1012 1013 for( ; x <= width - 4; x += 4 ) 1014 { 1015 __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x)); 1016 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x))); 1017 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x))); 1018 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x))); 1019 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x))); 1020 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x))); 1021 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x))); 1022 v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x))); 1023 1024 _mm_storeu_ps(dst + x, v_dst); 1025 } 1026 1027 return x; 1028 } 1029 }; 1030 1031 1032 #elif CV_NEON 1033 1034 struct VResizeLinearVec_32s8u 1035 { 1036 int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const 1037 { 1038 const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1]; 1039 const short* beta = (const short*)_beta; 1040 int x = 0; 1041 int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2); 1042 1043 for( ; x <= width - 16; x += 16) 1044 { 1045 int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4); 1046 int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4); 1047 1048 int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); 1049 int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); 1050 1051 int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), 1052 vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); 1053 v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2); 1054 1055 v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4); 1056 v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4); 1057 v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4); 1058 v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4); 1059 1060 v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01)); 1061 v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11)); 1062 1063 int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1), 1064 vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1)); 1065 v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2); 1066 1067 vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1))); 1068 } 1069 1070 return x; 1071 } 1072 }; 1073 1074 struct VResizeLinearVec_32f16u 1075 { 1076 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1077 { 1078 const float** src = (const float**)_src; 1079 const float* beta = (const float*)_beta; 1080 const float *S0 = src[0], *S1 = src[1]; 1081 ushort* dst = (ushort*)_dst; 1082 int x = 0; 1083 1084 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); 1085 1086 for( ; x <= width - 8; x += 8 ) 1087 { 1088 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); 1089 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); 1090 1091 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); 1092 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); 1093 1094 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), 1095 vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); 1096 } 1097 1098 return x; 1099 } 1100 }; 1101 1102 struct VResizeLinearVec_32f16s 1103 { 1104 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1105 { 1106 const float** src = (const float**)_src; 1107 const float* beta = (const float*)_beta; 1108 const float *S0 = src[0], *S1 = src[1]; 1109 short* dst = (short*)_dst; 1110 int x = 0; 1111 1112 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); 1113 1114 for( ; x <= width - 8; x += 8 ) 1115 { 1116 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); 1117 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); 1118 1119 float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1); 1120 float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1); 1121 1122 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), 1123 vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); 1124 } 1125 1126 return x; 1127 } 1128 }; 1129 1130 struct VResizeLinearVec_32f 1131 { 1132 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1133 { 1134 const float** src = (const float**)_src; 1135 const float* beta = (const float*)_beta; 1136 const float *S0 = src[0], *S1 = src[1]; 1137 float* dst = (float*)_dst; 1138 int x = 0; 1139 1140 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]); 1141 1142 for( ; x <= width - 8; x += 8 ) 1143 { 1144 float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4); 1145 float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4); 1146 1147 vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1)); 1148 vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1)); 1149 } 1150 1151 return x; 1152 } 1153 }; 1154 1155 typedef VResizeNoVec VResizeCubicVec_32s8u; 1156 1157 struct VResizeCubicVec_32f16u 1158 { 1159 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1160 { 1161 const float** src = (const float**)_src; 1162 const float* beta = (const float*)_beta; 1163 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 1164 ushort* dst = (ushort*)_dst; 1165 int x = 0; 1166 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 1167 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); 1168 1169 for( ; x <= width - 8; x += 8 ) 1170 { 1171 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 1172 v_b1, vld1q_f32(S1 + x)), 1173 v_b2, vld1q_f32(S2 + x)), 1174 v_b3, vld1q_f32(S3 + x)); 1175 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 1176 v_b1, vld1q_f32(S1 + x + 4)), 1177 v_b2, vld1q_f32(S2 + x + 4)), 1178 v_b3, vld1q_f32(S3 + x + 4)); 1179 1180 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)), 1181 vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); 1182 } 1183 1184 return x; 1185 } 1186 }; 1187 1188 struct VResizeCubicVec_32f16s 1189 { 1190 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1191 { 1192 const float** src = (const float**)_src; 1193 const float* beta = (const float*)_beta; 1194 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 1195 short* dst = (short*)_dst; 1196 int x = 0; 1197 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 1198 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); 1199 1200 for( ; x <= width - 8; x += 8 ) 1201 { 1202 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 1203 v_b1, vld1q_f32(S1 + x)), 1204 v_b2, vld1q_f32(S2 + x)), 1205 v_b3, vld1q_f32(S3 + x)); 1206 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 1207 v_b1, vld1q_f32(S1 + x + 4)), 1208 v_b2, vld1q_f32(S2 + x + 4)), 1209 v_b3, vld1q_f32(S3 + x + 4)); 1210 1211 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)), 1212 vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); 1213 } 1214 1215 return x; 1216 } 1217 }; 1218 1219 struct VResizeCubicVec_32f 1220 { 1221 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1222 { 1223 const float** src = (const float**)_src; 1224 const float* beta = (const float*)_beta; 1225 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 1226 float* dst = (float*)_dst; 1227 int x = 0; 1228 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 1229 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]); 1230 1231 for( ; x <= width - 8; x += 8 ) 1232 { 1233 vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 1234 v_b1, vld1q_f32(S1 + x)), 1235 v_b2, vld1q_f32(S2 + x)), 1236 v_b3, vld1q_f32(S3 + x))); 1237 vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 1238 v_b1, vld1q_f32(S1 + x + 4)), 1239 v_b2, vld1q_f32(S2 + x + 4)), 1240 v_b3, vld1q_f32(S3 + x + 4))); 1241 } 1242 1243 return x; 1244 } 1245 }; 1246 1247 struct VResizeLanczos4Vec_32f16u 1248 { 1249 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1250 { 1251 const float** src = (const float**)_src; 1252 const float* beta = (const float*)_beta; 1253 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 1254 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 1255 ushort * dst = (ushort*)_dst; 1256 int x = 0; 1257 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 1258 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), 1259 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), 1260 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); 1261 1262 for( ; x <= width - 8; x += 8 ) 1263 { 1264 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 1265 v_b1, vld1q_f32(S1 + x)), 1266 v_b2, vld1q_f32(S2 + x)), 1267 v_b3, vld1q_f32(S3 + x)); 1268 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), 1269 v_b5, vld1q_f32(S5 + x)), 1270 v_b6, vld1q_f32(S6 + x)), 1271 v_b7, vld1q_f32(S7 + x)); 1272 float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); 1273 1274 v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 1275 v_b1, vld1q_f32(S1 + x + 4)), 1276 v_b2, vld1q_f32(S2 + x + 4)), 1277 v_b3, vld1q_f32(S3 + x + 4)); 1278 v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), 1279 v_b5, vld1q_f32(S5 + x + 4)), 1280 v_b6, vld1q_f32(S6 + x + 4)), 1281 v_b7, vld1q_f32(S7 + x + 4)); 1282 v_dst1 = vaddq_f32(v_dst0, v_dst1); 1283 1284 vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)), 1285 vqmovn_u32(cv_vrndq_u32_f32(v_dst1)))); 1286 } 1287 1288 return x; 1289 } 1290 }; 1291 1292 struct VResizeLanczos4Vec_32f16s 1293 { 1294 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1295 { 1296 const float** src = (const float**)_src; 1297 const float* beta = (const float*)_beta; 1298 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 1299 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 1300 short * dst = (short*)_dst; 1301 int x = 0; 1302 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 1303 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), 1304 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), 1305 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); 1306 1307 for( ; x <= width - 8; x += 8 ) 1308 { 1309 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 1310 v_b1, vld1q_f32(S1 + x)), 1311 v_b2, vld1q_f32(S2 + x)), 1312 v_b3, vld1q_f32(S3 + x)); 1313 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), 1314 v_b5, vld1q_f32(S5 + x)), 1315 v_b6, vld1q_f32(S6 + x)), 1316 v_b7, vld1q_f32(S7 + x)); 1317 float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1); 1318 1319 v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)), 1320 v_b1, vld1q_f32(S1 + x + 4)), 1321 v_b2, vld1q_f32(S2 + x + 4)), 1322 v_b3, vld1q_f32(S3 + x + 4)); 1323 v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)), 1324 v_b5, vld1q_f32(S5 + x + 4)), 1325 v_b6, vld1q_f32(S6 + x + 4)), 1326 v_b7, vld1q_f32(S7 + x + 4)); 1327 v_dst1 = vaddq_f32(v_dst0, v_dst1); 1328 1329 vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)), 1330 vqmovn_s32(cv_vrndq_s32_f32(v_dst1)))); 1331 } 1332 1333 return x; 1334 } 1335 }; 1336 1337 struct VResizeLanczos4Vec_32f 1338 { 1339 int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const 1340 { 1341 const float** src = (const float**)_src; 1342 const float* beta = (const float*)_beta; 1343 const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3], 1344 *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7]; 1345 float* dst = (float*)_dst; 1346 int x = 0; 1347 float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]), 1348 v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]), 1349 v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]), 1350 v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]); 1351 1352 for( ; x <= width - 4; x += 4 ) 1353 { 1354 float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)), 1355 v_b1, vld1q_f32(S1 + x)), 1356 v_b2, vld1q_f32(S2 + x)), 1357 v_b3, vld1q_f32(S3 + x)); 1358 float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)), 1359 v_b5, vld1q_f32(S5 + x)), 1360 v_b6, vld1q_f32(S6 + x)), 1361 v_b7, vld1q_f32(S7 + x)); 1362 vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1)); 1363 } 1364 1365 return x; 1366 } 1367 }; 1368 1369 #else 1370 1371 typedef VResizeNoVec VResizeLinearVec_32s8u; 1372 typedef VResizeNoVec VResizeLinearVec_32f16u; 1373 typedef VResizeNoVec VResizeLinearVec_32f16s; 1374 typedef VResizeNoVec VResizeLinearVec_32f; 1375 1376 typedef VResizeNoVec VResizeCubicVec_32s8u; 1377 typedef VResizeNoVec VResizeCubicVec_32f16u; 1378 typedef VResizeNoVec VResizeCubicVec_32f16s; 1379 typedef VResizeNoVec VResizeCubicVec_32f; 1380 1381 typedef VResizeNoVec VResizeLanczos4Vec_32f16u; 1382 typedef VResizeNoVec VResizeLanczos4Vec_32f16s; 1383 typedef VResizeNoVec VResizeLanczos4Vec_32f; 1384 1385 #endif 1386 1387 typedef HResizeNoVec HResizeLinearVec_8u32s; 1388 typedef HResizeNoVec HResizeLinearVec_16u32f; 1389 typedef HResizeNoVec HResizeLinearVec_16s32f; 1390 typedef HResizeNoVec HResizeLinearVec_32f; 1391 typedef HResizeNoVec HResizeLinearVec_64f; 1392 1393 1394 template<typename T, typename WT, typename AT, int ONE, class VecOp> 1395 struct HResizeLinear 1396 { 1397 typedef T value_type; 1398 typedef WT buf_type; 1399 typedef AT alpha_type; 1400 1401 void operator()(const T** src, WT** dst, int count, 1402 const int* xofs, const AT* alpha, 1403 int swidth, int dwidth, int cn, int xmin, int xmax ) const 1404 { 1405 int dx, k; 1406 VecOp vecOp; 1407 1408 int dx0 = vecOp((const uchar**)src, (uchar**)dst, count, 1409 xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax ); 1410 1411 for( k = 0; k <= count - 2; k++ ) 1412 { 1413 const T *S0 = src[k], *S1 = src[k+1]; 1414 WT *D0 = dst[k], *D1 = dst[k+1]; 1415 for( dx = dx0; dx < xmax; dx++ ) 1416 { 1417 int sx = xofs[dx]; 1418 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1]; 1419 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1; 1420 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1; 1421 D0[dx] = t0; D1[dx] = t1; 1422 } 1423 1424 for( ; dx < dwidth; dx++ ) 1425 { 1426 int sx = xofs[dx]; 1427 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE); 1428 } 1429 } 1430 1431 for( ; k < count; k++ ) 1432 { 1433 const T *S = src[k]; 1434 WT *D = dst[k]; 1435 for( dx = 0; dx < xmax; dx++ ) 1436 { 1437 int sx = xofs[dx]; 1438 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1]; 1439 } 1440 1441 for( ; dx < dwidth; dx++ ) 1442 D[dx] = WT(S[xofs[dx]]*ONE); 1443 } 1444 } 1445 }; 1446 1447 1448 template<typename T, typename WT, typename AT, class CastOp, class VecOp> 1449 struct VResizeLinear 1450 { 1451 typedef T value_type; 1452 typedef WT buf_type; 1453 typedef AT alpha_type; 1454 1455 void operator()(const WT** src, T* dst, const AT* beta, int width ) const 1456 { 1457 WT b0 = beta[0], b1 = beta[1]; 1458 const WT *S0 = src[0], *S1 = src[1]; 1459 CastOp castOp; 1460 VecOp vecOp; 1461 1462 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); 1463 #if CV_ENABLE_UNROLLED 1464 for( ; x <= width - 4; x += 4 ) 1465 { 1466 WT t0, t1; 1467 t0 = S0[x]*b0 + S1[x]*b1; 1468 t1 = S0[x+1]*b0 + S1[x+1]*b1; 1469 dst[x] = castOp(t0); dst[x+1] = castOp(t1); 1470 t0 = S0[x+2]*b0 + S1[x+2]*b1; 1471 t1 = S0[x+3]*b0 + S1[x+3]*b1; 1472 dst[x+2] = castOp(t0); dst[x+3] = castOp(t1); 1473 } 1474 #endif 1475 for( ; x < width; x++ ) 1476 dst[x] = castOp(S0[x]*b0 + S1[x]*b1); 1477 } 1478 }; 1479 1480 template<> 1481 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u> 1482 { 1483 typedef uchar value_type; 1484 typedef int buf_type; 1485 typedef short alpha_type; 1486 1487 void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const 1488 { 1489 alpha_type b0 = beta[0], b1 = beta[1]; 1490 const buf_type *S0 = src[0], *S1 = src[1]; 1491 VResizeLinearVec_32s8u vecOp; 1492 1493 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); 1494 #if CV_ENABLE_UNROLLED 1495 for( ; x <= width - 4; x += 4 ) 1496 { 1497 dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2); 1498 dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2); 1499 dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2); 1500 dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2); 1501 } 1502 #endif 1503 for( ; x < width; x++ ) 1504 dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2); 1505 } 1506 }; 1507 1508 1509 template<typename T, typename WT, typename AT> 1510 struct HResizeCubic 1511 { 1512 typedef T value_type; 1513 typedef WT buf_type; 1514 typedef AT alpha_type; 1515 1516 void operator()(const T** src, WT** dst, int count, 1517 const int* xofs, const AT* alpha, 1518 int swidth, int dwidth, int cn, int xmin, int xmax ) const 1519 { 1520 for( int k = 0; k < count; k++ ) 1521 { 1522 const T *S = src[k]; 1523 WT *D = dst[k]; 1524 int dx = 0, limit = xmin; 1525 for(;;) 1526 { 1527 for( ; dx < limit; dx++, alpha += 4 ) 1528 { 1529 int j, sx = xofs[dx] - cn; 1530 WT v = 0; 1531 for( j = 0; j < 4; j++ ) 1532 { 1533 int sxj = sx + j*cn; 1534 if( (unsigned)sxj >= (unsigned)swidth ) 1535 { 1536 while( sxj < 0 ) 1537 sxj += cn; 1538 while( sxj >= swidth ) 1539 sxj -= cn; 1540 } 1541 v += S[sxj]*alpha[j]; 1542 } 1543 D[dx] = v; 1544 } 1545 if( limit == dwidth ) 1546 break; 1547 for( ; dx < xmax; dx++, alpha += 4 ) 1548 { 1549 int sx = xofs[dx]; 1550 D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] + 1551 S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3]; 1552 } 1553 limit = dwidth; 1554 } 1555 alpha -= dwidth*4; 1556 } 1557 } 1558 }; 1559 1560 1561 template<typename T, typename WT, typename AT, class CastOp, class VecOp> 1562 struct VResizeCubic 1563 { 1564 typedef T value_type; 1565 typedef WT buf_type; 1566 typedef AT alpha_type; 1567 1568 void operator()(const WT** src, T* dst, const AT* beta, int width ) const 1569 { 1570 WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3]; 1571 const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3]; 1572 CastOp castOp; 1573 VecOp vecOp; 1574 1575 int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); 1576 for( ; x < width; x++ ) 1577 dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3); 1578 } 1579 }; 1580 1581 1582 template<typename T, typename WT, typename AT> 1583 struct HResizeLanczos4 1584 { 1585 typedef T value_type; 1586 typedef WT buf_type; 1587 typedef AT alpha_type; 1588 1589 void operator()(const T** src, WT** dst, int count, 1590 const int* xofs, const AT* alpha, 1591 int swidth, int dwidth, int cn, int xmin, int xmax ) const 1592 { 1593 for( int k = 0; k < count; k++ ) 1594 { 1595 const T *S = src[k]; 1596 WT *D = dst[k]; 1597 int dx = 0, limit = xmin; 1598 for(;;) 1599 { 1600 for( ; dx < limit; dx++, alpha += 8 ) 1601 { 1602 int j, sx = xofs[dx] - cn*3; 1603 WT v = 0; 1604 for( j = 0; j < 8; j++ ) 1605 { 1606 int sxj = sx + j*cn; 1607 if( (unsigned)sxj >= (unsigned)swidth ) 1608 { 1609 while( sxj < 0 ) 1610 sxj += cn; 1611 while( sxj >= swidth ) 1612 sxj -= cn; 1613 } 1614 v += S[sxj]*alpha[j]; 1615 } 1616 D[dx] = v; 1617 } 1618 if( limit == dwidth ) 1619 break; 1620 for( ; dx < xmax; dx++, alpha += 8 ) 1621 { 1622 int sx = xofs[dx]; 1623 D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] + 1624 S[sx-cn]*alpha[2] + S[sx]*alpha[3] + 1625 S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] + 1626 S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7]; 1627 } 1628 limit = dwidth; 1629 } 1630 alpha -= dwidth*8; 1631 } 1632 } 1633 }; 1634 1635 1636 template<typename T, typename WT, typename AT, class CastOp, class VecOp> 1637 struct VResizeLanczos4 1638 { 1639 typedef T value_type; 1640 typedef WT buf_type; 1641 typedef AT alpha_type; 1642 1643 void operator()(const WT** src, T* dst, const AT* beta, int width ) const 1644 { 1645 CastOp castOp; 1646 VecOp vecOp; 1647 int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width); 1648 #if CV_ENABLE_UNROLLED 1649 for( ; x <= width - 4; x += 4 ) 1650 { 1651 WT b = beta[0]; 1652 const WT* S = src[0]; 1653 WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b; 1654 1655 for( k = 1; k < 8; k++ ) 1656 { 1657 b = beta[k]; S = src[k]; 1658 s0 += S[x]*b; s1 += S[x+1]*b; 1659 s2 += S[x+2]*b; s3 += S[x+3]*b; 1660 } 1661 1662 dst[x] = castOp(s0); dst[x+1] = castOp(s1); 1663 dst[x+2] = castOp(s2); dst[x+3] = castOp(s3); 1664 } 1665 #endif 1666 for( ; x < width; x++ ) 1667 { 1668 dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] + 1669 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] + 1670 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]); 1671 } 1672 } 1673 }; 1674 1675 1676 static inline int clip(int x, int a, int b) 1677 { 1678 return x >= a ? (x < b ? x : b-1) : a; 1679 } 1680 1681 static const int MAX_ESIZE=16; 1682 1683 template <typename HResize, typename VResize> 1684 class resizeGeneric_Invoker : 1685 public ParallelLoopBody 1686 { 1687 public: 1688 typedef typename HResize::value_type T; 1689 typedef typename HResize::buf_type WT; 1690 typedef typename HResize::alpha_type AT; 1691 1692 resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs, 1693 const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize, 1694 int _ksize, int _xmin, int _xmax) : 1695 ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs), 1696 alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize), 1697 ksize(_ksize), xmin(_xmin), xmax(_xmax) 1698 { 1699 CV_Assert(ksize <= MAX_ESIZE); 1700 } 1701 1702 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) 1703 # pragma GCC diagnostic push 1704 # pragma GCC diagnostic ignored "-Warray-bounds" 1705 #endif 1706 virtual void operator() (const Range& range) const 1707 { 1708 int dy, cn = src.channels(); 1709 HResize hresize; 1710 VResize vresize; 1711 1712 int bufstep = (int)alignSize(dsize.width, 16); 1713 AutoBuffer<WT> _buffer(bufstep*ksize); 1714 const T* srows[MAX_ESIZE]={0}; 1715 WT* rows[MAX_ESIZE]={0}; 1716 int prev_sy[MAX_ESIZE]; 1717 1718 for(int k = 0; k < ksize; k++ ) 1719 { 1720 prev_sy[k] = -1; 1721 rows[k] = (WT*)_buffer + bufstep*k; 1722 } 1723 1724 const AT* beta = _beta + ksize * range.start; 1725 1726 for( dy = range.start; dy < range.end; dy++, beta += ksize ) 1727 { 1728 int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2; 1729 1730 for(int k = 0; k < ksize; k++ ) 1731 { 1732 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height); 1733 for( k1 = std::max(k1, k); k1 < ksize; k1++ ) 1734 { 1735 if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it. 1736 { 1737 if( k1 > k ) 1738 memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) ); 1739 break; 1740 } 1741 } 1742 if( k1 == ksize ) 1743 k0 = std::min(k0, k); // remember the first row that needs to be computed 1744 srows[k] = src.template ptr<T>(sy); 1745 prev_sy[k] = sy; 1746 } 1747 1748 if( k0 < ksize ) 1749 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha), 1750 ssize.width, dsize.width, cn, xmin, xmax ); 1751 vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width ); 1752 } 1753 } 1754 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8) 1755 # pragma GCC diagnostic pop 1756 #endif 1757 1758 private: 1759 Mat src; 1760 Mat dst; 1761 const int* xofs, *yofs; 1762 const AT* alpha, *_beta; 1763 Size ssize, dsize; 1764 const int ksize, xmin, xmax; 1765 1766 resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&); 1767 }; 1768 1769 template<class HResize, class VResize> 1770 static void resizeGeneric_( const Mat& src, Mat& dst, 1771 const int* xofs, const void* _alpha, 1772 const int* yofs, const void* _beta, 1773 int xmin, int xmax, int ksize ) 1774 { 1775 typedef typename HResize::alpha_type AT; 1776 1777 const AT* beta = (const AT*)_beta; 1778 Size ssize = src.size(), dsize = dst.size(); 1779 int cn = src.channels(); 1780 ssize.width *= cn; 1781 dsize.width *= cn; 1782 xmin *= cn; 1783 xmax *= cn; 1784 // image resize is a separable operation. In case of not too strong 1785 1786 Range range(0, dsize.height); 1787 resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta, 1788 ssize, dsize, ksize, xmin, xmax); 1789 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 1790 } 1791 1792 template <typename T, typename WT> 1793 struct ResizeAreaFastNoVec 1794 { 1795 ResizeAreaFastNoVec(int, int) { } 1796 ResizeAreaFastNoVec(int, int, int, int) { } 1797 int operator() (const T*, T*, int) const 1798 { return 0; } 1799 }; 1800 1801 #if CV_NEON 1802 1803 class ResizeAreaFastVec_SIMD_8u 1804 { 1805 public: 1806 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : 1807 cn(_cn), step(_step) 1808 { 1809 } 1810 1811 int operator() (const uchar* S, uchar* D, int w) const 1812 { 1813 int dx = 0; 1814 const uchar* S0 = S, * S1 = S0 + step; 1815 1816 uint16x8_t v_2 = vdupq_n_u16(2); 1817 1818 if (cn == 1) 1819 { 1820 for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16) 1821 { 1822 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1); 1823 1824 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1])); 1825 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1]))); 1826 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2); 1827 1828 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1])); 1829 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1]))); 1830 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2); 1831 1832 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1))); 1833 } 1834 } 1835 else if (cn == 4) 1836 { 1837 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 1838 { 1839 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1); 1840 1841 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0)); 1842 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0)); 1843 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1)); 1844 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1)); 1845 1846 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)), 1847 vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10))); 1848 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)), 1849 vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11))); 1850 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2); 1851 1852 vst1_u8(D, vmovn_u16(v_dst)); 1853 } 1854 } 1855 1856 return dx; 1857 } 1858 1859 private: 1860 int cn, step; 1861 }; 1862 1863 class ResizeAreaFastVec_SIMD_16u 1864 { 1865 public: 1866 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : 1867 cn(_cn), step(_step) 1868 { 1869 } 1870 1871 int operator() (const ushort * S, ushort * D, int w) const 1872 { 1873 int dx = 0; 1874 const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step); 1875 1876 uint32x4_t v_2 = vdupq_n_u32(2); 1877 1878 if (cn == 1) 1879 { 1880 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 1881 { 1882 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1); 1883 1884 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1])); 1885 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1]))); 1886 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2); 1887 1888 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1])); 1889 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1]))); 1890 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2); 1891 1892 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1))); 1893 } 1894 } 1895 else if (cn == 4) 1896 { 1897 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 1898 { 1899 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1); 1900 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)), 1901 vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1))); 1902 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2))); 1903 } 1904 } 1905 1906 return dx; 1907 } 1908 1909 private: 1910 int cn, step; 1911 }; 1912 1913 class ResizeAreaFastVec_SIMD_16s 1914 { 1915 public: 1916 ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : 1917 cn(_cn), step(_step) 1918 { 1919 } 1920 1921 int operator() (const short * S, short * D, int w) const 1922 { 1923 int dx = 0; 1924 const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step); 1925 1926 int32x4_t v_2 = vdupq_n_s32(2); 1927 1928 if (cn == 1) 1929 { 1930 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 1931 { 1932 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1); 1933 1934 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1])); 1935 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1]))); 1936 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2); 1937 1938 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1])); 1939 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1]))); 1940 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2); 1941 1942 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1))); 1943 } 1944 } 1945 else if (cn == 4) 1946 { 1947 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 1948 { 1949 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1); 1950 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)), 1951 vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1))); 1952 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2))); 1953 } 1954 } 1955 1956 return dx; 1957 } 1958 1959 private: 1960 int cn, step; 1961 }; 1962 1963 struct ResizeAreaFastVec_SIMD_32f 1964 { 1965 ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : 1966 cn(_cn), step(_step) 1967 { 1968 fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); 1969 } 1970 1971 int operator() (const float * S, float * D, int w) const 1972 { 1973 if (!fast_mode) 1974 return 0; 1975 1976 const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); 1977 int dx = 0; 1978 1979 float32x4_t v_025 = vdupq_n_f32(0.25f); 1980 1981 if (cn == 1) 1982 { 1983 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 1984 { 1985 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1); 1986 1987 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]); 1988 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]); 1989 1990 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); 1991 } 1992 } 1993 else if (cn == 4) 1994 { 1995 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 1996 { 1997 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4)); 1998 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4)); 1999 2000 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025)); 2001 } 2002 } 2003 2004 return dx; 2005 } 2006 2007 private: 2008 int cn; 2009 bool fast_mode; 2010 int step; 2011 }; 2012 2013 #elif CV_SSE2 2014 2015 class ResizeAreaFastVec_SIMD_8u 2016 { 2017 public: 2018 ResizeAreaFastVec_SIMD_8u(int _cn, int _step) : 2019 cn(_cn), step(_step) 2020 { 2021 use_simd = checkHardwareSupport(CV_CPU_SSE2); 2022 } 2023 2024 int operator() (const uchar* S, uchar* D, int w) const 2025 { 2026 if (!use_simd) 2027 return 0; 2028 2029 int dx = 0; 2030 const uchar* S0 = S; 2031 const uchar* S1 = S0 + step; 2032 __m128i zero = _mm_setzero_si128(); 2033 __m128i delta2 = _mm_set1_epi16(2); 2034 2035 if (cn == 1) 2036 { 2037 __m128i masklow = _mm_set1_epi16(0x00ff); 2038 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 2039 { 2040 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 2041 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 2042 2043 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow)); 2044 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow)); 2045 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2); 2046 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); 2047 2048 _mm_storel_epi64((__m128i*)D, s0); 2049 } 2050 } 2051 else if (cn == 3) 2052 for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6) 2053 { 2054 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 2055 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 2056 2057 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); 2058 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero); 2059 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); 2060 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero); 2061 2062 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6)); 2063 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6)); 2064 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); 2065 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); 2066 _mm_storel_epi64((__m128i*)D, s0); 2067 2068 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6)); 2069 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6)); 2070 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); 2071 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero); 2072 _mm_storel_epi64((__m128i*)(D+3), s0); 2073 } 2074 else 2075 { 2076 CV_Assert(cn == 4); 2077 int v[] = { 0, 0, -1, -1 }; 2078 __m128i mask = _mm_loadu_si128((const __m128i*)v); 2079 2080 for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8) 2081 { 2082 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 2083 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 2084 2085 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero); 2086 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero); 2087 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero); 2088 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero); 2089 2090 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8)); 2091 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8)); 2092 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); 2093 __m128i res0 = _mm_srli_epi16(s0, 2); 2094 2095 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8)); 2096 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8)); 2097 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2)); 2098 __m128i res1 = _mm_srli_epi16(s0, 2); 2099 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0), 2100 _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero); 2101 _mm_storel_epi64((__m128i*)(D), s0); 2102 } 2103 } 2104 2105 return dx; 2106 } 2107 2108 private: 2109 int cn; 2110 bool use_simd; 2111 int step; 2112 }; 2113 2114 class ResizeAreaFastVec_SIMD_16u 2115 { 2116 public: 2117 ResizeAreaFastVec_SIMD_16u(int _cn, int _step) : 2118 cn(_cn), step(_step) 2119 { 2120 use_simd = checkHardwareSupport(CV_CPU_SSE2); 2121 } 2122 2123 int operator() (const ushort* S, ushort* D, int w) const 2124 { 2125 if (!use_simd) 2126 return 0; 2127 2128 int dx = 0; 2129 const ushort* S0 = (const ushort*)S; 2130 const ushort* S1 = (const ushort*)((const uchar*)(S) + step); 2131 __m128i masklow = _mm_set1_epi32(0x0000ffff); 2132 __m128i zero = _mm_setzero_si128(); 2133 __m128i delta2 = _mm_set1_epi32(2); 2134 2135 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero) 2136 2137 if (cn == 1) 2138 { 2139 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 2140 { 2141 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 2142 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 2143 2144 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow)); 2145 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow)); 2146 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); 2147 s0 = _mm_srli_epi32(s0, 2); 2148 s0 = _mm_packus_epi32(s0, zero); 2149 2150 _mm_storel_epi64((__m128i*)D, s0); 2151 } 2152 } 2153 else if (cn == 3) 2154 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) 2155 { 2156 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 2157 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 2158 2159 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero); 2160 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero); 2161 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero); 2162 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero); 2163 2164 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); 2165 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); 2166 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); 2167 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); 2168 _mm_storel_epi64((__m128i*)D, s0); 2169 } 2170 else 2171 { 2172 CV_Assert(cn == 4); 2173 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 2174 { 2175 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 2176 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 2177 2178 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero); 2179 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero); 2180 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero); 2181 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero); 2182 2183 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); 2184 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); 2185 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); 2186 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero); 2187 _mm_storel_epi64((__m128i*)D, s0); 2188 } 2189 } 2190 2191 #undef _mm_packus_epi32 2192 2193 return dx; 2194 } 2195 2196 private: 2197 int cn; 2198 int step; 2199 bool use_simd; 2200 }; 2201 2202 class ResizeAreaFastVec_SIMD_16s 2203 { 2204 public: 2205 ResizeAreaFastVec_SIMD_16s(int _cn, int _step) : 2206 cn(_cn), step(_step) 2207 { 2208 use_simd = checkHardwareSupport(CV_CPU_SSE2); 2209 } 2210 2211 int operator() (const short* S, short* D, int w) const 2212 { 2213 if (!use_simd) 2214 return 0; 2215 2216 int dx = 0; 2217 const short* S0 = (const short*)S; 2218 const short* S1 = (const short*)((const uchar*)(S) + step); 2219 __m128i masklow = _mm_set1_epi32(0x0000ffff); 2220 __m128i zero = _mm_setzero_si128(); 2221 __m128i delta2 = _mm_set1_epi32(2); 2222 2223 if (cn == 1) 2224 { 2225 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 2226 { 2227 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 2228 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 2229 2230 __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16), 2231 _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16)); 2232 __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16), 2233 _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16)); 2234 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2); 2235 s0 = _mm_srai_epi32(s0, 2); 2236 s0 = _mm_packs_epi32(s0, zero); 2237 2238 _mm_storel_epi64((__m128i*)D, s0); 2239 } 2240 } 2241 else if (cn == 3) 2242 for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3) 2243 { 2244 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 2245 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 2246 2247 __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); 2248 __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16); 2249 __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); 2250 __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16); 2251 2252 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h); 2253 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h); 2254 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1)); 2255 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); 2256 _mm_storel_epi64((__m128i*)D, s0); 2257 } 2258 else 2259 { 2260 CV_Assert(cn == 4); 2261 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 2262 { 2263 __m128i r0 = _mm_loadu_si128((const __m128i*)S0); 2264 __m128i r1 = _mm_loadu_si128((const __m128i*)S1); 2265 2266 __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16); 2267 __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16); 2268 __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16); 2269 __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16); 2270 2271 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h); 2272 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h); 2273 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2)); 2274 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero); 2275 _mm_storel_epi64((__m128i*)D, s0); 2276 } 2277 } 2278 2279 return dx; 2280 } 2281 2282 private: 2283 int cn; 2284 int step; 2285 bool use_simd; 2286 }; 2287 2288 struct ResizeAreaFastVec_SIMD_32f 2289 { 2290 ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) : 2291 cn(_cn), step(_step) 2292 { 2293 fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4); 2294 fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2); 2295 } 2296 2297 int operator() (const float * S, float * D, int w) const 2298 { 2299 if (!fast_mode) 2300 return 0; 2301 2302 const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step); 2303 int dx = 0; 2304 2305 __m128 v_025 = _mm_set1_ps(0.25f); 2306 2307 if (cn == 1) 2308 { 2309 const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1); 2310 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 2311 { 2312 __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4), 2313 v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4); 2314 2315 __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo), 2316 _mm_shuffle_ps(v_row00, v_row01, shuffle_hi)); 2317 __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo), 2318 _mm_shuffle_ps(v_row10, v_row11, shuffle_hi)); 2319 2320 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); 2321 } 2322 } 2323 else if (cn == 4) 2324 { 2325 for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4) 2326 { 2327 __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4)); 2328 __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4)); 2329 2330 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025)); 2331 } 2332 } 2333 2334 return dx; 2335 } 2336 2337 private: 2338 int cn; 2339 bool fast_mode; 2340 int step; 2341 }; 2342 2343 #else 2344 2345 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u; 2346 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u; 2347 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s; 2348 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f; 2349 2350 #endif 2351 2352 template<typename T, typename SIMDVecOp> 2353 struct ResizeAreaFastVec 2354 { 2355 ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) : 2356 scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step) 2357 { 2358 fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4); 2359 } 2360 2361 int operator() (const T* S, T* D, int w) const 2362 { 2363 if (!fast_mode) 2364 return 0; 2365 2366 const T* nextS = (const T*)((const uchar*)S + step); 2367 int dx = vecOp(S, D, w); 2368 2369 if (cn == 1) 2370 for( ; dx < w; ++dx ) 2371 { 2372 int index = dx*2; 2373 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2); 2374 } 2375 else if (cn == 3) 2376 for( ; dx < w; dx += 3 ) 2377 { 2378 int index = dx*2; 2379 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2); 2380 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2); 2381 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2); 2382 } 2383 else 2384 { 2385 CV_Assert(cn == 4); 2386 for( ; dx < w; dx += 4 ) 2387 { 2388 int index = dx*2; 2389 D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2); 2390 D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2); 2391 D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2); 2392 D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2); 2393 } 2394 } 2395 2396 return dx; 2397 } 2398 2399 private: 2400 int scale_x, scale_y; 2401 int cn; 2402 bool fast_mode; 2403 int step; 2404 SIMDVecOp vecOp; 2405 }; 2406 2407 template <typename T, typename WT, typename VecOp> 2408 class resizeAreaFast_Invoker : 2409 public ParallelLoopBody 2410 { 2411 public: 2412 resizeAreaFast_Invoker(const Mat &_src, Mat &_dst, 2413 int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) : 2414 ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x), 2415 scale_y(_scale_y), ofs(_ofs), xofs(_xofs) 2416 { 2417 } 2418 2419 virtual void operator() (const Range& range) const 2420 { 2421 Size ssize = src.size(), dsize = dst.size(); 2422 int cn = src.channels(); 2423 int area = scale_x*scale_y; 2424 float scale = 1.f/(area); 2425 int dwidth1 = (ssize.width/scale_x)*cn; 2426 dsize.width *= cn; 2427 ssize.width *= cn; 2428 int dy, dx, k = 0; 2429 2430 VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/); 2431 2432 for( dy = range.start; dy < range.end; dy++ ) 2433 { 2434 T* D = (T*)(dst.data + dst.step*dy); 2435 int sy0 = dy*scale_y; 2436 int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0; 2437 2438 if( sy0 >= ssize.height ) 2439 { 2440 for( dx = 0; dx < dsize.width; dx++ ) 2441 D[dx] = 0; 2442 continue; 2443 } 2444 2445 dx = vop(src.template ptr<T>(sy0), D, w); 2446 for( ; dx < w; dx++ ) 2447 { 2448 const T* S = src.template ptr<T>(sy0) + xofs[dx]; 2449 WT sum = 0; 2450 k = 0; 2451 #if CV_ENABLE_UNROLLED 2452 for( ; k <= area - 4; k += 4 ) 2453 sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]]; 2454 #endif 2455 for( ; k < area; k++ ) 2456 sum += S[ofs[k]]; 2457 2458 D[dx] = saturate_cast<T>(sum * scale); 2459 } 2460 2461 for( ; dx < dsize.width; dx++ ) 2462 { 2463 WT sum = 0; 2464 int count = 0, sx0 = xofs[dx]; 2465 if( sx0 >= ssize.width ) 2466 D[dx] = 0; 2467 2468 for( int sy = 0; sy < scale_y; sy++ ) 2469 { 2470 if( sy0 + sy >= ssize.height ) 2471 break; 2472 const T* S = src.template ptr<T>(sy0 + sy) + sx0; 2473 for( int sx = 0; sx < scale_x*cn; sx += cn ) 2474 { 2475 if( sx0 + sx >= ssize.width ) 2476 break; 2477 sum += S[sx]; 2478 count++; 2479 } 2480 } 2481 2482 D[dx] = saturate_cast<T>((float)sum/count); 2483 } 2484 } 2485 } 2486 2487 private: 2488 Mat src; 2489 Mat dst; 2490 int scale_x, scale_y; 2491 const int *ofs, *xofs; 2492 }; 2493 2494 template<typename T, typename WT, typename VecOp> 2495 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs, 2496 int scale_x, int scale_y ) 2497 { 2498 Range range(0, dst.rows); 2499 resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x, 2500 scale_y, ofs, xofs); 2501 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 2502 } 2503 2504 struct DecimateAlpha 2505 { 2506 int si, di; 2507 float alpha; 2508 }; 2509 2510 2511 template<typename T, typename WT> class ResizeArea_Invoker : 2512 public ParallelLoopBody 2513 { 2514 public: 2515 ResizeArea_Invoker( const Mat& _src, Mat& _dst, 2516 const DecimateAlpha* _xtab, int _xtab_size, 2517 const DecimateAlpha* _ytab, int _ytab_size, 2518 const int* _tabofs ) 2519 { 2520 src = &_src; 2521 dst = &_dst; 2522 xtab0 = _xtab; 2523 xtab_size0 = _xtab_size; 2524 ytab = _ytab; 2525 ytab_size = _ytab_size; 2526 tabofs = _tabofs; 2527 } 2528 2529 virtual void operator() (const Range& range) const 2530 { 2531 Size dsize = dst->size(); 2532 int cn = dst->channels(); 2533 dsize.width *= cn; 2534 AutoBuffer<WT> _buffer(dsize.width*2); 2535 const DecimateAlpha* xtab = xtab0; 2536 int xtab_size = xtab_size0; 2537 WT *buf = _buffer, *sum = buf + dsize.width; 2538 int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di; 2539 2540 for( dx = 0; dx < dsize.width; dx++ ) 2541 sum[dx] = (WT)0; 2542 2543 for( j = j_start; j < j_end; j++ ) 2544 { 2545 WT beta = ytab[j].alpha; 2546 int dy = ytab[j].di; 2547 int sy = ytab[j].si; 2548 2549 { 2550 const T* S = src->template ptr<T>(sy); 2551 for( dx = 0; dx < dsize.width; dx++ ) 2552 buf[dx] = (WT)0; 2553 2554 if( cn == 1 ) 2555 for( k = 0; k < xtab_size; k++ ) 2556 { 2557 int dxn = xtab[k].di; 2558 WT alpha = xtab[k].alpha; 2559 buf[dxn] += S[xtab[k].si]*alpha; 2560 } 2561 else if( cn == 2 ) 2562 for( k = 0; k < xtab_size; k++ ) 2563 { 2564 int sxn = xtab[k].si; 2565 int dxn = xtab[k].di; 2566 WT alpha = xtab[k].alpha; 2567 WT t0 = buf[dxn] + S[sxn]*alpha; 2568 WT t1 = buf[dxn+1] + S[sxn+1]*alpha; 2569 buf[dxn] = t0; buf[dxn+1] = t1; 2570 } 2571 else if( cn == 3 ) 2572 for( k = 0; k < xtab_size; k++ ) 2573 { 2574 int sxn = xtab[k].si; 2575 int dxn = xtab[k].di; 2576 WT alpha = xtab[k].alpha; 2577 WT t0 = buf[dxn] + S[sxn]*alpha; 2578 WT t1 = buf[dxn+1] + S[sxn+1]*alpha; 2579 WT t2 = buf[dxn+2] + S[sxn+2]*alpha; 2580 buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2; 2581 } 2582 else if( cn == 4 ) 2583 { 2584 for( k = 0; k < xtab_size; k++ ) 2585 { 2586 int sxn = xtab[k].si; 2587 int dxn = xtab[k].di; 2588 WT alpha = xtab[k].alpha; 2589 WT t0 = buf[dxn] + S[sxn]*alpha; 2590 WT t1 = buf[dxn+1] + S[sxn+1]*alpha; 2591 buf[dxn] = t0; buf[dxn+1] = t1; 2592 t0 = buf[dxn+2] + S[sxn+2]*alpha; 2593 t1 = buf[dxn+3] + S[sxn+3]*alpha; 2594 buf[dxn+2] = t0; buf[dxn+3] = t1; 2595 } 2596 } 2597 else 2598 { 2599 for( k = 0; k < xtab_size; k++ ) 2600 { 2601 int sxn = xtab[k].si; 2602 int dxn = xtab[k].di; 2603 WT alpha = xtab[k].alpha; 2604 for( int c = 0; c < cn; c++ ) 2605 buf[dxn + c] += S[sxn + c]*alpha; 2606 } 2607 } 2608 } 2609 2610 if( dy != prev_dy ) 2611 { 2612 T* D = dst->template ptr<T>(prev_dy); 2613 2614 for( dx = 0; dx < dsize.width; dx++ ) 2615 { 2616 D[dx] = saturate_cast<T>(sum[dx]); 2617 sum[dx] = beta*buf[dx]; 2618 } 2619 prev_dy = dy; 2620 } 2621 else 2622 { 2623 for( dx = 0; dx < dsize.width; dx++ ) 2624 sum[dx] += beta*buf[dx]; 2625 } 2626 } 2627 2628 { 2629 T* D = dst->template ptr<T>(prev_dy); 2630 for( dx = 0; dx < dsize.width; dx++ ) 2631 D[dx] = saturate_cast<T>(sum[dx]); 2632 } 2633 } 2634 2635 private: 2636 const Mat* src; 2637 Mat* dst; 2638 const DecimateAlpha* xtab0; 2639 const DecimateAlpha* ytab; 2640 int xtab_size0, ytab_size; 2641 const int* tabofs; 2642 }; 2643 2644 2645 template <typename T, typename WT> 2646 static void resizeArea_( const Mat& src, Mat& dst, 2647 const DecimateAlpha* xtab, int xtab_size, 2648 const DecimateAlpha* ytab, int ytab_size, 2649 const int* tabofs ) 2650 { 2651 parallel_for_(Range(0, dst.rows), 2652 ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs), 2653 dst.total()/((double)(1 << 16))); 2654 } 2655 2656 2657 typedef void (*ResizeFunc)( const Mat& src, Mat& dst, 2658 const int* xofs, const void* alpha, 2659 const int* yofs, const void* beta, 2660 int xmin, int xmax, int ksize ); 2661 2662 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst, 2663 const int* ofs, const int *xofs, 2664 int scale_x, int scale_y ); 2665 2666 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst, 2667 const DecimateAlpha* xtab, int xtab_size, 2668 const DecimateAlpha* ytab, int ytab_size, 2669 const int* yofs); 2670 2671 2672 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab ) 2673 { 2674 int k = 0; 2675 for(int dx = 0; dx < dsize; dx++ ) 2676 { 2677 double fsx1 = dx * scale; 2678 double fsx2 = fsx1 + scale; 2679 double cellWidth = std::min(scale, ssize - fsx1); 2680 2681 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2); 2682 2683 sx2 = std::min(sx2, ssize - 1); 2684 sx1 = std::min(sx1, sx2); 2685 2686 if( sx1 - fsx1 > 1e-3 ) 2687 { 2688 assert( k < ssize*2 ); 2689 tab[k].di = dx * cn; 2690 tab[k].si = (sx1 - 1) * cn; 2691 tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth); 2692 } 2693 2694 for(int sx = sx1; sx < sx2; sx++ ) 2695 { 2696 assert( k < ssize*2 ); 2697 tab[k].di = dx * cn; 2698 tab[k].si = sx * cn; 2699 tab[k++].alpha = float(1.0 / cellWidth); 2700 } 2701 2702 if( fsx2 - sx2 > 1e-3 ) 2703 { 2704 assert( k < ssize*2 ); 2705 tab[k].di = dx * cn; 2706 tab[k].si = sx2 * cn; 2707 tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth); 2708 } 2709 } 2710 return k; 2711 } 2712 2713 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; } 2714 2715 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \ 2716 func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \ 2717 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\ 2718 specBuf.allocate(specSize);\ 2719 pSpec = (uchar*)specBuf;\ 2720 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec)); 2721 2722 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \ 2723 if (mode == (int)ippCubic) { *ok = false; return; } \ 2724 func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \ 2725 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\ 2726 specBuf.allocate(specSize);\ 2727 pSpec = (uchar*)specBuf;\ 2728 CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\ 2729 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\ 2730 getSrcOffsetFunc = (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE; 2731 2732 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \ 2733 func = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \ 2734 CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\ 2735 specBuf.allocate(specSize);\ 2736 pSpec = (uchar*)specBuf;\ 2737 AutoBuffer<uchar> buf(initSize);\ 2738 uchar* pInit = (uchar*)buf;\ 2739 CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit)); 2740 2741 #define SET_IPP_RESIZE_PTR(TYPE, CN) \ 2742 if (mode == (int)ippLinear) { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \ 2743 else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \ 2744 else { *ok = false; return; } \ 2745 getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \ 2746 getSrcOffsetFunc = (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE; 2747 2748 #if IPP_VERSION_X100 >= 701 2749 class IPPresizeInvoker : 2750 public ParallelLoopBody 2751 { 2752 public: 2753 IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) : 2754 ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x), 2755 inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode), 2756 func(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok) 2757 { 2758 *ok = true; 2759 IppiSize srcSize, dstSize; 2760 int type = src.type(), specSize = 0, initSize = 0; 2761 srcSize.width = src.cols; 2762 srcSize.height = src.rows; 2763 dstSize.width = dst.cols; 2764 dstSize.height = dst.rows; 2765 2766 switch (type) 2767 { 2768 #if 0 // disabled since it breaks tests for CascadeClassifier 2769 case CV_8UC1: SET_IPP_RESIZE_PTR(8u,C1); break; 2770 case CV_8UC3: SET_IPP_RESIZE_PTR(8u,C3); break; 2771 case CV_8UC4: SET_IPP_RESIZE_PTR(8u,C4); break; 2772 #endif 2773 case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break; 2774 case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break; 2775 case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break; 2776 case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break; 2777 case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break; 2778 case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break; 2779 case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break; 2780 case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break; 2781 case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break; 2782 case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break; 2783 case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break; 2784 case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break; 2785 default: { *ok = false; return; } break; 2786 } 2787 } 2788 2789 ~IPPresizeInvoker() 2790 { 2791 } 2792 2793 virtual void operator() (const Range& range) const 2794 { 2795 if (*ok == false) 2796 return; 2797 2798 int cn = src.channels(); 2799 int dsty = min(cvRound(range.start * inv_scale_y), dst.rows); 2800 int dstwidth = min(cvRound(src.cols * inv_scale_x), dst.cols); 2801 int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows); 2802 2803 IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0}; 2804 IppiSize dstSize = { dstwidth, dstheight - dsty }; 2805 int bufsize = 0, itemSize = (int)src.elemSize1(); 2806 2807 CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize)); 2808 CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset)); 2809 2810 const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize; 2811 Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize; 2812 2813 AutoBuffer<uchar> buf(bufsize + 64); 2814 uchar* bufptr = alignPtr((uchar*)buf, 32); 2815 2816 if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 ) 2817 *ok = false; 2818 else 2819 { 2820 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 2821 } 2822 } 2823 private: 2824 const Mat & src; 2825 Mat & dst; 2826 double inv_scale_x; 2827 double inv_scale_y; 2828 void *pSpec; 2829 AutoBuffer<uchar> specBuf; 2830 int mode; 2831 ippiResizeFunc func; 2832 ippiResizeGetBufferSize getBufferSizeFunc; 2833 ippiResizeGetSrcOffset getSrcOffsetFunc; 2834 bool *ok; 2835 const IPPresizeInvoker& operator= (const IPPresizeInvoker&); 2836 }; 2837 2838 #endif 2839 2840 #ifdef HAVE_OPENCL 2841 2842 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab, 2843 float * const alpha_tab, int * const ofs_tab) 2844 { 2845 int k = 0, dx = 0; 2846 for ( ; dx < dsize; dx++) 2847 { 2848 ofs_tab[dx] = k; 2849 2850 double fsx1 = dx * scale; 2851 double fsx2 = fsx1 + scale; 2852 double cellWidth = std::min(scale, ssize - fsx1); 2853 2854 int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2); 2855 2856 sx2 = std::min(sx2, ssize - 1); 2857 sx1 = std::min(sx1, sx2); 2858 2859 if (sx1 - fsx1 > 1e-3) 2860 { 2861 map_tab[k] = sx1 - 1; 2862 alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth); 2863 } 2864 2865 for (int sx = sx1; sx < sx2; sx++) 2866 { 2867 map_tab[k] = sx; 2868 alpha_tab[k++] = float(1.0 / cellWidth); 2869 } 2870 2871 if (fsx2 - sx2 > 1e-3) 2872 { 2873 map_tab[k] = sx2; 2874 alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth); 2875 } 2876 } 2877 ofs_tab[dx] = k; 2878 } 2879 2880 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize, 2881 double fx, double fy, int interpolation) 2882 { 2883 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 2884 2885 double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy; 2886 float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy; 2887 int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx); 2888 bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON && 2889 std::abs(inv_fy - iscale_y) < DBL_EPSILON; 2890 2891 // in case of scale_x && scale_y is equal to 2 2892 // INTER_AREA (fast) also is equal to INTER_LINEAR 2893 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 ) 2894 /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower 2895 2896 if( !(cn <= 4 && 2897 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || 2898 (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) ) 2899 return false; 2900 2901 UMat src = _src.getUMat(); 2902 _dst.create(dsize, type); 2903 UMat dst = _dst.getUMat(); 2904 2905 Size ssize = src.size(); 2906 ocl::Kernel k; 2907 size_t globalsize[] = { dst.cols, dst.rows }; 2908 2909 ocl::Image2D srcImage; 2910 2911 // See if this could be done with a sampler. We stick with integer 2912 // datatypes because the observed error is low. 2913 bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() && 2914 ocl::Image2D::canCreateAlias(src) && depth <= 4 && 2915 ocl::Image2D::isFormatSupported(depth, cn, true) && 2916 src.offset==0); 2917 if (useSampler) 2918 { 2919 int wdepth = std::max(depth, CV_32S); 2920 char buf[2][32]; 2921 cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s " 2922 "-D convertToDT=%s -D cn=%d", 2923 depth, ocl::typeToStr(type), ocl::typeToStr(depth), 2924 ocl::convertTypeStr(wdepth, depth, cn, buf[1]), 2925 cn); 2926 k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts); 2927 2928 if (k.empty()) 2929 useSampler = false; 2930 else 2931 { 2932 // Convert the input into an OpenCL image type, using normalized channel data types 2933 // and aliasing the UMat. 2934 srcImage = ocl::Image2D(src, true, true); 2935 k.args(srcImage, ocl::KernelArg::WriteOnly(dst), 2936 (float)inv_fx, (float)inv_fy); 2937 } 2938 } 2939 2940 if (interpolation == INTER_LINEAR && !useSampler) 2941 { 2942 char buf[2][32]; 2943 2944 // integer path is slower because of CPU part, so it's disabled 2945 if (depth == CV_8U && ((void)0, 0)) 2946 { 2947 AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2)); 2948 int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width; 2949 short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2; 2950 float fxx, fyy; 2951 int sx, sy; 2952 2953 for (int dx = 0; dx < dsize.width; dx++) 2954 { 2955 fxx = (float)((dx+0.5)*inv_fx - 0.5); 2956 sx = cvFloor(fxx); 2957 fxx -= sx; 2958 2959 if (sx < 0) 2960 fxx = 0, sx = 0; 2961 2962 if (sx >= ssize.width-1) 2963 fxx = 0, sx = ssize.width-1; 2964 2965 xofs[dx] = sx; 2966 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE); 2967 ialpha[dx*2 + 1] = saturate_cast<short>(fxx * INTER_RESIZE_COEF_SCALE); 2968 } 2969 2970 for (int dy = 0; dy < dsize.height; dy++) 2971 { 2972 fyy = (float)((dy+0.5)*inv_fy - 0.5); 2973 sy = cvFloor(fyy); 2974 fyy -= sy; 2975 2976 yofs[dy] = sy; 2977 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE); 2978 ibeta[dy*2 + 1] = saturate_cast<short>(fyy * INTER_RESIZE_COEF_SCALE); 2979 } 2980 2981 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn); 2982 UMat coeffs; 2983 Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs); 2984 2985 k.create("resizeLN", ocl::imgproc::resize_oclsrc, 2986 format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s " 2987 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d " 2988 "-D INTER_RESIZE_COEF_BITS=%d", 2989 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), 2990 ocl::convertTypeStr(depth, wdepth, cn, buf[0]), 2991 ocl::convertTypeStr(wdepth, depth, cn, buf[1]), 2992 cn, INTER_RESIZE_COEF_BITS)); 2993 if (k.empty()) 2994 return false; 2995 2996 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), 2997 ocl::KernelArg::PtrReadOnly(coeffs)); 2998 } 2999 else 3000 { 3001 int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn); 3002 k.create("resizeLN", ocl::imgproc::resize_oclsrc, 3003 format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s " 3004 "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d " 3005 "-D INTER_RESIZE_COEF_BITS=%d", 3006 depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), 3007 ocl::convertTypeStr(depth, wdepth, cn, buf[0]), 3008 ocl::convertTypeStr(wdepth, depth, cn, buf[1]), 3009 cn, INTER_RESIZE_COEF_BITS)); 3010 if (k.empty()) 3011 return false; 3012 3013 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), 3014 (float)inv_fx, (float)inv_fy); 3015 } 3016 } 3017 else if (interpolation == INTER_NEAREST) 3018 { 3019 k.create("resizeNN", ocl::imgproc::resize_oclsrc, 3020 format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d", 3021 ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn)); 3022 if (k.empty()) 3023 return false; 3024 3025 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), 3026 (float)inv_fx, (float)inv_fy); 3027 } 3028 else if (interpolation == INTER_AREA) 3029 { 3030 int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F); 3031 int wtype = CV_MAKE_TYPE(wdepth, cn); 3032 3033 char cvt[2][40]; 3034 String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d", 3035 ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype), 3036 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn); 3037 3038 UMat alphaOcl, tabofsOcl, mapOcl; 3039 UMat dmap, smap; 3040 3041 if (is_area_fast) 3042 { 3043 int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn); 3044 buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST" 3045 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff", 3046 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]), 3047 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]), 3048 iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y)); 3049 3050 k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption); 3051 if (k.empty()) 3052 return false; 3053 } 3054 else 3055 { 3056 buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0])); 3057 k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption); 3058 if (k.empty()) 3059 return false; 3060 3061 int xytab_size = (ssize.width + ssize.height) << 1; 3062 int tabofs_size = dsize.height + dsize.width + 2; 3063 3064 AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size); 3065 AutoBuffer<float> _xyalpha_tab(xytab_size); 3066 int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1); 3067 float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1); 3068 int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1; 3069 3070 ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab); 3071 ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab); 3072 3073 // loading precomputed arrays to GPU 3074 Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl); 3075 Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl); 3076 Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl); 3077 } 3078 3079 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst); 3080 3081 if (is_area_fast) 3082 k.args(srcarg, dstarg); 3083 else 3084 k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl), 3085 ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl)); 3086 3087 return k.run(2, globalsize, NULL, false); 3088 } 3089 3090 return k.run(2, globalsize, 0, false); 3091 } 3092 3093 #endif 3094 3095 } 3096 3097 ////////////////////////////////////////////////////////////////////////////////////////// 3098 3099 void cv::resize( InputArray _src, OutputArray _dst, Size dsize, 3100 double inv_scale_x, double inv_scale_y, int interpolation ) 3101 { 3102 static ResizeFunc linear_tab[] = 3103 { 3104 resizeGeneric_< 3105 HResizeLinear<uchar, int, short, 3106 INTER_RESIZE_COEF_SCALE, 3107 HResizeLinearVec_8u32s>, 3108 VResizeLinear<uchar, int, short, 3109 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, 3110 VResizeLinearVec_32s8u> >, 3111 0, 3112 resizeGeneric_< 3113 HResizeLinear<ushort, float, float, 1, 3114 HResizeLinearVec_16u32f>, 3115 VResizeLinear<ushort, float, float, Cast<float, ushort>, 3116 VResizeLinearVec_32f16u> >, 3117 resizeGeneric_< 3118 HResizeLinear<short, float, float, 1, 3119 HResizeLinearVec_16s32f>, 3120 VResizeLinear<short, float, float, Cast<float, short>, 3121 VResizeLinearVec_32f16s> >, 3122 0, 3123 resizeGeneric_< 3124 HResizeLinear<float, float, float, 1, 3125 HResizeLinearVec_32f>, 3126 VResizeLinear<float, float, float, Cast<float, float>, 3127 VResizeLinearVec_32f> >, 3128 resizeGeneric_< 3129 HResizeLinear<double, double, float, 1, 3130 HResizeNoVec>, 3131 VResizeLinear<double, double, float, Cast<double, double>, 3132 VResizeNoVec> >, 3133 0 3134 }; 3135 3136 static ResizeFunc cubic_tab[] = 3137 { 3138 resizeGeneric_< 3139 HResizeCubic<uchar, int, short>, 3140 VResizeCubic<uchar, int, short, 3141 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, 3142 VResizeCubicVec_32s8u> >, 3143 0, 3144 resizeGeneric_< 3145 HResizeCubic<ushort, float, float>, 3146 VResizeCubic<ushort, float, float, Cast<float, ushort>, 3147 VResizeCubicVec_32f16u> >, 3148 resizeGeneric_< 3149 HResizeCubic<short, float, float>, 3150 VResizeCubic<short, float, float, Cast<float, short>, 3151 VResizeCubicVec_32f16s> >, 3152 0, 3153 resizeGeneric_< 3154 HResizeCubic<float, float, float>, 3155 VResizeCubic<float, float, float, Cast<float, float>, 3156 VResizeCubicVec_32f> >, 3157 resizeGeneric_< 3158 HResizeCubic<double, double, float>, 3159 VResizeCubic<double, double, float, Cast<double, double>, 3160 VResizeNoVec> >, 3161 0 3162 }; 3163 3164 static ResizeFunc lanczos4_tab[] = 3165 { 3166 resizeGeneric_<HResizeLanczos4<uchar, int, short>, 3167 VResizeLanczos4<uchar, int, short, 3168 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, 3169 VResizeNoVec> >, 3170 0, 3171 resizeGeneric_<HResizeLanczos4<ushort, float, float>, 3172 VResizeLanczos4<ushort, float, float, Cast<float, ushort>, 3173 VResizeLanczos4Vec_32f16u> >, 3174 resizeGeneric_<HResizeLanczos4<short, float, float>, 3175 VResizeLanczos4<short, float, float, Cast<float, short>, 3176 VResizeLanczos4Vec_32f16s> >, 3177 0, 3178 resizeGeneric_<HResizeLanczos4<float, float, float>, 3179 VResizeLanczos4<float, float, float, Cast<float, float>, 3180 VResizeLanczos4Vec_32f> >, 3181 resizeGeneric_<HResizeLanczos4<double, double, float>, 3182 VResizeLanczos4<double, double, float, Cast<double, double>, 3183 VResizeNoVec> >, 3184 0 3185 }; 3186 3187 static ResizeAreaFastFunc areafast_tab[] = 3188 { 3189 resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >, 3190 0, 3191 resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >, 3192 resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >, 3193 0, 3194 resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>, 3195 resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >, 3196 0 3197 }; 3198 3199 static ResizeAreaFunc area_tab[] = 3200 { 3201 resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>, 3202 resizeArea_<short, float>, 0, resizeArea_<float, float>, 3203 resizeArea_<double, double>, 0 3204 }; 3205 3206 Size ssize = _src.size(); 3207 3208 CV_Assert( ssize.area() > 0 ); 3209 CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) ); 3210 if( dsize.area() == 0 ) 3211 { 3212 dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x), 3213 saturate_cast<int>(ssize.height*inv_scale_y)); 3214 CV_Assert( dsize.area() > 0 ); 3215 } 3216 else 3217 { 3218 inv_scale_x = (double)dsize.width/ssize.width; 3219 inv_scale_y = (double)dsize.height/ssize.height; 3220 } 3221 3222 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10, 3223 ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation)) 3224 3225 Mat src = _src.getMat(); 3226 _dst.create(dsize, src.type()); 3227 Mat dst = _dst.getMat(); 3228 3229 #ifdef HAVE_TEGRA_OPTIMIZATION 3230 if (tegra::useTegra() && tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation)) 3231 return; 3232 #endif 3233 3234 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 3235 double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y; 3236 int k, sx, sy, dx, dy; 3237 3238 int iscale_x = saturate_cast<int>(scale_x); 3239 int iscale_y = saturate_cast<int>(scale_y); 3240 3241 bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON && 3242 std::abs(scale_y - iscale_y) < DBL_EPSILON; 3243 3244 #if IPP_VERSION_X100 >= 701 3245 CV_IPP_CHECK() 3246 { 3247 #define IPP_RESIZE_EPS 1e-10 3248 3249 double ex = fabs((double)dsize.width / src.cols - inv_scale_x) / inv_scale_x; 3250 double ey = fabs((double)dsize.height / src.rows - inv_scale_y) / inv_scale_y; 3251 3252 if ( ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) && 3253 (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) && 3254 !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U)) 3255 { 3256 int mode = -1; 3257 if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2) 3258 mode = ippLinear; 3259 else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4) 3260 mode = ippCubic; 3261 3262 if( mode >= 0 && (cn == 1 || cn == 3 || cn == 4) && 3263 (depth == CV_16U || depth == CV_16S || depth == CV_32F || 3264 (depth == CV_64F && mode == ippLinear))) 3265 { 3266 bool ok = true; 3267 Range range(0, src.rows); 3268 IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok); 3269 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 3270 if( ok ) 3271 { 3272 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 3273 return; 3274 } 3275 setIppErrorStatus(); 3276 } 3277 } 3278 #undef IPP_RESIZE_EPS 3279 } 3280 #endif 3281 3282 if( interpolation == INTER_NEAREST ) 3283 { 3284 resizeNN( src, dst, inv_scale_x, inv_scale_y ); 3285 return; 3286 } 3287 3288 { 3289 // in case of scale_x && scale_y is equal to 2 3290 // INTER_AREA (fast) also is equal to INTER_LINEAR 3291 if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 ) 3292 interpolation = INTER_AREA; 3293 3294 // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1). 3295 // In other cases it is emulated using some variant of bilinear interpolation 3296 if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 ) 3297 { 3298 if( is_area_fast ) 3299 { 3300 int area = iscale_x*iscale_y; 3301 size_t srcstep = src.step / src.elemSize1(); 3302 AutoBuffer<int> _ofs(area + dsize.width*cn); 3303 int* ofs = _ofs; 3304 int* xofs = ofs + area; 3305 ResizeAreaFastFunc func = areafast_tab[depth]; 3306 CV_Assert( func != 0 ); 3307 3308 for( sy = 0, k = 0; sy < iscale_y; sy++ ) 3309 for( sx = 0; sx < iscale_x; sx++ ) 3310 ofs[k++] = (int)(sy*srcstep + sx*cn); 3311 3312 for( dx = 0; dx < dsize.width; dx++ ) 3313 { 3314 int j = dx * cn; 3315 sx = iscale_x * j; 3316 for( k = 0; k < cn; k++ ) 3317 xofs[j + k] = sx + k; 3318 } 3319 3320 func( src, dst, ofs, xofs, iscale_x, iscale_y ); 3321 return; 3322 } 3323 3324 ResizeAreaFunc func = area_tab[depth]; 3325 CV_Assert( func != 0 && cn <= 4 ); 3326 3327 AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2); 3328 DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2; 3329 3330 int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab); 3331 int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab); 3332 3333 AutoBuffer<int> _tabofs(dsize.height + 1); 3334 int* tabofs = _tabofs; 3335 for( k = 0, dy = 0; k < ytab_size; k++ ) 3336 { 3337 if( k == 0 || ytab[k].di != ytab[k-1].di ) 3338 { 3339 assert( ytab[k].di == dy ); 3340 tabofs[dy++] = k; 3341 } 3342 } 3343 tabofs[dy] = ytab_size; 3344 3345 func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs ); 3346 return; 3347 } 3348 } 3349 3350 int xmin = 0, xmax = dsize.width, width = dsize.width*cn; 3351 bool area_mode = interpolation == INTER_AREA; 3352 bool fixpt = depth == CV_8U; 3353 float fx, fy; 3354 ResizeFunc func=0; 3355 int ksize=0, ksize2; 3356 if( interpolation == INTER_CUBIC ) 3357 ksize = 4, func = cubic_tab[depth]; 3358 else if( interpolation == INTER_LANCZOS4 ) 3359 ksize = 8, func = lanczos4_tab[depth]; 3360 else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA ) 3361 ksize = 2, func = linear_tab[depth]; 3362 else 3363 CV_Error( CV_StsBadArg, "Unknown interpolation method" ); 3364 ksize2 = ksize/2; 3365 3366 CV_Assert( func != 0 ); 3367 3368 AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize)); 3369 int* xofs = (int*)(uchar*)_buffer; 3370 int* yofs = xofs + width; 3371 float* alpha = (float*)(yofs + dsize.height); 3372 short* ialpha = (short*)alpha; 3373 float* beta = alpha + width*ksize; 3374 short* ibeta = ialpha + width*ksize; 3375 float cbuf[MAX_ESIZE]; 3376 3377 for( dx = 0; dx < dsize.width; dx++ ) 3378 { 3379 if( !area_mode ) 3380 { 3381 fx = (float)((dx+0.5)*scale_x - 0.5); 3382 sx = cvFloor(fx); 3383 fx -= sx; 3384 } 3385 else 3386 { 3387 sx = cvFloor(dx*scale_x); 3388 fx = (float)((dx+1) - (sx+1)*inv_scale_x); 3389 fx = fx <= 0 ? 0.f : fx - cvFloor(fx); 3390 } 3391 3392 if( sx < ksize2-1 ) 3393 { 3394 xmin = dx+1; 3395 if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4)) 3396 fx = 0, sx = 0; 3397 } 3398 3399 if( sx + ksize2 >= ssize.width ) 3400 { 3401 xmax = std::min( xmax, dx ); 3402 if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4)) 3403 fx = 0, sx = ssize.width-1; 3404 } 3405 3406 for( k = 0, sx *= cn; k < cn; k++ ) 3407 xofs[dx*cn + k] = sx + k; 3408 3409 if( interpolation == INTER_CUBIC ) 3410 interpolateCubic( fx, cbuf ); 3411 else if( interpolation == INTER_LANCZOS4 ) 3412 interpolateLanczos4( fx, cbuf ); 3413 else 3414 { 3415 cbuf[0] = 1.f - fx; 3416 cbuf[1] = fx; 3417 } 3418 if( fixpt ) 3419 { 3420 for( k = 0; k < ksize; k++ ) 3421 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE); 3422 for( ; k < cn*ksize; k++ ) 3423 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize]; 3424 } 3425 else 3426 { 3427 for( k = 0; k < ksize; k++ ) 3428 alpha[dx*cn*ksize + k] = cbuf[k]; 3429 for( ; k < cn*ksize; k++ ) 3430 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize]; 3431 } 3432 } 3433 3434 for( dy = 0; dy < dsize.height; dy++ ) 3435 { 3436 if( !area_mode ) 3437 { 3438 fy = (float)((dy+0.5)*scale_y - 0.5); 3439 sy = cvFloor(fy); 3440 fy -= sy; 3441 } 3442 else 3443 { 3444 sy = cvFloor(dy*scale_y); 3445 fy = (float)((dy+1) - (sy+1)*inv_scale_y); 3446 fy = fy <= 0 ? 0.f : fy - cvFloor(fy); 3447 } 3448 3449 yofs[dy] = sy; 3450 if( interpolation == INTER_CUBIC ) 3451 interpolateCubic( fy, cbuf ); 3452 else if( interpolation == INTER_LANCZOS4 ) 3453 interpolateLanczos4( fy, cbuf ); 3454 else 3455 { 3456 cbuf[0] = 1.f - fy; 3457 cbuf[1] = fy; 3458 } 3459 3460 if( fixpt ) 3461 { 3462 for( k = 0; k < ksize; k++ ) 3463 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE); 3464 } 3465 else 3466 { 3467 for( k = 0; k < ksize; k++ ) 3468 beta[dy*ksize + k] = cbuf[k]; 3469 } 3470 } 3471 3472 func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs, 3473 fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize ); 3474 } 3475 3476 3477 /****************************************************************************************\ 3478 * General warping (affine, perspective, remap) * 3479 \****************************************************************************************/ 3480 3481 namespace cv 3482 { 3483 3484 template<typename T> 3485 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy, 3486 int borderType, const Scalar& _borderValue ) 3487 { 3488 Size ssize = _src.size(), dsize = _dst.size(); 3489 int cn = _src.channels(); 3490 const T* S0 = _src.ptr<T>(); 3491 size_t sstep = _src.step/sizeof(S0[0]); 3492 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]), 3493 saturate_cast<T>(_borderValue[1]), 3494 saturate_cast<T>(_borderValue[2]), 3495 saturate_cast<T>(_borderValue[3])); 3496 int dx, dy; 3497 3498 unsigned width1 = ssize.width, height1 = ssize.height; 3499 3500 if( _dst.isContinuous() && _xy.isContinuous() ) 3501 { 3502 dsize.width *= dsize.height; 3503 dsize.height = 1; 3504 } 3505 3506 for( dy = 0; dy < dsize.height; dy++ ) 3507 { 3508 T* D = _dst.ptr<T>(dy); 3509 const short* XY = _xy.ptr<short>(dy); 3510 3511 if( cn == 1 ) 3512 { 3513 for( dx = 0; dx < dsize.width; dx++ ) 3514 { 3515 int sx = XY[dx*2], sy = XY[dx*2+1]; 3516 if( (unsigned)sx < width1 && (unsigned)sy < height1 ) 3517 D[dx] = S0[sy*sstep + sx]; 3518 else 3519 { 3520 if( borderType == BORDER_REPLICATE ) 3521 { 3522 sx = clip(sx, 0, ssize.width); 3523 sy = clip(sy, 0, ssize.height); 3524 D[dx] = S0[sy*sstep + sx]; 3525 } 3526 else if( borderType == BORDER_CONSTANT ) 3527 D[dx] = cval[0]; 3528 else if( borderType != BORDER_TRANSPARENT ) 3529 { 3530 sx = borderInterpolate(sx, ssize.width, borderType); 3531 sy = borderInterpolate(sy, ssize.height, borderType); 3532 D[dx] = S0[sy*sstep + sx]; 3533 } 3534 } 3535 } 3536 } 3537 else 3538 { 3539 for( dx = 0; dx < dsize.width; dx++, D += cn ) 3540 { 3541 int sx = XY[dx*2], sy = XY[dx*2+1], k; 3542 const T *S; 3543 if( (unsigned)sx < width1 && (unsigned)sy < height1 ) 3544 { 3545 if( cn == 3 ) 3546 { 3547 S = S0 + sy*sstep + sx*3; 3548 D[0] = S[0], D[1] = S[1], D[2] = S[2]; 3549 } 3550 else if( cn == 4 ) 3551 { 3552 S = S0 + sy*sstep + sx*4; 3553 D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3]; 3554 } 3555 else 3556 { 3557 S = S0 + sy*sstep + sx*cn; 3558 for( k = 0; k < cn; k++ ) 3559 D[k] = S[k]; 3560 } 3561 } 3562 else if( borderType != BORDER_TRANSPARENT ) 3563 { 3564 if( borderType == BORDER_REPLICATE ) 3565 { 3566 sx = clip(sx, 0, ssize.width); 3567 sy = clip(sy, 0, ssize.height); 3568 S = S0 + sy*sstep + sx*cn; 3569 } 3570 else if( borderType == BORDER_CONSTANT ) 3571 S = &cval[0]; 3572 else 3573 { 3574 sx = borderInterpolate(sx, ssize.width, borderType); 3575 sy = borderInterpolate(sy, ssize.height, borderType); 3576 S = S0 + sy*sstep + sx*cn; 3577 } 3578 for( k = 0; k < cn; k++ ) 3579 D[k] = S[k]; 3580 } 3581 } 3582 } 3583 } 3584 } 3585 3586 3587 struct RemapNoVec 3588 { 3589 int operator()( const Mat&, void*, const short*, const ushort*, 3590 const void*, int ) const { return 0; } 3591 }; 3592 3593 #if CV_SSE2 3594 3595 struct RemapVec_8u 3596 { 3597 int operator()( const Mat& _src, void* _dst, const short* XY, 3598 const ushort* FXY, const void* _wtab, int width ) const 3599 { 3600 int cn = _src.channels(), x = 0, sstep = (int)_src.step; 3601 3602 if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) || 3603 sstep > 0x8000 ) 3604 return 0; 3605 3606 const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1); 3607 const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0]; 3608 uchar* D = (uchar*)_dst; 3609 __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2); 3610 __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16)); 3611 __m128i z = _mm_setzero_si128(); 3612 int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4]; 3613 3614 if( cn == 1 ) 3615 { 3616 for( ; x <= width - 8; x += 8 ) 3617 { 3618 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2)); 3619 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8)); 3620 __m128i v0, v1, v2, v3, a0, a1, b0, b1; 3621 unsigned i0, i1; 3622 3623 xy0 = _mm_madd_epi16( xy0, xy2ofs ); 3624 xy1 = _mm_madd_epi16( xy1, xy2ofs ); 3625 _mm_store_si128( (__m128i*)iofs0, xy0 ); 3626 _mm_store_si128( (__m128i*)iofs1, xy1 ); 3627 3628 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16); 3629 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16); 3630 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); 3631 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16); 3632 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16); 3633 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); 3634 v0 = _mm_unpacklo_epi8(v0, z); 3635 v1 = _mm_unpacklo_epi8(v1, z); 3636 3637 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)), 3638 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4))); 3639 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)), 3640 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4))); 3641 b0 = _mm_unpacklo_epi64(a0, a1); 3642 b1 = _mm_unpackhi_epi64(a0, a1); 3643 v0 = _mm_madd_epi16(v0, b0); 3644 v1 = _mm_madd_epi16(v1, b1); 3645 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta); 3646 3647 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16); 3648 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16); 3649 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); 3650 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16); 3651 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16); 3652 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1)); 3653 v2 = _mm_unpacklo_epi8(v2, z); 3654 v3 = _mm_unpacklo_epi8(v3, z); 3655 3656 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)), 3657 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4))); 3658 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)), 3659 _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4))); 3660 b0 = _mm_unpacklo_epi64(a0, a1); 3661 b1 = _mm_unpackhi_epi64(a0, a1); 3662 v2 = _mm_madd_epi16(v2, b0); 3663 v3 = _mm_madd_epi16(v3, b1); 3664 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta); 3665 3666 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS); 3667 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS); 3668 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z); 3669 _mm_storel_epi64( (__m128i*)(D + x), v0 ); 3670 } 3671 } 3672 else if( cn == 3 ) 3673 { 3674 for( ; x <= width - 5; x += 4, D += 12 ) 3675 { 3676 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2)); 3677 __m128i u0, v0, u1, v1; 3678 3679 xy0 = _mm_madd_epi16( xy0, xy2ofs ); 3680 _mm_store_si128( (__m128i*)iofs0, xy0 ); 3681 const __m128i *w0, *w1; 3682 w0 = (const __m128i*)(wtab + FXY[x]*16); 3683 w1 = (const __m128i*)(wtab + FXY[x+1]*16); 3684 3685 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])), 3686 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3))); 3687 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])), 3688 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3))); 3689 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])), 3690 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3))); 3691 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])), 3692 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3))); 3693 u0 = _mm_unpacklo_epi8(u0, z); 3694 v0 = _mm_unpacklo_epi8(v0, z); 3695 u1 = _mm_unpacklo_epi8(u1, z); 3696 v1 = _mm_unpacklo_epi8(v1, z); 3697 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); 3698 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); 3699 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); 3700 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); 3701 u0 = _mm_slli_si128(u0, 4); 3702 u0 = _mm_packs_epi32(u0, u1); 3703 u0 = _mm_packus_epi16(u0, u0); 3704 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1)); 3705 3706 w0 = (const __m128i*)(wtab + FXY[x+2]*16); 3707 w1 = (const __m128i*)(wtab + FXY[x+3]*16); 3708 3709 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])), 3710 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3))); 3711 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])), 3712 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3))); 3713 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])), 3714 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3))); 3715 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])), 3716 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3))); 3717 u0 = _mm_unpacklo_epi8(u0, z); 3718 v0 = _mm_unpacklo_epi8(v0, z); 3719 u1 = _mm_unpacklo_epi8(u1, z); 3720 v1 = _mm_unpacklo_epi8(v1, z); 3721 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); 3722 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); 3723 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); 3724 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); 3725 u0 = _mm_slli_si128(u0, 4); 3726 u0 = _mm_packs_epi32(u0, u1); 3727 u0 = _mm_packus_epi16(u0, u0); 3728 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1)); 3729 } 3730 } 3731 else if( cn == 4 ) 3732 { 3733 for( ; x <= width - 4; x += 4, D += 16 ) 3734 { 3735 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2)); 3736 __m128i u0, v0, u1, v1; 3737 3738 xy0 = _mm_madd_epi16( xy0, xy2ofs ); 3739 _mm_store_si128( (__m128i*)iofs0, xy0 ); 3740 const __m128i *w0, *w1; 3741 w0 = (const __m128i*)(wtab + FXY[x]*16); 3742 w1 = (const __m128i*)(wtab + FXY[x+1]*16); 3743 3744 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])), 3745 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4))); 3746 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])), 3747 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4))); 3748 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])), 3749 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4))); 3750 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])), 3751 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4))); 3752 u0 = _mm_unpacklo_epi8(u0, z); 3753 v0 = _mm_unpacklo_epi8(v0, z); 3754 u1 = _mm_unpacklo_epi8(u1, z); 3755 v1 = _mm_unpacklo_epi8(v1, z); 3756 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); 3757 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); 3758 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); 3759 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); 3760 u0 = _mm_packs_epi32(u0, u1); 3761 u0 = _mm_packus_epi16(u0, u0); 3762 _mm_storel_epi64((__m128i*)D, u0); 3763 3764 w0 = (const __m128i*)(wtab + FXY[x+2]*16); 3765 w1 = (const __m128i*)(wtab + FXY[x+3]*16); 3766 3767 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])), 3768 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4))); 3769 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])), 3770 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4))); 3771 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])), 3772 _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4))); 3773 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])), 3774 _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4))); 3775 u0 = _mm_unpacklo_epi8(u0, z); 3776 v0 = _mm_unpacklo_epi8(v0, z); 3777 u1 = _mm_unpacklo_epi8(u1, z); 3778 v1 = _mm_unpacklo_epi8(v1, z); 3779 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1])); 3780 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1])); 3781 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS); 3782 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS); 3783 u0 = _mm_packs_epi32(u0, u1); 3784 u0 = _mm_packus_epi16(u0, u0); 3785 _mm_storel_epi64((__m128i*)(D + 8), u0); 3786 } 3787 } 3788 3789 return x; 3790 } 3791 }; 3792 3793 #else 3794 3795 typedef RemapNoVec RemapVec_8u; 3796 3797 #endif 3798 3799 3800 template<class CastOp, class VecOp, typename AT> 3801 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy, 3802 const Mat& _fxy, const void* _wtab, 3803 int borderType, const Scalar& _borderValue ) 3804 { 3805 typedef typename CastOp::rtype T; 3806 typedef typename CastOp::type1 WT; 3807 Size ssize = _src.size(), dsize = _dst.size(); 3808 int k, cn = _src.channels(); 3809 const AT* wtab = (const AT*)_wtab; 3810 const T* S0 = _src.ptr<T>(); 3811 size_t sstep = _src.step/sizeof(S0[0]); 3812 T cval[CV_CN_MAX]; 3813 int dx, dy; 3814 CastOp castOp; 3815 VecOp vecOp; 3816 3817 for( k = 0; k < cn; k++ ) 3818 cval[k] = saturate_cast<T>(_borderValue[k & 3]); 3819 3820 unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0); 3821 CV_Assert( ssize.area() > 0 ); 3822 #if CV_SSE2 3823 if( _src.type() == CV_8UC3 ) 3824 width1 = std::max(ssize.width-2, 0); 3825 #endif 3826 3827 for( dy = 0; dy < dsize.height; dy++ ) 3828 { 3829 T* D = _dst.ptr<T>(dy); 3830 const short* XY = _xy.ptr<short>(dy); 3831 const ushort* FXY = _fxy.ptr<ushort>(dy); 3832 int X0 = 0; 3833 bool prevInlier = false; 3834 3835 for( dx = 0; dx <= dsize.width; dx++ ) 3836 { 3837 bool curInlier = dx < dsize.width ? 3838 (unsigned)XY[dx*2] < width1 && 3839 (unsigned)XY[dx*2+1] < height1 : !prevInlier; 3840 if( curInlier == prevInlier ) 3841 continue; 3842 3843 int X1 = dx; 3844 dx = X0; 3845 X0 = X1; 3846 prevInlier = curInlier; 3847 3848 if( !curInlier ) 3849 { 3850 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx ); 3851 D += len*cn; 3852 dx += len; 3853 3854 if( cn == 1 ) 3855 { 3856 for( ; dx < X1; dx++, D++ ) 3857 { 3858 int sx = XY[dx*2], sy = XY[dx*2+1]; 3859 const AT* w = wtab + FXY[dx]*4; 3860 const T* S = S0 + sy*sstep + sx; 3861 *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3])); 3862 } 3863 } 3864 else if( cn == 2 ) 3865 for( ; dx < X1; dx++, D += 2 ) 3866 { 3867 int sx = XY[dx*2], sy = XY[dx*2+1]; 3868 const AT* w = wtab + FXY[dx]*4; 3869 const T* S = S0 + sy*sstep + sx*2; 3870 WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3]; 3871 WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3]; 3872 D[0] = castOp(t0); D[1] = castOp(t1); 3873 } 3874 else if( cn == 3 ) 3875 for( ; dx < X1; dx++, D += 3 ) 3876 { 3877 int sx = XY[dx*2], sy = XY[dx*2+1]; 3878 const AT* w = wtab + FXY[dx]*4; 3879 const T* S = S0 + sy*sstep + sx*3; 3880 WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3]; 3881 WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3]; 3882 WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3]; 3883 D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2); 3884 } 3885 else if( cn == 4 ) 3886 for( ; dx < X1; dx++, D += 4 ) 3887 { 3888 int sx = XY[dx*2], sy = XY[dx*2+1]; 3889 const AT* w = wtab + FXY[dx]*4; 3890 const T* S = S0 + sy*sstep + sx*4; 3891 WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3]; 3892 WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3]; 3893 D[0] = castOp(t0); D[1] = castOp(t1); 3894 t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3]; 3895 t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3]; 3896 D[2] = castOp(t0); D[3] = castOp(t1); 3897 } 3898 else 3899 for( ; dx < X1; dx++, D += cn ) 3900 { 3901 int sx = XY[dx*2], sy = XY[dx*2+1]; 3902 const AT* w = wtab + FXY[dx]*4; 3903 const T* S = S0 + sy*sstep + sx*cn; 3904 for( k = 0; k < cn; k++ ) 3905 { 3906 WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3]; 3907 D[k] = castOp(t0); 3908 } 3909 } 3910 } 3911 else 3912 { 3913 if( borderType == BORDER_TRANSPARENT && cn != 3 ) 3914 { 3915 D += (X1 - dx)*cn; 3916 dx = X1; 3917 continue; 3918 } 3919 3920 if( cn == 1 ) 3921 for( ; dx < X1; dx++, D++ ) 3922 { 3923 int sx = XY[dx*2], sy = XY[dx*2+1]; 3924 if( borderType == BORDER_CONSTANT && 3925 (sx >= ssize.width || sx+1 < 0 || 3926 sy >= ssize.height || sy+1 < 0) ) 3927 { 3928 D[0] = cval[0]; 3929 } 3930 else 3931 { 3932 int sx0, sx1, sy0, sy1; 3933 T v0, v1, v2, v3; 3934 const AT* w = wtab + FXY[dx]*4; 3935 if( borderType == BORDER_REPLICATE ) 3936 { 3937 sx0 = clip(sx, 0, ssize.width); 3938 sx1 = clip(sx+1, 0, ssize.width); 3939 sy0 = clip(sy, 0, ssize.height); 3940 sy1 = clip(sy+1, 0, ssize.height); 3941 v0 = S0[sy0*sstep + sx0]; 3942 v1 = S0[sy0*sstep + sx1]; 3943 v2 = S0[sy1*sstep + sx0]; 3944 v3 = S0[sy1*sstep + sx1]; 3945 } 3946 else 3947 { 3948 sx0 = borderInterpolate(sx, ssize.width, borderType); 3949 sx1 = borderInterpolate(sx+1, ssize.width, borderType); 3950 sy0 = borderInterpolate(sy, ssize.height, borderType); 3951 sy1 = borderInterpolate(sy+1, ssize.height, borderType); 3952 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0]; 3953 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0]; 3954 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0]; 3955 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0]; 3956 } 3957 D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3])); 3958 } 3959 } 3960 else 3961 for( ; dx < X1; dx++, D += cn ) 3962 { 3963 int sx = XY[dx*2], sy = XY[dx*2+1]; 3964 if( borderType == BORDER_CONSTANT && 3965 (sx >= ssize.width || sx+1 < 0 || 3966 sy >= ssize.height || sy+1 < 0) ) 3967 { 3968 for( k = 0; k < cn; k++ ) 3969 D[k] = cval[k]; 3970 } 3971 else 3972 { 3973 int sx0, sx1, sy0, sy1; 3974 const T *v0, *v1, *v2, *v3; 3975 const AT* w = wtab + FXY[dx]*4; 3976 if( borderType == BORDER_REPLICATE ) 3977 { 3978 sx0 = clip(sx, 0, ssize.width); 3979 sx1 = clip(sx+1, 0, ssize.width); 3980 sy0 = clip(sy, 0, ssize.height); 3981 sy1 = clip(sy+1, 0, ssize.height); 3982 v0 = S0 + sy0*sstep + sx0*cn; 3983 v1 = S0 + sy0*sstep + sx1*cn; 3984 v2 = S0 + sy1*sstep + sx0*cn; 3985 v3 = S0 + sy1*sstep + sx1*cn; 3986 } 3987 else if( borderType == BORDER_TRANSPARENT && 3988 ((unsigned)sx >= (unsigned)(ssize.width-1) || 3989 (unsigned)sy >= (unsigned)(ssize.height-1))) 3990 continue; 3991 else 3992 { 3993 sx0 = borderInterpolate(sx, ssize.width, borderType); 3994 sx1 = borderInterpolate(sx+1, ssize.width, borderType); 3995 sy0 = borderInterpolate(sy, ssize.height, borderType); 3996 sy1 = borderInterpolate(sy+1, ssize.height, borderType); 3997 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0]; 3998 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0]; 3999 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0]; 4000 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0]; 4001 } 4002 for( k = 0; k < cn; k++ ) 4003 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3])); 4004 } 4005 } 4006 } 4007 } 4008 } 4009 } 4010 4011 4012 template<class CastOp, typename AT, int ONE> 4013 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy, 4014 const Mat& _fxy, const void* _wtab, 4015 int borderType, const Scalar& _borderValue ) 4016 { 4017 typedef typename CastOp::rtype T; 4018 typedef typename CastOp::type1 WT; 4019 Size ssize = _src.size(), dsize = _dst.size(); 4020 int cn = _src.channels(); 4021 const AT* wtab = (const AT*)_wtab; 4022 const T* S0 = _src.ptr<T>(); 4023 size_t sstep = _src.step/sizeof(S0[0]); 4024 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]), 4025 saturate_cast<T>(_borderValue[1]), 4026 saturate_cast<T>(_borderValue[2]), 4027 saturate_cast<T>(_borderValue[3])); 4028 int dx, dy; 4029 CastOp castOp; 4030 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101; 4031 4032 unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0); 4033 4034 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() ) 4035 { 4036 dsize.width *= dsize.height; 4037 dsize.height = 1; 4038 } 4039 4040 for( dy = 0; dy < dsize.height; dy++ ) 4041 { 4042 T* D = _dst.ptr<T>(dy); 4043 const short* XY = _xy.ptr<short>(dy); 4044 const ushort* FXY = _fxy.ptr<ushort>(dy); 4045 4046 for( dx = 0; dx < dsize.width; dx++, D += cn ) 4047 { 4048 int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1; 4049 const AT* w = wtab + FXY[dx]*16; 4050 int i, k; 4051 if( (unsigned)sx < width1 && (unsigned)sy < height1 ) 4052 { 4053 const T* S = S0 + sy*sstep + sx*cn; 4054 for( k = 0; k < cn; k++ ) 4055 { 4056 WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3]; 4057 S += sstep; 4058 sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7]; 4059 S += sstep; 4060 sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11]; 4061 S += sstep; 4062 sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15]; 4063 S += 1 - sstep*3; 4064 D[k] = castOp(sum); 4065 } 4066 } 4067 else 4068 { 4069 int x[4], y[4]; 4070 if( borderType == BORDER_TRANSPARENT && 4071 ((unsigned)(sx+1) >= (unsigned)ssize.width || 4072 (unsigned)(sy+1) >= (unsigned)ssize.height) ) 4073 continue; 4074 4075 if( borderType1 == BORDER_CONSTANT && 4076 (sx >= ssize.width || sx+4 <= 0 || 4077 sy >= ssize.height || sy+4 <= 0)) 4078 { 4079 for( k = 0; k < cn; k++ ) 4080 D[k] = cval[k]; 4081 continue; 4082 } 4083 4084 for( i = 0; i < 4; i++ ) 4085 { 4086 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn; 4087 y[i] = borderInterpolate(sy + i, ssize.height, borderType1); 4088 } 4089 4090 for( k = 0; k < cn; k++, S0++, w -= 16 ) 4091 { 4092 WT cv = cval[k], sum = cv*ONE; 4093 for( i = 0; i < 4; i++, w += 4 ) 4094 { 4095 int yi = y[i]; 4096 const T* S = S0 + yi*sstep; 4097 if( yi < 0 ) 4098 continue; 4099 if( x[0] >= 0 ) 4100 sum += (S[x[0]] - cv)*w[0]; 4101 if( x[1] >= 0 ) 4102 sum += (S[x[1]] - cv)*w[1]; 4103 if( x[2] >= 0 ) 4104 sum += (S[x[2]] - cv)*w[2]; 4105 if( x[3] >= 0 ) 4106 sum += (S[x[3]] - cv)*w[3]; 4107 } 4108 D[k] = castOp(sum); 4109 } 4110 S0 -= cn; 4111 } 4112 } 4113 } 4114 } 4115 4116 4117 template<class CastOp, typename AT, int ONE> 4118 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy, 4119 const Mat& _fxy, const void* _wtab, 4120 int borderType, const Scalar& _borderValue ) 4121 { 4122 typedef typename CastOp::rtype T; 4123 typedef typename CastOp::type1 WT; 4124 Size ssize = _src.size(), dsize = _dst.size(); 4125 int cn = _src.channels(); 4126 const AT* wtab = (const AT*)_wtab; 4127 const T* S0 = _src.ptr<T>(); 4128 size_t sstep = _src.step/sizeof(S0[0]); 4129 Scalar_<T> cval(saturate_cast<T>(_borderValue[0]), 4130 saturate_cast<T>(_borderValue[1]), 4131 saturate_cast<T>(_borderValue[2]), 4132 saturate_cast<T>(_borderValue[3])); 4133 int dx, dy; 4134 CastOp castOp; 4135 int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101; 4136 4137 unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0); 4138 4139 if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() ) 4140 { 4141 dsize.width *= dsize.height; 4142 dsize.height = 1; 4143 } 4144 4145 for( dy = 0; dy < dsize.height; dy++ ) 4146 { 4147 T* D = _dst.ptr<T>(dy); 4148 const short* XY = _xy.ptr<short>(dy); 4149 const ushort* FXY = _fxy.ptr<ushort>(dy); 4150 4151 for( dx = 0; dx < dsize.width; dx++, D += cn ) 4152 { 4153 int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3; 4154 const AT* w = wtab + FXY[dx]*64; 4155 const T* S = S0 + sy*sstep + sx*cn; 4156 int i, k; 4157 if( (unsigned)sx < width1 && (unsigned)sy < height1 ) 4158 { 4159 for( k = 0; k < cn; k++ ) 4160 { 4161 WT sum = 0; 4162 for( int r = 0; r < 8; r++, S += sstep, w += 8 ) 4163 sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] + 4164 S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7]; 4165 w -= 64; 4166 S -= sstep*8 - 1; 4167 D[k] = castOp(sum); 4168 } 4169 } 4170 else 4171 { 4172 int x[8], y[8]; 4173 if( borderType == BORDER_TRANSPARENT && 4174 ((unsigned)(sx+3) >= (unsigned)ssize.width || 4175 (unsigned)(sy+3) >= (unsigned)ssize.height) ) 4176 continue; 4177 4178 if( borderType1 == BORDER_CONSTANT && 4179 (sx >= ssize.width || sx+8 <= 0 || 4180 sy >= ssize.height || sy+8 <= 0)) 4181 { 4182 for( k = 0; k < cn; k++ ) 4183 D[k] = cval[k]; 4184 continue; 4185 } 4186 4187 for( i = 0; i < 8; i++ ) 4188 { 4189 x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn; 4190 y[i] = borderInterpolate(sy + i, ssize.height, borderType1); 4191 } 4192 4193 for( k = 0; k < cn; k++, S0++, w -= 64 ) 4194 { 4195 WT cv = cval[k], sum = cv*ONE; 4196 for( i = 0; i < 8; i++, w += 8 ) 4197 { 4198 int yi = y[i]; 4199 const T* S1 = S0 + yi*sstep; 4200 if( yi < 0 ) 4201 continue; 4202 if( x[0] >= 0 ) 4203 sum += (S1[x[0]] - cv)*w[0]; 4204 if( x[1] >= 0 ) 4205 sum += (S1[x[1]] - cv)*w[1]; 4206 if( x[2] >= 0 ) 4207 sum += (S1[x[2]] - cv)*w[2]; 4208 if( x[3] >= 0 ) 4209 sum += (S1[x[3]] - cv)*w[3]; 4210 if( x[4] >= 0 ) 4211 sum += (S1[x[4]] - cv)*w[4]; 4212 if( x[5] >= 0 ) 4213 sum += (S1[x[5]] - cv)*w[5]; 4214 if( x[6] >= 0 ) 4215 sum += (S1[x[6]] - cv)*w[6]; 4216 if( x[7] >= 0 ) 4217 sum += (S1[x[7]] - cv)*w[7]; 4218 } 4219 D[k] = castOp(sum); 4220 } 4221 S0 -= cn; 4222 } 4223 } 4224 } 4225 } 4226 4227 4228 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy, 4229 int borderType, const Scalar& _borderValue ); 4230 4231 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy, 4232 const Mat& _fxy, const void* _wtab, 4233 int borderType, const Scalar& _borderValue); 4234 4235 class RemapInvoker : 4236 public ParallelLoopBody 4237 { 4238 public: 4239 RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1, 4240 const Mat *_m2, int _borderType, const Scalar &_borderValue, 4241 int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) : 4242 ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2), 4243 borderType(_borderType), borderValue(_borderValue), 4244 planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab) 4245 { 4246 } 4247 4248 virtual void operator() (const Range& range) const 4249 { 4250 int x, y, x1, y1; 4251 const int buf_size = 1 << 14; 4252 int brows0 = std::min(128, dst->rows), map_depth = m1->depth(); 4253 int bcols0 = std::min(buf_size/brows0, dst->cols); 4254 brows0 = std::min(buf_size/bcols0, dst->rows); 4255 #if CV_SSE2 4256 bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); 4257 #endif 4258 4259 Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa; 4260 if( !nnfunc ) 4261 _bufa.create(brows0, bcols0, CV_16UC1); 4262 4263 for( y = range.start; y < range.end; y += brows0 ) 4264 { 4265 for( x = 0; x < dst->cols; x += bcols0 ) 4266 { 4267 int brows = std::min(brows0, range.end - y); 4268 int bcols = std::min(bcols0, dst->cols - x); 4269 Mat dpart(*dst, Rect(x, y, bcols, brows)); 4270 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows)); 4271 4272 if( nnfunc ) 4273 { 4274 if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format 4275 bufxy = (*m1)(Rect(x, y, bcols, brows)); 4276 else if( map_depth != CV_32F ) 4277 { 4278 for( y1 = 0; y1 < brows; y1++ ) 4279 { 4280 short* XY = bufxy.ptr<short>(y1); 4281 const short* sXY = m1->ptr<short>(y+y1) + x*2; 4282 const ushort* sA = m2->ptr<ushort>(y+y1) + x; 4283 4284 for( x1 = 0; x1 < bcols; x1++ ) 4285 { 4286 int a = sA[x1] & (INTER_TAB_SIZE2-1); 4287 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0]; 4288 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1]; 4289 } 4290 } 4291 } 4292 else if( !planar_input ) 4293 (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth()); 4294 else 4295 { 4296 for( y1 = 0; y1 < brows; y1++ ) 4297 { 4298 short* XY = bufxy.ptr<short>(y1); 4299 const float* sX = m1->ptr<float>(y+y1) + x; 4300 const float* sY = m2->ptr<float>(y+y1) + x; 4301 x1 = 0; 4302 4303 #if CV_SSE2 4304 if( useSIMD ) 4305 { 4306 for( ; x1 <= bcols - 8; x1 += 8 ) 4307 { 4308 __m128 fx0 = _mm_loadu_ps(sX + x1); 4309 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4); 4310 __m128 fy0 = _mm_loadu_ps(sY + x1); 4311 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4); 4312 __m128i ix0 = _mm_cvtps_epi32(fx0); 4313 __m128i ix1 = _mm_cvtps_epi32(fx1); 4314 __m128i iy0 = _mm_cvtps_epi32(fy0); 4315 __m128i iy1 = _mm_cvtps_epi32(fy1); 4316 ix0 = _mm_packs_epi32(ix0, ix1); 4317 iy0 = _mm_packs_epi32(iy0, iy1); 4318 ix1 = _mm_unpacklo_epi16(ix0, iy0); 4319 iy1 = _mm_unpackhi_epi16(ix0, iy0); 4320 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1); 4321 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1); 4322 } 4323 } 4324 #endif 4325 4326 for( ; x1 < bcols; x1++ ) 4327 { 4328 XY[x1*2] = saturate_cast<short>(sX[x1]); 4329 XY[x1*2+1] = saturate_cast<short>(sY[x1]); 4330 } 4331 } 4332 } 4333 nnfunc( *src, dpart, bufxy, borderType, borderValue ); 4334 continue; 4335 } 4336 4337 Mat bufa(_bufa, Rect(0, 0, bcols, brows)); 4338 for( y1 = 0; y1 < brows; y1++ ) 4339 { 4340 short* XY = bufxy.ptr<short>(y1); 4341 ushort* A = bufa.ptr<ushort>(y1); 4342 4343 if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) ) 4344 { 4345 bufxy = (*m1)(Rect(x, y, bcols, brows)); 4346 4347 const ushort* sA = m2->ptr<ushort>(y+y1) + x; 4348 x1 = 0; 4349 4350 #if CV_NEON 4351 uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1); 4352 for ( ; x1 <= bcols - 8; x1 += 8) 4353 vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale)); 4354 #elif CV_SSE2 4355 __m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1); 4356 for ( ; x1 <= bcols - 8; x1 += 8) 4357 _mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale)); 4358 #endif 4359 4360 for( ; x1 < bcols; x1++ ) 4361 A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1)); 4362 } 4363 else if( planar_input ) 4364 { 4365 const float* sX = m1->ptr<float>(y+y1) + x; 4366 const float* sY = m2->ptr<float>(y+y1) + x; 4367 4368 x1 = 0; 4369 #if CV_SSE2 4370 if( useSIMD ) 4371 { 4372 __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE); 4373 __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1); 4374 for( ; x1 <= bcols - 8; x1 += 8 ) 4375 { 4376 __m128 fx0 = _mm_loadu_ps(sX + x1); 4377 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4); 4378 __m128 fy0 = _mm_loadu_ps(sY + x1); 4379 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4); 4380 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale)); 4381 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale)); 4382 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale)); 4383 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale)); 4384 __m128i mx0 = _mm_and_si128(ix0, mask); 4385 __m128i mx1 = _mm_and_si128(ix1, mask); 4386 __m128i my0 = _mm_and_si128(iy0, mask); 4387 __m128i my1 = _mm_and_si128(iy1, mask); 4388 mx0 = _mm_packs_epi32(mx0, mx1); 4389 my0 = _mm_packs_epi32(my0, my1); 4390 my0 = _mm_slli_epi16(my0, INTER_BITS); 4391 mx0 = _mm_or_si128(mx0, my0); 4392 _mm_storeu_si128((__m128i*)(A + x1), mx0); 4393 ix0 = _mm_srai_epi32(ix0, INTER_BITS); 4394 ix1 = _mm_srai_epi32(ix1, INTER_BITS); 4395 iy0 = _mm_srai_epi32(iy0, INTER_BITS); 4396 iy1 = _mm_srai_epi32(iy1, INTER_BITS); 4397 ix0 = _mm_packs_epi32(ix0, ix1); 4398 iy0 = _mm_packs_epi32(iy0, iy1); 4399 ix1 = _mm_unpacklo_epi16(ix0, iy0); 4400 iy1 = _mm_unpackhi_epi16(ix0, iy0); 4401 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1); 4402 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1); 4403 } 4404 } 4405 #elif CV_NEON 4406 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); 4407 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE); 4408 4409 for( ; x1 <= bcols - 4; x1 += 4 ) 4410 { 4411 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)), 4412 v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale)); 4413 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3, 4414 vandq_s32(v_sy, v_scale2)); 4415 vst1_u16(A + x1, vqmovun_s32(v_v)); 4416 4417 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)), 4418 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS))); 4419 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1])); 4420 } 4421 #endif 4422 4423 for( ; x1 < bcols; x1++ ) 4424 { 4425 int sx = cvRound(sX[x1]*INTER_TAB_SIZE); 4426 int sy = cvRound(sY[x1]*INTER_TAB_SIZE); 4427 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1)); 4428 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS); 4429 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS); 4430 A[x1] = (ushort)v; 4431 } 4432 } 4433 else 4434 { 4435 const float* sXY = m1->ptr<float>(y+y1) + x*2; 4436 x1 = 0; 4437 4438 #if CV_NEON 4439 float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE); 4440 int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE); 4441 4442 for( ; x1 <= bcols - 4; x1 += 4 ) 4443 { 4444 float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1)); 4445 int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale)); 4446 int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale)); 4447 int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3, 4448 vandq_s32(v_sy, v_scale2)); 4449 vst1_u16(A + x1, vqmovun_s32(v_v)); 4450 4451 int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)), 4452 vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS))); 4453 vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1])); 4454 } 4455 #endif 4456 4457 for( x1 = 0; x1 < bcols; x1++ ) 4458 { 4459 int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE); 4460 int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE); 4461 int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1)); 4462 XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS); 4463 XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS); 4464 A[x1] = (ushort)v; 4465 } 4466 } 4467 } 4468 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue); 4469 } 4470 } 4471 } 4472 4473 private: 4474 const Mat* src; 4475 Mat* dst; 4476 const Mat *m1, *m2; 4477 int borderType; 4478 Scalar borderValue; 4479 int planar_input; 4480 RemapNNFunc nnfunc; 4481 RemapFunc ifunc; 4482 const void *ctab; 4483 }; 4484 4485 #ifdef HAVE_OPENCL 4486 4487 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2, 4488 int interpolation, int borderType, const Scalar& borderValue) 4489 { 4490 const ocl::Device & dev = ocl::Device::getDefault(); 4491 int cn = _src.channels(), type = _src.type(), depth = _src.depth(), 4492 rowsPerWI = dev.isIntel() ? 4 : 1; 4493 4494 if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST) 4495 || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1) 4496 return false; 4497 4498 UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat(); 4499 4500 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) || 4501 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) ) 4502 { 4503 if (map1.type() != CV_16SC2) 4504 std::swap(map1, map2); 4505 } 4506 else 4507 CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) ); 4508 4509 _dst.create(map1.size(), type); 4510 UMat dst = _dst.getUMat(); 4511 4512 String kernelName = "remap"; 4513 if (map1.type() == CV_32FC2 && map2.empty()) 4514 kernelName += "_32FC2"; 4515 else if (map1.type() == CV_16SC2) 4516 { 4517 kernelName += "_16SC2"; 4518 if (!map2.empty()) 4519 kernelName += "_16UC1"; 4520 } 4521 else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) 4522 kernelName += "_2_32FC1"; 4523 else 4524 CV_Error(Error::StsBadArg, "Unsupported map types"); 4525 4526 static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" }; 4527 static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP", 4528 "BORDER_REFLECT_101", "BORDER_TRANSPARENT" }; 4529 String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d", 4530 interMap[interpolation], borderMap[borderType], 4531 ocl::typeToStr(type), rowsPerWI); 4532 4533 if (interpolation != INTER_NEAREST) 4534 { 4535 char cvt[3][40]; 4536 int wdepth = std::max(CV_32F, depth); 4537 buildOptions = buildOptions 4538 + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s" 4539 " -D convertToWT2=%s -D WT2=%s", 4540 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), 4541 ocl::convertTypeStr(wdepth, depth, cn, cvt[0]), 4542 ocl::convertTypeStr(depth, wdepth, cn, cvt[1]), 4543 ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]), 4544 ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2))); 4545 } 4546 int scalarcn = cn == 3 ? 4 : cn; 4547 int sctype = CV_MAKETYPE(depth, scalarcn); 4548 buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d", 4549 ocl::typeToStr(type), ocl::typeToStr(depth), 4550 cn, ocl::typeToStr(sctype), depth); 4551 4552 ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions); 4553 4554 Mat scalar(1, 1, sctype, borderValue); 4555 ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst), 4556 map1arg = ocl::KernelArg::ReadOnlyNoSize(map1), 4557 scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize()); 4558 4559 if (map2.empty()) 4560 k.args(srcarg, dstarg, map1arg, scalararg); 4561 else 4562 k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg); 4563 4564 size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI }; 4565 return k.run(2, globalThreads, NULL, false); 4566 } 4567 4568 #endif 4569 4570 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0 4571 4572 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi, 4573 const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep, 4574 void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation); 4575 4576 class IPPRemapInvoker : 4577 public ParallelLoopBody 4578 { 4579 public: 4580 IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc, 4581 int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) : 4582 ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc), 4583 ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok) 4584 { 4585 *ok = true; 4586 } 4587 4588 virtual void operator() (const Range & range) const 4589 { 4590 IppiRect srcRoiRect = { 0, 0, src.cols, src.rows }; 4591 Mat dstRoi = dst.rowRange(range); 4592 IppiSize dstRoiSize = ippiSize(dstRoi.size()); 4593 int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 4594 4595 if (borderType == BORDER_CONSTANT && 4596 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth)) 4597 { 4598 *ok = false; 4599 return; 4600 } 4601 4602 if (ippFunc(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect, 4603 map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step, 4604 dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0) 4605 *ok = false; 4606 else 4607 { 4608 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 4609 } 4610 } 4611 4612 private: 4613 Mat & src, & dst, & map1, & map2; 4614 ippiRemap ippFunc; 4615 int ippInterpolation, borderType; 4616 Scalar borderValue; 4617 bool * ok; 4618 }; 4619 4620 #endif 4621 4622 } 4623 4624 void cv::remap( InputArray _src, OutputArray _dst, 4625 InputArray _map1, InputArray _map2, 4626 int interpolation, int borderType, const Scalar& borderValue ) 4627 { 4628 static RemapNNFunc nn_tab[] = 4629 { 4630 remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>, 4631 remapNearest<int>, remapNearest<float>, remapNearest<double>, 0 4632 }; 4633 4634 static RemapFunc linear_tab[] = 4635 { 4636 remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0, 4637 remapBilinear<Cast<float, ushort>, RemapNoVec, float>, 4638 remapBilinear<Cast<float, short>, RemapNoVec, float>, 0, 4639 remapBilinear<Cast<float, float>, RemapNoVec, float>, 4640 remapBilinear<Cast<double, double>, RemapNoVec, float>, 0 4641 }; 4642 4643 static RemapFunc cubic_tab[] = 4644 { 4645 remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0, 4646 remapBicubic<Cast<float, ushort>, float, 1>, 4647 remapBicubic<Cast<float, short>, float, 1>, 0, 4648 remapBicubic<Cast<float, float>, float, 1>, 4649 remapBicubic<Cast<double, double>, float, 1>, 0 4650 }; 4651 4652 static RemapFunc lanczos4_tab[] = 4653 { 4654 remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0, 4655 remapLanczos4<Cast<float, ushort>, float, 1>, 4656 remapLanczos4<Cast<float, short>, float, 1>, 0, 4657 remapLanczos4<Cast<float, float>, float, 1>, 4658 remapLanczos4<Cast<double, double>, float, 1>, 0 4659 }; 4660 4661 CV_Assert( _map1.size().area() > 0 ); 4662 CV_Assert( _map2.empty() || (_map2.size() == _map1.size())); 4663 4664 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 4665 ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue)) 4666 4667 Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat(); 4668 _dst.create( map1.size(), src.type() ); 4669 Mat dst = _dst.getMat(); 4670 if( dst.data == src.data ) 4671 src = src.clone(); 4672 4673 if( interpolation == INTER_AREA ) 4674 interpolation = INTER_LINEAR; 4675 4676 int type = src.type(), depth = CV_MAT_DEPTH(type); 4677 4678 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0 4679 CV_IPP_CHECK() 4680 { 4681 if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) && 4682 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 && 4683 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT)) 4684 { 4685 int ippInterpolation = 4686 interpolation == INTER_NEAREST ? IPPI_INTER_NN : 4687 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC; 4688 4689 ippiRemap ippFunc = 4690 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R : 4691 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R : 4692 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R : 4693 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R : 4694 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R : 4695 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R : 4696 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R : 4697 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R : 4698 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0; 4699 4700 if (ippFunc) 4701 { 4702 bool ok; 4703 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation, 4704 borderType, borderValue, &ok); 4705 Range range(0, dst.rows); 4706 parallel_for_(range, invoker, dst.total() / (double)(1 << 16)); 4707 4708 if (ok) 4709 { 4710 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 4711 return; 4712 } 4713 setIppErrorStatus(); 4714 } 4715 } 4716 } 4717 #endif 4718 4719 RemapNNFunc nnfunc = 0; 4720 RemapFunc ifunc = 0; 4721 const void* ctab = 0; 4722 bool fixpt = depth == CV_8U; 4723 bool planar_input = false; 4724 4725 if( interpolation == INTER_NEAREST ) 4726 { 4727 nnfunc = nn_tab[depth]; 4728 CV_Assert( nnfunc != 0 ); 4729 } 4730 else 4731 { 4732 if( interpolation == INTER_LINEAR ) 4733 ifunc = linear_tab[depth]; 4734 else if( interpolation == INTER_CUBIC ) 4735 ifunc = cubic_tab[depth]; 4736 else if( interpolation == INTER_LANCZOS4 ) 4737 ifunc = lanczos4_tab[depth]; 4738 else 4739 CV_Error( CV_StsBadArg, "Unknown interpolation method" ); 4740 CV_Assert( ifunc != 0 ); 4741 ctab = initInterTab2D( interpolation, fixpt ); 4742 } 4743 4744 const Mat *m1 = &map1, *m2 = &map2; 4745 4746 if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) || 4747 (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) ) 4748 { 4749 if( map1.type() != CV_16SC2 ) 4750 std::swap(m1, m2); 4751 } 4752 else 4753 { 4754 CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) || 4755 (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) ); 4756 planar_input = map1.channels() == 1; 4757 } 4758 4759 RemapInvoker invoker(src, dst, m1, m2, 4760 borderType, borderValue, planar_input, nnfunc, ifunc, 4761 ctab); 4762 parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16)); 4763 } 4764 4765 4766 void cv::convertMaps( InputArray _map1, InputArray _map2, 4767 OutputArray _dstmap1, OutputArray _dstmap2, 4768 int dstm1type, bool nninterpolate ) 4769 { 4770 Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2; 4771 Size size = map1.size(); 4772 const Mat *m1 = &map1, *m2 = &map2; 4773 int m1type = m1->type(), m2type = m2->type(); 4774 4775 CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) || 4776 (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) || 4777 (m1type == CV_32FC1 && m2type == CV_32FC1) || 4778 (m1type == CV_32FC2 && m2->empty()) ); 4779 4780 if( m2type == CV_16SC2 ) 4781 { 4782 std::swap( m1, m2 ); 4783 std::swap( m1type, m2type ); 4784 } 4785 4786 if( dstm1type <= 0 ) 4787 dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2; 4788 CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 ); 4789 _dstmap1.create( size, dstm1type ); 4790 dstmap1 = _dstmap1.getMat(); 4791 4792 if( !nninterpolate && dstm1type != CV_32FC2 ) 4793 { 4794 _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 ); 4795 dstmap2 = _dstmap2.getMat(); 4796 } 4797 else 4798 _dstmap2.release(); 4799 4800 if( m1type == dstm1type || (nninterpolate && 4801 ((m1type == CV_16SC2 && dstm1type == CV_32FC2) || 4802 (m1type == CV_32FC2 && dstm1type == CV_16SC2))) ) 4803 { 4804 m1->convertTo( dstmap1, dstmap1.type() ); 4805 if( !dstmap2.empty() && dstmap2.type() == m2->type() ) 4806 m2->copyTo( dstmap2 ); 4807 return; 4808 } 4809 4810 if( m1type == CV_32FC1 && dstm1type == CV_32FC2 ) 4811 { 4812 Mat vdata[] = { *m1, *m2 }; 4813 merge( vdata, 2, dstmap1 ); 4814 return; 4815 } 4816 4817 if( m1type == CV_32FC2 && dstm1type == CV_32FC1 ) 4818 { 4819 Mat mv[] = { dstmap1, dstmap2 }; 4820 split( *m1, mv ); 4821 return; 4822 } 4823 4824 if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) && 4825 dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) ) 4826 { 4827 size.width *= size.height; 4828 size.height = 1; 4829 } 4830 4831 #if CV_SSE2 4832 bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); 4833 #endif 4834 #if CV_SSE4_1 4835 bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); 4836 #endif 4837 4838 const float scale = 1.f/INTER_TAB_SIZE; 4839 int x, y; 4840 for( y = 0; y < size.height; y++ ) 4841 { 4842 const float* src1f = m1->ptr<float>(y); 4843 const float* src2f = m2->ptr<float>(y); 4844 const short* src1 = (const short*)src1f; 4845 const ushort* src2 = (const ushort*)src2f; 4846 4847 float* dst1f = dstmap1.ptr<float>(y); 4848 float* dst2f = dstmap2.ptr<float>(y); 4849 short* dst1 = (short*)dst1f; 4850 ushort* dst2 = (ushort*)dst2f; 4851 x = 0; 4852 4853 if( m1type == CV_32FC1 && dstm1type == CV_16SC2 ) 4854 { 4855 if( nninterpolate ) 4856 { 4857 #if CV_NEON 4858 for( ; x <= size.width - 8; x += 8 ) 4859 { 4860 int16x8x2_t v_dst; 4861 v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), 4862 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))); 4863 v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))), 4864 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4)))); 4865 4866 vst2q_s16(dst1 + (x << 1), v_dst); 4867 } 4868 #elif CV_SSE4_1 4869 if (useSSE4_1) 4870 { 4871 for( ; x <= size.width - 16; x += 16 ) 4872 { 4873 __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), 4874 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))); 4875 __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)), 4876 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12))); 4877 4878 __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)), 4879 _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4))); 4880 __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)), 4881 _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12))); 4882 4883 _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3); 4884 4885 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0); 4886 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1); 4887 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2); 4888 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3); 4889 } 4890 } 4891 #endif 4892 for( ; x < size.width; x++ ) 4893 { 4894 dst1[x*2] = saturate_cast<short>(src1f[x]); 4895 dst1[x*2+1] = saturate_cast<short>(src2f[x]); 4896 } 4897 } 4898 else 4899 { 4900 #if CV_NEON 4901 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); 4902 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); 4903 4904 for( ; x <= size.width - 8; x += 8 ) 4905 { 4906 int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale)); 4907 int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale)); 4908 int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale)); 4909 int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale)); 4910 4911 int16x8x2_t v_dst; 4912 v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)), 4913 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS))); 4914 v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)), 4915 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS))); 4916 4917 vst2q_s16(dst1 + (x << 1), v_dst); 4918 4919 uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS), 4920 vandq_s32(v_ix0, v_mask))); 4921 uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS), 4922 vandq_s32(v_ix1, v_mask))); 4923 vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); 4924 } 4925 #elif CV_SSE4_1 4926 if (useSSE4_1) 4927 { 4928 __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); 4929 __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); 4930 4931 for( ; x <= size.width - 16; x += 16 ) 4932 { 4933 __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its)); 4934 __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its)); 4935 __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its)); 4936 __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its)); 4937 4938 __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), 4939 _mm_srai_epi32(v_ix1, INTER_BITS)); 4940 __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), 4941 _mm_srai_epi32(v_iy1, INTER_BITS)); 4942 __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), 4943 _mm_and_si128(v_ix0, v_its1)); 4944 __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), 4945 _mm_and_si128(v_ix1, v_its1)); 4946 _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21)); 4947 4948 v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its)); 4949 v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its)); 4950 v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its)); 4951 v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its)); 4952 4953 __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS), 4954 _mm_srai_epi32(v_ix1, INTER_BITS)); 4955 __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS), 4956 _mm_srai_epi32(v_iy1, INTER_BITS)); 4957 v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS), 4958 _mm_and_si128(v_ix0, v_its1)); 4959 v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS), 4960 _mm_and_si128(v_ix1, v_its1)); 4961 _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21)); 4962 4963 _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13); 4964 4965 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10); 4966 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11); 4967 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12); 4968 _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13); 4969 } 4970 } 4971 #endif 4972 for( ; x < size.width; x++ ) 4973 { 4974 int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE); 4975 int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE); 4976 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS); 4977 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS); 4978 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); 4979 } 4980 } 4981 } 4982 else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 ) 4983 { 4984 if( nninterpolate ) 4985 { 4986 #if CV_NEON 4987 for( ; x <= (size.width << 1) - 8; x += 8 ) 4988 vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))), 4989 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))))); 4990 #elif CV_SSE2 4991 for( ; x <= (size.width << 1) - 8; x += 8 ) 4992 { 4993 _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)), 4994 _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)))); 4995 } 4996 #endif 4997 for( ; x < size.width; x++ ) 4998 { 4999 dst1[x*2] = saturate_cast<short>(src1f[x*2]); 5000 dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]); 5001 } 5002 } 5003 else 5004 { 5005 #if CV_NEON 5006 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE); 5007 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); 5008 5009 for( ; x <= size.width - 8; x += 8 ) 5010 { 5011 float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8); 5012 int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale)); 5013 int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale)); 5014 int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale)); 5015 int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale)); 5016 5017 int16x8x2_t v_dst; 5018 v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)), 5019 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS))); 5020 v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)), 5021 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS))); 5022 5023 vst2q_s16(dst1 + (x << 1), v_dst); 5024 5025 uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS), 5026 vandq_s32(v_ix0, v_mask))); 5027 uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS), 5028 vandq_s32(v_ix1, v_mask))); 5029 vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1)); 5030 } 5031 #elif CV_SSE4_1 5032 if (useSSE4_1) 5033 { 5034 __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE); 5035 __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1); 5036 __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16); 5037 5038 for( ; x <= size.width - 4; x += 4 ) 5039 { 5040 __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its)); 5041 __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its)); 5042 5043 __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS), 5044 _mm_srai_epi32(v_src1, INTER_BITS)); 5045 _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1); 5046 5047 // x0 y0 x1 y1 . . . 5048 v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1), 5049 _mm_and_si128(v_src1, v_its1)); 5050 __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . . 5051 _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . . 5052 _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2)); 5053 } 5054 } 5055 #endif 5056 for( ; x < size.width; x++ ) 5057 { 5058 int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE); 5059 int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE); 5060 dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS); 5061 dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS); 5062 dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1))); 5063 } 5064 } 5065 } 5066 else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 ) 5067 { 5068 #if CV_NEON 5069 uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1); 5070 uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1); 5071 float32x4_t v_scale = vdupq_n_f32(scale); 5072 5073 for( ; x <= size.width - 8; x += 8) 5074 { 5075 uint32x4_t v_fxy1, v_fxy2; 5076 if (src2) 5077 { 5078 uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2); 5079 v_fxy1 = vmovl_u16(vget_low_u16(v_src2)); 5080 v_fxy2 = vmovl_u16(vget_high_u16(v_src2)); 5081 } 5082 else 5083 v_fxy1 = v_fxy2 = v_zero; 5084 5085 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1)); 5086 float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))), 5087 v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask))); 5088 float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))), 5089 v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS))); 5090 vst1q_f32(dst1f + x, v_dst1); 5091 vst1q_f32(dst2f + x, v_dst2); 5092 5093 v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))), 5094 v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask))); 5095 v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))), 5096 v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS))); 5097 vst1q_f32(dst1f + x + 4, v_dst1); 5098 vst1q_f32(dst2f + x + 4, v_dst2); 5099 } 5100 #elif CV_SSE2 5101 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); 5102 __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); 5103 __m128 v_scale = _mm_set1_ps(scale); 5104 5105 for( ; x <= size.width - 16; x += 16) 5106 { 5107 __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); 5108 __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8)); 5109 __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16)); 5110 __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24)); 5111 5112 _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21); 5113 5114 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; 5115 __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero); 5116 _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)), 5117 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); 5118 _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)), 5119 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); 5120 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); 5121 _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)), 5122 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); 5123 _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)), 5124 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); 5125 5126 v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero; 5127 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); 5128 _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)), 5129 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); 5130 _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)), 5131 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); 5132 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero); 5133 _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)), 5134 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask))))); 5135 _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)), 5136 _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS))))); 5137 } 5138 #endif 5139 for( ; x < size.width; x++ ) 5140 { 5141 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0; 5142 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; 5143 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale; 5144 } 5145 } 5146 else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 ) 5147 { 5148 #if CV_NEON 5149 int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1); 5150 int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1); 5151 float32x4_t v_scale = vdupq_n_f32(scale); 5152 5153 for( ; x <= size.width - 8; x += 8) 5154 { 5155 int32x4_t v_fxy1, v_fxy2; 5156 if (src2) 5157 { 5158 int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2); 5159 v_fxy1 = vmovl_s16(vget_low_s16(v_src2)); 5160 v_fxy2 = vmovl_s16(vget_high_s16(v_src2)); 5161 } 5162 else 5163 v_fxy1 = v_fxy2 = v_zero; 5164 5165 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1)); 5166 float32x4x2_t v_dst; 5167 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))), 5168 v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask))); 5169 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))), 5170 v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS))); 5171 vst2q_f32(dst1f + (x << 1), v_dst); 5172 5173 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))), 5174 v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask))); 5175 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))), 5176 v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS))); 5177 vst2q_f32(dst1f + (x << 1) + 8, v_dst); 5178 } 5179 #elif CV_SSE2 5180 if (useSSE2) 5181 { 5182 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1); 5183 __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1); 5184 __m128 v_scale = _mm_set1_ps(scale); 5185 5186 for ( ; x <= size.width - 8; x += 8) 5187 { 5188 __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2)); 5189 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero; 5190 __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask); 5191 __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS); 5192 5193 __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale); 5194 _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add)); 5195 5196 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale); 5197 _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add)); 5198 } 5199 } 5200 #endif 5201 for( ; x < size.width; x++ ) 5202 { 5203 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0; 5204 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale; 5205 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale; 5206 } 5207 } 5208 else 5209 CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" ); 5210 } 5211 } 5212 5213 5214 namespace cv 5215 { 5216 5217 class WarpAffineInvoker : 5218 public ParallelLoopBody 5219 { 5220 public: 5221 WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType, 5222 const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) : 5223 ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation), 5224 borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta), 5225 M(_M) 5226 { 5227 } 5228 5229 virtual void operator() (const Range& range) const 5230 { 5231 const int BLOCK_SZ = 64; 5232 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ]; 5233 const int AB_BITS = MAX(10, (int)INTER_BITS); 5234 const int AB_SCALE = 1 << AB_BITS; 5235 int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1; 5236 #if CV_SSE2 5237 bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2); 5238 #endif 5239 #if CV_SSE4_1 5240 bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); 5241 #endif 5242 5243 int bh0 = std::min(BLOCK_SZ/2, dst.rows); 5244 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols); 5245 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows); 5246 5247 for( y = range.start; y < range.end; y += bh0 ) 5248 { 5249 for( x = 0; x < dst.cols; x += bw0 ) 5250 { 5251 int bw = std::min( bw0, dst.cols - x); 5252 int bh = std::min( bh0, range.end - y); 5253 5254 Mat _XY(bh, bw, CV_16SC2, XY), matA; 5255 Mat dpart(dst, Rect(x, y, bw, bh)); 5256 5257 for( y1 = 0; y1 < bh; y1++ ) 5258 { 5259 short* xy = XY + y1*bw*2; 5260 int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta; 5261 int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta; 5262 5263 if( interpolation == INTER_NEAREST ) 5264 { 5265 x1 = 0; 5266 #if CV_NEON 5267 int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0); 5268 for( ; x1 <= bw - 8; x1 += 8 ) 5269 { 5270 int16x8x2_t v_dst; 5271 v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)), 5272 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS))); 5273 v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)), 5274 vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS))); 5275 5276 vst2q_s16(xy + (x1 << 1), v_dst); 5277 } 5278 #elif CV_SSE4_1 5279 if (useSSE4_1) 5280 { 5281 __m128i v_X0 = _mm_set1_epi32(X0); 5282 __m128i v_Y0 = _mm_set1_epi32(Y0); 5283 for ( ; x1 <= bw - 16; x1 += 16) 5284 { 5285 __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS), 5286 _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS)); 5287 __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS), 5288 _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS)); 5289 5290 __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS), 5291 _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS)); 5292 __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS), 5293 _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS)); 5294 5295 _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1); 5296 5297 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0); 5298 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1); 5299 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0); 5300 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1); 5301 } 5302 } 5303 #endif 5304 for( ; x1 < bw; x1++ ) 5305 { 5306 int X = (X0 + adelta[x+x1]) >> AB_BITS; 5307 int Y = (Y0 + bdelta[x+x1]) >> AB_BITS; 5308 xy[x1*2] = saturate_cast<short>(X); 5309 xy[x1*2+1] = saturate_cast<short>(Y); 5310 } 5311 } 5312 else 5313 { 5314 short* alpha = A + y1*bw; 5315 x1 = 0; 5316 #if CV_SSE2 5317 if( useSSE2 ) 5318 { 5319 __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1); 5320 __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0); 5321 for( ; x1 <= bw - 8; x1 += 8 ) 5322 { 5323 __m128i tx0, tx1, ty0, ty1; 5324 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX); 5325 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY); 5326 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX); 5327 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY); 5328 5329 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS); 5330 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS); 5331 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS); 5332 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS); 5333 5334 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask), 5335 _mm_and_si128(tx1, fxy_mask)); 5336 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask), 5337 _mm_and_si128(ty1, fxy_mask)); 5338 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS), 5339 _mm_srai_epi32(tx1, INTER_BITS)); 5340 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS), 5341 _mm_srai_epi32(ty1, INTER_BITS)); 5342 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS)); 5343 5344 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0)); 5345 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0)); 5346 _mm_storeu_si128((__m128i*)(alpha + x1), fx_); 5347 } 5348 } 5349 #elif CV_NEON 5350 int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1); 5351 for( ; x1 <= bw - 8; x1 += 8 ) 5352 { 5353 int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS); 5354 int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS); 5355 int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS); 5356 int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS); 5357 5358 int16x8x2_t v_xy; 5359 v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS))); 5360 v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS))); 5361 5362 vst2q_s16(xy + (x1 << 1), v_xy); 5363 5364 int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS), 5365 vandq_s32(v_X0, v_mask))); 5366 int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS), 5367 vandq_s32(v_X1, v_mask))); 5368 vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1)); 5369 } 5370 #endif 5371 for( ; x1 < bw; x1++ ) 5372 { 5373 int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS); 5374 int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS); 5375 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS); 5376 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS); 5377 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + 5378 (X & (INTER_TAB_SIZE-1))); 5379 } 5380 } 5381 } 5382 5383 if( interpolation == INTER_NEAREST ) 5384 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue ); 5385 else 5386 { 5387 Mat _matA(bh, bw, CV_16U, A); 5388 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue ); 5389 } 5390 } 5391 } 5392 } 5393 5394 private: 5395 Mat src; 5396 Mat dst; 5397 int interpolation, borderType; 5398 Scalar borderValue; 5399 int *adelta, *bdelta; 5400 double *M; 5401 }; 5402 5403 5404 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0 5405 class IPPWarpAffineInvoker : 5406 public ParallelLoopBody 5407 { 5408 public: 5409 IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType, 5410 const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) : 5411 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs), 5412 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok) 5413 { 5414 *ok = true; 5415 } 5416 5417 virtual void operator() (const Range& range) const 5418 { 5419 IppiSize srcsize = { src.cols, src.rows }; 5420 IppiRect srcroi = { 0, 0, src.cols, src.rows }; 5421 IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start }; 5422 int cnn = src.channels(); 5423 if( borderType == BORDER_CONSTANT ) 5424 { 5425 IppiSize setSize = { dst.cols, range.end - range.start }; 5426 void *dataPointer = dst.ptr(range.start); 5427 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) ) 5428 { 5429 *ok = false; 5430 return; 5431 } 5432 } 5433 5434 // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr 5435 IppStatus status = func( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), 5436 (int)dst.step[0], dstroi, coeffs, mode ); 5437 if( status < 0) 5438 *ok = false; 5439 else 5440 { 5441 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 5442 } 5443 } 5444 private: 5445 Mat &src; 5446 Mat &dst; 5447 int mode; 5448 double (&coeffs)[2][3]; 5449 int borderType; 5450 Scalar borderValue; 5451 ippiWarpAffineBackFunc func; 5452 bool *ok; 5453 const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&); 5454 }; 5455 #endif 5456 5457 #ifdef HAVE_OPENCL 5458 5459 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 }; 5460 5461 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0, 5462 Size dsize, int flags, int borderType, const Scalar& borderValue, 5463 int op_type) 5464 { 5465 CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE); 5466 const ocl::Device & dev = ocl::Device::getDefault(); 5467 5468 int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 5469 const bool doubleSupport = dev.doubleFPConfig() > 0; 5470 5471 int interpolation = flags & INTER_MAX; 5472 if( interpolation == INTER_AREA ) 5473 interpolation = INTER_LINEAR; 5474 int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1; 5475 5476 if ( !(borderType == cv::BORDER_CONSTANT && 5477 (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) || 5478 (!doubleSupport && depth == CV_64F) || cn > 4) 5479 return false; 5480 5481 const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" }; 5482 ocl::ProgramSource program = op_type == OCL_OP_AFFINE ? 5483 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc; 5484 const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective"; 5485 5486 int scalarcn = cn == 3 ? 4 : cn; 5487 bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE; 5488 int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth); 5489 int sctype = CV_MAKETYPE(wdepth, scalarcn); 5490 5491 ocl::Kernel k; 5492 String opts; 5493 if (interpolation == INTER_NEAREST) 5494 { 5495 opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d", 5496 ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "", 5497 ocl::typeToStr(CV_MAT_DEPTH(type)), 5498 ocl::typeToStr(sctype), cn, rowsPerWI); 5499 } 5500 else 5501 { 5502 char cvt[2][50]; 5503 opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d" 5504 " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d", 5505 interpolationMap[interpolation], ocl::typeToStr(type), 5506 ocl::typeToStr(CV_MAT_DEPTH(type)), 5507 ocl::typeToStr(sctype), 5508 ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth, 5509 ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), 5510 ocl::convertTypeStr(wdepth, depth, cn, cvt[1]), 5511 doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI); 5512 } 5513 5514 k.create(kernelName, program, opts); 5515 if (k.empty()) 5516 return false; 5517 5518 double borderBuf[] = { 0, 0, 0, 0 }; 5519 scalarToRawData(borderValue, borderBuf, sctype); 5520 5521 UMat src = _src.getUMat(), M0; 5522 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() ); 5523 UMat dst = _dst.getUMat(); 5524 5525 double M[9]; 5526 int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3); 5527 Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat(); 5528 CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) && 5529 M1.rows == matRows && M1.cols == 3 ); 5530 M1.convertTo(matM, matM.type()); 5531 5532 if( !(flags & WARP_INVERSE_MAP) ) 5533 { 5534 if (op_type == OCL_OP_PERSPECTIVE) 5535 invert(matM, matM); 5536 else 5537 { 5538 double D = M[0]*M[4] - M[1]*M[3]; 5539 D = D != 0 ? 1./D : 0; 5540 double A11 = M[4]*D, A22=M[0]*D; 5541 M[0] = A11; M[1] *= -D; 5542 M[3] *= -D; M[4] = A22; 5543 double b1 = -M[0]*M[2] - M[1]*M[5]; 5544 double b2 = -M[3]*M[2] - M[4]*M[5]; 5545 M[2] = b1; M[5] = b2; 5546 } 5547 } 5548 matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F); 5549 5550 k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0), 5551 ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype))); 5552 5553 size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI }; 5554 return k.run(2, globalThreads, NULL, false); 5555 } 5556 5557 #endif 5558 5559 } 5560 5561 5562 void cv::warpAffine( InputArray _src, OutputArray _dst, 5563 InputArray _M0, Size dsize, 5564 int flags, int borderType, const Scalar& borderValue ) 5565 { 5566 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 5567 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, 5568 borderValue, OCL_OP_AFFINE)) 5569 5570 Mat src = _src.getMat(), M0 = _M0.getMat(); 5571 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() ); 5572 Mat dst = _dst.getMat(); 5573 CV_Assert( src.cols > 0 && src.rows > 0 ); 5574 if( dst.data == src.data ) 5575 src = src.clone(); 5576 5577 double M[6]; 5578 Mat matM(2, 3, CV_64F, M); 5579 int interpolation = flags & INTER_MAX; 5580 if( interpolation == INTER_AREA ) 5581 interpolation = INTER_LINEAR; 5582 5583 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 ); 5584 M0.convertTo(matM, matM.type()); 5585 5586 #ifdef HAVE_TEGRA_OPTIMIZATION 5587 if( tegra::useTegra() && tegra::warpAffine(src, dst, M, flags, borderType, borderValue) ) 5588 return; 5589 #endif 5590 5591 if( !(flags & WARP_INVERSE_MAP) ) 5592 { 5593 double D = M[0]*M[4] - M[1]*M[3]; 5594 D = D != 0 ? 1./D : 0; 5595 double A11 = M[4]*D, A22=M[0]*D; 5596 M[0] = A11; M[1] *= -D; 5597 M[3] *= -D; M[4] = A22; 5598 double b1 = -M[0]*M[2] - M[1]*M[5]; 5599 double b2 = -M[3]*M[2] - M[4]*M[5]; 5600 M[2] = b1; M[5] = b2; 5601 } 5602 5603 int x; 5604 AutoBuffer<int> _abdelta(dst.cols*2); 5605 int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols; 5606 const int AB_BITS = MAX(10, (int)INTER_BITS); 5607 const int AB_SCALE = 1 << AB_BITS; 5608 5609 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0 5610 CV_IPP_CHECK() 5611 { 5612 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 5613 if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) && 5614 ( cn == 1 || cn == 3 || cn == 4 ) && 5615 ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) && 5616 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) ) 5617 { 5618 ippiWarpAffineBackFunc ippFunc = 0; 5619 if ((flags & WARP_INVERSE_MAP) != 0) 5620 { 5621 ippFunc = 5622 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R : 5623 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R : 5624 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R : 5625 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R : 5626 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R : 5627 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R : 5628 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R : 5629 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R : 5630 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R : 5631 0; 5632 } 5633 else 5634 { 5635 ippFunc = 5636 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R : 5637 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R : 5638 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R : 5639 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R : 5640 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R : 5641 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R : 5642 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R : 5643 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R : 5644 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R : 5645 0; 5646 } 5647 int mode = 5648 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : 5649 interpolation == INTER_NEAREST ? IPPI_INTER_NN : 5650 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 5651 0; 5652 CV_Assert(mode && ippFunc); 5653 5654 double coeffs[2][3]; 5655 for( int i = 0; i < 2; i++ ) 5656 for( int j = 0; j < 3; j++ ) 5657 coeffs[i][j] = matM.at<double>(i, j); 5658 5659 bool ok; 5660 Range range(0, dst.rows); 5661 IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok); 5662 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 5663 if( ok ) 5664 { 5665 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 5666 return; 5667 } 5668 setIppErrorStatus(); 5669 } 5670 } 5671 #endif 5672 5673 for( x = 0; x < dst.cols; x++ ) 5674 { 5675 adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE); 5676 bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE); 5677 } 5678 5679 Range range(0, dst.rows); 5680 WarpAffineInvoker invoker(src, dst, interpolation, borderType, 5681 borderValue, adelta, bdelta, M); 5682 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 5683 } 5684 5685 5686 namespace cv 5687 { 5688 5689 class WarpPerspectiveInvoker : 5690 public ParallelLoopBody 5691 { 5692 public: 5693 WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation, 5694 int _borderType, const Scalar &_borderValue) : 5695 ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation), 5696 borderType(_borderType), borderValue(_borderValue) 5697 { 5698 } 5699 5700 virtual void operator() (const Range& range) const 5701 { 5702 const int BLOCK_SZ = 32; 5703 short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ]; 5704 int x, y, x1, y1, width = dst.cols, height = dst.rows; 5705 5706 int bh0 = std::min(BLOCK_SZ/2, height); 5707 int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width); 5708 bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height); 5709 5710 #if CV_SSE4_1 5711 bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1); 5712 __m128d v_M0 = _mm_set1_pd(M[0]); 5713 __m128d v_M3 = _mm_set1_pd(M[3]); 5714 __m128d v_M6 = _mm_set1_pd(M[6]); 5715 __m128d v_intmax = _mm_set1_pd((double)INT_MAX); 5716 __m128d v_intmin = _mm_set1_pd((double)INT_MIN); 5717 __m128d v_2 = _mm_set1_pd(2), 5718 v_zero = _mm_setzero_pd(), 5719 v_1 = _mm_set1_pd(1), 5720 v_its = _mm_set1_pd(INTER_TAB_SIZE); 5721 __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1); 5722 #endif 5723 5724 for( y = range.start; y < range.end; y += bh0 ) 5725 { 5726 for( x = 0; x < width; x += bw0 ) 5727 { 5728 int bw = std::min( bw0, width - x); 5729 int bh = std::min( bh0, range.end - y); // height 5730 5731 Mat _XY(bh, bw, CV_16SC2, XY), matA; 5732 Mat dpart(dst, Rect(x, y, bw, bh)); 5733 5734 for( y1 = 0; y1 < bh; y1++ ) 5735 { 5736 short* xy = XY + y1*bw*2; 5737 double X0 = M[0]*x + M[1]*(y + y1) + M[2]; 5738 double Y0 = M[3]*x + M[4]*(y + y1) + M[5]; 5739 double W0 = M[6]*x + M[7]*(y + y1) + M[8]; 5740 5741 if( interpolation == INTER_NEAREST ) 5742 { 5743 x1 = 0; 5744 5745 #if CV_SSE4_1 5746 if (haveSSE4_1) 5747 { 5748 __m128d v_X0d = _mm_set1_pd(X0); 5749 __m128d v_Y0d = _mm_set1_pd(Y0); 5750 __m128d v_W0 = _mm_set1_pd(W0); 5751 __m128d v_x1 = _mm_set_pd(1, 0); 5752 5753 for( ; x1 <= bw - 16; x1 += 16 ) 5754 { 5755 // 0-3 5756 __m128i v_X0, v_Y0; 5757 { 5758 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5759 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 5760 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5761 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5762 v_x1 = _mm_add_pd(v_x1, v_2); 5763 5764 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5765 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 5766 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5767 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5768 v_x1 = _mm_add_pd(v_x1, v_2); 5769 5770 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 5771 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 5772 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 5773 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 5774 } 5775 5776 // 4-8 5777 __m128i v_X1, v_Y1; 5778 { 5779 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5780 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 5781 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5782 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5783 v_x1 = _mm_add_pd(v_x1, v_2); 5784 5785 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5786 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 5787 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5788 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5789 v_x1 = _mm_add_pd(v_x1, v_2); 5790 5791 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 5792 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 5793 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 5794 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 5795 } 5796 5797 // 8-11 5798 __m128i v_X2, v_Y2; 5799 { 5800 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5801 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 5802 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5803 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5804 v_x1 = _mm_add_pd(v_x1, v_2); 5805 5806 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5807 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 5808 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5809 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5810 v_x1 = _mm_add_pd(v_x1, v_2); 5811 5812 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 5813 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 5814 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 5815 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 5816 } 5817 5818 // 12-15 5819 __m128i v_X3, v_Y3; 5820 { 5821 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5822 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 5823 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5824 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5825 v_x1 = _mm_add_pd(v_x1, v_2); 5826 5827 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5828 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W)); 5829 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5830 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5831 v_x1 = _mm_add_pd(v_x1, v_2); 5832 5833 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 5834 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 5835 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 5836 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 5837 } 5838 5839 // convert to 16s 5840 v_X0 = _mm_packs_epi32(v_X0, v_X1); 5841 v_X1 = _mm_packs_epi32(v_X2, v_X3); 5842 v_Y0 = _mm_packs_epi32(v_Y0, v_Y1); 5843 v_Y1 = _mm_packs_epi32(v_Y2, v_Y3); 5844 5845 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); 5846 5847 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); 5848 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); 5849 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); 5850 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); 5851 } 5852 } 5853 #endif 5854 5855 for( ; x1 < bw; x1++ ) 5856 { 5857 double W = W0 + M[6]*x1; 5858 W = W ? 1./W : 0; 5859 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W)); 5860 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W)); 5861 int X = saturate_cast<int>(fX); 5862 int Y = saturate_cast<int>(fY); 5863 5864 xy[x1*2] = saturate_cast<short>(X); 5865 xy[x1*2+1] = saturate_cast<short>(Y); 5866 } 5867 } 5868 else 5869 { 5870 short* alpha = A + y1*bw; 5871 x1 = 0; 5872 5873 #if CV_SSE4_1 5874 if (haveSSE4_1) 5875 { 5876 __m128d v_X0d = _mm_set1_pd(X0); 5877 __m128d v_Y0d = _mm_set1_pd(Y0); 5878 __m128d v_W0 = _mm_set1_pd(W0); 5879 __m128d v_x1 = _mm_set_pd(1, 0); 5880 5881 for( ; x1 <= bw - 16; x1 += 16 ) 5882 { 5883 // 0-3 5884 __m128i v_X0, v_Y0; 5885 { 5886 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5887 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 5888 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5889 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5890 v_x1 = _mm_add_pd(v_x1, v_2); 5891 5892 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5893 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 5894 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5895 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5896 v_x1 = _mm_add_pd(v_x1, v_2); 5897 5898 v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 5899 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 5900 v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 5901 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 5902 } 5903 5904 // 4-8 5905 __m128i v_X1, v_Y1; 5906 { 5907 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5908 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 5909 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5910 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5911 v_x1 = _mm_add_pd(v_x1, v_2); 5912 5913 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5914 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 5915 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5916 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5917 v_x1 = _mm_add_pd(v_x1, v_2); 5918 5919 v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 5920 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 5921 v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 5922 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 5923 } 5924 5925 // 8-11 5926 __m128i v_X2, v_Y2; 5927 { 5928 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5929 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 5930 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5931 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5932 v_x1 = _mm_add_pd(v_x1, v_2); 5933 5934 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5935 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 5936 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5937 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5938 v_x1 = _mm_add_pd(v_x1, v_2); 5939 5940 v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 5941 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 5942 v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 5943 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 5944 } 5945 5946 // 12-15 5947 __m128i v_X3, v_Y3; 5948 { 5949 __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5950 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 5951 __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5952 __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5953 v_x1 = _mm_add_pd(v_x1, v_2); 5954 5955 v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0); 5956 v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W)); 5957 __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W))); 5958 __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W))); 5959 v_x1 = _mm_add_pd(v_x1, v_2); 5960 5961 v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)), 5962 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1)))); 5963 v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)), 5964 _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1)))); 5965 } 5966 5967 // store alpha 5968 __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS), 5969 _mm_and_si128(v_X0, v_itsi1)); 5970 __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS), 5971 _mm_and_si128(v_X1, v_itsi1)); 5972 _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1)); 5973 5974 v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS), 5975 _mm_and_si128(v_X2, v_itsi1)); 5976 v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS), 5977 _mm_and_si128(v_X3, v_itsi1)); 5978 _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1)); 5979 5980 // convert to 16s 5981 v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS)); 5982 v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS)); 5983 v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS)); 5984 v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS)); 5985 5986 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1); 5987 5988 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0); 5989 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1); 5990 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0); 5991 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1); 5992 } 5993 } 5994 #endif 5995 5996 for( ; x1 < bw; x1++ ) 5997 { 5998 double W = W0 + M[6]*x1; 5999 W = W ? INTER_TAB_SIZE/W : 0; 6000 double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W)); 6001 double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W)); 6002 int X = saturate_cast<int>(fX); 6003 int Y = saturate_cast<int>(fY); 6004 6005 xy[x1*2] = saturate_cast<short>(X >> INTER_BITS); 6006 xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS); 6007 alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + 6008 (X & (INTER_TAB_SIZE-1))); 6009 } 6010 } 6011 } 6012 6013 if( interpolation == INTER_NEAREST ) 6014 remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue ); 6015 else 6016 { 6017 Mat _matA(bh, bw, CV_16U, A); 6018 remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue ); 6019 } 6020 } 6021 } 6022 } 6023 6024 private: 6025 Mat src; 6026 Mat dst; 6027 double* M; 6028 int interpolation, borderType; 6029 Scalar borderValue; 6030 }; 6031 6032 6033 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0 6034 class IPPWarpPerspectiveInvoker : 6035 public ParallelLoopBody 6036 { 6037 public: 6038 IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation, 6039 int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) : 6040 ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs), 6041 borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok) 6042 { 6043 *ok = true; 6044 } 6045 6046 virtual void operator() (const Range& range) const 6047 { 6048 IppiSize srcsize = {src.cols, src.rows}; 6049 IppiRect srcroi = {0, 0, src.cols, src.rows}; 6050 IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start}; 6051 int cnn = src.channels(); 6052 6053 if( borderType == BORDER_CONSTANT ) 6054 { 6055 IppiSize setSize = {dst.cols, range.end - range.start}; 6056 void *dataPointer = dst.ptr(range.start); 6057 if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) ) 6058 { 6059 *ok = false; 6060 return; 6061 } 6062 } 6063 6064 IppStatus status = func(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode); 6065 if (status != ippStsNoErr) 6066 *ok = false; 6067 else 6068 { 6069 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 6070 } 6071 } 6072 private: 6073 Mat &src; 6074 Mat &dst; 6075 int mode; 6076 double (&coeffs)[3][3]; 6077 int borderType; 6078 const Scalar borderValue; 6079 ippiWarpPerspectiveFunc func; 6080 bool *ok; 6081 6082 const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&); 6083 }; 6084 #endif 6085 } 6086 6087 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0, 6088 Size dsize, int flags, int borderType, const Scalar& borderValue ) 6089 { 6090 CV_Assert( _src.total() > 0 ); 6091 6092 CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(), 6093 ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue, 6094 OCL_OP_PERSPECTIVE)) 6095 6096 Mat src = _src.getMat(), M0 = _M0.getMat(); 6097 _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() ); 6098 Mat dst = _dst.getMat(); 6099 6100 if( dst.data == src.data ) 6101 src = src.clone(); 6102 6103 double M[9]; 6104 Mat matM(3, 3, CV_64F, M); 6105 int interpolation = flags & INTER_MAX; 6106 if( interpolation == INTER_AREA ) 6107 interpolation = INTER_LINEAR; 6108 6109 CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 ); 6110 M0.convertTo(matM, matM.type()); 6111 6112 #ifdef HAVE_TEGRA_OPTIMIZATION 6113 if( tegra::useTegra() && tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) ) 6114 return; 6115 #endif 6116 6117 6118 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0 6119 CV_IPP_CHECK() 6120 { 6121 int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type); 6122 if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) && 6123 (cn == 1 || cn == 3 || cn == 4) && 6124 ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) && 6125 (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC)) 6126 { 6127 ippiWarpPerspectiveFunc ippFunc = 0; 6128 if ((flags & WARP_INVERSE_MAP) != 0) 6129 { 6130 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R : 6131 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R : 6132 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R : 6133 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R : 6134 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R : 6135 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R : 6136 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R : 6137 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R : 6138 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0; 6139 } 6140 else 6141 { 6142 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R : 6143 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R : 6144 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R : 6145 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R : 6146 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R : 6147 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R : 6148 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R : 6149 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R : 6150 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0; 6151 } 6152 int mode = 6153 interpolation == INTER_NEAREST ? IPPI_INTER_NN : 6154 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : 6155 interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0; 6156 CV_Assert(mode && ippFunc); 6157 6158 double coeffs[3][3]; 6159 for( int i = 0; i < 3; i++ ) 6160 for( int j = 0; j < 3; j++ ) 6161 coeffs[i][j] = matM.at<double>(i, j); 6162 6163 bool ok; 6164 Range range(0, dst.rows); 6165 IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok); 6166 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 6167 if( ok ) 6168 { 6169 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT); 6170 return; 6171 } 6172 setIppErrorStatus(); 6173 } 6174 } 6175 #endif 6176 6177 if( !(flags & WARP_INVERSE_MAP) ) 6178 invert(matM, matM); 6179 6180 Range range(0, dst.rows); 6181 WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue); 6182 parallel_for_(range, invoker, dst.total()/(double)(1<<16)); 6183 } 6184 6185 6186 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale ) 6187 { 6188 angle *= CV_PI/180; 6189 double alpha = cos(angle)*scale; 6190 double beta = sin(angle)*scale; 6191 6192 Mat M(2, 3, CV_64F); 6193 double* m = M.ptr<double>(); 6194 6195 m[0] = alpha; 6196 m[1] = beta; 6197 m[2] = (1-alpha)*center.x - beta*center.y; 6198 m[3] = -beta; 6199 m[4] = alpha; 6200 m[5] = beta*center.x + (1-alpha)*center.y; 6201 6202 return M; 6203 } 6204 6205 /* Calculates coefficients of perspective transformation 6206 * which maps (xi,yi) to (ui,vi), (i=1,2,3,4): 6207 * 6208 * c00*xi + c01*yi + c02 6209 * ui = --------------------- 6210 * c20*xi + c21*yi + c22 6211 * 6212 * c10*xi + c11*yi + c12 6213 * vi = --------------------- 6214 * c20*xi + c21*yi + c22 6215 * 6216 * Coefficients are calculated by solving linear system: 6217 * / x0 y0 1 0 0 0 -x0*u0 -y0*u0 \ /c00\ /u0\ 6218 * | x1 y1 1 0 0 0 -x1*u1 -y1*u1 | |c01| |u1| 6219 * | x2 y2 1 0 0 0 -x2*u2 -y2*u2 | |c02| |u2| 6220 * | x3 y3 1 0 0 0 -x3*u3 -y3*u3 |.|c10|=|u3|, 6221 * | 0 0 0 x0 y0 1 -x0*v0 -y0*v0 | |c11| |v0| 6222 * | 0 0 0 x1 y1 1 -x1*v1 -y1*v1 | |c12| |v1| 6223 * | 0 0 0 x2 y2 1 -x2*v2 -y2*v2 | |c20| |v2| 6224 * \ 0 0 0 x3 y3 1 -x3*v3 -y3*v3 / \c21/ \v3/ 6225 * 6226 * where: 6227 * cij - matrix coefficients, c22 = 1 6228 */ 6229 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] ) 6230 { 6231 Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr()); 6232 double a[8][8], b[8]; 6233 Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b); 6234 6235 for( int i = 0; i < 4; ++i ) 6236 { 6237 a[i][0] = a[i+4][3] = src[i].x; 6238 a[i][1] = a[i+4][4] = src[i].y; 6239 a[i][2] = a[i+4][5] = 1; 6240 a[i][3] = a[i][4] = a[i][5] = 6241 a[i+4][0] = a[i+4][1] = a[i+4][2] = 0; 6242 a[i][6] = -src[i].x*dst[i].x; 6243 a[i][7] = -src[i].y*dst[i].x; 6244 a[i+4][6] = -src[i].x*dst[i].y; 6245 a[i+4][7] = -src[i].y*dst[i].y; 6246 b[i] = dst[i].x; 6247 b[i+4] = dst[i].y; 6248 } 6249 6250 solve( A, B, X, DECOMP_SVD ); 6251 M.ptr<double>()[8] = 1.; 6252 6253 return M; 6254 } 6255 6256 /* Calculates coefficients of affine transformation 6257 * which maps (xi,yi) to (ui,vi), (i=1,2,3): 6258 * 6259 * ui = c00*xi + c01*yi + c02 6260 * 6261 * vi = c10*xi + c11*yi + c12 6262 * 6263 * Coefficients are calculated by solving linear system: 6264 * / x0 y0 1 0 0 0 \ /c00\ /u0\ 6265 * | x1 y1 1 0 0 0 | |c01| |u1| 6266 * | x2 y2 1 0 0 0 | |c02| |u2| 6267 * | 0 0 0 x0 y0 1 | |c10| |v0| 6268 * | 0 0 0 x1 y1 1 | |c11| |v1| 6269 * \ 0 0 0 x2 y2 1 / |c12| |v2| 6270 * 6271 * where: 6272 * cij - matrix coefficients 6273 */ 6274 6275 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] ) 6276 { 6277 Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr()); 6278 double a[6*6], b[6]; 6279 Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b); 6280 6281 for( int i = 0; i < 3; i++ ) 6282 { 6283 int j = i*12; 6284 int k = i*12+6; 6285 a[j] = a[k+3] = src[i].x; 6286 a[j+1] = a[k+4] = src[i].y; 6287 a[j+2] = a[k+5] = 1; 6288 a[j+3] = a[j+4] = a[j+5] = 0; 6289 a[k] = a[k+1] = a[k+2] = 0; 6290 b[i*2] = dst[i].x; 6291 b[i*2+1] = dst[i].y; 6292 } 6293 6294 solve( A, B, X ); 6295 return M; 6296 } 6297 6298 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM) 6299 { 6300 Mat matM = _matM.getMat(); 6301 CV_Assert(matM.rows == 2 && matM.cols == 3); 6302 __iM.create(2, 3, matM.type()); 6303 Mat _iM = __iM.getMat(); 6304 6305 if( matM.type() == CV_32F ) 6306 { 6307 const float* M = matM.ptr<float>(); 6308 float* iM = _iM.ptr<float>(); 6309 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0])); 6310 6311 double D = M[0]*M[step+1] - M[1]*M[step]; 6312 D = D != 0 ? 1./D : 0; 6313 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D; 6314 double b1 = -A11*M[2] - A12*M[step+2]; 6315 double b2 = -A21*M[2] - A22*M[step+2]; 6316 6317 iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1; 6318 iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2; 6319 } 6320 else if( matM.type() == CV_64F ) 6321 { 6322 const double* M = matM.ptr<double>(); 6323 double* iM = _iM.ptr<double>(); 6324 int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0])); 6325 6326 double D = M[0]*M[step+1] - M[1]*M[step]; 6327 D = D != 0 ? 1./D : 0; 6328 double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D; 6329 double b1 = -A11*M[2] - A12*M[step+2]; 6330 double b2 = -A21*M[2] - A22*M[step+2]; 6331 6332 iM[0] = A11; iM[1] = A12; iM[2] = b1; 6333 iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2; 6334 } 6335 else 6336 CV_Error( CV_StsUnsupportedFormat, "" ); 6337 } 6338 6339 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst) 6340 { 6341 Mat src = _src.getMat(), dst = _dst.getMat(); 6342 CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4); 6343 return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data); 6344 } 6345 6346 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst) 6347 { 6348 Mat src = _src.getMat(), dst = _dst.getMat(); 6349 CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3); 6350 return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data); 6351 } 6352 6353 CV_IMPL void 6354 cvResize( const CvArr* srcarr, CvArr* dstarr, int method ) 6355 { 6356 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 6357 CV_Assert( src.type() == dst.type() ); 6358 cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols, 6359 (double)dst.rows/src.rows, method ); 6360 } 6361 6362 6363 CV_IMPL void 6364 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr, 6365 int flags, CvScalar fillval ) 6366 { 6367 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 6368 cv::Mat matrix = cv::cvarrToMat(marr); 6369 CV_Assert( src.type() == dst.type() ); 6370 cv::warpAffine( src, dst, matrix, dst.size(), flags, 6371 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT, 6372 fillval ); 6373 } 6374 6375 CV_IMPL void 6376 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr, 6377 int flags, CvScalar fillval ) 6378 { 6379 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr); 6380 cv::Mat matrix = cv::cvarrToMat(marr); 6381 CV_Assert( src.type() == dst.type() ); 6382 cv::warpPerspective( src, dst, matrix, dst.size(), flags, 6383 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT, 6384 fillval ); 6385 } 6386 6387 CV_IMPL void 6388 cvRemap( const CvArr* srcarr, CvArr* dstarr, 6389 const CvArr* _mapx, const CvArr* _mapy, 6390 int flags, CvScalar fillval ) 6391 { 6392 cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst; 6393 cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy); 6394 CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() ); 6395 cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX, 6396 (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT, 6397 fillval ); 6398 CV_Assert( dst0.data == dst.data ); 6399 } 6400 6401 6402 CV_IMPL CvMat* 6403 cv2DRotationMatrix( CvPoint2D32f center, double angle, 6404 double scale, CvMat* matrix ) 6405 { 6406 cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale); 6407 CV_Assert( M.size() == M0.size() ); 6408 M.convertTo(M0, M0.type()); 6409 return matrix; 6410 } 6411 6412 6413 CV_IMPL CvMat* 6414 cvGetPerspectiveTransform( const CvPoint2D32f* src, 6415 const CvPoint2D32f* dst, 6416 CvMat* matrix ) 6417 { 6418 cv::Mat M0 = cv::cvarrToMat(matrix), 6419 M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst); 6420 CV_Assert( M.size() == M0.size() ); 6421 M.convertTo(M0, M0.type()); 6422 return matrix; 6423 } 6424 6425 6426 CV_IMPL CvMat* 6427 cvGetAffineTransform( const CvPoint2D32f* src, 6428 const CvPoint2D32f* dst, 6429 CvMat* matrix ) 6430 { 6431 cv::Mat M0 = cv::cvarrToMat(matrix), 6432 M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst); 6433 CV_Assert( M.size() == M0.size() ); 6434 M.convertTo(M0, M0.type()); 6435 return matrix; 6436 } 6437 6438 6439 CV_IMPL void 6440 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 ) 6441 { 6442 cv::Mat map1 = cv::cvarrToMat(arr1), map2; 6443 cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2; 6444 6445 if( arr2 ) 6446 map2 = cv::cvarrToMat(arr2); 6447 if( dstarr2 ) 6448 { 6449 dstmap2 = cv::cvarrToMat(dstarr2); 6450 if( dstmap2.type() == CV_16SC1 ) 6451 dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step); 6452 } 6453 6454 cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false ); 6455 } 6456 6457 /****************************************************************************************\ 6458 * Log-Polar Transform * 6459 \****************************************************************************************/ 6460 6461 /* now it is done via Remap; more correct implementation should use 6462 some super-sampling technique outside of the "fovea" circle */ 6463 CV_IMPL void 6464 cvLogPolar( const CvArr* srcarr, CvArr* dstarr, 6465 CvPoint2D32f center, double M, int flags ) 6466 { 6467 cv::Ptr<CvMat> mapx, mapy; 6468 6469 CvMat srcstub, *src = cvGetMat(srcarr, &srcstub); 6470 CvMat dststub, *dst = cvGetMat(dstarr, &dststub); 6471 CvSize ssize, dsize; 6472 6473 if( !CV_ARE_TYPES_EQ( src, dst )) 6474 CV_Error( CV_StsUnmatchedFormats, "" ); 6475 6476 if( M <= 0 ) 6477 CV_Error( CV_StsOutOfRange, "M should be >0" ); 6478 6479 ssize = cvGetMatSize(src); 6480 dsize = cvGetMatSize(dst); 6481 6482 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F )); 6483 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F )); 6484 6485 if( !(flags & CV_WARP_INVERSE_MAP) ) 6486 { 6487 int phi, rho; 6488 cv::AutoBuffer<double> _exp_tab(dsize.width); 6489 double* exp_tab = _exp_tab; 6490 6491 for( rho = 0; rho < dst->width; rho++ ) 6492 exp_tab[rho] = std::exp(rho/M); 6493 6494 for( phi = 0; phi < dsize.height; phi++ ) 6495 { 6496 double cp = cos(phi*2*CV_PI/dsize.height); 6497 double sp = sin(phi*2*CV_PI/dsize.height); 6498 float* mx = (float*)(mapx->data.ptr + phi*mapx->step); 6499 float* my = (float*)(mapy->data.ptr + phi*mapy->step); 6500 6501 for( rho = 0; rho < dsize.width; rho++ ) 6502 { 6503 double r = exp_tab[rho]; 6504 double x = r*cp + center.x; 6505 double y = r*sp + center.y; 6506 6507 mx[rho] = (float)x; 6508 my[rho] = (float)y; 6509 } 6510 } 6511 } 6512 else 6513 { 6514 int x, y; 6515 CvMat bufx, bufy, bufp, bufa; 6516 double ascale = ssize.height/(2*CV_PI); 6517 cv::AutoBuffer<float> _buf(4*dsize.width); 6518 float* buf = _buf; 6519 6520 bufx = cvMat( 1, dsize.width, CV_32F, buf ); 6521 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width ); 6522 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 ); 6523 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 ); 6524 6525 for( x = 0; x < dsize.width; x++ ) 6526 bufx.data.fl[x] = (float)x - center.x; 6527 6528 for( y = 0; y < dsize.height; y++ ) 6529 { 6530 float* mx = (float*)(mapx->data.ptr + y*mapx->step); 6531 float* my = (float*)(mapy->data.ptr + y*mapy->step); 6532 6533 for( x = 0; x < dsize.width; x++ ) 6534 bufy.data.fl[x] = (float)y - center.y; 6535 6536 #if 1 6537 cvCartToPolar( &bufx, &bufy, &bufp, &bufa ); 6538 6539 for( x = 0; x < dsize.width; x++ ) 6540 bufp.data.fl[x] += 1.f; 6541 6542 cvLog( &bufp, &bufp ); 6543 6544 for( x = 0; x < dsize.width; x++ ) 6545 { 6546 double rho = bufp.data.fl[x]*M; 6547 double phi = bufa.data.fl[x]*ascale; 6548 6549 mx[x] = (float)rho; 6550 my[x] = (float)phi; 6551 } 6552 #else 6553 for( x = 0; x < dsize.width; x++ ) 6554 { 6555 double xx = bufx.data.fl[x]; 6556 double yy = bufy.data.fl[x]; 6557 6558 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M; 6559 double a = atan2(yy,xx); 6560 if( a < 0 ) 6561 a = 2*CV_PI + a; 6562 a *= ascale; 6563 6564 mx[x] = (float)p; 6565 my[x] = (float)a; 6566 } 6567 #endif 6568 } 6569 } 6570 6571 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) ); 6572 } 6573 6574 void cv::logPolar( InputArray _src, OutputArray _dst, 6575 Point2f center, double M, int flags ) 6576 { 6577 Mat src = _src.getMat(); 6578 _dst.create( src.size(), src.type() ); 6579 CvMat c_src = src, c_dst = _dst.getMat(); 6580 cvLogPolar( &c_src, &c_dst, center, M, flags ); 6581 } 6582 6583 /**************************************************************************************** 6584 Linear-Polar Transform 6585 J.L. Blanco, Apr 2009 6586 ****************************************************************************************/ 6587 CV_IMPL 6588 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr, 6589 CvPoint2D32f center, double maxRadius, int flags ) 6590 { 6591 cv::Ptr<CvMat> mapx, mapy; 6592 6593 CvMat srcstub, *src = (CvMat*)srcarr; 6594 CvMat dststub, *dst = (CvMat*)dstarr; 6595 CvSize ssize, dsize; 6596 6597 src = cvGetMat( srcarr, &srcstub,0,0 ); 6598 dst = cvGetMat( dstarr, &dststub,0,0 ); 6599 6600 if( !CV_ARE_TYPES_EQ( src, dst )) 6601 CV_Error( CV_StsUnmatchedFormats, "" ); 6602 6603 ssize.width = src->cols; 6604 ssize.height = src->rows; 6605 dsize.width = dst->cols; 6606 dsize.height = dst->rows; 6607 6608 mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F )); 6609 mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F )); 6610 6611 if( !(flags & CV_WARP_INVERSE_MAP) ) 6612 { 6613 int phi, rho; 6614 6615 for( phi = 0; phi < dsize.height; phi++ ) 6616 { 6617 double cp = cos(phi*2*CV_PI/dsize.height); 6618 double sp = sin(phi*2*CV_PI/dsize.height); 6619 float* mx = (float*)(mapx->data.ptr + phi*mapx->step); 6620 float* my = (float*)(mapy->data.ptr + phi*mapy->step); 6621 6622 for( rho = 0; rho < dsize.width; rho++ ) 6623 { 6624 double r = maxRadius*(rho+1)/dsize.width; 6625 double x = r*cp + center.x; 6626 double y = r*sp + center.y; 6627 6628 mx[rho] = (float)x; 6629 my[rho] = (float)y; 6630 } 6631 } 6632 } 6633 else 6634 { 6635 int x, y; 6636 CvMat bufx, bufy, bufp, bufa; 6637 const double ascale = ssize.height/(2*CV_PI); 6638 const double pscale = ssize.width/maxRadius; 6639 6640 cv::AutoBuffer<float> _buf(4*dsize.width); 6641 float* buf = _buf; 6642 6643 bufx = cvMat( 1, dsize.width, CV_32F, buf ); 6644 bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width ); 6645 bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 ); 6646 bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 ); 6647 6648 for( x = 0; x < dsize.width; x++ ) 6649 bufx.data.fl[x] = (float)x - center.x; 6650 6651 for( y = 0; y < dsize.height; y++ ) 6652 { 6653 float* mx = (float*)(mapx->data.ptr + y*mapx->step); 6654 float* my = (float*)(mapy->data.ptr + y*mapy->step); 6655 6656 for( x = 0; x < dsize.width; x++ ) 6657 bufy.data.fl[x] = (float)y - center.y; 6658 6659 cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 ); 6660 6661 for( x = 0; x < dsize.width; x++ ) 6662 bufp.data.fl[x] += 1.f; 6663 6664 for( x = 0; x < dsize.width; x++ ) 6665 { 6666 double rho = bufp.data.fl[x]*pscale; 6667 double phi = bufa.data.fl[x]*ascale; 6668 mx[x] = (float)rho; 6669 my[x] = (float)phi; 6670 } 6671 } 6672 } 6673 6674 cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) ); 6675 } 6676 6677 void cv::linearPolar( InputArray _src, OutputArray _dst, 6678 Point2f center, double maxRadius, int flags ) 6679 { 6680 Mat src = _src.getMat(); 6681 _dst.create( src.size(), src.type() ); 6682 CvMat c_src = src, c_dst = _dst.getMat(); 6683 cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags ); 6684 } 6685 6686 /* End of file. */ 6687