Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 /* ////////////////////////////////////////////////////////////////////
     45 //
     46 //  Geometrical transforms on images and matrices: rotation, zoom etc.
     47 //
     48 // */
     49 
     50 #include "precomp.hpp"
     51 #include "opencl_kernels_imgproc.hpp"
     52 
     53 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7)
     54 static IppStatus sts = ippInit();
     55 #endif
     56 
     57 namespace cv
     58 {
     59 #if IPP_VERSION_X100 >= 701
     60     typedef IppStatus (CV_STDCALL* ippiResizeFunc)(const void*, int, const void*, int, IppiPoint, IppiSize, IppiBorderType, void*, void*, Ipp8u*);
     61     typedef IppStatus (CV_STDCALL* ippiResizeGetBufferSize)(void*, IppiSize, Ipp32u, int*);
     62     typedef IppStatus (CV_STDCALL* ippiResizeGetSrcOffset)(void*, IppiPoint, IppiPoint*);
     63 #endif
     64 
     65 #if defined (HAVE_IPP) && (IPP_VERSION_MAJOR >= 7) && 0
     66     typedef IppStatus (CV_STDCALL* ippiSetFunc)(const void*, void *, int, IppiSize);
     67     typedef IppStatus (CV_STDCALL* ippiWarpPerspectiveFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [3][3], int);
     68     typedef IppStatus (CV_STDCALL* ippiWarpAffineBackFunc)(const void*, IppiSize, int, IppiRect, void *, int, IppiRect, double [2][3], int);
     69 
     70     template <int channels, typename Type>
     71     bool IPPSetSimple(cv::Scalar value, void *dataPointer, int step, IppiSize &size, ippiSetFunc func)
     72     {
     73         Type values[channels];
     74         for( int i = 0; i < channels; i++ )
     75             values[i] = saturate_cast<Type>(value[i]);
     76         return func(values, dataPointer, step, size) >= 0;
     77     }
     78 
     79     static bool IPPSet(const cv::Scalar &value, void *dataPointer, int step, IppiSize &size, int channels, int depth)
     80     {
     81         if( channels == 1 )
     82         {
     83             switch( depth )
     84             {
     85             case CV_8U:
     86                 return ippiSet_8u_C1R(saturate_cast<Ipp8u>(value[0]), (Ipp8u *)dataPointer, step, size) >= 0;
     87             case CV_16U:
     88                 return ippiSet_16u_C1R(saturate_cast<Ipp16u>(value[0]), (Ipp16u *)dataPointer, step, size) >= 0;
     89             case CV_32F:
     90                 return ippiSet_32f_C1R(saturate_cast<Ipp32f>(value[0]), (Ipp32f *)dataPointer, step, size) >= 0;
     91             }
     92         }
     93         else
     94         {
     95             if( channels == 3 )
     96             {
     97                 switch( depth )
     98                 {
     99                 case CV_8U:
    100                     return IPPSetSimple<3, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C3R);
    101                 case CV_16U:
    102                     return IPPSetSimple<3, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C3R);
    103                 case CV_32F:
    104                     return IPPSetSimple<3, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C3R);
    105                 }
    106             }
    107             else if( channels == 4 )
    108             {
    109                 switch( depth )
    110                 {
    111                 case CV_8U:
    112                     return IPPSetSimple<4, Ipp8u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_8u_C4R);
    113                 case CV_16U:
    114                     return IPPSetSimple<4, Ipp16u>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_16u_C4R);
    115                 case CV_32F:
    116                     return IPPSetSimple<4, Ipp32f>(value, dataPointer, step, size, (ippiSetFunc)ippiSet_32f_C4R);
    117                 }
    118             }
    119         }
    120         return false;
    121     }
    122 #endif
    123 
    124 /************** interpolation formulas and tables ***************/
    125 
    126 const int INTER_RESIZE_COEF_BITS=11;
    127 const int INTER_RESIZE_COEF_SCALE=1 << INTER_RESIZE_COEF_BITS;
    128 
    129 const int INTER_REMAP_COEF_BITS=15;
    130 const int INTER_REMAP_COEF_SCALE=1 << INTER_REMAP_COEF_BITS;
    131 
    132 static uchar NNDeltaTab_i[INTER_TAB_SIZE2][2];
    133 
    134 static float BilinearTab_f[INTER_TAB_SIZE2][2][2];
    135 static short BilinearTab_i[INTER_TAB_SIZE2][2][2];
    136 
    137 #if CV_SSE2 || CV_NEON
    138 static short BilinearTab_iC4_buf[INTER_TAB_SIZE2+2][2][8];
    139 static short (*BilinearTab_iC4)[2][8] = (short (*)[2][8])alignPtr(BilinearTab_iC4_buf, 16);
    140 #endif
    141 
    142 static float BicubicTab_f[INTER_TAB_SIZE2][4][4];
    143 static short BicubicTab_i[INTER_TAB_SIZE2][4][4];
    144 
    145 static float Lanczos4Tab_f[INTER_TAB_SIZE2][8][8];
    146 static short Lanczos4Tab_i[INTER_TAB_SIZE2][8][8];
    147 
    148 static inline void interpolateLinear( float x, float* coeffs )
    149 {
    150     coeffs[0] = 1.f - x;
    151     coeffs[1] = x;
    152 }
    153 
    154 static inline void interpolateCubic( float x, float* coeffs )
    155 {
    156     const float A = -0.75f;
    157 
    158     coeffs[0] = ((A*(x + 1) - 5*A)*(x + 1) + 8*A)*(x + 1) - 4*A;
    159     coeffs[1] = ((A + 2)*x - (A + 3))*x*x + 1;
    160     coeffs[2] = ((A + 2)*(1 - x) - (A + 3))*(1 - x)*(1 - x) + 1;
    161     coeffs[3] = 1.f - coeffs[0] - coeffs[1] - coeffs[2];
    162 }
    163 
    164 static inline void interpolateLanczos4( float x, float* coeffs )
    165 {
    166     static const double s45 = 0.70710678118654752440084436210485;
    167     static const double cs[][2]=
    168     {{1, 0}, {-s45, -s45}, {0, 1}, {s45, -s45}, {-1, 0}, {s45, s45}, {0, -1}, {-s45, s45}};
    169 
    170     if( x < FLT_EPSILON )
    171     {
    172         for( int i = 0; i < 8; i++ )
    173             coeffs[i] = 0;
    174         coeffs[3] = 1;
    175         return;
    176     }
    177 
    178     float sum = 0;
    179     double y0=-(x+3)*CV_PI*0.25, s0 = sin(y0), c0=cos(y0);
    180     for(int i = 0; i < 8; i++ )
    181     {
    182         double y = -(x+3-i)*CV_PI*0.25;
    183         coeffs[i] = (float)((cs[i][0]*s0 + cs[i][1]*c0)/(y*y));
    184         sum += coeffs[i];
    185     }
    186 
    187     sum = 1.f/sum;
    188     for(int i = 0; i < 8; i++ )
    189         coeffs[i] *= sum;
    190 }
    191 
    192 static void initInterTab1D(int method, float* tab, int tabsz)
    193 {
    194     float scale = 1.f/tabsz;
    195     if( method == INTER_LINEAR )
    196     {
    197         for( int i = 0; i < tabsz; i++, tab += 2 )
    198             interpolateLinear( i*scale, tab );
    199     }
    200     else if( method == INTER_CUBIC )
    201     {
    202         for( int i = 0; i < tabsz; i++, tab += 4 )
    203             interpolateCubic( i*scale, tab );
    204     }
    205     else if( method == INTER_LANCZOS4 )
    206     {
    207         for( int i = 0; i < tabsz; i++, tab += 8 )
    208             interpolateLanczos4( i*scale, tab );
    209     }
    210     else
    211         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
    212 }
    213 
    214 
    215 static const void* initInterTab2D( int method, bool fixpt )
    216 {
    217     static bool inittab[INTER_MAX+1] = {false};
    218     float* tab = 0;
    219     short* itab = 0;
    220     int ksize = 0;
    221     if( method == INTER_LINEAR )
    222         tab = BilinearTab_f[0][0], itab = BilinearTab_i[0][0], ksize=2;
    223     else if( method == INTER_CUBIC )
    224         tab = BicubicTab_f[0][0], itab = BicubicTab_i[0][0], ksize=4;
    225     else if( method == INTER_LANCZOS4 )
    226         tab = Lanczos4Tab_f[0][0], itab = Lanczos4Tab_i[0][0], ksize=8;
    227     else
    228         CV_Error( CV_StsBadArg, "Unknown/unsupported interpolation type" );
    229 
    230     if( !inittab[method] )
    231     {
    232         AutoBuffer<float> _tab(8*INTER_TAB_SIZE);
    233         int i, j, k1, k2;
    234         initInterTab1D(method, _tab, INTER_TAB_SIZE);
    235         for( i = 0; i < INTER_TAB_SIZE; i++ )
    236             for( j = 0; j < INTER_TAB_SIZE; j++, tab += ksize*ksize, itab += ksize*ksize )
    237             {
    238                 int isum = 0;
    239                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][0] = j < INTER_TAB_SIZE/2;
    240                 NNDeltaTab_i[i*INTER_TAB_SIZE+j][1] = i < INTER_TAB_SIZE/2;
    241 
    242                 for( k1 = 0; k1 < ksize; k1++ )
    243                 {
    244                     float vy = _tab[i*ksize + k1];
    245                     for( k2 = 0; k2 < ksize; k2++ )
    246                     {
    247                         float v = vy*_tab[j*ksize + k2];
    248                         tab[k1*ksize + k2] = v;
    249                         isum += itab[k1*ksize + k2] = saturate_cast<short>(v*INTER_REMAP_COEF_SCALE);
    250                     }
    251                 }
    252 
    253                 if( isum != INTER_REMAP_COEF_SCALE )
    254                 {
    255                     int diff = isum - INTER_REMAP_COEF_SCALE;
    256                     int ksize2 = ksize/2, Mk1=ksize2, Mk2=ksize2, mk1=ksize2, mk2=ksize2;
    257                     for( k1 = ksize2; k1 < ksize2+2; k1++ )
    258                         for( k2 = ksize2; k2 < ksize2+2; k2++ )
    259                         {
    260                             if( itab[k1*ksize+k2] < itab[mk1*ksize+mk2] )
    261                                 mk1 = k1, mk2 = k2;
    262                             else if( itab[k1*ksize+k2] > itab[Mk1*ksize+Mk2] )
    263                                 Mk1 = k1, Mk2 = k2;
    264                         }
    265                     if( diff < 0 )
    266                         itab[Mk1*ksize + Mk2] = (short)(itab[Mk1*ksize + Mk2] - diff);
    267                     else
    268                         itab[mk1*ksize + mk2] = (short)(itab[mk1*ksize + mk2] - diff);
    269                 }
    270             }
    271         tab -= INTER_TAB_SIZE2*ksize*ksize;
    272         itab -= INTER_TAB_SIZE2*ksize*ksize;
    273 #if CV_SSE2 || CV_NEON
    274         if( method == INTER_LINEAR )
    275         {
    276             for( i = 0; i < INTER_TAB_SIZE2; i++ )
    277                 for( j = 0; j < 4; j++ )
    278                 {
    279                     BilinearTab_iC4[i][0][j*2] = BilinearTab_i[i][0][0];
    280                     BilinearTab_iC4[i][0][j*2+1] = BilinearTab_i[i][0][1];
    281                     BilinearTab_iC4[i][1][j*2] = BilinearTab_i[i][1][0];
    282                     BilinearTab_iC4[i][1][j*2+1] = BilinearTab_i[i][1][1];
    283                 }
    284         }
    285 #endif
    286         inittab[method] = true;
    287     }
    288     return fixpt ? (const void*)itab : (const void*)tab;
    289 }
    290 
    291 #ifndef __MINGW32__
    292 static bool initAllInterTab2D()
    293 {
    294     return  initInterTab2D( INTER_LINEAR, false ) &&
    295             initInterTab2D( INTER_LINEAR, true ) &&
    296             initInterTab2D( INTER_CUBIC, false ) &&
    297             initInterTab2D( INTER_CUBIC, true ) &&
    298             initInterTab2D( INTER_LANCZOS4, false ) &&
    299             initInterTab2D( INTER_LANCZOS4, true );
    300 }
    301 
    302 static volatile bool doInitAllInterTab2D = initAllInterTab2D();
    303 #endif
    304 
    305 template<typename ST, typename DT> struct Cast
    306 {
    307     typedef ST type1;
    308     typedef DT rtype;
    309 
    310     DT operator()(ST val) const { return saturate_cast<DT>(val); }
    311 };
    312 
    313 template<typename ST, typename DT, int bits> struct FixedPtCast
    314 {
    315     typedef ST type1;
    316     typedef DT rtype;
    317     enum { SHIFT = bits, DELTA = 1 << (bits-1) };
    318 
    319     DT operator()(ST val) const { return saturate_cast<DT>((val + DELTA)>>SHIFT); }
    320 };
    321 
    322 /****************************************************************************************\
    323 *                                         Resize                                         *
    324 \****************************************************************************************/
    325 
    326 class resizeNNInvoker :
    327     public ParallelLoopBody
    328 {
    329 public:
    330     resizeNNInvoker(const Mat& _src, Mat &_dst, int *_x_ofs, int _pix_size4, double _ify) :
    331         ParallelLoopBody(), src(_src), dst(_dst), x_ofs(_x_ofs), pix_size4(_pix_size4),
    332         ify(_ify)
    333     {
    334     }
    335 
    336     virtual void operator() (const Range& range) const
    337     {
    338         Size ssize = src.size(), dsize = dst.size();
    339         int y, x, pix_size = (int)src.elemSize();
    340 
    341         for( y = range.start; y < range.end; y++ )
    342         {
    343             uchar* D = dst.data + dst.step*y;
    344             int sy = std::min(cvFloor(y*ify), ssize.height-1);
    345             const uchar* S = src.ptr(sy);
    346 
    347             switch( pix_size )
    348             {
    349             case 1:
    350                 for( x = 0; x <= dsize.width - 2; x += 2 )
    351                 {
    352                     uchar t0 = S[x_ofs[x]];
    353                     uchar t1 = S[x_ofs[x+1]];
    354                     D[x] = t0;
    355                     D[x+1] = t1;
    356                 }
    357 
    358                 for( ; x < dsize.width; x++ )
    359                     D[x] = S[x_ofs[x]];
    360                 break;
    361             case 2:
    362                 for( x = 0; x < dsize.width; x++ )
    363                     *(ushort*)(D + x*2) = *(ushort*)(S + x_ofs[x]);
    364                 break;
    365             case 3:
    366                 for( x = 0; x < dsize.width; x++, D += 3 )
    367                 {
    368                     const uchar* _tS = S + x_ofs[x];
    369                     D[0] = _tS[0]; D[1] = _tS[1]; D[2] = _tS[2];
    370                 }
    371                 break;
    372             case 4:
    373                 for( x = 0; x < dsize.width; x++ )
    374                     *(int*)(D + x*4) = *(int*)(S + x_ofs[x]);
    375                 break;
    376             case 6:
    377                 for( x = 0; x < dsize.width; x++, D += 6 )
    378                 {
    379                     const ushort* _tS = (const ushort*)(S + x_ofs[x]);
    380                     ushort* _tD = (ushort*)D;
    381                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
    382                 }
    383                 break;
    384             case 8:
    385                 for( x = 0; x < dsize.width; x++, D += 8 )
    386                 {
    387                     const int* _tS = (const int*)(S + x_ofs[x]);
    388                     int* _tD = (int*)D;
    389                     _tD[0] = _tS[0]; _tD[1] = _tS[1];
    390                 }
    391                 break;
    392             case 12:
    393                 for( x = 0; x < dsize.width; x++, D += 12 )
    394                 {
    395                     const int* _tS = (const int*)(S + x_ofs[x]);
    396                     int* _tD = (int*)D;
    397                     _tD[0] = _tS[0]; _tD[1] = _tS[1]; _tD[2] = _tS[2];
    398                 }
    399                 break;
    400             default:
    401                 for( x = 0; x < dsize.width; x++, D += pix_size )
    402                 {
    403                     const int* _tS = (const int*)(S + x_ofs[x]);
    404                     int* _tD = (int*)D;
    405                     for( int k = 0; k < pix_size4; k++ )
    406                         _tD[k] = _tS[k];
    407                 }
    408             }
    409         }
    410     }
    411 
    412 private:
    413     const Mat src;
    414     Mat dst;
    415     int* x_ofs, pix_size4;
    416     double ify;
    417 
    418     resizeNNInvoker(const resizeNNInvoker&);
    419     resizeNNInvoker& operator=(const resizeNNInvoker&);
    420 };
    421 
    422 static void
    423 resizeNN( const Mat& src, Mat& dst, double fx, double fy )
    424 {
    425     Size ssize = src.size(), dsize = dst.size();
    426     AutoBuffer<int> _x_ofs(dsize.width);
    427     int* x_ofs = _x_ofs;
    428     int pix_size = (int)src.elemSize();
    429     int pix_size4 = (int)(pix_size / sizeof(int));
    430     double ifx = 1./fx, ify = 1./fy;
    431     int x;
    432 
    433     for( x = 0; x < dsize.width; x++ )
    434     {
    435         int sx = cvFloor(x*ifx);
    436         x_ofs[x] = std::min(sx, ssize.width-1)*pix_size;
    437     }
    438 
    439     Range range(0, dsize.height);
    440     resizeNNInvoker invoker(src, dst, x_ofs, pix_size4, ify);
    441     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
    442 }
    443 
    444 
    445 struct VResizeNoVec
    446 {
    447     int operator()(const uchar**, uchar*, const uchar*, int ) const { return 0; }
    448 };
    449 
    450 struct HResizeNoVec
    451 {
    452     int operator()(const uchar**, uchar**, int, const int*,
    453         const uchar*, int, int, int, int, int) const { return 0; }
    454 };
    455 
    456 #if CV_SSE2
    457 
    458 struct VResizeLinearVec_32s8u
    459 {
    460     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
    461     {
    462         if( !checkHardwareSupport(CV_CPU_SSE2) )
    463             return 0;
    464 
    465         const int** src = (const int**)_src;
    466         const short* beta = (const short*)_beta;
    467         const int *S0 = src[0], *S1 = src[1];
    468         int x = 0;
    469         __m128i b0 = _mm_set1_epi16(beta[0]), b1 = _mm_set1_epi16(beta[1]);
    470         __m128i delta = _mm_set1_epi16(2);
    471 
    472         if( (((size_t)S0|(size_t)S1)&15) == 0 )
    473             for( ; x <= width - 16; x += 16 )
    474             {
    475                 __m128i x0, x1, x2, y0, y1, y2;
    476                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
    477                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
    478                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
    479                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
    480                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
    481                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
    482 
    483                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 8));
    484                 x2 = _mm_load_si128((const __m128i*)(S0 + x + 12));
    485                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 8));
    486                 y2 = _mm_load_si128((const __m128i*)(S1 + x + 12));
    487                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
    488                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
    489 
    490                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
    491                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
    492 
    493                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
    494                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
    495                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
    496             }
    497         else
    498             for( ; x <= width - 16; x += 16 )
    499             {
    500                 __m128i x0, x1, x2, y0, y1, y2;
    501                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
    502                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
    503                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
    504                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
    505                 x0 = _mm_packs_epi32(_mm_srai_epi32(x0, 4), _mm_srai_epi32(x1, 4));
    506                 y0 = _mm_packs_epi32(_mm_srai_epi32(y0, 4), _mm_srai_epi32(y1, 4));
    507 
    508                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 8));
    509                 x2 = _mm_loadu_si128((const __m128i*)(S0 + x + 12));
    510                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 8));
    511                 y2 = _mm_loadu_si128((const __m128i*)(S1 + x + 12));
    512                 x1 = _mm_packs_epi32(_mm_srai_epi32(x1, 4), _mm_srai_epi32(x2, 4));
    513                 y1 = _mm_packs_epi32(_mm_srai_epi32(y1, 4), _mm_srai_epi32(y2, 4));
    514 
    515                 x0 = _mm_adds_epi16(_mm_mulhi_epi16( x0, b0 ), _mm_mulhi_epi16( y0, b1 ));
    516                 x1 = _mm_adds_epi16(_mm_mulhi_epi16( x1, b0 ), _mm_mulhi_epi16( y1, b1 ));
    517 
    518                 x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
    519                 x1 = _mm_srai_epi16(_mm_adds_epi16(x1, delta), 2);
    520                 _mm_storeu_si128( (__m128i*)(dst + x), _mm_packus_epi16(x0, x1));
    521             }
    522 
    523         for( ; x < width - 4; x += 4 )
    524         {
    525             __m128i x0, y0;
    526             x0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S0 + x)), 4);
    527             y0 = _mm_srai_epi32(_mm_loadu_si128((const __m128i*)(S1 + x)), 4);
    528             x0 = _mm_packs_epi32(x0, x0);
    529             y0 = _mm_packs_epi32(y0, y0);
    530             x0 = _mm_adds_epi16(_mm_mulhi_epi16(x0, b0), _mm_mulhi_epi16(y0, b1));
    531             x0 = _mm_srai_epi16(_mm_adds_epi16(x0, delta), 2);
    532             x0 = _mm_packus_epi16(x0, x0);
    533             *(int*)(dst + x) = _mm_cvtsi128_si32(x0);
    534         }
    535 
    536         return x;
    537     }
    538 };
    539 
    540 
    541 template<int shiftval> struct VResizeLinearVec_32f16
    542 {
    543     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    544     {
    545         if( !checkHardwareSupport(CV_CPU_SSE2) )
    546             return 0;
    547 
    548         const float** src = (const float**)_src;
    549         const float* beta = (const float*)_beta;
    550         const float *S0 = src[0], *S1 = src[1];
    551         ushort* dst = (ushort*)_dst;
    552         int x = 0;
    553 
    554         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
    555         __m128i preshift = _mm_set1_epi32(shiftval);
    556         __m128i postshift = _mm_set1_epi16((short)shiftval);
    557 
    558         if( (((size_t)S0|(size_t)S1)&15) == 0 )
    559             for( ; x <= width - 16; x += 16 )
    560             {
    561                 __m128 x0, x1, y0, y1;
    562                 __m128i t0, t1, t2;
    563                 x0 = _mm_load_ps(S0 + x);
    564                 x1 = _mm_load_ps(S0 + x + 4);
    565                 y0 = _mm_load_ps(S1 + x);
    566                 y1 = _mm_load_ps(S1 + x + 4);
    567 
    568                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
    569                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
    570                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
    571                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
    572                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
    573 
    574                 x0 = _mm_load_ps(S0 + x + 8);
    575                 x1 = _mm_load_ps(S0 + x + 12);
    576                 y0 = _mm_load_ps(S1 + x + 8);
    577                 y1 = _mm_load_ps(S1 + x + 12);
    578 
    579                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
    580                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
    581                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
    582                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
    583                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
    584 
    585                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
    586                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
    587             }
    588         else
    589             for( ; x <= width - 16; x += 16 )
    590             {
    591                 __m128 x0, x1, y0, y1;
    592                 __m128i t0, t1, t2;
    593                 x0 = _mm_loadu_ps(S0 + x);
    594                 x1 = _mm_loadu_ps(S0 + x + 4);
    595                 y0 = _mm_loadu_ps(S1 + x);
    596                 y1 = _mm_loadu_ps(S1 + x + 4);
    597 
    598                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
    599                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
    600                 t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
    601                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
    602                 t0 = _mm_add_epi16(_mm_packs_epi32(t0, t2), postshift);
    603 
    604                 x0 = _mm_loadu_ps(S0 + x + 8);
    605                 x1 = _mm_loadu_ps(S0 + x + 12);
    606                 y0 = _mm_loadu_ps(S1 + x + 8);
    607                 y1 = _mm_loadu_ps(S1 + x + 12);
    608 
    609                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
    610                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
    611                 t1 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
    612                 t2 = _mm_add_epi32(_mm_cvtps_epi32(x1), preshift);
    613                 t1 = _mm_add_epi16(_mm_packs_epi32(t1, t2), postshift);
    614 
    615                 _mm_storeu_si128( (__m128i*)(dst + x), t0);
    616                 _mm_storeu_si128( (__m128i*)(dst + x + 8), t1);
    617             }
    618 
    619         for( ; x < width - 4; x += 4 )
    620         {
    621             __m128 x0, y0;
    622             __m128i t0;
    623             x0 = _mm_loadu_ps(S0 + x);
    624             y0 = _mm_loadu_ps(S1 + x);
    625 
    626             x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
    627             t0 = _mm_add_epi32(_mm_cvtps_epi32(x0), preshift);
    628             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t0), postshift);
    629             _mm_storel_epi64( (__m128i*)(dst + x), t0);
    630         }
    631 
    632         return x;
    633     }
    634 };
    635 
    636 typedef VResizeLinearVec_32f16<SHRT_MIN> VResizeLinearVec_32f16u;
    637 typedef VResizeLinearVec_32f16<0> VResizeLinearVec_32f16s;
    638 
    639 struct VResizeLinearVec_32f
    640 {
    641     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    642     {
    643         if( !checkHardwareSupport(CV_CPU_SSE) )
    644             return 0;
    645 
    646         const float** src = (const float**)_src;
    647         const float* beta = (const float*)_beta;
    648         const float *S0 = src[0], *S1 = src[1];
    649         float* dst = (float*)_dst;
    650         int x = 0;
    651 
    652         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]);
    653 
    654         if( (((size_t)S0|(size_t)S1)&15) == 0 )
    655             for( ; x <= width - 8; x += 8 )
    656             {
    657                 __m128 x0, x1, y0, y1;
    658                 x0 = _mm_load_ps(S0 + x);
    659                 x1 = _mm_load_ps(S0 + x + 4);
    660                 y0 = _mm_load_ps(S1 + x);
    661                 y1 = _mm_load_ps(S1 + x + 4);
    662 
    663                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
    664                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
    665 
    666                 _mm_storeu_ps( dst + x, x0);
    667                 _mm_storeu_ps( dst + x + 4, x1);
    668             }
    669         else
    670             for( ; x <= width - 8; x += 8 )
    671             {
    672                 __m128 x0, x1, y0, y1;
    673                 x0 = _mm_loadu_ps(S0 + x);
    674                 x1 = _mm_loadu_ps(S0 + x + 4);
    675                 y0 = _mm_loadu_ps(S1 + x);
    676                 y1 = _mm_loadu_ps(S1 + x + 4);
    677 
    678                 x0 = _mm_add_ps(_mm_mul_ps(x0, b0), _mm_mul_ps(y0, b1));
    679                 x1 = _mm_add_ps(_mm_mul_ps(x1, b0), _mm_mul_ps(y1, b1));
    680 
    681                 _mm_storeu_ps( dst + x, x0);
    682                 _mm_storeu_ps( dst + x + 4, x1);
    683             }
    684 
    685         return x;
    686     }
    687 };
    688 
    689 
    690 struct VResizeCubicVec_32s8u
    691 {
    692     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
    693     {
    694         if( !checkHardwareSupport(CV_CPU_SSE2) )
    695             return 0;
    696 
    697         const int** src = (const int**)_src;
    698         const short* beta = (const short*)_beta;
    699         const int *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
    700         int x = 0;
    701         float scale = 1.f/(INTER_RESIZE_COEF_SCALE*INTER_RESIZE_COEF_SCALE);
    702         __m128 b0 = _mm_set1_ps(beta[0]*scale), b1 = _mm_set1_ps(beta[1]*scale),
    703             b2 = _mm_set1_ps(beta[2]*scale), b3 = _mm_set1_ps(beta[3]*scale);
    704 
    705         if( (((size_t)S0|(size_t)S1|(size_t)S2|(size_t)S3)&15) == 0 )
    706             for( ; x <= width - 8; x += 8 )
    707             {
    708                 __m128i x0, x1, y0, y1;
    709                 __m128 s0, s1, f0, f1;
    710                 x0 = _mm_load_si128((const __m128i*)(S0 + x));
    711                 x1 = _mm_load_si128((const __m128i*)(S0 + x + 4));
    712                 y0 = _mm_load_si128((const __m128i*)(S1 + x));
    713                 y1 = _mm_load_si128((const __m128i*)(S1 + x + 4));
    714 
    715                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
    716                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
    717                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
    718                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
    719                 s0 = _mm_add_ps(s0, f0);
    720                 s1 = _mm_add_ps(s1, f1);
    721 
    722                 x0 = _mm_load_si128((const __m128i*)(S2 + x));
    723                 x1 = _mm_load_si128((const __m128i*)(S2 + x + 4));
    724                 y0 = _mm_load_si128((const __m128i*)(S3 + x));
    725                 y1 = _mm_load_si128((const __m128i*)(S3 + x + 4));
    726 
    727                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
    728                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
    729                 s0 = _mm_add_ps(s0, f0);
    730                 s1 = _mm_add_ps(s1, f1);
    731                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
    732                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
    733                 s0 = _mm_add_ps(s0, f0);
    734                 s1 = _mm_add_ps(s1, f1);
    735 
    736                 x0 = _mm_cvtps_epi32(s0);
    737                 x1 = _mm_cvtps_epi32(s1);
    738 
    739                 x0 = _mm_packs_epi32(x0, x1);
    740                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
    741             }
    742         else
    743             for( ; x <= width - 8; x += 8 )
    744             {
    745                 __m128i x0, x1, y0, y1;
    746                 __m128 s0, s1, f0, f1;
    747                 x0 = _mm_loadu_si128((const __m128i*)(S0 + x));
    748                 x1 = _mm_loadu_si128((const __m128i*)(S0 + x + 4));
    749                 y0 = _mm_loadu_si128((const __m128i*)(S1 + x));
    750                 y1 = _mm_loadu_si128((const __m128i*)(S1 + x + 4));
    751 
    752                 s0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b0);
    753                 s1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b0);
    754                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b1);
    755                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b1);
    756                 s0 = _mm_add_ps(s0, f0);
    757                 s1 = _mm_add_ps(s1, f1);
    758 
    759                 x0 = _mm_loadu_si128((const __m128i*)(S2 + x));
    760                 x1 = _mm_loadu_si128((const __m128i*)(S2 + x + 4));
    761                 y0 = _mm_loadu_si128((const __m128i*)(S3 + x));
    762                 y1 = _mm_loadu_si128((const __m128i*)(S3 + x + 4));
    763 
    764                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(x0), b2);
    765                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(x1), b2);
    766                 s0 = _mm_add_ps(s0, f0);
    767                 s1 = _mm_add_ps(s1, f1);
    768                 f0 = _mm_mul_ps(_mm_cvtepi32_ps(y0), b3);
    769                 f1 = _mm_mul_ps(_mm_cvtepi32_ps(y1), b3);
    770                 s0 = _mm_add_ps(s0, f0);
    771                 s1 = _mm_add_ps(s1, f1);
    772 
    773                 x0 = _mm_cvtps_epi32(s0);
    774                 x1 = _mm_cvtps_epi32(s1);
    775 
    776                 x0 = _mm_packs_epi32(x0, x1);
    777                 _mm_storel_epi64( (__m128i*)(dst + x), _mm_packus_epi16(x0, x0));
    778             }
    779 
    780         return x;
    781     }
    782 };
    783 
    784 
    785 template<int shiftval> struct VResizeCubicVec_32f16
    786 {
    787     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    788     {
    789         if( !checkHardwareSupport(CV_CPU_SSE2) )
    790             return 0;
    791 
    792         const float** src = (const float**)_src;
    793         const float* beta = (const float*)_beta;
    794         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
    795         ushort* dst = (ushort*)_dst;
    796         int x = 0;
    797         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
    798             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
    799         __m128i preshift = _mm_set1_epi32(shiftval);
    800         __m128i postshift = _mm_set1_epi16((short)shiftval);
    801 
    802         for( ; x <= width - 8; x += 8 )
    803         {
    804             __m128 x0, x1, y0, y1, s0, s1;
    805             __m128i t0, t1;
    806             x0 = _mm_loadu_ps(S0 + x);
    807             x1 = _mm_loadu_ps(S0 + x + 4);
    808             y0 = _mm_loadu_ps(S1 + x);
    809             y1 = _mm_loadu_ps(S1 + x + 4);
    810 
    811             s0 = _mm_mul_ps(x0, b0);
    812             s1 = _mm_mul_ps(x1, b0);
    813             y0 = _mm_mul_ps(y0, b1);
    814             y1 = _mm_mul_ps(y1, b1);
    815             s0 = _mm_add_ps(s0, y0);
    816             s1 = _mm_add_ps(s1, y1);
    817 
    818             x0 = _mm_loadu_ps(S2 + x);
    819             x1 = _mm_loadu_ps(S2 + x + 4);
    820             y0 = _mm_loadu_ps(S3 + x);
    821             y1 = _mm_loadu_ps(S3 + x + 4);
    822 
    823             x0 = _mm_mul_ps(x0, b2);
    824             x1 = _mm_mul_ps(x1, b2);
    825             y0 = _mm_mul_ps(y0, b3);
    826             y1 = _mm_mul_ps(y1, b3);
    827             s0 = _mm_add_ps(s0, x0);
    828             s1 = _mm_add_ps(s1, x1);
    829             s0 = _mm_add_ps(s0, y0);
    830             s1 = _mm_add_ps(s1, y1);
    831 
    832             t0 = _mm_add_epi32(_mm_cvtps_epi32(s0), preshift);
    833             t1 = _mm_add_epi32(_mm_cvtps_epi32(s1), preshift);
    834 
    835             t0 = _mm_add_epi16(_mm_packs_epi32(t0, t1), postshift);
    836             _mm_storeu_si128( (__m128i*)(dst + x), t0);
    837         }
    838 
    839         return x;
    840     }
    841 };
    842 
    843 typedef VResizeCubicVec_32f16<SHRT_MIN> VResizeCubicVec_32f16u;
    844 typedef VResizeCubicVec_32f16<0> VResizeCubicVec_32f16s;
    845 
    846 struct VResizeCubicVec_32f
    847 {
    848     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    849     {
    850         if( !checkHardwareSupport(CV_CPU_SSE) )
    851             return 0;
    852 
    853         const float** src = (const float**)_src;
    854         const float* beta = (const float*)_beta;
    855         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
    856         float* dst = (float*)_dst;
    857         int x = 0;
    858         __m128 b0 = _mm_set1_ps(beta[0]), b1 = _mm_set1_ps(beta[1]),
    859             b2 = _mm_set1_ps(beta[2]), b3 = _mm_set1_ps(beta[3]);
    860 
    861         for( ; x <= width - 8; x += 8 )
    862         {
    863             __m128 x0, x1, y0, y1, s0, s1;
    864             x0 = _mm_loadu_ps(S0 + x);
    865             x1 = _mm_loadu_ps(S0 + x + 4);
    866             y0 = _mm_loadu_ps(S1 + x);
    867             y1 = _mm_loadu_ps(S1 + x + 4);
    868 
    869             s0 = _mm_mul_ps(x0, b0);
    870             s1 = _mm_mul_ps(x1, b0);
    871             y0 = _mm_mul_ps(y0, b1);
    872             y1 = _mm_mul_ps(y1, b1);
    873             s0 = _mm_add_ps(s0, y0);
    874             s1 = _mm_add_ps(s1, y1);
    875 
    876             x0 = _mm_loadu_ps(S2 + x);
    877             x1 = _mm_loadu_ps(S2 + x + 4);
    878             y0 = _mm_loadu_ps(S3 + x);
    879             y1 = _mm_loadu_ps(S3 + x + 4);
    880 
    881             x0 = _mm_mul_ps(x0, b2);
    882             x1 = _mm_mul_ps(x1, b2);
    883             y0 = _mm_mul_ps(y0, b3);
    884             y1 = _mm_mul_ps(y1, b3);
    885             s0 = _mm_add_ps(s0, x0);
    886             s1 = _mm_add_ps(s1, x1);
    887             s0 = _mm_add_ps(s0, y0);
    888             s1 = _mm_add_ps(s1, y1);
    889 
    890             _mm_storeu_ps( dst + x, s0);
    891             _mm_storeu_ps( dst + x + 4, s1);
    892         }
    893 
    894         return x;
    895     }
    896 };
    897 
    898 #if CV_SSE4_1
    899 
    900 struct VResizeLanczos4Vec_32f16u
    901 {
    902     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    903     {
    904         const float** src = (const float**)_src;
    905         const float* beta = (const float*)_beta;
    906         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
    907                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
    908         short * dst = (short*)_dst;
    909         int x = 0;
    910         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
    911                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
    912                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
    913                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
    914 
    915         for( ; x <= width - 8; x += 8 )
    916         {
    917             __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
    918             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
    919             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
    920             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
    921             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
    922             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
    923             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
    924             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
    925 
    926             __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
    927             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
    928             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
    929             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
    930             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
    931             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
    932             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
    933             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
    934 
    935             __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
    936             __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
    937 
    938             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dsti0, v_dsti1));
    939         }
    940 
    941         return x;
    942     }
    943 };
    944 
    945 #else
    946 
    947 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
    948 
    949 #endif
    950 
    951 struct VResizeLanczos4Vec_32f16s
    952 {
    953     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
    954     {
    955         const float** src = (const float**)_src;
    956         const float* beta = (const float*)_beta;
    957         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
    958                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
    959         short * dst = (short*)_dst;
    960         int x = 0;
    961         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
    962                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
    963                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
    964                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
    965 
    966         for( ; x <= width - 8; x += 8 )
    967         {
    968             __m128 v_dst0 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
    969             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
    970             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
    971             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
    972             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
    973             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
    974             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
    975             v_dst0 = _mm_add_ps(v_dst0, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
    976 
    977             __m128 v_dst1 = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x + 4));
    978             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x + 4)));
    979             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x + 4)));
    980             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x + 4)));
    981             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x + 4)));
    982             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x + 4)));
    983             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x + 4)));
    984             v_dst1 = _mm_add_ps(v_dst1, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x + 4)));
    985 
    986             __m128i v_dsti0 = _mm_cvtps_epi32(v_dst0);
    987             __m128i v_dsti1 = _mm_cvtps_epi32(v_dst1);
    988 
    989             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dsti0, v_dsti1));
    990         }
    991 
    992         return x;
    993     }
    994 };
    995 
    996 
    997 struct VResizeLanczos4Vec_32f
    998 {
    999     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1000     {
   1001         const float** src = (const float**)_src;
   1002         const float* beta = (const float*)_beta;
   1003         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
   1004                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
   1005         float* dst = (float*)_dst;
   1006         int x = 0;
   1007 
   1008         __m128 v_b0 = _mm_set1_ps(beta[0]), v_b1 = _mm_set1_ps(beta[1]),
   1009                v_b2 = _mm_set1_ps(beta[2]), v_b3 = _mm_set1_ps(beta[3]),
   1010                v_b4 = _mm_set1_ps(beta[4]), v_b5 = _mm_set1_ps(beta[5]),
   1011                v_b6 = _mm_set1_ps(beta[6]), v_b7 = _mm_set1_ps(beta[7]);
   1012 
   1013         for( ; x <= width - 4; x += 4 )
   1014         {
   1015             __m128 v_dst = _mm_mul_ps(v_b0, _mm_loadu_ps(S0 + x));
   1016             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b1, _mm_loadu_ps(S1 + x)));
   1017             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b2, _mm_loadu_ps(S2 + x)));
   1018             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b3, _mm_loadu_ps(S3 + x)));
   1019             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b4, _mm_loadu_ps(S4 + x)));
   1020             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b5, _mm_loadu_ps(S5 + x)));
   1021             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b6, _mm_loadu_ps(S6 + x)));
   1022             v_dst = _mm_add_ps(v_dst, _mm_mul_ps(v_b7, _mm_loadu_ps(S7 + x)));
   1023 
   1024             _mm_storeu_ps(dst + x, v_dst);
   1025         }
   1026 
   1027         return x;
   1028     }
   1029 };
   1030 
   1031 
   1032 #elif CV_NEON
   1033 
   1034 struct VResizeLinearVec_32s8u
   1035 {
   1036     int operator()(const uchar** _src, uchar* dst, const uchar* _beta, int width ) const
   1037     {
   1038         const int** src = (const int**)_src, *S0 = src[0], *S1 = src[1];
   1039         const short* beta = (const short*)_beta;
   1040         int x = 0;
   1041         int16x8_t v_b0 = vdupq_n_s16(beta[0]), v_b1 = vdupq_n_s16(beta[1]), v_delta = vdupq_n_s16(2);
   1042 
   1043         for( ; x <= width - 16; x += 16)
   1044         {
   1045             int32x4_t v_src00 = vshrq_n_s32(vld1q_s32(S0 + x), 4), v_src10 = vshrq_n_s32(vld1q_s32(S1 + x), 4);
   1046             int32x4_t v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 4), 4), v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 4), 4);
   1047 
   1048             int16x8_t v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
   1049             int16x8_t v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
   1050 
   1051             int16x8_t v_dst0 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
   1052                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
   1053             v_dst0 = vshrq_n_s16(vaddq_s16(v_dst0, v_delta), 2);
   1054 
   1055             v_src00 = vshrq_n_s32(vld1q_s32(S0 + x + 8), 4);
   1056             v_src10 = vshrq_n_s32(vld1q_s32(S1 + x + 8), 4);
   1057             v_src01 = vshrq_n_s32(vld1q_s32(S0 + x + 12), 4);
   1058             v_src11 = vshrq_n_s32(vld1q_s32(S1 + x + 12), 4);
   1059 
   1060             v_src0 = vcombine_s16(vmovn_s32(v_src00), vmovn_s32(v_src01));
   1061             v_src1 = vcombine_s16(vmovn_s32(v_src10), vmovn_s32(v_src11));
   1062 
   1063             int16x8_t v_dst1 = vaddq_s16(vshrq_n_s16(vqdmulhq_s16(v_src0, v_b0), 1),
   1064                                          vshrq_n_s16(vqdmulhq_s16(v_src1, v_b1), 1));
   1065             v_dst1 = vshrq_n_s16(vaddq_s16(v_dst1, v_delta), 2);
   1066 
   1067             vst1q_u8(dst + x, vcombine_u8(vqmovun_s16(v_dst0), vqmovun_s16(v_dst1)));
   1068         }
   1069 
   1070         return x;
   1071     }
   1072 };
   1073 
   1074 struct VResizeLinearVec_32f16u
   1075 {
   1076     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1077     {
   1078         const float** src = (const float**)_src;
   1079         const float* beta = (const float*)_beta;
   1080         const float *S0 = src[0], *S1 = src[1];
   1081         ushort* dst = (ushort*)_dst;
   1082         int x = 0;
   1083 
   1084         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
   1085 
   1086         for( ; x <= width - 8; x += 8 )
   1087         {
   1088             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
   1089             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
   1090 
   1091             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
   1092             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
   1093 
   1094             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
   1095                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
   1096         }
   1097 
   1098         return x;
   1099     }
   1100 };
   1101 
   1102 struct VResizeLinearVec_32f16s
   1103 {
   1104     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1105     {
   1106         const float** src = (const float**)_src;
   1107         const float* beta = (const float*)_beta;
   1108         const float *S0 = src[0], *S1 = src[1];
   1109         short* dst = (short*)_dst;
   1110         int x = 0;
   1111 
   1112         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
   1113 
   1114         for( ; x <= width - 8; x += 8 )
   1115         {
   1116             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
   1117             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
   1118 
   1119             float32x4_t v_dst0 = vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1);
   1120             float32x4_t v_dst1 = vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1);
   1121 
   1122             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
   1123                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
   1124         }
   1125 
   1126         return x;
   1127     }
   1128 };
   1129 
   1130 struct VResizeLinearVec_32f
   1131 {
   1132     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1133     {
   1134         const float** src = (const float**)_src;
   1135         const float* beta = (const float*)_beta;
   1136         const float *S0 = src[0], *S1 = src[1];
   1137         float* dst = (float*)_dst;
   1138         int x = 0;
   1139 
   1140         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]);
   1141 
   1142         for( ; x <= width - 8; x += 8 )
   1143         {
   1144             float32x4_t v_src00 = vld1q_f32(S0 + x), v_src01 = vld1q_f32(S0 + x + 4);
   1145             float32x4_t v_src10 = vld1q_f32(S1 + x), v_src11 = vld1q_f32(S1 + x + 4);
   1146 
   1147             vst1q_f32(dst + x, vmlaq_f32(vmulq_f32(v_src00, v_b0), v_src10, v_b1));
   1148             vst1q_f32(dst + x + 4, vmlaq_f32(vmulq_f32(v_src01, v_b0), v_src11, v_b1));
   1149         }
   1150 
   1151         return x;
   1152     }
   1153 };
   1154 
   1155 typedef VResizeNoVec VResizeCubicVec_32s8u;
   1156 
   1157 struct VResizeCubicVec_32f16u
   1158 {
   1159     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1160     {
   1161         const float** src = (const float**)_src;
   1162         const float* beta = (const float*)_beta;
   1163         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
   1164         ushort* dst = (ushort*)_dst;
   1165         int x = 0;
   1166         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
   1167                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
   1168 
   1169         for( ; x <= width - 8; x += 8 )
   1170         {
   1171             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
   1172                                                                          v_b1, vld1q_f32(S1 + x)),
   1173                                                                          v_b2, vld1q_f32(S2 + x)),
   1174                                                                          v_b3, vld1q_f32(S3 + x));
   1175             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
   1176                                                                          v_b1, vld1q_f32(S1 + x + 4)),
   1177                                                                          v_b2, vld1q_f32(S2 + x + 4)),
   1178                                                                          v_b3, vld1q_f32(S3 + x + 4));
   1179 
   1180             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst0)),
   1181                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
   1182         }
   1183 
   1184         return x;
   1185     }
   1186 };
   1187 
   1188 struct VResizeCubicVec_32f16s
   1189 {
   1190     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1191     {
   1192         const float** src = (const float**)_src;
   1193         const float* beta = (const float*)_beta;
   1194         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
   1195         short* dst = (short*)_dst;
   1196         int x = 0;
   1197         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
   1198                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
   1199 
   1200         for( ; x <= width - 8; x += 8 )
   1201         {
   1202             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
   1203                                                                          v_b1, vld1q_f32(S1 + x)),
   1204                                                                          v_b2, vld1q_f32(S2 + x)),
   1205                                                                          v_b3, vld1q_f32(S3 + x));
   1206             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
   1207                                                                          v_b1, vld1q_f32(S1 + x + 4)),
   1208                                                                          v_b2, vld1q_f32(S2 + x + 4)),
   1209                                                                          v_b3, vld1q_f32(S3 + x + 4));
   1210 
   1211             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst0)),
   1212                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
   1213         }
   1214 
   1215         return x;
   1216     }
   1217 };
   1218 
   1219 struct VResizeCubicVec_32f
   1220 {
   1221     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1222     {
   1223         const float** src = (const float**)_src;
   1224         const float* beta = (const float*)_beta;
   1225         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
   1226         float* dst = (float*)_dst;
   1227         int x = 0;
   1228         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
   1229                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]);
   1230 
   1231         for( ; x <= width - 8; x += 8 )
   1232         {
   1233             vst1q_f32(dst + x, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
   1234                                                                        v_b1, vld1q_f32(S1 + x)),
   1235                                                                        v_b2, vld1q_f32(S2 + x)),
   1236                                                                        v_b3, vld1q_f32(S3 + x)));
   1237             vst1q_f32(dst + x + 4, vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
   1238                                                                           v_b1, vld1q_f32(S1 + x + 4)),
   1239                                                                           v_b2, vld1q_f32(S2 + x + 4)),
   1240                                                                           v_b3, vld1q_f32(S3 + x + 4)));
   1241         }
   1242 
   1243         return x;
   1244     }
   1245 };
   1246 
   1247 struct VResizeLanczos4Vec_32f16u
   1248 {
   1249     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1250     {
   1251         const float** src = (const float**)_src;
   1252         const float* beta = (const float*)_beta;
   1253         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
   1254                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
   1255         ushort * dst = (ushort*)_dst;
   1256         int x = 0;
   1257         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
   1258                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
   1259                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
   1260                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
   1261 
   1262         for( ; x <= width - 8; x += 8 )
   1263         {
   1264             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
   1265                                                                          v_b1, vld1q_f32(S1 + x)),
   1266                                                                          v_b2, vld1q_f32(S2 + x)),
   1267                                                                          v_b3, vld1q_f32(S3 + x));
   1268             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
   1269                                                                          v_b5, vld1q_f32(S5 + x)),
   1270                                                                          v_b6, vld1q_f32(S6 + x)),
   1271                                                                          v_b7, vld1q_f32(S7 + x));
   1272             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
   1273 
   1274             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
   1275                                                              v_b1, vld1q_f32(S1 + x + 4)),
   1276                                                              v_b2, vld1q_f32(S2 + x + 4)),
   1277                                                              v_b3, vld1q_f32(S3 + x + 4));
   1278             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
   1279                                                              v_b5, vld1q_f32(S5 + x + 4)),
   1280                                                              v_b6, vld1q_f32(S6 + x + 4)),
   1281                                                              v_b7, vld1q_f32(S7 + x + 4));
   1282             v_dst1 = vaddq_f32(v_dst0, v_dst1);
   1283 
   1284             vst1q_u16(dst + x, vcombine_u16(vqmovn_u32(cv_vrndq_u32_f32(v_dst)),
   1285                                             vqmovn_u32(cv_vrndq_u32_f32(v_dst1))));
   1286         }
   1287 
   1288         return x;
   1289     }
   1290 };
   1291 
   1292 struct VResizeLanczos4Vec_32f16s
   1293 {
   1294     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1295     {
   1296         const float** src = (const float**)_src;
   1297         const float* beta = (const float*)_beta;
   1298         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
   1299                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
   1300         short * dst = (short*)_dst;
   1301         int x = 0;
   1302         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
   1303                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
   1304                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
   1305                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
   1306 
   1307         for( ; x <= width - 8; x += 8 )
   1308         {
   1309             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
   1310                                                                          v_b1, vld1q_f32(S1 + x)),
   1311                                                                          v_b2, vld1q_f32(S2 + x)),
   1312                                                                          v_b3, vld1q_f32(S3 + x));
   1313             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
   1314                                                                          v_b5, vld1q_f32(S5 + x)),
   1315                                                                          v_b6, vld1q_f32(S6 + x)),
   1316                                                                          v_b7, vld1q_f32(S7 + x));
   1317             float32x4_t v_dst = vaddq_f32(v_dst0, v_dst1);
   1318 
   1319             v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x + 4)),
   1320                                                              v_b1, vld1q_f32(S1 + x + 4)),
   1321                                                              v_b2, vld1q_f32(S2 + x + 4)),
   1322                                                              v_b3, vld1q_f32(S3 + x + 4));
   1323             v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x + 4)),
   1324                                                              v_b5, vld1q_f32(S5 + x + 4)),
   1325                                                              v_b6, vld1q_f32(S6 + x + 4)),
   1326                                                              v_b7, vld1q_f32(S7 + x + 4));
   1327             v_dst1 = vaddq_f32(v_dst0, v_dst1);
   1328 
   1329             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(v_dst)),
   1330                                             vqmovn_s32(cv_vrndq_s32_f32(v_dst1))));
   1331         }
   1332 
   1333         return x;
   1334     }
   1335 };
   1336 
   1337 struct VResizeLanczos4Vec_32f
   1338 {
   1339     int operator()(const uchar** _src, uchar* _dst, const uchar* _beta, int width ) const
   1340     {
   1341         const float** src = (const float**)_src;
   1342         const float* beta = (const float*)_beta;
   1343         const float *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3],
   1344                     *S4 = src[4], *S5 = src[5], *S6 = src[6], *S7 = src[7];
   1345         float* dst = (float*)_dst;
   1346         int x = 0;
   1347         float32x4_t v_b0 = vdupq_n_f32(beta[0]), v_b1 = vdupq_n_f32(beta[1]),
   1348                     v_b2 = vdupq_n_f32(beta[2]), v_b3 = vdupq_n_f32(beta[3]),
   1349                     v_b4 = vdupq_n_f32(beta[4]), v_b5 = vdupq_n_f32(beta[5]),
   1350                     v_b6 = vdupq_n_f32(beta[6]), v_b7 = vdupq_n_f32(beta[7]);
   1351 
   1352         for( ; x <= width - 4; x += 4 )
   1353         {
   1354             float32x4_t v_dst0 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b0, vld1q_f32(S0 + x)),
   1355                                                                          v_b1, vld1q_f32(S1 + x)),
   1356                                                                          v_b2, vld1q_f32(S2 + x)),
   1357                                                                          v_b3, vld1q_f32(S3 + x));
   1358             float32x4_t v_dst1 = vmlaq_f32(vmlaq_f32(vmlaq_f32(vmulq_f32(v_b4, vld1q_f32(S4 + x)),
   1359                                                                          v_b5, vld1q_f32(S5 + x)),
   1360                                                                          v_b6, vld1q_f32(S6 + x)),
   1361                                                                          v_b7, vld1q_f32(S7 + x));
   1362             vst1q_f32(dst + x, vaddq_f32(v_dst0, v_dst1));
   1363         }
   1364 
   1365         return x;
   1366     }
   1367 };
   1368 
   1369 #else
   1370 
   1371 typedef VResizeNoVec VResizeLinearVec_32s8u;
   1372 typedef VResizeNoVec VResizeLinearVec_32f16u;
   1373 typedef VResizeNoVec VResizeLinearVec_32f16s;
   1374 typedef VResizeNoVec VResizeLinearVec_32f;
   1375 
   1376 typedef VResizeNoVec VResizeCubicVec_32s8u;
   1377 typedef VResizeNoVec VResizeCubicVec_32f16u;
   1378 typedef VResizeNoVec VResizeCubicVec_32f16s;
   1379 typedef VResizeNoVec VResizeCubicVec_32f;
   1380 
   1381 typedef VResizeNoVec VResizeLanczos4Vec_32f16u;
   1382 typedef VResizeNoVec VResizeLanczos4Vec_32f16s;
   1383 typedef VResizeNoVec VResizeLanczos4Vec_32f;
   1384 
   1385 #endif
   1386 
   1387 typedef HResizeNoVec HResizeLinearVec_8u32s;
   1388 typedef HResizeNoVec HResizeLinearVec_16u32f;
   1389 typedef HResizeNoVec HResizeLinearVec_16s32f;
   1390 typedef HResizeNoVec HResizeLinearVec_32f;
   1391 typedef HResizeNoVec HResizeLinearVec_64f;
   1392 
   1393 
   1394 template<typename T, typename WT, typename AT, int ONE, class VecOp>
   1395 struct HResizeLinear
   1396 {
   1397     typedef T value_type;
   1398     typedef WT buf_type;
   1399     typedef AT alpha_type;
   1400 
   1401     void operator()(const T** src, WT** dst, int count,
   1402                     const int* xofs, const AT* alpha,
   1403                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
   1404     {
   1405         int dx, k;
   1406         VecOp vecOp;
   1407 
   1408         int dx0 = vecOp((const uchar**)src, (uchar**)dst, count,
   1409             xofs, (const uchar*)alpha, swidth, dwidth, cn, xmin, xmax );
   1410 
   1411         for( k = 0; k <= count - 2; k++ )
   1412         {
   1413             const T *S0 = src[k], *S1 = src[k+1];
   1414             WT *D0 = dst[k], *D1 = dst[k+1];
   1415             for( dx = dx0; dx < xmax; dx++ )
   1416             {
   1417                 int sx = xofs[dx];
   1418                 WT a0 = alpha[dx*2], a1 = alpha[dx*2+1];
   1419                 WT t0 = S0[sx]*a0 + S0[sx + cn]*a1;
   1420                 WT t1 = S1[sx]*a0 + S1[sx + cn]*a1;
   1421                 D0[dx] = t0; D1[dx] = t1;
   1422             }
   1423 
   1424             for( ; dx < dwidth; dx++ )
   1425             {
   1426                 int sx = xofs[dx];
   1427                 D0[dx] = WT(S0[sx]*ONE); D1[dx] = WT(S1[sx]*ONE);
   1428             }
   1429         }
   1430 
   1431         for( ; k < count; k++ )
   1432         {
   1433             const T *S = src[k];
   1434             WT *D = dst[k];
   1435             for( dx = 0; dx < xmax; dx++ )
   1436             {
   1437                 int sx = xofs[dx];
   1438                 D[dx] = S[sx]*alpha[dx*2] + S[sx+cn]*alpha[dx*2+1];
   1439             }
   1440 
   1441             for( ; dx < dwidth; dx++ )
   1442                 D[dx] = WT(S[xofs[dx]]*ONE);
   1443         }
   1444     }
   1445 };
   1446 
   1447 
   1448 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
   1449 struct VResizeLinear
   1450 {
   1451     typedef T value_type;
   1452     typedef WT buf_type;
   1453     typedef AT alpha_type;
   1454 
   1455     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
   1456     {
   1457         WT b0 = beta[0], b1 = beta[1];
   1458         const WT *S0 = src[0], *S1 = src[1];
   1459         CastOp castOp;
   1460         VecOp vecOp;
   1461 
   1462         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
   1463         #if CV_ENABLE_UNROLLED
   1464         for( ; x <= width - 4; x += 4 )
   1465         {
   1466             WT t0, t1;
   1467             t0 = S0[x]*b0 + S1[x]*b1;
   1468             t1 = S0[x+1]*b0 + S1[x+1]*b1;
   1469             dst[x] = castOp(t0); dst[x+1] = castOp(t1);
   1470             t0 = S0[x+2]*b0 + S1[x+2]*b1;
   1471             t1 = S0[x+3]*b0 + S1[x+3]*b1;
   1472             dst[x+2] = castOp(t0); dst[x+3] = castOp(t1);
   1473         }
   1474         #endif
   1475         for( ; x < width; x++ )
   1476             dst[x] = castOp(S0[x]*b0 + S1[x]*b1);
   1477     }
   1478 };
   1479 
   1480 template<>
   1481 struct VResizeLinear<uchar, int, short, FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>, VResizeLinearVec_32s8u>
   1482 {
   1483     typedef uchar value_type;
   1484     typedef int buf_type;
   1485     typedef short alpha_type;
   1486 
   1487     void operator()(const buf_type** src, value_type* dst, const alpha_type* beta, int width ) const
   1488     {
   1489         alpha_type b0 = beta[0], b1 = beta[1];
   1490         const buf_type *S0 = src[0], *S1 = src[1];
   1491         VResizeLinearVec_32s8u vecOp;
   1492 
   1493         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
   1494         #if CV_ENABLE_UNROLLED
   1495         for( ; x <= width - 4; x += 4 )
   1496         {
   1497             dst[x+0] = uchar(( ((b0 * (S0[x+0] >> 4)) >> 16) + ((b1 * (S1[x+0] >> 4)) >> 16) + 2)>>2);
   1498             dst[x+1] = uchar(( ((b0 * (S0[x+1] >> 4)) >> 16) + ((b1 * (S1[x+1] >> 4)) >> 16) + 2)>>2);
   1499             dst[x+2] = uchar(( ((b0 * (S0[x+2] >> 4)) >> 16) + ((b1 * (S1[x+2] >> 4)) >> 16) + 2)>>2);
   1500             dst[x+3] = uchar(( ((b0 * (S0[x+3] >> 4)) >> 16) + ((b1 * (S1[x+3] >> 4)) >> 16) + 2)>>2);
   1501         }
   1502         #endif
   1503         for( ; x < width; x++ )
   1504             dst[x] = uchar(( ((b0 * (S0[x] >> 4)) >> 16) + ((b1 * (S1[x] >> 4)) >> 16) + 2)>>2);
   1505     }
   1506 };
   1507 
   1508 
   1509 template<typename T, typename WT, typename AT>
   1510 struct HResizeCubic
   1511 {
   1512     typedef T value_type;
   1513     typedef WT buf_type;
   1514     typedef AT alpha_type;
   1515 
   1516     void operator()(const T** src, WT** dst, int count,
   1517                     const int* xofs, const AT* alpha,
   1518                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
   1519     {
   1520         for( int k = 0; k < count; k++ )
   1521         {
   1522             const T *S = src[k];
   1523             WT *D = dst[k];
   1524             int dx = 0, limit = xmin;
   1525             for(;;)
   1526             {
   1527                 for( ; dx < limit; dx++, alpha += 4 )
   1528                 {
   1529                     int j, sx = xofs[dx] - cn;
   1530                     WT v = 0;
   1531                     for( j = 0; j < 4; j++ )
   1532                     {
   1533                         int sxj = sx + j*cn;
   1534                         if( (unsigned)sxj >= (unsigned)swidth )
   1535                         {
   1536                             while( sxj < 0 )
   1537                                 sxj += cn;
   1538                             while( sxj >= swidth )
   1539                                 sxj -= cn;
   1540                         }
   1541                         v += S[sxj]*alpha[j];
   1542                     }
   1543                     D[dx] = v;
   1544                 }
   1545                 if( limit == dwidth )
   1546                     break;
   1547                 for( ; dx < xmax; dx++, alpha += 4 )
   1548                 {
   1549                     int sx = xofs[dx];
   1550                     D[dx] = S[sx-cn]*alpha[0] + S[sx]*alpha[1] +
   1551                         S[sx+cn]*alpha[2] + S[sx+cn*2]*alpha[3];
   1552                 }
   1553                 limit = dwidth;
   1554             }
   1555             alpha -= dwidth*4;
   1556         }
   1557     }
   1558 };
   1559 
   1560 
   1561 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
   1562 struct VResizeCubic
   1563 {
   1564     typedef T value_type;
   1565     typedef WT buf_type;
   1566     typedef AT alpha_type;
   1567 
   1568     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
   1569     {
   1570         WT b0 = beta[0], b1 = beta[1], b2 = beta[2], b3 = beta[3];
   1571         const WT *S0 = src[0], *S1 = src[1], *S2 = src[2], *S3 = src[3];
   1572         CastOp castOp;
   1573         VecOp vecOp;
   1574 
   1575         int x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
   1576         for( ; x < width; x++ )
   1577             dst[x] = castOp(S0[x]*b0 + S1[x]*b1 + S2[x]*b2 + S3[x]*b3);
   1578     }
   1579 };
   1580 
   1581 
   1582 template<typename T, typename WT, typename AT>
   1583 struct HResizeLanczos4
   1584 {
   1585     typedef T value_type;
   1586     typedef WT buf_type;
   1587     typedef AT alpha_type;
   1588 
   1589     void operator()(const T** src, WT** dst, int count,
   1590                     const int* xofs, const AT* alpha,
   1591                     int swidth, int dwidth, int cn, int xmin, int xmax ) const
   1592     {
   1593         for( int k = 0; k < count; k++ )
   1594         {
   1595             const T *S = src[k];
   1596             WT *D = dst[k];
   1597             int dx = 0, limit = xmin;
   1598             for(;;)
   1599             {
   1600                 for( ; dx < limit; dx++, alpha += 8 )
   1601                 {
   1602                     int j, sx = xofs[dx] - cn*3;
   1603                     WT v = 0;
   1604                     for( j = 0; j < 8; j++ )
   1605                     {
   1606                         int sxj = sx + j*cn;
   1607                         if( (unsigned)sxj >= (unsigned)swidth )
   1608                         {
   1609                             while( sxj < 0 )
   1610                                 sxj += cn;
   1611                             while( sxj >= swidth )
   1612                                 sxj -= cn;
   1613                         }
   1614                         v += S[sxj]*alpha[j];
   1615                     }
   1616                     D[dx] = v;
   1617                 }
   1618                 if( limit == dwidth )
   1619                     break;
   1620                 for( ; dx < xmax; dx++, alpha += 8 )
   1621                 {
   1622                     int sx = xofs[dx];
   1623                     D[dx] = S[sx-cn*3]*alpha[0] + S[sx-cn*2]*alpha[1] +
   1624                         S[sx-cn]*alpha[2] + S[sx]*alpha[3] +
   1625                         S[sx+cn]*alpha[4] + S[sx+cn*2]*alpha[5] +
   1626                         S[sx+cn*3]*alpha[6] + S[sx+cn*4]*alpha[7];
   1627                 }
   1628                 limit = dwidth;
   1629             }
   1630             alpha -= dwidth*8;
   1631         }
   1632     }
   1633 };
   1634 
   1635 
   1636 template<typename T, typename WT, typename AT, class CastOp, class VecOp>
   1637 struct VResizeLanczos4
   1638 {
   1639     typedef T value_type;
   1640     typedef WT buf_type;
   1641     typedef AT alpha_type;
   1642 
   1643     void operator()(const WT** src, T* dst, const AT* beta, int width ) const
   1644     {
   1645         CastOp castOp;
   1646         VecOp vecOp;
   1647         int k, x = vecOp((const uchar**)src, (uchar*)dst, (const uchar*)beta, width);
   1648         #if CV_ENABLE_UNROLLED
   1649         for( ; x <= width - 4; x += 4 )
   1650         {
   1651             WT b = beta[0];
   1652             const WT* S = src[0];
   1653             WT s0 = S[x]*b, s1 = S[x+1]*b, s2 = S[x+2]*b, s3 = S[x+3]*b;
   1654 
   1655             for( k = 1; k < 8; k++ )
   1656             {
   1657                 b = beta[k]; S = src[k];
   1658                 s0 += S[x]*b; s1 += S[x+1]*b;
   1659                 s2 += S[x+2]*b; s3 += S[x+3]*b;
   1660             }
   1661 
   1662             dst[x] = castOp(s0); dst[x+1] = castOp(s1);
   1663             dst[x+2] = castOp(s2); dst[x+3] = castOp(s3);
   1664         }
   1665         #endif
   1666         for( ; x < width; x++ )
   1667         {
   1668             dst[x] = castOp(src[0][x]*beta[0] + src[1][x]*beta[1] +
   1669                 src[2][x]*beta[2] + src[3][x]*beta[3] + src[4][x]*beta[4] +
   1670                 src[5][x]*beta[5] + src[6][x]*beta[6] + src[7][x]*beta[7]);
   1671         }
   1672     }
   1673 };
   1674 
   1675 
   1676 static inline int clip(int x, int a, int b)
   1677 {
   1678     return x >= a ? (x < b ? x : b-1) : a;
   1679 }
   1680 
   1681 static const int MAX_ESIZE=16;
   1682 
   1683 template <typename HResize, typename VResize>
   1684 class resizeGeneric_Invoker :
   1685     public ParallelLoopBody
   1686 {
   1687 public:
   1688     typedef typename HResize::value_type T;
   1689     typedef typename HResize::buf_type WT;
   1690     typedef typename HResize::alpha_type AT;
   1691 
   1692     resizeGeneric_Invoker(const Mat& _src, Mat &_dst, const int *_xofs, const int *_yofs,
   1693         const AT* _alpha, const AT* __beta, const Size& _ssize, const Size &_dsize,
   1694         int _ksize, int _xmin, int _xmax) :
   1695         ParallelLoopBody(), src(_src), dst(_dst), xofs(_xofs), yofs(_yofs),
   1696         alpha(_alpha), _beta(__beta), ssize(_ssize), dsize(_dsize),
   1697         ksize(_ksize), xmin(_xmin), xmax(_xmax)
   1698     {
   1699         CV_Assert(ksize <= MAX_ESIZE);
   1700     }
   1701 
   1702 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
   1703 # pragma GCC diagnostic push
   1704 # pragma GCC diagnostic ignored "-Warray-bounds"
   1705 #endif
   1706     virtual void operator() (const Range& range) const
   1707     {
   1708         int dy, cn = src.channels();
   1709         HResize hresize;
   1710         VResize vresize;
   1711 
   1712         int bufstep = (int)alignSize(dsize.width, 16);
   1713         AutoBuffer<WT> _buffer(bufstep*ksize);
   1714         const T* srows[MAX_ESIZE]={0};
   1715         WT* rows[MAX_ESIZE]={0};
   1716         int prev_sy[MAX_ESIZE];
   1717 
   1718         for(int k = 0; k < ksize; k++ )
   1719         {
   1720             prev_sy[k] = -1;
   1721             rows[k] = (WT*)_buffer + bufstep*k;
   1722         }
   1723 
   1724         const AT* beta = _beta + ksize * range.start;
   1725 
   1726         for( dy = range.start; dy < range.end; dy++, beta += ksize )
   1727         {
   1728             int sy0 = yofs[dy], k0=ksize, k1=0, ksize2 = ksize/2;
   1729 
   1730             for(int k = 0; k < ksize; k++ )
   1731             {
   1732                 int sy = clip(sy0 - ksize2 + 1 + k, 0, ssize.height);
   1733                 for( k1 = std::max(k1, k); k1 < ksize; k1++ )
   1734                 {
   1735                     if( sy == prev_sy[k1] ) // if the sy-th row has been computed already, reuse it.
   1736                     {
   1737                         if( k1 > k )
   1738                             memcpy( rows[k], rows[k1], bufstep*sizeof(rows[0][0]) );
   1739                         break;
   1740                     }
   1741                 }
   1742                 if( k1 == ksize )
   1743                     k0 = std::min(k0, k); // remember the first row that needs to be computed
   1744                 srows[k] = src.template ptr<T>(sy);
   1745                 prev_sy[k] = sy;
   1746             }
   1747 
   1748             if( k0 < ksize )
   1749                 hresize( (const T**)(srows + k0), (WT**)(rows + k0), ksize - k0, xofs, (const AT*)(alpha),
   1750                         ssize.width, dsize.width, cn, xmin, xmax );
   1751             vresize( (const WT**)rows, (T*)(dst.data + dst.step*dy), beta, dsize.width );
   1752         }
   1753     }
   1754 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ == 8)
   1755 # pragma GCC diagnostic pop
   1756 #endif
   1757 
   1758 private:
   1759     Mat src;
   1760     Mat dst;
   1761     const int* xofs, *yofs;
   1762     const AT* alpha, *_beta;
   1763     Size ssize, dsize;
   1764     const int ksize, xmin, xmax;
   1765 
   1766     resizeGeneric_Invoker& operator = (const resizeGeneric_Invoker&);
   1767 };
   1768 
   1769 template<class HResize, class VResize>
   1770 static void resizeGeneric_( const Mat& src, Mat& dst,
   1771                             const int* xofs, const void* _alpha,
   1772                             const int* yofs, const void* _beta,
   1773                             int xmin, int xmax, int ksize )
   1774 {
   1775     typedef typename HResize::alpha_type AT;
   1776 
   1777     const AT* beta = (const AT*)_beta;
   1778     Size ssize = src.size(), dsize = dst.size();
   1779     int cn = src.channels();
   1780     ssize.width *= cn;
   1781     dsize.width *= cn;
   1782     xmin *= cn;
   1783     xmax *= cn;
   1784     // image resize is a separable operation. In case of not too strong
   1785 
   1786     Range range(0, dsize.height);
   1787     resizeGeneric_Invoker<HResize, VResize> invoker(src, dst, xofs, yofs, (const AT*)_alpha, beta,
   1788         ssize, dsize, ksize, xmin, xmax);
   1789     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
   1790 }
   1791 
   1792 template <typename T, typename WT>
   1793 struct ResizeAreaFastNoVec
   1794 {
   1795     ResizeAreaFastNoVec(int, int) { }
   1796     ResizeAreaFastNoVec(int, int, int, int) { }
   1797     int operator() (const T*, T*, int) const
   1798     { return 0; }
   1799 };
   1800 
   1801 #if CV_NEON
   1802 
   1803 class ResizeAreaFastVec_SIMD_8u
   1804 {
   1805 public:
   1806     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
   1807         cn(_cn), step(_step)
   1808     {
   1809     }
   1810 
   1811     int operator() (const uchar* S, uchar* D, int w) const
   1812     {
   1813         int dx = 0;
   1814         const uchar* S0 = S, * S1 = S0 + step;
   1815 
   1816         uint16x8_t v_2 = vdupq_n_u16(2);
   1817 
   1818         if (cn == 1)
   1819         {
   1820             for ( ; dx <= w - 16; dx += 16, S0 += 32, S1 += 32, D += 16)
   1821             {
   1822                 uint8x16x2_t v_row0 = vld2q_u8(S0), v_row1 = vld2q_u8(S1);
   1823 
   1824                 uint16x8_t v_dst0 = vaddl_u8(vget_low_u8(v_row0.val[0]), vget_low_u8(v_row0.val[1]));
   1825                 v_dst0 = vaddq_u16(v_dst0, vaddl_u8(vget_low_u8(v_row1.val[0]), vget_low_u8(v_row1.val[1])));
   1826                 v_dst0 = vshrq_n_u16(vaddq_u16(v_dst0, v_2), 2);
   1827 
   1828                 uint16x8_t v_dst1 = vaddl_u8(vget_high_u8(v_row0.val[0]), vget_high_u8(v_row0.val[1]));
   1829                 v_dst1 = vaddq_u16(v_dst1, vaddl_u8(vget_high_u8(v_row1.val[0]), vget_high_u8(v_row1.val[1])));
   1830                 v_dst1 = vshrq_n_u16(vaddq_u16(v_dst1, v_2), 2);
   1831 
   1832                 vst1q_u8(D, vcombine_u8(vmovn_u16(v_dst0), vmovn_u16(v_dst1)));
   1833             }
   1834         }
   1835         else if (cn == 4)
   1836         {
   1837             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
   1838             {
   1839                 uint8x16_t v_row0 = vld1q_u8(S0), v_row1 = vld1q_u8(S1);
   1840 
   1841                 uint16x8_t v_row00 = vmovl_u8(vget_low_u8(v_row0));
   1842                 uint16x8_t v_row01 = vmovl_u8(vget_high_u8(v_row0));
   1843                 uint16x8_t v_row10 = vmovl_u8(vget_low_u8(v_row1));
   1844                 uint16x8_t v_row11 = vmovl_u8(vget_high_u8(v_row1));
   1845 
   1846                 uint16x4_t v_p0 = vadd_u16(vadd_u16(vget_low_u16(v_row00), vget_high_u16(v_row00)),
   1847                                            vadd_u16(vget_low_u16(v_row10), vget_high_u16(v_row10)));
   1848                 uint16x4_t v_p1 = vadd_u16(vadd_u16(vget_low_u16(v_row01), vget_high_u16(v_row01)),
   1849                                            vadd_u16(vget_low_u16(v_row11), vget_high_u16(v_row11)));
   1850                 uint16x8_t v_dst = vshrq_n_u16(vaddq_u16(vcombine_u16(v_p0, v_p1), v_2), 2);
   1851 
   1852                 vst1_u8(D, vmovn_u16(v_dst));
   1853             }
   1854         }
   1855 
   1856         return dx;
   1857     }
   1858 
   1859 private:
   1860     int cn, step;
   1861 };
   1862 
   1863 class ResizeAreaFastVec_SIMD_16u
   1864 {
   1865 public:
   1866     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
   1867         cn(_cn), step(_step)
   1868     {
   1869     }
   1870 
   1871     int operator() (const ushort * S, ushort * D, int w) const
   1872     {
   1873         int dx = 0;
   1874         const ushort * S0 = S, * S1 = (const ushort *)((const uchar *)(S0) + step);
   1875 
   1876         uint32x4_t v_2 = vdupq_n_u32(2);
   1877 
   1878         if (cn == 1)
   1879         {
   1880             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
   1881             {
   1882                 uint16x8x2_t v_row0 = vld2q_u16(S0), v_row1 = vld2q_u16(S1);
   1883 
   1884                 uint32x4_t v_dst0 = vaddl_u16(vget_low_u16(v_row0.val[0]), vget_low_u16(v_row0.val[1]));
   1885                 v_dst0 = vaddq_u32(v_dst0, vaddl_u16(vget_low_u16(v_row1.val[0]), vget_low_u16(v_row1.val[1])));
   1886                 v_dst0 = vshrq_n_u32(vaddq_u32(v_dst0, v_2), 2);
   1887 
   1888                 uint32x4_t v_dst1 = vaddl_u16(vget_high_u16(v_row0.val[0]), vget_high_u16(v_row0.val[1]));
   1889                 v_dst1 = vaddq_u32(v_dst1, vaddl_u16(vget_high_u16(v_row1.val[0]), vget_high_u16(v_row1.val[1])));
   1890                 v_dst1 = vshrq_n_u32(vaddq_u32(v_dst1, v_2), 2);
   1891 
   1892                 vst1q_u16(D, vcombine_u16(vmovn_u32(v_dst0), vmovn_u32(v_dst1)));
   1893             }
   1894         }
   1895         else if (cn == 4)
   1896         {
   1897             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   1898             {
   1899                 uint16x8_t v_row0 = vld1q_u16(S0), v_row1 = vld1q_u16(S1);
   1900                 uint32x4_t v_dst = vaddq_u32(vaddl_u16(vget_low_u16(v_row0), vget_high_u16(v_row0)),
   1901                                              vaddl_u16(vget_low_u16(v_row1), vget_high_u16(v_row1)));
   1902                 vst1_u16(D, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst, v_2), 2)));
   1903             }
   1904         }
   1905 
   1906         return dx;
   1907     }
   1908 
   1909 private:
   1910     int cn, step;
   1911 };
   1912 
   1913 class ResizeAreaFastVec_SIMD_16s
   1914 {
   1915 public:
   1916     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
   1917         cn(_cn), step(_step)
   1918     {
   1919     }
   1920 
   1921     int operator() (const short * S, short * D, int w) const
   1922     {
   1923         int dx = 0;
   1924         const short * S0 = S, * S1 = (const short *)((const uchar *)(S0) + step);
   1925 
   1926         int32x4_t v_2 = vdupq_n_s32(2);
   1927 
   1928         if (cn == 1)
   1929         {
   1930             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
   1931             {
   1932                 int16x8x2_t v_row0 = vld2q_s16(S0), v_row1 = vld2q_s16(S1);
   1933 
   1934                 int32x4_t v_dst0 = vaddl_s16(vget_low_s16(v_row0.val[0]), vget_low_s16(v_row0.val[1]));
   1935                 v_dst0 = vaddq_s32(v_dst0, vaddl_s16(vget_low_s16(v_row1.val[0]), vget_low_s16(v_row1.val[1])));
   1936                 v_dst0 = vshrq_n_s32(vaddq_s32(v_dst0, v_2), 2);
   1937 
   1938                 int32x4_t v_dst1 = vaddl_s16(vget_high_s16(v_row0.val[0]), vget_high_s16(v_row0.val[1]));
   1939                 v_dst1 = vaddq_s32(v_dst1, vaddl_s16(vget_high_s16(v_row1.val[0]), vget_high_s16(v_row1.val[1])));
   1940                 v_dst1 = vshrq_n_s32(vaddq_s32(v_dst1, v_2), 2);
   1941 
   1942                 vst1q_s16(D, vcombine_s16(vmovn_s32(v_dst0), vmovn_s32(v_dst1)));
   1943             }
   1944         }
   1945         else if (cn == 4)
   1946         {
   1947             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   1948             {
   1949                 int16x8_t v_row0 = vld1q_s16(S0), v_row1 = vld1q_s16(S1);
   1950                 int32x4_t v_dst = vaddq_s32(vaddl_s16(vget_low_s16(v_row0), vget_high_s16(v_row0)),
   1951                                             vaddl_s16(vget_low_s16(v_row1), vget_high_s16(v_row1)));
   1952                 vst1_s16(D, vmovn_s32(vshrq_n_s32(vaddq_s32(v_dst, v_2), 2)));
   1953             }
   1954         }
   1955 
   1956         return dx;
   1957     }
   1958 
   1959 private:
   1960     int cn, step;
   1961 };
   1962 
   1963 struct ResizeAreaFastVec_SIMD_32f
   1964 {
   1965     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
   1966         cn(_cn), step(_step)
   1967     {
   1968         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
   1969     }
   1970 
   1971     int operator() (const float * S, float * D, int w) const
   1972     {
   1973         if (!fast_mode)
   1974             return 0;
   1975 
   1976         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
   1977         int dx = 0;
   1978 
   1979         float32x4_t v_025 = vdupq_n_f32(0.25f);
   1980 
   1981         if (cn == 1)
   1982         {
   1983             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   1984             {
   1985                 float32x4x2_t v_row0 = vld2q_f32(S0), v_row1 = vld2q_f32(S1);
   1986 
   1987                 float32x4_t v_dst0 = vaddq_f32(v_row0.val[0], v_row0.val[1]);
   1988                 float32x4_t v_dst1 = vaddq_f32(v_row1.val[0], v_row1.val[1]);
   1989 
   1990                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
   1991             }
   1992         }
   1993         else if (cn == 4)
   1994         {
   1995             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   1996             {
   1997                 float32x4_t v_dst0 = vaddq_f32(vld1q_f32(S0), vld1q_f32(S0 + 4));
   1998                 float32x4_t v_dst1 = vaddq_f32(vld1q_f32(S1), vld1q_f32(S1 + 4));
   1999 
   2000                 vst1q_f32(D, vmulq_f32(vaddq_f32(v_dst0, v_dst1), v_025));
   2001             }
   2002         }
   2003 
   2004         return dx;
   2005     }
   2006 
   2007 private:
   2008     int cn;
   2009     bool fast_mode;
   2010     int step;
   2011 };
   2012 
   2013 #elif CV_SSE2
   2014 
   2015 class ResizeAreaFastVec_SIMD_8u
   2016 {
   2017 public:
   2018     ResizeAreaFastVec_SIMD_8u(int _cn, int _step) :
   2019         cn(_cn), step(_step)
   2020     {
   2021         use_simd = checkHardwareSupport(CV_CPU_SSE2);
   2022     }
   2023 
   2024     int operator() (const uchar* S, uchar* D, int w) const
   2025     {
   2026         if (!use_simd)
   2027             return 0;
   2028 
   2029         int dx = 0;
   2030         const uchar* S0 = S;
   2031         const uchar* S1 = S0 + step;
   2032         __m128i zero = _mm_setzero_si128();
   2033         __m128i delta2 = _mm_set1_epi16(2);
   2034 
   2035         if (cn == 1)
   2036         {
   2037             __m128i masklow = _mm_set1_epi16(0x00ff);
   2038             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
   2039             {
   2040                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
   2041                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
   2042 
   2043                 __m128i s0 = _mm_add_epi16(_mm_srli_epi16(r0, 8), _mm_and_si128(r0, masklow));
   2044                 __m128i s1 = _mm_add_epi16(_mm_srli_epi16(r1, 8), _mm_and_si128(r1, masklow));
   2045                 s0 = _mm_add_epi16(_mm_add_epi16(s0, s1), delta2);
   2046                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
   2047 
   2048                 _mm_storel_epi64((__m128i*)D, s0);
   2049             }
   2050         }
   2051         else if (cn == 3)
   2052             for ( ; dx <= w - 11; dx += 6, S0 += 12, S1 += 12, D += 6)
   2053             {
   2054                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
   2055                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
   2056 
   2057                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
   2058                 __m128i r0_16h = _mm_unpacklo_epi8(_mm_srli_si128(r0, 6), zero);
   2059                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
   2060                 __m128i r1_16h = _mm_unpacklo_epi8(_mm_srli_si128(r1, 6), zero);
   2061 
   2062                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 6));
   2063                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 6));
   2064                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
   2065                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
   2066                 _mm_storel_epi64((__m128i*)D, s0);
   2067 
   2068                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 6));
   2069                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 6));
   2070                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
   2071                 s0 = _mm_packus_epi16(_mm_srli_epi16(s0, 2), zero);
   2072                 _mm_storel_epi64((__m128i*)(D+3), s0);
   2073             }
   2074         else
   2075         {
   2076             CV_Assert(cn == 4);
   2077             int v[] = { 0, 0, -1, -1 };
   2078             __m128i mask = _mm_loadu_si128((const __m128i*)v);
   2079 
   2080             for ( ; dx <= w - 8; dx += 8, S0 += 16, S1 += 16, D += 8)
   2081             {
   2082                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
   2083                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
   2084 
   2085                 __m128i r0_16l = _mm_unpacklo_epi8(r0, zero);
   2086                 __m128i r0_16h = _mm_unpackhi_epi8(r0, zero);
   2087                 __m128i r1_16l = _mm_unpacklo_epi8(r1, zero);
   2088                 __m128i r1_16h = _mm_unpackhi_epi8(r1, zero);
   2089 
   2090                 __m128i s0 = _mm_add_epi16(r0_16l, _mm_srli_si128(r0_16l, 8));
   2091                 __m128i s1 = _mm_add_epi16(r1_16l, _mm_srli_si128(r1_16l, 8));
   2092                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
   2093                 __m128i res0 = _mm_srli_epi16(s0, 2);
   2094 
   2095                 s0 = _mm_add_epi16(r0_16h, _mm_srli_si128(r0_16h, 8));
   2096                 s1 = _mm_add_epi16(r1_16h, _mm_srli_si128(r1_16h, 8));
   2097                 s0 = _mm_add_epi16(s1, _mm_add_epi16(s0, delta2));
   2098                 __m128i res1 = _mm_srli_epi16(s0, 2);
   2099                 s0 = _mm_packus_epi16(_mm_or_si128(_mm_andnot_si128(mask, res0),
   2100                                                    _mm_and_si128(mask, _mm_slli_si128(res1, 8))), zero);
   2101                 _mm_storel_epi64((__m128i*)(D), s0);
   2102             }
   2103         }
   2104 
   2105         return dx;
   2106     }
   2107 
   2108 private:
   2109     int cn;
   2110     bool use_simd;
   2111     int step;
   2112 };
   2113 
   2114 class ResizeAreaFastVec_SIMD_16u
   2115 {
   2116 public:
   2117     ResizeAreaFastVec_SIMD_16u(int _cn, int _step) :
   2118         cn(_cn), step(_step)
   2119     {
   2120         use_simd = checkHardwareSupport(CV_CPU_SSE2);
   2121     }
   2122 
   2123     int operator() (const ushort* S, ushort* D, int w) const
   2124     {
   2125         if (!use_simd)
   2126             return 0;
   2127 
   2128         int dx = 0;
   2129         const ushort* S0 = (const ushort*)S;
   2130         const ushort* S1 = (const ushort*)((const uchar*)(S) + step);
   2131         __m128i masklow = _mm_set1_epi32(0x0000ffff);
   2132         __m128i zero = _mm_setzero_si128();
   2133         __m128i delta2 = _mm_set1_epi32(2);
   2134 
   2135 #define _mm_packus_epi32(a, zero) _mm_packs_epi32(_mm_srai_epi32(_mm_slli_epi32(a, 16), 16), zero)
   2136 
   2137         if (cn == 1)
   2138         {
   2139             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   2140             {
   2141                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
   2142                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
   2143 
   2144                 __m128i s0 = _mm_add_epi32(_mm_srli_epi32(r0, 16), _mm_and_si128(r0, masklow));
   2145                 __m128i s1 = _mm_add_epi32(_mm_srli_epi32(r1, 16), _mm_and_si128(r1, masklow));
   2146                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
   2147                 s0 = _mm_srli_epi32(s0, 2);
   2148                 s0 = _mm_packus_epi32(s0, zero);
   2149 
   2150                 _mm_storel_epi64((__m128i*)D, s0);
   2151             }
   2152         }
   2153         else if (cn == 3)
   2154             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
   2155             {
   2156                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
   2157                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
   2158 
   2159                 __m128i r0_16l = _mm_unpacklo_epi16(r0, zero);
   2160                 __m128i r0_16h = _mm_unpacklo_epi16(_mm_srli_si128(r0, 6), zero);
   2161                 __m128i r1_16l = _mm_unpacklo_epi16(r1, zero);
   2162                 __m128i r1_16h = _mm_unpacklo_epi16(_mm_srli_si128(r1, 6), zero);
   2163 
   2164                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
   2165                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
   2166                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
   2167                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
   2168                 _mm_storel_epi64((__m128i*)D, s0);
   2169             }
   2170         else
   2171         {
   2172             CV_Assert(cn == 4);
   2173             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   2174             {
   2175                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
   2176                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
   2177 
   2178                 __m128i r0_32l = _mm_unpacklo_epi16(r0, zero);
   2179                 __m128i r0_32h = _mm_unpackhi_epi16(r0, zero);
   2180                 __m128i r1_32l = _mm_unpacklo_epi16(r1, zero);
   2181                 __m128i r1_32h = _mm_unpackhi_epi16(r1, zero);
   2182 
   2183                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
   2184                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
   2185                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
   2186                 s0 = _mm_packus_epi32(_mm_srli_epi32(s0, 2), zero);
   2187                 _mm_storel_epi64((__m128i*)D, s0);
   2188             }
   2189         }
   2190 
   2191 #undef _mm_packus_epi32
   2192 
   2193         return dx;
   2194     }
   2195 
   2196 private:
   2197     int cn;
   2198     int step;
   2199     bool use_simd;
   2200 };
   2201 
   2202 class ResizeAreaFastVec_SIMD_16s
   2203 {
   2204 public:
   2205     ResizeAreaFastVec_SIMD_16s(int _cn, int _step) :
   2206         cn(_cn), step(_step)
   2207     {
   2208         use_simd = checkHardwareSupport(CV_CPU_SSE2);
   2209     }
   2210 
   2211     int operator() (const short* S, short* D, int w) const
   2212     {
   2213         if (!use_simd)
   2214             return 0;
   2215 
   2216         int dx = 0;
   2217         const short* S0 = (const short*)S;
   2218         const short* S1 = (const short*)((const uchar*)(S) + step);
   2219         __m128i masklow = _mm_set1_epi32(0x0000ffff);
   2220         __m128i zero = _mm_setzero_si128();
   2221         __m128i delta2 = _mm_set1_epi32(2);
   2222 
   2223         if (cn == 1)
   2224         {
   2225             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   2226             {
   2227                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
   2228                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
   2229 
   2230                 __m128i s0 = _mm_add_epi32(_mm_srai_epi32(r0, 16),
   2231                     _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r0, masklow), 16), 16));
   2232                 __m128i s1 = _mm_add_epi32(_mm_srai_epi32(r1, 16),
   2233                     _mm_srai_epi32(_mm_slli_epi32(_mm_and_si128(r1, masklow), 16), 16));
   2234                 s0 = _mm_add_epi32(_mm_add_epi32(s0, s1), delta2);
   2235                 s0 = _mm_srai_epi32(s0, 2);
   2236                 s0 = _mm_packs_epi32(s0, zero);
   2237 
   2238                 _mm_storel_epi64((__m128i*)D, s0);
   2239             }
   2240         }
   2241         else if (cn == 3)
   2242             for ( ; dx <= w - 4; dx += 3, S0 += 6, S1 += 6, D += 3)
   2243             {
   2244                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
   2245                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
   2246 
   2247                 __m128i r0_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
   2248                 __m128i r0_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r0, 6)), 16);
   2249                 __m128i r1_16l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
   2250                 __m128i r1_16h = _mm_srai_epi32(_mm_unpacklo_epi16(zero, _mm_srli_si128(r1, 6)), 16);
   2251 
   2252                 __m128i s0 = _mm_add_epi32(r0_16l, r0_16h);
   2253                 __m128i s1 = _mm_add_epi32(r1_16l, r1_16h);
   2254                 s0 = _mm_add_epi32(delta2, _mm_add_epi32(s0, s1));
   2255                 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
   2256                 _mm_storel_epi64((__m128i*)D, s0);
   2257             }
   2258         else
   2259         {
   2260             CV_Assert(cn == 4);
   2261             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   2262             {
   2263                 __m128i r0 = _mm_loadu_si128((const __m128i*)S0);
   2264                 __m128i r1 = _mm_loadu_si128((const __m128i*)S1);
   2265 
   2266                 __m128i r0_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r0), 16);
   2267                 __m128i r0_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r0), 16);
   2268                 __m128i r1_32l = _mm_srai_epi32(_mm_unpacklo_epi16(zero, r1), 16);
   2269                 __m128i r1_32h = _mm_srai_epi32(_mm_unpackhi_epi16(zero, r1), 16);
   2270 
   2271                 __m128i s0 = _mm_add_epi32(r0_32l, r0_32h);
   2272                 __m128i s1 = _mm_add_epi32(r1_32l, r1_32h);
   2273                 s0 = _mm_add_epi32(s1, _mm_add_epi32(s0, delta2));
   2274                 s0 = _mm_packs_epi32(_mm_srai_epi32(s0, 2), zero);
   2275                 _mm_storel_epi64((__m128i*)D, s0);
   2276             }
   2277         }
   2278 
   2279         return dx;
   2280     }
   2281 
   2282 private:
   2283     int cn;
   2284     int step;
   2285     bool use_simd;
   2286 };
   2287 
   2288 struct ResizeAreaFastVec_SIMD_32f
   2289 {
   2290     ResizeAreaFastVec_SIMD_32f(int _scale_x, int _scale_y, int _cn, int _step) :
   2291         cn(_cn), step(_step)
   2292     {
   2293         fast_mode = _scale_x == 2 && _scale_y == 2 && (cn == 1 || cn == 4);
   2294         fast_mode = fast_mode && checkHardwareSupport(CV_CPU_SSE2);
   2295     }
   2296 
   2297     int operator() (const float * S, float * D, int w) const
   2298     {
   2299         if (!fast_mode)
   2300             return 0;
   2301 
   2302         const float * S0 = S, * S1 = (const float *)((const uchar *)(S0) + step);
   2303         int dx = 0;
   2304 
   2305         __m128 v_025 = _mm_set1_ps(0.25f);
   2306 
   2307         if (cn == 1)
   2308         {
   2309             const int shuffle_lo = _MM_SHUFFLE(2, 0, 2, 0), shuffle_hi = _MM_SHUFFLE(3, 1, 3, 1);
   2310             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   2311             {
   2312                 __m128 v_row00 = _mm_loadu_ps(S0), v_row01 = _mm_loadu_ps(S0 + 4),
   2313                        v_row10 = _mm_loadu_ps(S1), v_row11 = _mm_loadu_ps(S1 + 4);
   2314 
   2315                 __m128 v_dst0 = _mm_add_ps(_mm_shuffle_ps(v_row00, v_row01, shuffle_lo),
   2316                                            _mm_shuffle_ps(v_row00, v_row01, shuffle_hi));
   2317                 __m128 v_dst1 = _mm_add_ps(_mm_shuffle_ps(v_row10, v_row11, shuffle_lo),
   2318                                            _mm_shuffle_ps(v_row10, v_row11, shuffle_hi));
   2319 
   2320                 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
   2321             }
   2322         }
   2323         else if (cn == 4)
   2324         {
   2325             for ( ; dx <= w - 4; dx += 4, S0 += 8, S1 += 8, D += 4)
   2326             {
   2327                 __m128 v_dst0 = _mm_add_ps(_mm_loadu_ps(S0), _mm_loadu_ps(S0 + 4));
   2328                 __m128 v_dst1 = _mm_add_ps(_mm_loadu_ps(S1), _mm_loadu_ps(S1 + 4));
   2329 
   2330                 _mm_storeu_ps(D, _mm_mul_ps(_mm_add_ps(v_dst0, v_dst1), v_025));
   2331             }
   2332         }
   2333 
   2334         return dx;
   2335     }
   2336 
   2337 private:
   2338     int cn;
   2339     bool fast_mode;
   2340     int step;
   2341 };
   2342 
   2343 #else
   2344 
   2345 typedef ResizeAreaFastNoVec<uchar, uchar> ResizeAreaFastVec_SIMD_8u;
   2346 typedef ResizeAreaFastNoVec<ushort, ushort> ResizeAreaFastVec_SIMD_16u;
   2347 typedef ResizeAreaFastNoVec<short, short> ResizeAreaFastVec_SIMD_16s;
   2348 typedef ResizeAreaFastNoVec<float, float> ResizeAreaFastVec_SIMD_32f;
   2349 
   2350 #endif
   2351 
   2352 template<typename T, typename SIMDVecOp>
   2353 struct ResizeAreaFastVec
   2354 {
   2355     ResizeAreaFastVec(int _scale_x, int _scale_y, int _cn, int _step) :
   2356         scale_x(_scale_x), scale_y(_scale_y), cn(_cn), step(_step), vecOp(_cn, _step)
   2357     {
   2358         fast_mode = scale_x == 2 && scale_y == 2 && (cn == 1 || cn == 3 || cn == 4);
   2359     }
   2360 
   2361     int operator() (const T* S, T* D, int w) const
   2362     {
   2363         if (!fast_mode)
   2364             return 0;
   2365 
   2366         const T* nextS = (const T*)((const uchar*)S + step);
   2367         int dx = vecOp(S, D, w);
   2368 
   2369         if (cn == 1)
   2370             for( ; dx < w; ++dx )
   2371             {
   2372                 int index = dx*2;
   2373                 D[dx] = (T)((S[index] + S[index+1] + nextS[index] + nextS[index+1] + 2) >> 2);
   2374             }
   2375         else if (cn == 3)
   2376             for( ; dx < w; dx += 3 )
   2377             {
   2378                 int index = dx*2;
   2379                 D[dx] = (T)((S[index] + S[index+3] + nextS[index] + nextS[index+3] + 2) >> 2);
   2380                 D[dx+1] = (T)((S[index+1] + S[index+4] + nextS[index+1] + nextS[index+4] + 2) >> 2);
   2381                 D[dx+2] = (T)((S[index+2] + S[index+5] + nextS[index+2] + nextS[index+5] + 2) >> 2);
   2382             }
   2383         else
   2384             {
   2385                 CV_Assert(cn == 4);
   2386                 for( ; dx < w; dx += 4 )
   2387                 {
   2388                     int index = dx*2;
   2389                     D[dx] = (T)((S[index] + S[index+4] + nextS[index] + nextS[index+4] + 2) >> 2);
   2390                     D[dx+1] = (T)((S[index+1] + S[index+5] + nextS[index+1] + nextS[index+5] + 2) >> 2);
   2391                     D[dx+2] = (T)((S[index+2] + S[index+6] + nextS[index+2] + nextS[index+6] + 2) >> 2);
   2392                     D[dx+3] = (T)((S[index+3] + S[index+7] + nextS[index+3] + nextS[index+7] + 2) >> 2);
   2393                 }
   2394             }
   2395 
   2396         return dx;
   2397     }
   2398 
   2399 private:
   2400     int scale_x, scale_y;
   2401     int cn;
   2402     bool fast_mode;
   2403     int step;
   2404     SIMDVecOp vecOp;
   2405 };
   2406 
   2407 template <typename T, typename WT, typename VecOp>
   2408 class resizeAreaFast_Invoker :
   2409     public ParallelLoopBody
   2410 {
   2411 public:
   2412     resizeAreaFast_Invoker(const Mat &_src, Mat &_dst,
   2413         int _scale_x, int _scale_y, const int* _ofs, const int* _xofs) :
   2414         ParallelLoopBody(), src(_src), dst(_dst), scale_x(_scale_x),
   2415         scale_y(_scale_y), ofs(_ofs), xofs(_xofs)
   2416     {
   2417     }
   2418 
   2419     virtual void operator() (const Range& range) const
   2420     {
   2421         Size ssize = src.size(), dsize = dst.size();
   2422         int cn = src.channels();
   2423         int area = scale_x*scale_y;
   2424         float scale = 1.f/(area);
   2425         int dwidth1 = (ssize.width/scale_x)*cn;
   2426         dsize.width *= cn;
   2427         ssize.width *= cn;
   2428         int dy, dx, k = 0;
   2429 
   2430         VecOp vop(scale_x, scale_y, src.channels(), (int)src.step/*, area_ofs*/);
   2431 
   2432         for( dy = range.start; dy < range.end; dy++ )
   2433         {
   2434             T* D = (T*)(dst.data + dst.step*dy);
   2435             int sy0 = dy*scale_y;
   2436             int w = sy0 + scale_y <= ssize.height ? dwidth1 : 0;
   2437 
   2438             if( sy0 >= ssize.height )
   2439             {
   2440                 for( dx = 0; dx < dsize.width; dx++ )
   2441                     D[dx] = 0;
   2442                 continue;
   2443             }
   2444 
   2445             dx = vop(src.template ptr<T>(sy0), D, w);
   2446             for( ; dx < w; dx++ )
   2447             {
   2448                 const T* S = src.template ptr<T>(sy0) + xofs[dx];
   2449                 WT sum = 0;
   2450                 k = 0;
   2451                 #if CV_ENABLE_UNROLLED
   2452                 for( ; k <= area - 4; k += 4 )
   2453                     sum += S[ofs[k]] + S[ofs[k+1]] + S[ofs[k+2]] + S[ofs[k+3]];
   2454                 #endif
   2455                 for( ; k < area; k++ )
   2456                     sum += S[ofs[k]];
   2457 
   2458                 D[dx] = saturate_cast<T>(sum * scale);
   2459             }
   2460 
   2461             for( ; dx < dsize.width; dx++ )
   2462             {
   2463                 WT sum = 0;
   2464                 int count = 0, sx0 = xofs[dx];
   2465                 if( sx0 >= ssize.width )
   2466                     D[dx] = 0;
   2467 
   2468                 for( int sy = 0; sy < scale_y; sy++ )
   2469                 {
   2470                     if( sy0 + sy >= ssize.height )
   2471                         break;
   2472                     const T* S = src.template ptr<T>(sy0 + sy) + sx0;
   2473                     for( int sx = 0; sx < scale_x*cn; sx += cn )
   2474                     {
   2475                         if( sx0 + sx >= ssize.width )
   2476                             break;
   2477                         sum += S[sx];
   2478                         count++;
   2479                     }
   2480                 }
   2481 
   2482                 D[dx] = saturate_cast<T>((float)sum/count);
   2483             }
   2484         }
   2485     }
   2486 
   2487 private:
   2488     Mat src;
   2489     Mat dst;
   2490     int scale_x, scale_y;
   2491     const int *ofs, *xofs;
   2492 };
   2493 
   2494 template<typename T, typename WT, typename VecOp>
   2495 static void resizeAreaFast_( const Mat& src, Mat& dst, const int* ofs, const int* xofs,
   2496                              int scale_x, int scale_y )
   2497 {
   2498     Range range(0, dst.rows);
   2499     resizeAreaFast_Invoker<T, WT, VecOp> invoker(src, dst, scale_x,
   2500         scale_y, ofs, xofs);
   2501     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
   2502 }
   2503 
   2504 struct DecimateAlpha
   2505 {
   2506     int si, di;
   2507     float alpha;
   2508 };
   2509 
   2510 
   2511 template<typename T, typename WT> class ResizeArea_Invoker :
   2512     public ParallelLoopBody
   2513 {
   2514 public:
   2515     ResizeArea_Invoker( const Mat& _src, Mat& _dst,
   2516                         const DecimateAlpha* _xtab, int _xtab_size,
   2517                         const DecimateAlpha* _ytab, int _ytab_size,
   2518                         const int* _tabofs )
   2519     {
   2520         src = &_src;
   2521         dst = &_dst;
   2522         xtab0 = _xtab;
   2523         xtab_size0 = _xtab_size;
   2524         ytab = _ytab;
   2525         ytab_size = _ytab_size;
   2526         tabofs = _tabofs;
   2527     }
   2528 
   2529     virtual void operator() (const Range& range) const
   2530     {
   2531         Size dsize = dst->size();
   2532         int cn = dst->channels();
   2533         dsize.width *= cn;
   2534         AutoBuffer<WT> _buffer(dsize.width*2);
   2535         const DecimateAlpha* xtab = xtab0;
   2536         int xtab_size = xtab_size0;
   2537         WT *buf = _buffer, *sum = buf + dsize.width;
   2538         int j_start = tabofs[range.start], j_end = tabofs[range.end], j, k, dx, prev_dy = ytab[j_start].di;
   2539 
   2540         for( dx = 0; dx < dsize.width; dx++ )
   2541             sum[dx] = (WT)0;
   2542 
   2543         for( j = j_start; j < j_end; j++ )
   2544         {
   2545             WT beta = ytab[j].alpha;
   2546             int dy = ytab[j].di;
   2547             int sy = ytab[j].si;
   2548 
   2549             {
   2550                 const T* S = src->template ptr<T>(sy);
   2551                 for( dx = 0; dx < dsize.width; dx++ )
   2552                     buf[dx] = (WT)0;
   2553 
   2554                 if( cn == 1 )
   2555                     for( k = 0; k < xtab_size; k++ )
   2556                     {
   2557                         int dxn = xtab[k].di;
   2558                         WT alpha = xtab[k].alpha;
   2559                         buf[dxn] += S[xtab[k].si]*alpha;
   2560                     }
   2561                 else if( cn == 2 )
   2562                     for( k = 0; k < xtab_size; k++ )
   2563                     {
   2564                         int sxn = xtab[k].si;
   2565                         int dxn = xtab[k].di;
   2566                         WT alpha = xtab[k].alpha;
   2567                         WT t0 = buf[dxn] + S[sxn]*alpha;
   2568                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
   2569                         buf[dxn] = t0; buf[dxn+1] = t1;
   2570                     }
   2571                 else if( cn == 3 )
   2572                     for( k = 0; k < xtab_size; k++ )
   2573                     {
   2574                         int sxn = xtab[k].si;
   2575                         int dxn = xtab[k].di;
   2576                         WT alpha = xtab[k].alpha;
   2577                         WT t0 = buf[dxn] + S[sxn]*alpha;
   2578                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
   2579                         WT t2 = buf[dxn+2] + S[sxn+2]*alpha;
   2580                         buf[dxn] = t0; buf[dxn+1] = t1; buf[dxn+2] = t2;
   2581                     }
   2582                 else if( cn == 4 )
   2583                 {
   2584                     for( k = 0; k < xtab_size; k++ )
   2585                     {
   2586                         int sxn = xtab[k].si;
   2587                         int dxn = xtab[k].di;
   2588                         WT alpha = xtab[k].alpha;
   2589                         WT t0 = buf[dxn] + S[sxn]*alpha;
   2590                         WT t1 = buf[dxn+1] + S[sxn+1]*alpha;
   2591                         buf[dxn] = t0; buf[dxn+1] = t1;
   2592                         t0 = buf[dxn+2] + S[sxn+2]*alpha;
   2593                         t1 = buf[dxn+3] + S[sxn+3]*alpha;
   2594                         buf[dxn+2] = t0; buf[dxn+3] = t1;
   2595                     }
   2596                 }
   2597                 else
   2598                 {
   2599                     for( k = 0; k < xtab_size; k++ )
   2600                     {
   2601                         int sxn = xtab[k].si;
   2602                         int dxn = xtab[k].di;
   2603                         WT alpha = xtab[k].alpha;
   2604                         for( int c = 0; c < cn; c++ )
   2605                             buf[dxn + c] += S[sxn + c]*alpha;
   2606                     }
   2607                 }
   2608             }
   2609 
   2610             if( dy != prev_dy )
   2611             {
   2612                 T* D = dst->template ptr<T>(prev_dy);
   2613 
   2614                 for( dx = 0; dx < dsize.width; dx++ )
   2615                 {
   2616                     D[dx] = saturate_cast<T>(sum[dx]);
   2617                     sum[dx] = beta*buf[dx];
   2618                 }
   2619                 prev_dy = dy;
   2620             }
   2621             else
   2622             {
   2623                 for( dx = 0; dx < dsize.width; dx++ )
   2624                     sum[dx] += beta*buf[dx];
   2625             }
   2626         }
   2627 
   2628         {
   2629         T* D = dst->template ptr<T>(prev_dy);
   2630         for( dx = 0; dx < dsize.width; dx++ )
   2631             D[dx] = saturate_cast<T>(sum[dx]);
   2632         }
   2633     }
   2634 
   2635 private:
   2636     const Mat* src;
   2637     Mat* dst;
   2638     const DecimateAlpha* xtab0;
   2639     const DecimateAlpha* ytab;
   2640     int xtab_size0, ytab_size;
   2641     const int* tabofs;
   2642 };
   2643 
   2644 
   2645 template <typename T, typename WT>
   2646 static void resizeArea_( const Mat& src, Mat& dst,
   2647                          const DecimateAlpha* xtab, int xtab_size,
   2648                          const DecimateAlpha* ytab, int ytab_size,
   2649                          const int* tabofs )
   2650 {
   2651     parallel_for_(Range(0, dst.rows),
   2652                  ResizeArea_Invoker<T, WT>(src, dst, xtab, xtab_size, ytab, ytab_size, tabofs),
   2653                  dst.total()/((double)(1 << 16)));
   2654 }
   2655 
   2656 
   2657 typedef void (*ResizeFunc)( const Mat& src, Mat& dst,
   2658                             const int* xofs, const void* alpha,
   2659                             const int* yofs, const void* beta,
   2660                             int xmin, int xmax, int ksize );
   2661 
   2662 typedef void (*ResizeAreaFastFunc)( const Mat& src, Mat& dst,
   2663                                     const int* ofs, const int *xofs,
   2664                                     int scale_x, int scale_y );
   2665 
   2666 typedef void (*ResizeAreaFunc)( const Mat& src, Mat& dst,
   2667                                 const DecimateAlpha* xtab, int xtab_size,
   2668                                 const DecimateAlpha* ytab, int ytab_size,
   2669                                 const int* yofs);
   2670 
   2671 
   2672 static int computeResizeAreaTab( int ssize, int dsize, int cn, double scale, DecimateAlpha* tab )
   2673 {
   2674     int k = 0;
   2675     for(int dx = 0; dx < dsize; dx++ )
   2676     {
   2677         double fsx1 = dx * scale;
   2678         double fsx2 = fsx1 + scale;
   2679         double cellWidth = std::min(scale, ssize - fsx1);
   2680 
   2681         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
   2682 
   2683         sx2 = std::min(sx2, ssize - 1);
   2684         sx1 = std::min(sx1, sx2);
   2685 
   2686         if( sx1 - fsx1 > 1e-3 )
   2687         {
   2688             assert( k < ssize*2 );
   2689             tab[k].di = dx * cn;
   2690             tab[k].si = (sx1 - 1) * cn;
   2691             tab[k++].alpha = (float)((sx1 - fsx1) / cellWidth);
   2692         }
   2693 
   2694         for(int sx = sx1; sx < sx2; sx++ )
   2695         {
   2696             assert( k < ssize*2 );
   2697             tab[k].di = dx * cn;
   2698             tab[k].si = sx * cn;
   2699             tab[k++].alpha = float(1.0 / cellWidth);
   2700         }
   2701 
   2702         if( fsx2 - sx2 > 1e-3 )
   2703         {
   2704             assert( k < ssize*2 );
   2705             tab[k].di = dx * cn;
   2706             tab[k].si = sx2 * cn;
   2707             tab[k++].alpha = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
   2708         }
   2709     }
   2710     return k;
   2711 }
   2712 
   2713 #define CHECK_IPP_STATUS(STATUS) if (STATUS < 0) { *ok = false; return; }
   2714 
   2715 #define SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN) \
   2716     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
   2717     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
   2718     specBuf.allocate(specSize);\
   2719     pSpec = (uchar*)specBuf;\
   2720     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_32f*)pSpec));
   2721 
   2722 #define SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(TYPE, CN) \
   2723     if (mode == (int)ippCubic) { *ok = false; return; } \
   2724     func = (ippiResizeFunc)ippiResizeLinear_##TYPE##_##CN##R; \
   2725     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
   2726     specBuf.allocate(specSize);\
   2727     pSpec = (uchar*)specBuf;\
   2728     CHECK_IPP_STATUS(ippiResizeLinearInit_##TYPE(srcSize, dstSize, (IppiResizeSpec_64f*)pSpec));\
   2729     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE;\
   2730     getSrcOffsetFunc =  (ippiResizeGetSrcOffset) ippiResizeGetSrcOffset_##TYPE;
   2731 
   2732 #define SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN) \
   2733     func = (ippiResizeFunc)ippiResizeCubic_##TYPE##_##CN##R; \
   2734     CHECK_IPP_STATUS(ippiResizeGetSize_##TYPE(srcSize, dstSize, (IppiInterpolationType)mode, 0, &specSize, &initSize));\
   2735     specBuf.allocate(specSize);\
   2736     pSpec = (uchar*)specBuf;\
   2737     AutoBuffer<uchar> buf(initSize);\
   2738     uchar* pInit = (uchar*)buf;\
   2739     CHECK_IPP_STATUS(ippiResizeCubicInit_##TYPE(srcSize, dstSize, 0.f, 0.75f, (IppiResizeSpec_32f*)pSpec, pInit));
   2740 
   2741 #define SET_IPP_RESIZE_PTR(TYPE, CN) \
   2742     if (mode == (int)ippLinear)     { SET_IPP_RESIZE_LINEAR_FUNC_PTR(TYPE, CN);} \
   2743     else if (mode == (int)ippCubic) { SET_IPP_RESIZE_CUBIC_FUNC_PTR(TYPE, CN);} \
   2744     else { *ok = false; return; } \
   2745     getBufferSizeFunc = (ippiResizeGetBufferSize)ippiResizeGetBufferSize_##TYPE; \
   2746     getSrcOffsetFunc =  (ippiResizeGetSrcOffset)ippiResizeGetSrcOffset_##TYPE;
   2747 
   2748 #if IPP_VERSION_X100 >= 701
   2749 class IPPresizeInvoker :
   2750     public ParallelLoopBody
   2751 {
   2752 public:
   2753     IPPresizeInvoker(const Mat & _src, Mat & _dst, double _inv_scale_x, double _inv_scale_y, int _mode, bool *_ok) :
   2754         ParallelLoopBody(), src(_src), dst(_dst), inv_scale_x(_inv_scale_x),
   2755         inv_scale_y(_inv_scale_y), pSpec(NULL), mode(_mode),
   2756         func(NULL), getBufferSizeFunc(NULL), getSrcOffsetFunc(NULL), ok(_ok)
   2757     {
   2758         *ok = true;
   2759         IppiSize srcSize, dstSize;
   2760         int type = src.type(), specSize = 0, initSize = 0;
   2761         srcSize.width  = src.cols;
   2762         srcSize.height = src.rows;
   2763         dstSize.width  = dst.cols;
   2764         dstSize.height = dst.rows;
   2765 
   2766         switch (type)
   2767         {
   2768 #if 0 // disabled since it breaks tests for CascadeClassifier
   2769             case CV_8UC1:  SET_IPP_RESIZE_PTR(8u,C1);  break;
   2770             case CV_8UC3:  SET_IPP_RESIZE_PTR(8u,C3);  break;
   2771             case CV_8UC4:  SET_IPP_RESIZE_PTR(8u,C4);  break;
   2772 #endif
   2773             case CV_16UC1: SET_IPP_RESIZE_PTR(16u,C1); break;
   2774             case CV_16UC3: SET_IPP_RESIZE_PTR(16u,C3); break;
   2775             case CV_16UC4: SET_IPP_RESIZE_PTR(16u,C4); break;
   2776             case CV_16SC1: SET_IPP_RESIZE_PTR(16s,C1); break;
   2777             case CV_16SC3: SET_IPP_RESIZE_PTR(16s,C3); break;
   2778             case CV_16SC4: SET_IPP_RESIZE_PTR(16s,C4); break;
   2779             case CV_32FC1: SET_IPP_RESIZE_PTR(32f,C1); break;
   2780             case CV_32FC3: SET_IPP_RESIZE_PTR(32f,C3); break;
   2781             case CV_32FC4: SET_IPP_RESIZE_PTR(32f,C4); break;
   2782             case CV_64FC1: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C1); break;
   2783             case CV_64FC3: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C3); break;
   2784             case CV_64FC4: SET_IPP_RESIZE_LINEAR_FUNC_64_PTR(64f,C4); break;
   2785             default: { *ok = false; return; } break;
   2786         }
   2787     }
   2788 
   2789     ~IPPresizeInvoker()
   2790     {
   2791     }
   2792 
   2793     virtual void operator() (const Range& range) const
   2794     {
   2795         if (*ok == false)
   2796             return;
   2797 
   2798         int cn = src.channels();
   2799         int dsty = min(cvRound(range.start * inv_scale_y), dst.rows);
   2800         int dstwidth  = min(cvRound(src.cols * inv_scale_x), dst.cols);
   2801         int dstheight = min(cvRound(range.end * inv_scale_y), dst.rows);
   2802 
   2803         IppiPoint dstOffset = { 0, dsty }, srcOffset = {0, 0};
   2804         IppiSize  dstSize   = { dstwidth, dstheight - dsty };
   2805         int bufsize = 0, itemSize = (int)src.elemSize1();
   2806 
   2807         CHECK_IPP_STATUS(getBufferSizeFunc(pSpec, dstSize, cn, &bufsize));
   2808         CHECK_IPP_STATUS(getSrcOffsetFunc(pSpec, dstOffset, &srcOffset));
   2809 
   2810         const Ipp8u* pSrc = src.ptr<Ipp8u>(srcOffset.y) + srcOffset.x * cn * itemSize;
   2811         Ipp8u* pDst = dst.ptr<Ipp8u>(dstOffset.y) + dstOffset.x * cn * itemSize;
   2812 
   2813         AutoBuffer<uchar> buf(bufsize + 64);
   2814         uchar* bufptr = alignPtr((uchar*)buf, 32);
   2815 
   2816         if( func( pSrc, (int)src.step[0], pDst, (int)dst.step[0], dstOffset, dstSize, ippBorderRepl, 0, pSpec, bufptr ) < 0 )
   2817             *ok = false;
   2818         else
   2819         {
   2820             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   2821         }
   2822     }
   2823 private:
   2824     const Mat & src;
   2825     Mat & dst;
   2826     double inv_scale_x;
   2827     double inv_scale_y;
   2828     void *pSpec;
   2829     AutoBuffer<uchar> specBuf;
   2830     int mode;
   2831     ippiResizeFunc func;
   2832     ippiResizeGetBufferSize getBufferSizeFunc;
   2833     ippiResizeGetSrcOffset getSrcOffsetFunc;
   2834     bool *ok;
   2835     const IPPresizeInvoker& operator= (const IPPresizeInvoker&);
   2836 };
   2837 
   2838 #endif
   2839 
   2840 #ifdef HAVE_OPENCL
   2841 
   2842 static void ocl_computeResizeAreaTabs(int ssize, int dsize, double scale, int * const map_tab,
   2843                                       float * const alpha_tab, int * const ofs_tab)
   2844 {
   2845     int k = 0, dx = 0;
   2846     for ( ; dx < dsize; dx++)
   2847     {
   2848         ofs_tab[dx] = k;
   2849 
   2850         double fsx1 = dx * scale;
   2851         double fsx2 = fsx1 + scale;
   2852         double cellWidth = std::min(scale, ssize - fsx1);
   2853 
   2854         int sx1 = cvCeil(fsx1), sx2 = cvFloor(fsx2);
   2855 
   2856         sx2 = std::min(sx2, ssize - 1);
   2857         sx1 = std::min(sx1, sx2);
   2858 
   2859         if (sx1 - fsx1 > 1e-3)
   2860         {
   2861             map_tab[k] = sx1 - 1;
   2862             alpha_tab[k++] = (float)((sx1 - fsx1) / cellWidth);
   2863         }
   2864 
   2865         for (int sx = sx1; sx < sx2; sx++)
   2866         {
   2867             map_tab[k] = sx;
   2868             alpha_tab[k++] = float(1.0 / cellWidth);
   2869         }
   2870 
   2871         if (fsx2 - sx2 > 1e-3)
   2872         {
   2873             map_tab[k] = sx2;
   2874             alpha_tab[k++] = (float)(std::min(std::min(fsx2 - sx2, 1.), cellWidth) / cellWidth);
   2875         }
   2876     }
   2877     ofs_tab[dx] = k;
   2878 }
   2879 
   2880 static bool ocl_resize( InputArray _src, OutputArray _dst, Size dsize,
   2881                         double fx, double fy, int interpolation)
   2882 {
   2883     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   2884 
   2885     double inv_fx = 1.0 / fx, inv_fy = 1.0 / fy;
   2886     float inv_fxf = (float)inv_fx, inv_fyf = (float)inv_fy;
   2887     int iscale_x = saturate_cast<int>(inv_fx), iscale_y = saturate_cast<int>(inv_fx);
   2888     bool is_area_fast = std::abs(inv_fx - iscale_x) < DBL_EPSILON &&
   2889         std::abs(inv_fy - iscale_y) < DBL_EPSILON;
   2890 
   2891     // in case of scale_x && scale_y is equal to 2
   2892     // INTER_AREA (fast) also is equal to INTER_LINEAR
   2893     if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
   2894         /*interpolation = INTER_AREA*/(void)0; // INTER_AREA is slower
   2895 
   2896     if( !(cn <= 4 &&
   2897            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR ||
   2898             (interpolation == INTER_AREA && inv_fx >= 1 && inv_fy >= 1) )) )
   2899         return false;
   2900 
   2901     UMat src = _src.getUMat();
   2902     _dst.create(dsize, type);
   2903     UMat dst = _dst.getUMat();
   2904 
   2905     Size ssize = src.size();
   2906     ocl::Kernel k;
   2907     size_t globalsize[] = { dst.cols, dst.rows };
   2908 
   2909     ocl::Image2D srcImage;
   2910 
   2911     // See if this could be done with a sampler.  We stick with integer
   2912     // datatypes because the observed error is low.
   2913     bool useSampler = (interpolation == INTER_LINEAR && ocl::Device::getDefault().imageSupport() &&
   2914                        ocl::Image2D::canCreateAlias(src) && depth <= 4 &&
   2915                        ocl::Image2D::isFormatSupported(depth, cn, true) &&
   2916                        src.offset==0);
   2917     if (useSampler)
   2918     {
   2919         int wdepth = std::max(depth, CV_32S);
   2920         char buf[2][32];
   2921         cv::String compileOpts = format("-D USE_SAMPLER -D depth=%d -D T=%s -D T1=%s "
   2922                         "-D convertToDT=%s -D cn=%d",
   2923                         depth, ocl::typeToStr(type), ocl::typeToStr(depth),
   2924                         ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
   2925                         cn);
   2926         k.create("resizeSampler", ocl::imgproc::resize_oclsrc, compileOpts);
   2927 
   2928         if (k.empty())
   2929             useSampler = false;
   2930         else
   2931         {
   2932             // Convert the input into an OpenCL image type, using normalized channel data types
   2933             // and aliasing the UMat.
   2934             srcImage = ocl::Image2D(src, true, true);
   2935             k.args(srcImage, ocl::KernelArg::WriteOnly(dst),
   2936                    (float)inv_fx, (float)inv_fy);
   2937         }
   2938     }
   2939 
   2940     if (interpolation == INTER_LINEAR && !useSampler)
   2941     {
   2942         char buf[2][32];
   2943 
   2944         // integer path is slower because of CPU part, so it's disabled
   2945         if (depth == CV_8U && ((void)0, 0))
   2946         {
   2947             AutoBuffer<uchar> _buffer((dsize.width + dsize.height)*(sizeof(int) + sizeof(short)*2));
   2948             int* xofs = (int*)(uchar*)_buffer, * yofs = xofs + dsize.width;
   2949             short* ialpha = (short*)(yofs + dsize.height), * ibeta = ialpha + dsize.width*2;
   2950             float fxx, fyy;
   2951             int sx, sy;
   2952 
   2953             for (int dx = 0; dx < dsize.width; dx++)
   2954             {
   2955                 fxx = (float)((dx+0.5)*inv_fx - 0.5);
   2956                 sx = cvFloor(fxx);
   2957                 fxx -= sx;
   2958 
   2959                 if (sx < 0)
   2960                     fxx = 0, sx = 0;
   2961 
   2962                 if (sx >= ssize.width-1)
   2963                     fxx = 0, sx = ssize.width-1;
   2964 
   2965                 xofs[dx] = sx;
   2966                 ialpha[dx*2 + 0] = saturate_cast<short>((1.f - fxx) * INTER_RESIZE_COEF_SCALE);
   2967                 ialpha[dx*2 + 1] = saturate_cast<short>(fxx         * INTER_RESIZE_COEF_SCALE);
   2968             }
   2969 
   2970             for (int dy = 0; dy < dsize.height; dy++)
   2971             {
   2972                 fyy = (float)((dy+0.5)*inv_fy - 0.5);
   2973                 sy = cvFloor(fyy);
   2974                 fyy -= sy;
   2975 
   2976                 yofs[dy] = sy;
   2977                 ibeta[dy*2 + 0] = saturate_cast<short>((1.f - fyy) * INTER_RESIZE_COEF_SCALE);
   2978                 ibeta[dy*2 + 1] = saturate_cast<short>(fyy         * INTER_RESIZE_COEF_SCALE);
   2979             }
   2980 
   2981             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
   2982             UMat coeffs;
   2983             Mat(1, static_cast<int>(_buffer.size()), CV_8UC1, (uchar *)_buffer).copyTo(coeffs);
   2984 
   2985             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
   2986                      format("-D INTER_LINEAR_INTEGER -D depth=%d -D T=%s -D T1=%s "
   2987                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
   2988                             "-D INTER_RESIZE_COEF_BITS=%d",
   2989                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
   2990                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
   2991                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
   2992                             cn, INTER_RESIZE_COEF_BITS));
   2993             if (k.empty())
   2994                 return false;
   2995 
   2996             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
   2997                    ocl::KernelArg::PtrReadOnly(coeffs));
   2998         }
   2999         else
   3000         {
   3001             int wdepth = std::max(depth, CV_32S), wtype = CV_MAKETYPE(wdepth, cn);
   3002             k.create("resizeLN", ocl::imgproc::resize_oclsrc,
   3003                      format("-D INTER_LINEAR -D depth=%d -D T=%s -D T1=%s "
   3004                             "-D WT=%s -D convertToWT=%s -D convertToDT=%s -D cn=%d "
   3005                             "-D INTER_RESIZE_COEF_BITS=%d",
   3006                             depth, ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
   3007                             ocl::convertTypeStr(depth, wdepth, cn, buf[0]),
   3008                             ocl::convertTypeStr(wdepth, depth, cn, buf[1]),
   3009                             cn, INTER_RESIZE_COEF_BITS));
   3010             if (k.empty())
   3011                 return false;
   3012 
   3013             k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
   3014                    (float)inv_fx, (float)inv_fy);
   3015         }
   3016     }
   3017     else if (interpolation == INTER_NEAREST)
   3018     {
   3019         k.create("resizeNN", ocl::imgproc::resize_oclsrc,
   3020                  format("-D INTER_NEAREST -D T=%s -D T1=%s -D cn=%d",
   3021                         ocl::vecopTypeToStr(type), ocl::vecopTypeToStr(depth), cn));
   3022         if (k.empty())
   3023             return false;
   3024 
   3025         k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst),
   3026                (float)inv_fx, (float)inv_fy);
   3027     }
   3028     else if (interpolation == INTER_AREA)
   3029     {
   3030         int wdepth = std::max(depth, is_area_fast ? CV_32S : CV_32F);
   3031         int wtype = CV_MAKE_TYPE(wdepth, cn);
   3032 
   3033         char cvt[2][40];
   3034         String buildOption = format("-D INTER_AREA -D T=%s -D T1=%s -D WTV=%s -D convertToWTV=%s -D cn=%d",
   3035                                     ocl::typeToStr(type), ocl::typeToStr(depth), ocl::typeToStr(wtype),
   3036                                     ocl::convertTypeStr(depth, wdepth, cn, cvt[0]), cn);
   3037 
   3038         UMat alphaOcl, tabofsOcl, mapOcl;
   3039         UMat dmap, smap;
   3040 
   3041         if (is_area_fast)
   3042         {
   3043             int wdepth2 = std::max(CV_32F, depth), wtype2 = CV_MAKE_TYPE(wdepth2, cn);
   3044             buildOption = buildOption + format(" -D convertToT=%s -D WT2V=%s -D convertToWT2V=%s -D INTER_AREA_FAST"
   3045                                                 " -D XSCALE=%d -D YSCALE=%d -D SCALE=%ff",
   3046                                                 ocl::convertTypeStr(wdepth2, depth, cn, cvt[0]),
   3047                                                 ocl::typeToStr(wtype2), ocl::convertTypeStr(wdepth, wdepth2, cn, cvt[1]),
   3048                                     iscale_x, iscale_y, 1.0f / (iscale_x * iscale_y));
   3049 
   3050             k.create("resizeAREA_FAST", ocl::imgproc::resize_oclsrc, buildOption);
   3051             if (k.empty())
   3052                 return false;
   3053         }
   3054         else
   3055         {
   3056             buildOption = buildOption + format(" -D convertToT=%s", ocl::convertTypeStr(wdepth, depth, cn, cvt[0]));
   3057             k.create("resizeAREA", ocl::imgproc::resize_oclsrc, buildOption);
   3058             if (k.empty())
   3059                 return false;
   3060 
   3061             int xytab_size = (ssize.width + ssize.height) << 1;
   3062             int tabofs_size = dsize.height + dsize.width + 2;
   3063 
   3064             AutoBuffer<int> _xymap_tab(xytab_size), _xyofs_tab(tabofs_size);
   3065             AutoBuffer<float> _xyalpha_tab(xytab_size);
   3066             int * xmap_tab = _xymap_tab, * ymap_tab = _xymap_tab + (ssize.width << 1);
   3067             float * xalpha_tab = _xyalpha_tab, * yalpha_tab = _xyalpha_tab + (ssize.width << 1);
   3068             int * xofs_tab = _xyofs_tab, * yofs_tab = _xyofs_tab + dsize.width + 1;
   3069 
   3070             ocl_computeResizeAreaTabs(ssize.width, dsize.width, inv_fx, xmap_tab, xalpha_tab, xofs_tab);
   3071             ocl_computeResizeAreaTabs(ssize.height, dsize.height, inv_fy, ymap_tab, yalpha_tab, yofs_tab);
   3072 
   3073             // loading precomputed arrays to GPU
   3074             Mat(1, xytab_size, CV_32FC1, (void *)_xyalpha_tab).copyTo(alphaOcl);
   3075             Mat(1, xytab_size, CV_32SC1, (void *)_xymap_tab).copyTo(mapOcl);
   3076             Mat(1, tabofs_size, CV_32SC1, (void *)_xyofs_tab).copyTo(tabofsOcl);
   3077         }
   3078 
   3079         ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst);
   3080 
   3081         if (is_area_fast)
   3082             k.args(srcarg, dstarg);
   3083         else
   3084             k.args(srcarg, dstarg, inv_fxf, inv_fyf, ocl::KernelArg::PtrReadOnly(tabofsOcl),
   3085                    ocl::KernelArg::PtrReadOnly(mapOcl), ocl::KernelArg::PtrReadOnly(alphaOcl));
   3086 
   3087         return k.run(2, globalsize, NULL, false);
   3088     }
   3089 
   3090     return k.run(2, globalsize, 0, false);
   3091 }
   3092 
   3093 #endif
   3094 
   3095 }
   3096 
   3097 //////////////////////////////////////////////////////////////////////////////////////////
   3098 
   3099 void cv::resize( InputArray _src, OutputArray _dst, Size dsize,
   3100                  double inv_scale_x, double inv_scale_y, int interpolation )
   3101 {
   3102     static ResizeFunc linear_tab[] =
   3103     {
   3104         resizeGeneric_<
   3105             HResizeLinear<uchar, int, short,
   3106                 INTER_RESIZE_COEF_SCALE,
   3107                 HResizeLinearVec_8u32s>,
   3108             VResizeLinear<uchar, int, short,
   3109                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
   3110                 VResizeLinearVec_32s8u> >,
   3111         0,
   3112         resizeGeneric_<
   3113             HResizeLinear<ushort, float, float, 1,
   3114                 HResizeLinearVec_16u32f>,
   3115             VResizeLinear<ushort, float, float, Cast<float, ushort>,
   3116                 VResizeLinearVec_32f16u> >,
   3117         resizeGeneric_<
   3118             HResizeLinear<short, float, float, 1,
   3119                 HResizeLinearVec_16s32f>,
   3120             VResizeLinear<short, float, float, Cast<float, short>,
   3121                 VResizeLinearVec_32f16s> >,
   3122         0,
   3123         resizeGeneric_<
   3124             HResizeLinear<float, float, float, 1,
   3125                 HResizeLinearVec_32f>,
   3126             VResizeLinear<float, float, float, Cast<float, float>,
   3127                 VResizeLinearVec_32f> >,
   3128         resizeGeneric_<
   3129             HResizeLinear<double, double, float, 1,
   3130                 HResizeNoVec>,
   3131             VResizeLinear<double, double, float, Cast<double, double>,
   3132                 VResizeNoVec> >,
   3133         0
   3134     };
   3135 
   3136     static ResizeFunc cubic_tab[] =
   3137     {
   3138         resizeGeneric_<
   3139             HResizeCubic<uchar, int, short>,
   3140             VResizeCubic<uchar, int, short,
   3141                 FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
   3142                 VResizeCubicVec_32s8u> >,
   3143         0,
   3144         resizeGeneric_<
   3145             HResizeCubic<ushort, float, float>,
   3146             VResizeCubic<ushort, float, float, Cast<float, ushort>,
   3147             VResizeCubicVec_32f16u> >,
   3148         resizeGeneric_<
   3149             HResizeCubic<short, float, float>,
   3150             VResizeCubic<short, float, float, Cast<float, short>,
   3151             VResizeCubicVec_32f16s> >,
   3152         0,
   3153         resizeGeneric_<
   3154             HResizeCubic<float, float, float>,
   3155             VResizeCubic<float, float, float, Cast<float, float>,
   3156             VResizeCubicVec_32f> >,
   3157         resizeGeneric_<
   3158             HResizeCubic<double, double, float>,
   3159             VResizeCubic<double, double, float, Cast<double, double>,
   3160             VResizeNoVec> >,
   3161         0
   3162     };
   3163 
   3164     static ResizeFunc lanczos4_tab[] =
   3165     {
   3166         resizeGeneric_<HResizeLanczos4<uchar, int, short>,
   3167             VResizeLanczos4<uchar, int, short,
   3168             FixedPtCast<int, uchar, INTER_RESIZE_COEF_BITS*2>,
   3169             VResizeNoVec> >,
   3170         0,
   3171         resizeGeneric_<HResizeLanczos4<ushort, float, float>,
   3172             VResizeLanczos4<ushort, float, float, Cast<float, ushort>,
   3173             VResizeLanczos4Vec_32f16u> >,
   3174         resizeGeneric_<HResizeLanczos4<short, float, float>,
   3175             VResizeLanczos4<short, float, float, Cast<float, short>,
   3176             VResizeLanczos4Vec_32f16s> >,
   3177         0,
   3178         resizeGeneric_<HResizeLanczos4<float, float, float>,
   3179             VResizeLanczos4<float, float, float, Cast<float, float>,
   3180             VResizeLanczos4Vec_32f> >,
   3181         resizeGeneric_<HResizeLanczos4<double, double, float>,
   3182             VResizeLanczos4<double, double, float, Cast<double, double>,
   3183             VResizeNoVec> >,
   3184         0
   3185     };
   3186 
   3187     static ResizeAreaFastFunc areafast_tab[] =
   3188     {
   3189         resizeAreaFast_<uchar, int, ResizeAreaFastVec<uchar, ResizeAreaFastVec_SIMD_8u> >,
   3190         0,
   3191         resizeAreaFast_<ushort, float, ResizeAreaFastVec<ushort, ResizeAreaFastVec_SIMD_16u> >,
   3192         resizeAreaFast_<short, float, ResizeAreaFastVec<short, ResizeAreaFastVec_SIMD_16s> >,
   3193         0,
   3194         resizeAreaFast_<float, float, ResizeAreaFastVec_SIMD_32f>,
   3195         resizeAreaFast_<double, double, ResizeAreaFastNoVec<double, double> >,
   3196         0
   3197     };
   3198 
   3199     static ResizeAreaFunc area_tab[] =
   3200     {
   3201         resizeArea_<uchar, float>, 0, resizeArea_<ushort, float>,
   3202         resizeArea_<short, float>, 0, resizeArea_<float, float>,
   3203         resizeArea_<double, double>, 0
   3204     };
   3205 
   3206     Size ssize = _src.size();
   3207 
   3208     CV_Assert( ssize.area() > 0 );
   3209     CV_Assert( dsize.area() > 0 || (inv_scale_x > 0 && inv_scale_y > 0) );
   3210     if( dsize.area() == 0 )
   3211     {
   3212         dsize = Size(saturate_cast<int>(ssize.width*inv_scale_x),
   3213                      saturate_cast<int>(ssize.height*inv_scale_y));
   3214         CV_Assert( dsize.area() > 0 );
   3215     }
   3216     else
   3217     {
   3218         inv_scale_x = (double)dsize.width/ssize.width;
   3219         inv_scale_y = (double)dsize.height/ssize.height;
   3220     }
   3221 
   3222     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat() && _src.cols() > 10 && _src.rows() > 10,
   3223                ocl_resize(_src, _dst, dsize, inv_scale_x, inv_scale_y, interpolation))
   3224 
   3225     Mat src = _src.getMat();
   3226     _dst.create(dsize, src.type());
   3227     Mat dst = _dst.getMat();
   3228 
   3229 #ifdef HAVE_TEGRA_OPTIMIZATION
   3230     if (tegra::useTegra() && tegra::resize(src, dst, (float)inv_scale_x, (float)inv_scale_y, interpolation))
   3231         return;
   3232 #endif
   3233 
   3234     int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   3235     double scale_x = 1./inv_scale_x, scale_y = 1./inv_scale_y;
   3236     int k, sx, sy, dx, dy;
   3237 
   3238     int iscale_x = saturate_cast<int>(scale_x);
   3239     int iscale_y = saturate_cast<int>(scale_y);
   3240 
   3241     bool is_area_fast = std::abs(scale_x - iscale_x) < DBL_EPSILON &&
   3242             std::abs(scale_y - iscale_y) < DBL_EPSILON;
   3243 
   3244 #if IPP_VERSION_X100 >= 701
   3245     CV_IPP_CHECK()
   3246     {
   3247 #define IPP_RESIZE_EPS 1e-10
   3248 
   3249         double ex = fabs((double)dsize.width / src.cols  - inv_scale_x) / inv_scale_x;
   3250         double ey = fabs((double)dsize.height / src.rows - inv_scale_y) / inv_scale_y;
   3251 
   3252         if ( ((ex < IPP_RESIZE_EPS && ey < IPP_RESIZE_EPS && depth != CV_64F) || (ex == 0 && ey == 0 && depth == CV_64F)) &&
   3253              (interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
   3254              !(interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 && depth == CV_8U))
   3255         {
   3256             int mode = -1;
   3257             if (interpolation == INTER_LINEAR && src.rows >= 2 && src.cols >= 2)
   3258                 mode = ippLinear;
   3259             else if (interpolation == INTER_CUBIC && src.rows >= 4 && src.cols >= 4)
   3260                 mode = ippCubic;
   3261 
   3262             if( mode >= 0 && (cn == 1 || cn == 3 || cn == 4) &&
   3263                 (depth == CV_16U || depth == CV_16S || depth == CV_32F ||
   3264                 (depth == CV_64F && mode == ippLinear)))
   3265             {
   3266                 bool ok = true;
   3267                 Range range(0, src.rows);
   3268                 IPPresizeInvoker invoker(src, dst, inv_scale_x, inv_scale_y, mode, &ok);
   3269                 parallel_for_(range, invoker, dst.total()/(double)(1<<16));
   3270                 if( ok )
   3271                 {
   3272                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   3273                     return;
   3274                 }
   3275                 setIppErrorStatus();
   3276             }
   3277         }
   3278 #undef IPP_RESIZE_EPS
   3279     }
   3280 #endif
   3281 
   3282     if( interpolation == INTER_NEAREST )
   3283     {
   3284         resizeNN( src, dst, inv_scale_x, inv_scale_y );
   3285         return;
   3286     }
   3287 
   3288     {
   3289         // in case of scale_x && scale_y is equal to 2
   3290         // INTER_AREA (fast) also is equal to INTER_LINEAR
   3291         if( interpolation == INTER_LINEAR && is_area_fast && iscale_x == 2 && iscale_y == 2 )
   3292             interpolation = INTER_AREA;
   3293 
   3294         // true "area" interpolation is only implemented for the case (scale_x <= 1 && scale_y <= 1).
   3295         // In other cases it is emulated using some variant of bilinear interpolation
   3296         if( interpolation == INTER_AREA && scale_x >= 1 && scale_y >= 1 )
   3297         {
   3298             if( is_area_fast )
   3299             {
   3300                 int area = iscale_x*iscale_y;
   3301                 size_t srcstep = src.step / src.elemSize1();
   3302                 AutoBuffer<int> _ofs(area + dsize.width*cn);
   3303                 int* ofs = _ofs;
   3304                 int* xofs = ofs + area;
   3305                 ResizeAreaFastFunc func = areafast_tab[depth];
   3306                 CV_Assert( func != 0 );
   3307 
   3308                 for( sy = 0, k = 0; sy < iscale_y; sy++ )
   3309                     for( sx = 0; sx < iscale_x; sx++ )
   3310                         ofs[k++] = (int)(sy*srcstep + sx*cn);
   3311 
   3312                 for( dx = 0; dx < dsize.width; dx++ )
   3313                 {
   3314                     int j = dx * cn;
   3315                     sx = iscale_x * j;
   3316                     for( k = 0; k < cn; k++ )
   3317                         xofs[j + k] = sx + k;
   3318                 }
   3319 
   3320                 func( src, dst, ofs, xofs, iscale_x, iscale_y );
   3321                 return;
   3322             }
   3323 
   3324             ResizeAreaFunc func = area_tab[depth];
   3325             CV_Assert( func != 0 && cn <= 4 );
   3326 
   3327             AutoBuffer<DecimateAlpha> _xytab((ssize.width + ssize.height)*2);
   3328             DecimateAlpha* xtab = _xytab, *ytab = xtab + ssize.width*2;
   3329 
   3330             int xtab_size = computeResizeAreaTab(ssize.width, dsize.width, cn, scale_x, xtab);
   3331             int ytab_size = computeResizeAreaTab(ssize.height, dsize.height, 1, scale_y, ytab);
   3332 
   3333             AutoBuffer<int> _tabofs(dsize.height + 1);
   3334             int* tabofs = _tabofs;
   3335             for( k = 0, dy = 0; k < ytab_size; k++ )
   3336             {
   3337                 if( k == 0 || ytab[k].di != ytab[k-1].di )
   3338                 {
   3339                     assert( ytab[k].di == dy );
   3340                     tabofs[dy++] = k;
   3341                 }
   3342             }
   3343             tabofs[dy] = ytab_size;
   3344 
   3345             func( src, dst, xtab, xtab_size, ytab, ytab_size, tabofs );
   3346             return;
   3347         }
   3348     }
   3349 
   3350     int xmin = 0, xmax = dsize.width, width = dsize.width*cn;
   3351     bool area_mode = interpolation == INTER_AREA;
   3352     bool fixpt = depth == CV_8U;
   3353     float fx, fy;
   3354     ResizeFunc func=0;
   3355     int ksize=0, ksize2;
   3356     if( interpolation == INTER_CUBIC )
   3357         ksize = 4, func = cubic_tab[depth];
   3358     else if( interpolation == INTER_LANCZOS4 )
   3359         ksize = 8, func = lanczos4_tab[depth];
   3360     else if( interpolation == INTER_LINEAR || interpolation == INTER_AREA )
   3361         ksize = 2, func = linear_tab[depth];
   3362     else
   3363         CV_Error( CV_StsBadArg, "Unknown interpolation method" );
   3364     ksize2 = ksize/2;
   3365 
   3366     CV_Assert( func != 0 );
   3367 
   3368     AutoBuffer<uchar> _buffer((width + dsize.height)*(sizeof(int) + sizeof(float)*ksize));
   3369     int* xofs = (int*)(uchar*)_buffer;
   3370     int* yofs = xofs + width;
   3371     float* alpha = (float*)(yofs + dsize.height);
   3372     short* ialpha = (short*)alpha;
   3373     float* beta = alpha + width*ksize;
   3374     short* ibeta = ialpha + width*ksize;
   3375     float cbuf[MAX_ESIZE];
   3376 
   3377     for( dx = 0; dx < dsize.width; dx++ )
   3378     {
   3379         if( !area_mode )
   3380         {
   3381             fx = (float)((dx+0.5)*scale_x - 0.5);
   3382             sx = cvFloor(fx);
   3383             fx -= sx;
   3384         }
   3385         else
   3386         {
   3387             sx = cvFloor(dx*scale_x);
   3388             fx = (float)((dx+1) - (sx+1)*inv_scale_x);
   3389             fx = fx <= 0 ? 0.f : fx - cvFloor(fx);
   3390         }
   3391 
   3392         if( sx < ksize2-1 )
   3393         {
   3394             xmin = dx+1;
   3395             if( sx < 0 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
   3396                 fx = 0, sx = 0;
   3397         }
   3398 
   3399         if( sx + ksize2 >= ssize.width )
   3400         {
   3401             xmax = std::min( xmax, dx );
   3402             if( sx >= ssize.width-1 && (interpolation != INTER_CUBIC && interpolation != INTER_LANCZOS4))
   3403                 fx = 0, sx = ssize.width-1;
   3404         }
   3405 
   3406         for( k = 0, sx *= cn; k < cn; k++ )
   3407             xofs[dx*cn + k] = sx + k;
   3408 
   3409         if( interpolation == INTER_CUBIC )
   3410             interpolateCubic( fx, cbuf );
   3411         else if( interpolation == INTER_LANCZOS4 )
   3412             interpolateLanczos4( fx, cbuf );
   3413         else
   3414         {
   3415             cbuf[0] = 1.f - fx;
   3416             cbuf[1] = fx;
   3417         }
   3418         if( fixpt )
   3419         {
   3420             for( k = 0; k < ksize; k++ )
   3421                 ialpha[dx*cn*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
   3422             for( ; k < cn*ksize; k++ )
   3423                 ialpha[dx*cn*ksize + k] = ialpha[dx*cn*ksize + k - ksize];
   3424         }
   3425         else
   3426         {
   3427             for( k = 0; k < ksize; k++ )
   3428                 alpha[dx*cn*ksize + k] = cbuf[k];
   3429             for( ; k < cn*ksize; k++ )
   3430                 alpha[dx*cn*ksize + k] = alpha[dx*cn*ksize + k - ksize];
   3431         }
   3432     }
   3433 
   3434     for( dy = 0; dy < dsize.height; dy++ )
   3435     {
   3436         if( !area_mode )
   3437         {
   3438             fy = (float)((dy+0.5)*scale_y - 0.5);
   3439             sy = cvFloor(fy);
   3440             fy -= sy;
   3441         }
   3442         else
   3443         {
   3444             sy = cvFloor(dy*scale_y);
   3445             fy = (float)((dy+1) - (sy+1)*inv_scale_y);
   3446             fy = fy <= 0 ? 0.f : fy - cvFloor(fy);
   3447         }
   3448 
   3449         yofs[dy] = sy;
   3450         if( interpolation == INTER_CUBIC )
   3451             interpolateCubic( fy, cbuf );
   3452         else if( interpolation == INTER_LANCZOS4 )
   3453             interpolateLanczos4( fy, cbuf );
   3454         else
   3455         {
   3456             cbuf[0] = 1.f - fy;
   3457             cbuf[1] = fy;
   3458         }
   3459 
   3460         if( fixpt )
   3461         {
   3462             for( k = 0; k < ksize; k++ )
   3463                 ibeta[dy*ksize + k] = saturate_cast<short>(cbuf[k]*INTER_RESIZE_COEF_SCALE);
   3464         }
   3465         else
   3466         {
   3467             for( k = 0; k < ksize; k++ )
   3468                 beta[dy*ksize + k] = cbuf[k];
   3469         }
   3470     }
   3471 
   3472     func( src, dst, xofs, fixpt ? (void*)ialpha : (void*)alpha, yofs,
   3473           fixpt ? (void*)ibeta : (void*)beta, xmin, xmax, ksize );
   3474 }
   3475 
   3476 
   3477 /****************************************************************************************\
   3478 *                       General warping (affine, perspective, remap)                     *
   3479 \****************************************************************************************/
   3480 
   3481 namespace cv
   3482 {
   3483 
   3484 template<typename T>
   3485 static void remapNearest( const Mat& _src, Mat& _dst, const Mat& _xy,
   3486                           int borderType, const Scalar& _borderValue )
   3487 {
   3488     Size ssize = _src.size(), dsize = _dst.size();
   3489     int cn = _src.channels();
   3490     const T* S0 = _src.ptr<T>();
   3491     size_t sstep = _src.step/sizeof(S0[0]);
   3492     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
   3493         saturate_cast<T>(_borderValue[1]),
   3494         saturate_cast<T>(_borderValue[2]),
   3495         saturate_cast<T>(_borderValue[3]));
   3496     int dx, dy;
   3497 
   3498     unsigned width1 = ssize.width, height1 = ssize.height;
   3499 
   3500     if( _dst.isContinuous() && _xy.isContinuous() )
   3501     {
   3502         dsize.width *= dsize.height;
   3503         dsize.height = 1;
   3504     }
   3505 
   3506     for( dy = 0; dy < dsize.height; dy++ )
   3507     {
   3508         T* D = _dst.ptr<T>(dy);
   3509         const short* XY = _xy.ptr<short>(dy);
   3510 
   3511         if( cn == 1 )
   3512         {
   3513             for( dx = 0; dx < dsize.width; dx++ )
   3514             {
   3515                 int sx = XY[dx*2], sy = XY[dx*2+1];
   3516                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
   3517                     D[dx] = S0[sy*sstep + sx];
   3518                 else
   3519                 {
   3520                     if( borderType == BORDER_REPLICATE )
   3521                     {
   3522                         sx = clip(sx, 0, ssize.width);
   3523                         sy = clip(sy, 0, ssize.height);
   3524                         D[dx] = S0[sy*sstep + sx];
   3525                     }
   3526                     else if( borderType == BORDER_CONSTANT )
   3527                         D[dx] = cval[0];
   3528                     else if( borderType != BORDER_TRANSPARENT )
   3529                     {
   3530                         sx = borderInterpolate(sx, ssize.width, borderType);
   3531                         sy = borderInterpolate(sy, ssize.height, borderType);
   3532                         D[dx] = S0[sy*sstep + sx];
   3533                     }
   3534                 }
   3535             }
   3536         }
   3537         else
   3538         {
   3539             for( dx = 0; dx < dsize.width; dx++, D += cn )
   3540             {
   3541                 int sx = XY[dx*2], sy = XY[dx*2+1], k;
   3542                 const T *S;
   3543                 if( (unsigned)sx < width1 && (unsigned)sy < height1 )
   3544                 {
   3545                     if( cn == 3 )
   3546                     {
   3547                         S = S0 + sy*sstep + sx*3;
   3548                         D[0] = S[0], D[1] = S[1], D[2] = S[2];
   3549                     }
   3550                     else if( cn == 4 )
   3551                     {
   3552                         S = S0 + sy*sstep + sx*4;
   3553                         D[0] = S[0], D[1] = S[1], D[2] = S[2], D[3] = S[3];
   3554                     }
   3555                     else
   3556                     {
   3557                         S = S0 + sy*sstep + sx*cn;
   3558                         for( k = 0; k < cn; k++ )
   3559                             D[k] = S[k];
   3560                     }
   3561                 }
   3562                 else if( borderType != BORDER_TRANSPARENT )
   3563                 {
   3564                     if( borderType == BORDER_REPLICATE )
   3565                     {
   3566                         sx = clip(sx, 0, ssize.width);
   3567                         sy = clip(sy, 0, ssize.height);
   3568                         S = S0 + sy*sstep + sx*cn;
   3569                     }
   3570                     else if( borderType == BORDER_CONSTANT )
   3571                         S = &cval[0];
   3572                     else
   3573                     {
   3574                         sx = borderInterpolate(sx, ssize.width, borderType);
   3575                         sy = borderInterpolate(sy, ssize.height, borderType);
   3576                         S = S0 + sy*sstep + sx*cn;
   3577                     }
   3578                     for( k = 0; k < cn; k++ )
   3579                         D[k] = S[k];
   3580                 }
   3581             }
   3582         }
   3583     }
   3584 }
   3585 
   3586 
   3587 struct RemapNoVec
   3588 {
   3589     int operator()( const Mat&, void*, const short*, const ushort*,
   3590                     const void*, int ) const { return 0; }
   3591 };
   3592 
   3593 #if CV_SSE2
   3594 
   3595 struct RemapVec_8u
   3596 {
   3597     int operator()( const Mat& _src, void* _dst, const short* XY,
   3598                     const ushort* FXY, const void* _wtab, int width ) const
   3599     {
   3600         int cn = _src.channels(), x = 0, sstep = (int)_src.step;
   3601 
   3602         if( (cn != 1 && cn != 3 && cn != 4) || !checkHardwareSupport(CV_CPU_SSE2) ||
   3603             sstep > 0x8000 )
   3604             return 0;
   3605 
   3606         const uchar *S0 = _src.ptr(), *S1 = _src.ptr(1);
   3607         const short* wtab = cn == 1 ? (const short*)_wtab : &BilinearTab_iC4[0][0][0];
   3608         uchar* D = (uchar*)_dst;
   3609         __m128i delta = _mm_set1_epi32(INTER_REMAP_COEF_SCALE/2);
   3610         __m128i xy2ofs = _mm_set1_epi32(cn + (sstep << 16));
   3611         __m128i z = _mm_setzero_si128();
   3612         int CV_DECL_ALIGNED(16) iofs0[4], iofs1[4];
   3613 
   3614         if( cn == 1 )
   3615         {
   3616             for( ; x <= width - 8; x += 8 )
   3617             {
   3618                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
   3619                 __m128i xy1 = _mm_loadu_si128( (const __m128i*)(XY + x*2 + 8));
   3620                 __m128i v0, v1, v2, v3, a0, a1, b0, b1;
   3621                 unsigned i0, i1;
   3622 
   3623                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
   3624                 xy1 = _mm_madd_epi16( xy1, xy2ofs );
   3625                 _mm_store_si128( (__m128i*)iofs0, xy0 );
   3626                 _mm_store_si128( (__m128i*)iofs1, xy1 );
   3627 
   3628                 i0 = *(ushort*)(S0 + iofs0[0]) + (*(ushort*)(S0 + iofs0[1]) << 16);
   3629                 i1 = *(ushort*)(S0 + iofs0[2]) + (*(ushort*)(S0 + iofs0[3]) << 16);
   3630                 v0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
   3631                 i0 = *(ushort*)(S1 + iofs0[0]) + (*(ushort*)(S1 + iofs0[1]) << 16);
   3632                 i1 = *(ushort*)(S1 + iofs0[2]) + (*(ushort*)(S1 + iofs0[3]) << 16);
   3633                 v1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
   3634                 v0 = _mm_unpacklo_epi8(v0, z);
   3635                 v1 = _mm_unpacklo_epi8(v1, z);
   3636 
   3637                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x]*4)),
   3638                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+1]*4)));
   3639                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+2]*4)),
   3640                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+3]*4)));
   3641                 b0 = _mm_unpacklo_epi64(a0, a1);
   3642                 b1 = _mm_unpackhi_epi64(a0, a1);
   3643                 v0 = _mm_madd_epi16(v0, b0);
   3644                 v1 = _mm_madd_epi16(v1, b1);
   3645                 v0 = _mm_add_epi32(_mm_add_epi32(v0, v1), delta);
   3646 
   3647                 i0 = *(ushort*)(S0 + iofs1[0]) + (*(ushort*)(S0 + iofs1[1]) << 16);
   3648                 i1 = *(ushort*)(S0 + iofs1[2]) + (*(ushort*)(S0 + iofs1[3]) << 16);
   3649                 v2 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
   3650                 i0 = *(ushort*)(S1 + iofs1[0]) + (*(ushort*)(S1 + iofs1[1]) << 16);
   3651                 i1 = *(ushort*)(S1 + iofs1[2]) + (*(ushort*)(S1 + iofs1[3]) << 16);
   3652                 v3 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(i0), _mm_cvtsi32_si128(i1));
   3653                 v2 = _mm_unpacklo_epi8(v2, z);
   3654                 v3 = _mm_unpacklo_epi8(v3, z);
   3655 
   3656                 a0 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+4]*4)),
   3657                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+5]*4)));
   3658                 a1 = _mm_unpacklo_epi32(_mm_loadl_epi64((__m128i*)(wtab+FXY[x+6]*4)),
   3659                                         _mm_loadl_epi64((__m128i*)(wtab+FXY[x+7]*4)));
   3660                 b0 = _mm_unpacklo_epi64(a0, a1);
   3661                 b1 = _mm_unpackhi_epi64(a0, a1);
   3662                 v2 = _mm_madd_epi16(v2, b0);
   3663                 v3 = _mm_madd_epi16(v3, b1);
   3664                 v2 = _mm_add_epi32(_mm_add_epi32(v2, v3), delta);
   3665 
   3666                 v0 = _mm_srai_epi32(v0, INTER_REMAP_COEF_BITS);
   3667                 v2 = _mm_srai_epi32(v2, INTER_REMAP_COEF_BITS);
   3668                 v0 = _mm_packus_epi16(_mm_packs_epi32(v0, v2), z);
   3669                 _mm_storel_epi64( (__m128i*)(D + x), v0 );
   3670             }
   3671         }
   3672         else if( cn == 3 )
   3673         {
   3674             for( ; x <= width - 5; x += 4, D += 12 )
   3675             {
   3676                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
   3677                 __m128i u0, v0, u1, v1;
   3678 
   3679                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
   3680                 _mm_store_si128( (__m128i*)iofs0, xy0 );
   3681                 const __m128i *w0, *w1;
   3682                 w0 = (const __m128i*)(wtab + FXY[x]*16);
   3683                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
   3684 
   3685                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
   3686                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 3)));
   3687                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
   3688                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 3)));
   3689                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
   3690                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 3)));
   3691                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
   3692                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 3)));
   3693                 u0 = _mm_unpacklo_epi8(u0, z);
   3694                 v0 = _mm_unpacklo_epi8(v0, z);
   3695                 u1 = _mm_unpacklo_epi8(u1, z);
   3696                 v1 = _mm_unpacklo_epi8(v1, z);
   3697                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
   3698                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
   3699                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
   3700                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
   3701                 u0 = _mm_slli_si128(u0, 4);
   3702                 u0 = _mm_packs_epi32(u0, u1);
   3703                 u0 = _mm_packus_epi16(u0, u0);
   3704                 _mm_storel_epi64((__m128i*)D, _mm_srli_si128(u0,1));
   3705 
   3706                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
   3707                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
   3708 
   3709                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
   3710                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 3)));
   3711                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
   3712                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 3)));
   3713                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
   3714                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 3)));
   3715                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
   3716                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 3)));
   3717                 u0 = _mm_unpacklo_epi8(u0, z);
   3718                 v0 = _mm_unpacklo_epi8(v0, z);
   3719                 u1 = _mm_unpacklo_epi8(u1, z);
   3720                 v1 = _mm_unpacklo_epi8(v1, z);
   3721                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
   3722                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
   3723                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
   3724                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
   3725                 u0 = _mm_slli_si128(u0, 4);
   3726                 u0 = _mm_packs_epi32(u0, u1);
   3727                 u0 = _mm_packus_epi16(u0, u0);
   3728                 _mm_storel_epi64((__m128i*)(D + 6), _mm_srli_si128(u0,1));
   3729             }
   3730         }
   3731         else if( cn == 4 )
   3732         {
   3733             for( ; x <= width - 4; x += 4, D += 16 )
   3734             {
   3735                 __m128i xy0 = _mm_loadu_si128( (const __m128i*)(XY + x*2));
   3736                 __m128i u0, v0, u1, v1;
   3737 
   3738                 xy0 = _mm_madd_epi16( xy0, xy2ofs );
   3739                 _mm_store_si128( (__m128i*)iofs0, xy0 );
   3740                 const __m128i *w0, *w1;
   3741                 w0 = (const __m128i*)(wtab + FXY[x]*16);
   3742                 w1 = (const __m128i*)(wtab + FXY[x+1]*16);
   3743 
   3744                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[0])),
   3745                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[0] + 4)));
   3746                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[0])),
   3747                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[0] + 4)));
   3748                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[1])),
   3749                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[1] + 4)));
   3750                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[1])),
   3751                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[1] + 4)));
   3752                 u0 = _mm_unpacklo_epi8(u0, z);
   3753                 v0 = _mm_unpacklo_epi8(v0, z);
   3754                 u1 = _mm_unpacklo_epi8(u1, z);
   3755                 v1 = _mm_unpacklo_epi8(v1, z);
   3756                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
   3757                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
   3758                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
   3759                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
   3760                 u0 = _mm_packs_epi32(u0, u1);
   3761                 u0 = _mm_packus_epi16(u0, u0);
   3762                 _mm_storel_epi64((__m128i*)D, u0);
   3763 
   3764                 w0 = (const __m128i*)(wtab + FXY[x+2]*16);
   3765                 w1 = (const __m128i*)(wtab + FXY[x+3]*16);
   3766 
   3767                 u0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[2])),
   3768                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[2] + 4)));
   3769                 v0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[2])),
   3770                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[2] + 4)));
   3771                 u1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S0 + iofs0[3])),
   3772                                        _mm_cvtsi32_si128(*(int*)(S0 + iofs0[3] + 4)));
   3773                 v1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(int*)(S1 + iofs0[3])),
   3774                                        _mm_cvtsi32_si128(*(int*)(S1 + iofs0[3] + 4)));
   3775                 u0 = _mm_unpacklo_epi8(u0, z);
   3776                 v0 = _mm_unpacklo_epi8(v0, z);
   3777                 u1 = _mm_unpacklo_epi8(u1, z);
   3778                 v1 = _mm_unpacklo_epi8(v1, z);
   3779                 u0 = _mm_add_epi32(_mm_madd_epi16(u0, w0[0]), _mm_madd_epi16(v0, w0[1]));
   3780                 u1 = _mm_add_epi32(_mm_madd_epi16(u1, w1[0]), _mm_madd_epi16(v1, w1[1]));
   3781                 u0 = _mm_srai_epi32(_mm_add_epi32(u0, delta), INTER_REMAP_COEF_BITS);
   3782                 u1 = _mm_srai_epi32(_mm_add_epi32(u1, delta), INTER_REMAP_COEF_BITS);
   3783                 u0 = _mm_packs_epi32(u0, u1);
   3784                 u0 = _mm_packus_epi16(u0, u0);
   3785                 _mm_storel_epi64((__m128i*)(D + 8), u0);
   3786             }
   3787         }
   3788 
   3789         return x;
   3790     }
   3791 };
   3792 
   3793 #else
   3794 
   3795 typedef RemapNoVec RemapVec_8u;
   3796 
   3797 #endif
   3798 
   3799 
   3800 template<class CastOp, class VecOp, typename AT>
   3801 static void remapBilinear( const Mat& _src, Mat& _dst, const Mat& _xy,
   3802                            const Mat& _fxy, const void* _wtab,
   3803                            int borderType, const Scalar& _borderValue )
   3804 {
   3805     typedef typename CastOp::rtype T;
   3806     typedef typename CastOp::type1 WT;
   3807     Size ssize = _src.size(), dsize = _dst.size();
   3808     int k, cn = _src.channels();
   3809     const AT* wtab = (const AT*)_wtab;
   3810     const T* S0 = _src.ptr<T>();
   3811     size_t sstep = _src.step/sizeof(S0[0]);
   3812     T cval[CV_CN_MAX];
   3813     int dx, dy;
   3814     CastOp castOp;
   3815     VecOp vecOp;
   3816 
   3817     for( k = 0; k < cn; k++ )
   3818         cval[k] = saturate_cast<T>(_borderValue[k & 3]);
   3819 
   3820     unsigned width1 = std::max(ssize.width-1, 0), height1 = std::max(ssize.height-1, 0);
   3821     CV_Assert( ssize.area() > 0 );
   3822 #if CV_SSE2
   3823     if( _src.type() == CV_8UC3 )
   3824         width1 = std::max(ssize.width-2, 0);
   3825 #endif
   3826 
   3827     for( dy = 0; dy < dsize.height; dy++ )
   3828     {
   3829         T* D = _dst.ptr<T>(dy);
   3830         const short* XY = _xy.ptr<short>(dy);
   3831         const ushort* FXY = _fxy.ptr<ushort>(dy);
   3832         int X0 = 0;
   3833         bool prevInlier = false;
   3834 
   3835         for( dx = 0; dx <= dsize.width; dx++ )
   3836         {
   3837             bool curInlier = dx < dsize.width ?
   3838                 (unsigned)XY[dx*2] < width1 &&
   3839                 (unsigned)XY[dx*2+1] < height1 : !prevInlier;
   3840             if( curInlier == prevInlier )
   3841                 continue;
   3842 
   3843             int X1 = dx;
   3844             dx = X0;
   3845             X0 = X1;
   3846             prevInlier = curInlier;
   3847 
   3848             if( !curInlier )
   3849             {
   3850                 int len = vecOp( _src, D, XY + dx*2, FXY + dx, wtab, X1 - dx );
   3851                 D += len*cn;
   3852                 dx += len;
   3853 
   3854                 if( cn == 1 )
   3855                 {
   3856                     for( ; dx < X1; dx++, D++ )
   3857                     {
   3858                         int sx = XY[dx*2], sy = XY[dx*2+1];
   3859                         const AT* w = wtab + FXY[dx]*4;
   3860                         const T* S = S0 + sy*sstep + sx;
   3861                         *D = castOp(WT(S[0]*w[0] + S[1]*w[1] + S[sstep]*w[2] + S[sstep+1]*w[3]));
   3862                     }
   3863                 }
   3864                 else if( cn == 2 )
   3865                     for( ; dx < X1; dx++, D += 2 )
   3866                     {
   3867                         int sx = XY[dx*2], sy = XY[dx*2+1];
   3868                         const AT* w = wtab + FXY[dx]*4;
   3869                         const T* S = S0 + sy*sstep + sx*2;
   3870                         WT t0 = S[0]*w[0] + S[2]*w[1] + S[sstep]*w[2] + S[sstep+2]*w[3];
   3871                         WT t1 = S[1]*w[0] + S[3]*w[1] + S[sstep+1]*w[2] + S[sstep+3]*w[3];
   3872                         D[0] = castOp(t0); D[1] = castOp(t1);
   3873                     }
   3874                 else if( cn == 3 )
   3875                     for( ; dx < X1; dx++, D += 3 )
   3876                     {
   3877                         int sx = XY[dx*2], sy = XY[dx*2+1];
   3878                         const AT* w = wtab + FXY[dx]*4;
   3879                         const T* S = S0 + sy*sstep + sx*3;
   3880                         WT t0 = S[0]*w[0] + S[3]*w[1] + S[sstep]*w[2] + S[sstep+3]*w[3];
   3881                         WT t1 = S[1]*w[0] + S[4]*w[1] + S[sstep+1]*w[2] + S[sstep+4]*w[3];
   3882                         WT t2 = S[2]*w[0] + S[5]*w[1] + S[sstep+2]*w[2] + S[sstep+5]*w[3];
   3883                         D[0] = castOp(t0); D[1] = castOp(t1); D[2] = castOp(t2);
   3884                     }
   3885                 else if( cn == 4 )
   3886                     for( ; dx < X1; dx++, D += 4 )
   3887                     {
   3888                         int sx = XY[dx*2], sy = XY[dx*2+1];
   3889                         const AT* w = wtab + FXY[dx]*4;
   3890                         const T* S = S0 + sy*sstep + sx*4;
   3891                         WT t0 = S[0]*w[0] + S[4]*w[1] + S[sstep]*w[2] + S[sstep+4]*w[3];
   3892                         WT t1 = S[1]*w[0] + S[5]*w[1] + S[sstep+1]*w[2] + S[sstep+5]*w[3];
   3893                         D[0] = castOp(t0); D[1] = castOp(t1);
   3894                         t0 = S[2]*w[0] + S[6]*w[1] + S[sstep+2]*w[2] + S[sstep+6]*w[3];
   3895                         t1 = S[3]*w[0] + S[7]*w[1] + S[sstep+3]*w[2] + S[sstep+7]*w[3];
   3896                         D[2] = castOp(t0); D[3] = castOp(t1);
   3897                     }
   3898                 else
   3899                     for( ; dx < X1; dx++, D += cn )
   3900                     {
   3901                         int sx = XY[dx*2], sy = XY[dx*2+1];
   3902                         const AT* w = wtab + FXY[dx]*4;
   3903                         const T* S = S0 + sy*sstep + sx*cn;
   3904                         for( k = 0; k < cn; k++ )
   3905                         {
   3906                             WT t0 = S[k]*w[0] + S[k+cn]*w[1] + S[sstep+k]*w[2] + S[sstep+k+cn]*w[3];
   3907                             D[k] = castOp(t0);
   3908                         }
   3909                     }
   3910             }
   3911             else
   3912             {
   3913                 if( borderType == BORDER_TRANSPARENT && cn != 3 )
   3914                 {
   3915                     D += (X1 - dx)*cn;
   3916                     dx = X1;
   3917                     continue;
   3918                 }
   3919 
   3920                 if( cn == 1 )
   3921                     for( ; dx < X1; dx++, D++ )
   3922                     {
   3923                         int sx = XY[dx*2], sy = XY[dx*2+1];
   3924                         if( borderType == BORDER_CONSTANT &&
   3925                             (sx >= ssize.width || sx+1 < 0 ||
   3926                              sy >= ssize.height || sy+1 < 0) )
   3927                         {
   3928                             D[0] = cval[0];
   3929                         }
   3930                         else
   3931                         {
   3932                             int sx0, sx1, sy0, sy1;
   3933                             T v0, v1, v2, v3;
   3934                             const AT* w = wtab + FXY[dx]*4;
   3935                             if( borderType == BORDER_REPLICATE )
   3936                             {
   3937                                 sx0 = clip(sx, 0, ssize.width);
   3938                                 sx1 = clip(sx+1, 0, ssize.width);
   3939                                 sy0 = clip(sy, 0, ssize.height);
   3940                                 sy1 = clip(sy+1, 0, ssize.height);
   3941                                 v0 = S0[sy0*sstep + sx0];
   3942                                 v1 = S0[sy0*sstep + sx1];
   3943                                 v2 = S0[sy1*sstep + sx0];
   3944                                 v3 = S0[sy1*sstep + sx1];
   3945                             }
   3946                             else
   3947                             {
   3948                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
   3949                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
   3950                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
   3951                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
   3952                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx0] : cval[0];
   3953                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0[sy0*sstep + sx1] : cval[0];
   3954                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx0] : cval[0];
   3955                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0[sy1*sstep + sx1] : cval[0];
   3956                             }
   3957                             D[0] = castOp(WT(v0*w[0] + v1*w[1] + v2*w[2] + v3*w[3]));
   3958                         }
   3959                     }
   3960                 else
   3961                     for( ; dx < X1; dx++, D += cn )
   3962                     {
   3963                         int sx = XY[dx*2], sy = XY[dx*2+1];
   3964                         if( borderType == BORDER_CONSTANT &&
   3965                             (sx >= ssize.width || sx+1 < 0 ||
   3966                              sy >= ssize.height || sy+1 < 0) )
   3967                         {
   3968                             for( k = 0; k < cn; k++ )
   3969                                 D[k] = cval[k];
   3970                         }
   3971                         else
   3972                         {
   3973                             int sx0, sx1, sy0, sy1;
   3974                             const T *v0, *v1, *v2, *v3;
   3975                             const AT* w = wtab + FXY[dx]*4;
   3976                             if( borderType == BORDER_REPLICATE )
   3977                             {
   3978                                 sx0 = clip(sx, 0, ssize.width);
   3979                                 sx1 = clip(sx+1, 0, ssize.width);
   3980                                 sy0 = clip(sy, 0, ssize.height);
   3981                                 sy1 = clip(sy+1, 0, ssize.height);
   3982                                 v0 = S0 + sy0*sstep + sx0*cn;
   3983                                 v1 = S0 + sy0*sstep + sx1*cn;
   3984                                 v2 = S0 + sy1*sstep + sx0*cn;
   3985                                 v3 = S0 + sy1*sstep + sx1*cn;
   3986                             }
   3987                             else if( borderType == BORDER_TRANSPARENT &&
   3988                                 ((unsigned)sx >= (unsigned)(ssize.width-1) ||
   3989                                 (unsigned)sy >= (unsigned)(ssize.height-1)))
   3990                                 continue;
   3991                             else
   3992                             {
   3993                                 sx0 = borderInterpolate(sx, ssize.width, borderType);
   3994                                 sx1 = borderInterpolate(sx+1, ssize.width, borderType);
   3995                                 sy0 = borderInterpolate(sy, ssize.height, borderType);
   3996                                 sy1 = borderInterpolate(sy+1, ssize.height, borderType);
   3997                                 v0 = sx0 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx0*cn : &cval[0];
   3998                                 v1 = sx1 >= 0 && sy0 >= 0 ? S0 + sy0*sstep + sx1*cn : &cval[0];
   3999                                 v2 = sx0 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx0*cn : &cval[0];
   4000                                 v3 = sx1 >= 0 && sy1 >= 0 ? S0 + sy1*sstep + sx1*cn : &cval[0];
   4001                             }
   4002                             for( k = 0; k < cn; k++ )
   4003                                 D[k] = castOp(WT(v0[k]*w[0] + v1[k]*w[1] + v2[k]*w[2] + v3[k]*w[3]));
   4004                         }
   4005                     }
   4006             }
   4007         }
   4008     }
   4009 }
   4010 
   4011 
   4012 template<class CastOp, typename AT, int ONE>
   4013 static void remapBicubic( const Mat& _src, Mat& _dst, const Mat& _xy,
   4014                           const Mat& _fxy, const void* _wtab,
   4015                           int borderType, const Scalar& _borderValue )
   4016 {
   4017     typedef typename CastOp::rtype T;
   4018     typedef typename CastOp::type1 WT;
   4019     Size ssize = _src.size(), dsize = _dst.size();
   4020     int cn = _src.channels();
   4021     const AT* wtab = (const AT*)_wtab;
   4022     const T* S0 = _src.ptr<T>();
   4023     size_t sstep = _src.step/sizeof(S0[0]);
   4024     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
   4025         saturate_cast<T>(_borderValue[1]),
   4026         saturate_cast<T>(_borderValue[2]),
   4027         saturate_cast<T>(_borderValue[3]));
   4028     int dx, dy;
   4029     CastOp castOp;
   4030     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
   4031 
   4032     unsigned width1 = std::max(ssize.width-3, 0), height1 = std::max(ssize.height-3, 0);
   4033 
   4034     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
   4035     {
   4036         dsize.width *= dsize.height;
   4037         dsize.height = 1;
   4038     }
   4039 
   4040     for( dy = 0; dy < dsize.height; dy++ )
   4041     {
   4042         T* D = _dst.ptr<T>(dy);
   4043         const short* XY = _xy.ptr<short>(dy);
   4044         const ushort* FXY = _fxy.ptr<ushort>(dy);
   4045 
   4046         for( dx = 0; dx < dsize.width; dx++, D += cn )
   4047         {
   4048             int sx = XY[dx*2]-1, sy = XY[dx*2+1]-1;
   4049             const AT* w = wtab + FXY[dx]*16;
   4050             int i, k;
   4051             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
   4052             {
   4053                 const T* S = S0 + sy*sstep + sx*cn;
   4054                 for( k = 0; k < cn; k++ )
   4055                 {
   4056                     WT sum = S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3];
   4057                     S += sstep;
   4058                     sum += S[0]*w[4] + S[cn]*w[5] + S[cn*2]*w[6] + S[cn*3]*w[7];
   4059                     S += sstep;
   4060                     sum += S[0]*w[8] + S[cn]*w[9] + S[cn*2]*w[10] + S[cn*3]*w[11];
   4061                     S += sstep;
   4062                     sum += S[0]*w[12] + S[cn]*w[13] + S[cn*2]*w[14] + S[cn*3]*w[15];
   4063                     S += 1 - sstep*3;
   4064                     D[k] = castOp(sum);
   4065                 }
   4066             }
   4067             else
   4068             {
   4069                 int x[4], y[4];
   4070                 if( borderType == BORDER_TRANSPARENT &&
   4071                     ((unsigned)(sx+1) >= (unsigned)ssize.width ||
   4072                     (unsigned)(sy+1) >= (unsigned)ssize.height) )
   4073                     continue;
   4074 
   4075                 if( borderType1 == BORDER_CONSTANT &&
   4076                     (sx >= ssize.width || sx+4 <= 0 ||
   4077                     sy >= ssize.height || sy+4 <= 0))
   4078                 {
   4079                     for( k = 0; k < cn; k++ )
   4080                         D[k] = cval[k];
   4081                     continue;
   4082                 }
   4083 
   4084                 for( i = 0; i < 4; i++ )
   4085                 {
   4086                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
   4087                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
   4088                 }
   4089 
   4090                 for( k = 0; k < cn; k++, S0++, w -= 16 )
   4091                 {
   4092                     WT cv = cval[k], sum = cv*ONE;
   4093                     for( i = 0; i < 4; i++, w += 4 )
   4094                     {
   4095                         int yi = y[i];
   4096                         const T* S = S0 + yi*sstep;
   4097                         if( yi < 0 )
   4098                             continue;
   4099                         if( x[0] >= 0 )
   4100                             sum += (S[x[0]] - cv)*w[0];
   4101                         if( x[1] >= 0 )
   4102                             sum += (S[x[1]] - cv)*w[1];
   4103                         if( x[2] >= 0 )
   4104                             sum += (S[x[2]] - cv)*w[2];
   4105                         if( x[3] >= 0 )
   4106                             sum += (S[x[3]] - cv)*w[3];
   4107                     }
   4108                     D[k] = castOp(sum);
   4109                 }
   4110                 S0 -= cn;
   4111             }
   4112         }
   4113     }
   4114 }
   4115 
   4116 
   4117 template<class CastOp, typename AT, int ONE>
   4118 static void remapLanczos4( const Mat& _src, Mat& _dst, const Mat& _xy,
   4119                            const Mat& _fxy, const void* _wtab,
   4120                            int borderType, const Scalar& _borderValue )
   4121 {
   4122     typedef typename CastOp::rtype T;
   4123     typedef typename CastOp::type1 WT;
   4124     Size ssize = _src.size(), dsize = _dst.size();
   4125     int cn = _src.channels();
   4126     const AT* wtab = (const AT*)_wtab;
   4127     const T* S0 = _src.ptr<T>();
   4128     size_t sstep = _src.step/sizeof(S0[0]);
   4129     Scalar_<T> cval(saturate_cast<T>(_borderValue[0]),
   4130         saturate_cast<T>(_borderValue[1]),
   4131         saturate_cast<T>(_borderValue[2]),
   4132         saturate_cast<T>(_borderValue[3]));
   4133     int dx, dy;
   4134     CastOp castOp;
   4135     int borderType1 = borderType != BORDER_TRANSPARENT ? borderType : BORDER_REFLECT_101;
   4136 
   4137     unsigned width1 = std::max(ssize.width-7, 0), height1 = std::max(ssize.height-7, 0);
   4138 
   4139     if( _dst.isContinuous() && _xy.isContinuous() && _fxy.isContinuous() )
   4140     {
   4141         dsize.width *= dsize.height;
   4142         dsize.height = 1;
   4143     }
   4144 
   4145     for( dy = 0; dy < dsize.height; dy++ )
   4146     {
   4147         T* D = _dst.ptr<T>(dy);
   4148         const short* XY = _xy.ptr<short>(dy);
   4149         const ushort* FXY = _fxy.ptr<ushort>(dy);
   4150 
   4151         for( dx = 0; dx < dsize.width; dx++, D += cn )
   4152         {
   4153             int sx = XY[dx*2]-3, sy = XY[dx*2+1]-3;
   4154             const AT* w = wtab + FXY[dx]*64;
   4155             const T* S = S0 + sy*sstep + sx*cn;
   4156             int i, k;
   4157             if( (unsigned)sx < width1 && (unsigned)sy < height1 )
   4158             {
   4159                 for( k = 0; k < cn; k++ )
   4160                 {
   4161                     WT sum = 0;
   4162                     for( int r = 0; r < 8; r++, S += sstep, w += 8 )
   4163                         sum += S[0]*w[0] + S[cn]*w[1] + S[cn*2]*w[2] + S[cn*3]*w[3] +
   4164                             S[cn*4]*w[4] + S[cn*5]*w[5] + S[cn*6]*w[6] + S[cn*7]*w[7];
   4165                     w -= 64;
   4166                     S -= sstep*8 - 1;
   4167                     D[k] = castOp(sum);
   4168                 }
   4169             }
   4170             else
   4171             {
   4172                 int x[8], y[8];
   4173                 if( borderType == BORDER_TRANSPARENT &&
   4174                     ((unsigned)(sx+3) >= (unsigned)ssize.width ||
   4175                     (unsigned)(sy+3) >= (unsigned)ssize.height) )
   4176                     continue;
   4177 
   4178                 if( borderType1 == BORDER_CONSTANT &&
   4179                     (sx >= ssize.width || sx+8 <= 0 ||
   4180                     sy >= ssize.height || sy+8 <= 0))
   4181                 {
   4182                     for( k = 0; k < cn; k++ )
   4183                         D[k] = cval[k];
   4184                     continue;
   4185                 }
   4186 
   4187                 for( i = 0; i < 8; i++ )
   4188                 {
   4189                     x[i] = borderInterpolate(sx + i, ssize.width, borderType1)*cn;
   4190                     y[i] = borderInterpolate(sy + i, ssize.height, borderType1);
   4191                 }
   4192 
   4193                 for( k = 0; k < cn; k++, S0++, w -= 64 )
   4194                 {
   4195                     WT cv = cval[k], sum = cv*ONE;
   4196                     for( i = 0; i < 8; i++, w += 8 )
   4197                     {
   4198                         int yi = y[i];
   4199                         const T* S1 = S0 + yi*sstep;
   4200                         if( yi < 0 )
   4201                             continue;
   4202                         if( x[0] >= 0 )
   4203                             sum += (S1[x[0]] - cv)*w[0];
   4204                         if( x[1] >= 0 )
   4205                             sum += (S1[x[1]] - cv)*w[1];
   4206                         if( x[2] >= 0 )
   4207                             sum += (S1[x[2]] - cv)*w[2];
   4208                         if( x[3] >= 0 )
   4209                             sum += (S1[x[3]] - cv)*w[3];
   4210                         if( x[4] >= 0 )
   4211                             sum += (S1[x[4]] - cv)*w[4];
   4212                         if( x[5] >= 0 )
   4213                             sum += (S1[x[5]] - cv)*w[5];
   4214                         if( x[6] >= 0 )
   4215                             sum += (S1[x[6]] - cv)*w[6];
   4216                         if( x[7] >= 0 )
   4217                             sum += (S1[x[7]] - cv)*w[7];
   4218                     }
   4219                     D[k] = castOp(sum);
   4220                 }
   4221                 S0 -= cn;
   4222             }
   4223         }
   4224     }
   4225 }
   4226 
   4227 
   4228 typedef void (*RemapNNFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
   4229                             int borderType, const Scalar& _borderValue );
   4230 
   4231 typedef void (*RemapFunc)(const Mat& _src, Mat& _dst, const Mat& _xy,
   4232                           const Mat& _fxy, const void* _wtab,
   4233                           int borderType, const Scalar& _borderValue);
   4234 
   4235 class RemapInvoker :
   4236     public ParallelLoopBody
   4237 {
   4238 public:
   4239     RemapInvoker(const Mat& _src, Mat& _dst, const Mat *_m1,
   4240                  const Mat *_m2, int _borderType, const Scalar &_borderValue,
   4241                  int _planar_input, RemapNNFunc _nnfunc, RemapFunc _ifunc, const void *_ctab) :
   4242         ParallelLoopBody(), src(&_src), dst(&_dst), m1(_m1), m2(_m2),
   4243         borderType(_borderType), borderValue(_borderValue),
   4244         planar_input(_planar_input), nnfunc(_nnfunc), ifunc(_ifunc), ctab(_ctab)
   4245     {
   4246     }
   4247 
   4248     virtual void operator() (const Range& range) const
   4249     {
   4250         int x, y, x1, y1;
   4251         const int buf_size = 1 << 14;
   4252         int brows0 = std::min(128, dst->rows), map_depth = m1->depth();
   4253         int bcols0 = std::min(buf_size/brows0, dst->cols);
   4254         brows0 = std::min(buf_size/bcols0, dst->rows);
   4255     #if CV_SSE2
   4256         bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);
   4257     #endif
   4258 
   4259         Mat _bufxy(brows0, bcols0, CV_16SC2), _bufa;
   4260         if( !nnfunc )
   4261             _bufa.create(brows0, bcols0, CV_16UC1);
   4262 
   4263         for( y = range.start; y < range.end; y += brows0 )
   4264         {
   4265             for( x = 0; x < dst->cols; x += bcols0 )
   4266             {
   4267                 int brows = std::min(brows0, range.end - y);
   4268                 int bcols = std::min(bcols0, dst->cols - x);
   4269                 Mat dpart(*dst, Rect(x, y, bcols, brows));
   4270                 Mat bufxy(_bufxy, Rect(0, 0, bcols, brows));
   4271 
   4272                 if( nnfunc )
   4273                 {
   4274                     if( m1->type() == CV_16SC2 && m2->empty() ) // the data is already in the right format
   4275                         bufxy = (*m1)(Rect(x, y, bcols, brows));
   4276                     else if( map_depth != CV_32F )
   4277                     {
   4278                         for( y1 = 0; y1 < brows; y1++ )
   4279                         {
   4280                             short* XY = bufxy.ptr<short>(y1);
   4281                             const short* sXY = m1->ptr<short>(y+y1) + x*2;
   4282                             const ushort* sA = m2->ptr<ushort>(y+y1) + x;
   4283 
   4284                             for( x1 = 0; x1 < bcols; x1++ )
   4285                             {
   4286                                 int a = sA[x1] & (INTER_TAB_SIZE2-1);
   4287                                 XY[x1*2] = sXY[x1*2] + NNDeltaTab_i[a][0];
   4288                                 XY[x1*2+1] = sXY[x1*2+1] + NNDeltaTab_i[a][1];
   4289                             }
   4290                         }
   4291                     }
   4292                     else if( !planar_input )
   4293                         (*m1)(Rect(x, y, bcols, brows)).convertTo(bufxy, bufxy.depth());
   4294                     else
   4295                     {
   4296                         for( y1 = 0; y1 < brows; y1++ )
   4297                         {
   4298                             short* XY = bufxy.ptr<short>(y1);
   4299                             const float* sX = m1->ptr<float>(y+y1) + x;
   4300                             const float* sY = m2->ptr<float>(y+y1) + x;
   4301                             x1 = 0;
   4302 
   4303                         #if CV_SSE2
   4304                             if( useSIMD )
   4305                             {
   4306                                 for( ; x1 <= bcols - 8; x1 += 8 )
   4307                                 {
   4308                                     __m128 fx0 = _mm_loadu_ps(sX + x1);
   4309                                     __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
   4310                                     __m128 fy0 = _mm_loadu_ps(sY + x1);
   4311                                     __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
   4312                                     __m128i ix0 = _mm_cvtps_epi32(fx0);
   4313                                     __m128i ix1 = _mm_cvtps_epi32(fx1);
   4314                                     __m128i iy0 = _mm_cvtps_epi32(fy0);
   4315                                     __m128i iy1 = _mm_cvtps_epi32(fy1);
   4316                                     ix0 = _mm_packs_epi32(ix0, ix1);
   4317                                     iy0 = _mm_packs_epi32(iy0, iy1);
   4318                                     ix1 = _mm_unpacklo_epi16(ix0, iy0);
   4319                                     iy1 = _mm_unpackhi_epi16(ix0, iy0);
   4320                                     _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
   4321                                     _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
   4322                                 }
   4323                             }
   4324                         #endif
   4325 
   4326                             for( ; x1 < bcols; x1++ )
   4327                             {
   4328                                 XY[x1*2] = saturate_cast<short>(sX[x1]);
   4329                                 XY[x1*2+1] = saturate_cast<short>(sY[x1]);
   4330                             }
   4331                         }
   4332                     }
   4333                     nnfunc( *src, dpart, bufxy, borderType, borderValue );
   4334                     continue;
   4335                 }
   4336 
   4337                 Mat bufa(_bufa, Rect(0, 0, bcols, brows));
   4338                 for( y1 = 0; y1 < brows; y1++ )
   4339                 {
   4340                     short* XY = bufxy.ptr<short>(y1);
   4341                     ushort* A = bufa.ptr<ushort>(y1);
   4342 
   4343                     if( m1->type() == CV_16SC2 && (m2->type() == CV_16UC1 || m2->type() == CV_16SC1) )
   4344                     {
   4345                         bufxy = (*m1)(Rect(x, y, bcols, brows));
   4346 
   4347                         const ushort* sA = m2->ptr<ushort>(y+y1) + x;
   4348                         x1 = 0;
   4349 
   4350                     #if CV_NEON
   4351                         uint16x8_t v_scale = vdupq_n_u16(INTER_TAB_SIZE2-1);
   4352                         for ( ; x1 <= bcols - 8; x1 += 8)
   4353                             vst1q_u16(A + x1, vandq_u16(vld1q_u16(sA + x1), v_scale));
   4354                     #elif CV_SSE2
   4355                         __m128i v_scale = _mm_set1_epi16(INTER_TAB_SIZE2-1);
   4356                         for ( ; x1 <= bcols - 8; x1 += 8)
   4357                             _mm_storeu_si128((__m128i *)(A + x1), _mm_and_si128(_mm_loadu_si128((const __m128i *)(sA + x1)), v_scale));
   4358                     #endif
   4359 
   4360                         for( ; x1 < bcols; x1++ )
   4361                             A[x1] = (ushort)(sA[x1] & (INTER_TAB_SIZE2-1));
   4362                     }
   4363                     else if( planar_input )
   4364                     {
   4365                         const float* sX = m1->ptr<float>(y+y1) + x;
   4366                         const float* sY = m2->ptr<float>(y+y1) + x;
   4367 
   4368                         x1 = 0;
   4369                     #if CV_SSE2
   4370                         if( useSIMD )
   4371                         {
   4372                             __m128 scale = _mm_set1_ps((float)INTER_TAB_SIZE);
   4373                             __m128i mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
   4374                             for( ; x1 <= bcols - 8; x1 += 8 )
   4375                             {
   4376                                 __m128 fx0 = _mm_loadu_ps(sX + x1);
   4377                                 __m128 fx1 = _mm_loadu_ps(sX + x1 + 4);
   4378                                 __m128 fy0 = _mm_loadu_ps(sY + x1);
   4379                                 __m128 fy1 = _mm_loadu_ps(sY + x1 + 4);
   4380                                 __m128i ix0 = _mm_cvtps_epi32(_mm_mul_ps(fx0, scale));
   4381                                 __m128i ix1 = _mm_cvtps_epi32(_mm_mul_ps(fx1, scale));
   4382                                 __m128i iy0 = _mm_cvtps_epi32(_mm_mul_ps(fy0, scale));
   4383                                 __m128i iy1 = _mm_cvtps_epi32(_mm_mul_ps(fy1, scale));
   4384                                 __m128i mx0 = _mm_and_si128(ix0, mask);
   4385                                 __m128i mx1 = _mm_and_si128(ix1, mask);
   4386                                 __m128i my0 = _mm_and_si128(iy0, mask);
   4387                                 __m128i my1 = _mm_and_si128(iy1, mask);
   4388                                 mx0 = _mm_packs_epi32(mx0, mx1);
   4389                                 my0 = _mm_packs_epi32(my0, my1);
   4390                                 my0 = _mm_slli_epi16(my0, INTER_BITS);
   4391                                 mx0 = _mm_or_si128(mx0, my0);
   4392                                 _mm_storeu_si128((__m128i*)(A + x1), mx0);
   4393                                 ix0 = _mm_srai_epi32(ix0, INTER_BITS);
   4394                                 ix1 = _mm_srai_epi32(ix1, INTER_BITS);
   4395                                 iy0 = _mm_srai_epi32(iy0, INTER_BITS);
   4396                                 iy1 = _mm_srai_epi32(iy1, INTER_BITS);
   4397                                 ix0 = _mm_packs_epi32(ix0, ix1);
   4398                                 iy0 = _mm_packs_epi32(iy0, iy1);
   4399                                 ix1 = _mm_unpacklo_epi16(ix0, iy0);
   4400                                 iy1 = _mm_unpackhi_epi16(ix0, iy0);
   4401                                 _mm_storeu_si128((__m128i*)(XY + x1*2), ix1);
   4402                                 _mm_storeu_si128((__m128i*)(XY + x1*2 + 8), iy1);
   4403                             }
   4404                         }
   4405                     #elif CV_NEON
   4406                         float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
   4407                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE - 1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
   4408 
   4409                         for( ; x1 <= bcols - 4; x1 += 4 )
   4410                         {
   4411                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sX + x1), v_scale)),
   4412                                       v_sy = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(sY + x1), v_scale));
   4413                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
   4414                                                       vandq_s32(v_sy, v_scale2));
   4415                             vst1_u16(A + x1, vqmovun_s32(v_v));
   4416 
   4417                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
   4418                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
   4419                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
   4420                         }
   4421                     #endif
   4422 
   4423                         for( ; x1 < bcols; x1++ )
   4424                         {
   4425                             int sx = cvRound(sX[x1]*INTER_TAB_SIZE);
   4426                             int sy = cvRound(sY[x1]*INTER_TAB_SIZE);
   4427                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
   4428                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
   4429                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
   4430                             A[x1] = (ushort)v;
   4431                         }
   4432                     }
   4433                     else
   4434                     {
   4435                         const float* sXY = m1->ptr<float>(y+y1) + x*2;
   4436                         x1 = 0;
   4437 
   4438                     #if CV_NEON
   4439                         float32x4_t v_scale = vdupq_n_f32(INTER_TAB_SIZE);
   4440                         int32x4_t v_scale2 = vdupq_n_s32(INTER_TAB_SIZE-1), v_scale3 = vdupq_n_s32(INTER_TAB_SIZE);
   4441 
   4442                         for( ; x1 <= bcols - 4; x1 += 4 )
   4443                         {
   4444                             float32x4x2_t v_src = vld2q_f32(sXY + (x1 << 1));
   4445                             int32x4_t v_sx = cv_vrndq_s32_f32(vmulq_f32(v_src.val[0], v_scale));
   4446                             int32x4_t v_sy = cv_vrndq_s32_f32(vmulq_f32(v_src.val[1], v_scale));
   4447                             int32x4_t v_v = vmlaq_s32(vandq_s32(v_sx, v_scale2), v_scale3,
   4448                                                       vandq_s32(v_sy, v_scale2));
   4449                             vst1_u16(A + x1, vqmovun_s32(v_v));
   4450 
   4451                             int16x4x2_t v_dst = vzip_s16(vqmovn_s32(vshrq_n_s32(v_sx, INTER_BITS)),
   4452                                                          vqmovn_s32(vshrq_n_s32(v_sy, INTER_BITS)));
   4453                             vst1q_s16(XY + (x1 << 1), vcombine_s16(v_dst.val[0], v_dst.val[1]));
   4454                         }
   4455                     #endif
   4456 
   4457                         for( x1 = 0; x1 < bcols; x1++ )
   4458                         {
   4459                             int sx = cvRound(sXY[x1*2]*INTER_TAB_SIZE);
   4460                             int sy = cvRound(sXY[x1*2+1]*INTER_TAB_SIZE);
   4461                             int v = (sy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (sx & (INTER_TAB_SIZE-1));
   4462                             XY[x1*2] = saturate_cast<short>(sx >> INTER_BITS);
   4463                             XY[x1*2+1] = saturate_cast<short>(sy >> INTER_BITS);
   4464                             A[x1] = (ushort)v;
   4465                         }
   4466                     }
   4467                 }
   4468                 ifunc(*src, dpart, bufxy, bufa, ctab, borderType, borderValue);
   4469             }
   4470         }
   4471     }
   4472 
   4473 private:
   4474     const Mat* src;
   4475     Mat* dst;
   4476     const Mat *m1, *m2;
   4477     int borderType;
   4478     Scalar borderValue;
   4479     int planar_input;
   4480     RemapNNFunc nnfunc;
   4481     RemapFunc ifunc;
   4482     const void *ctab;
   4483 };
   4484 
   4485 #ifdef HAVE_OPENCL
   4486 
   4487 static bool ocl_remap(InputArray _src, OutputArray _dst, InputArray _map1, InputArray _map2,
   4488                       int interpolation, int borderType, const Scalar& borderValue)
   4489 {
   4490     const ocl::Device & dev = ocl::Device::getDefault();
   4491     int cn = _src.channels(), type = _src.type(), depth = _src.depth(),
   4492             rowsPerWI = dev.isIntel() ? 4 : 1;
   4493 
   4494     if (borderType == BORDER_TRANSPARENT || !(interpolation == INTER_LINEAR || interpolation == INTER_NEAREST)
   4495             || _map1.type() == CV_16SC1 || _map2.type() == CV_16SC1)
   4496         return false;
   4497 
   4498     UMat src = _src.getUMat(), map1 = _map1.getUMat(), map2 = _map2.getUMat();
   4499 
   4500     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.empty())) ||
   4501         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.empty())) )
   4502     {
   4503         if (map1.type() != CV_16SC2)
   4504             std::swap(map1, map2);
   4505     }
   4506     else
   4507         CV_Assert( map1.type() == CV_32FC2 || (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
   4508 
   4509     _dst.create(map1.size(), type);
   4510     UMat dst = _dst.getUMat();
   4511 
   4512     String kernelName = "remap";
   4513     if (map1.type() == CV_32FC2 && map2.empty())
   4514         kernelName += "_32FC2";
   4515     else if (map1.type() == CV_16SC2)
   4516     {
   4517         kernelName += "_16SC2";
   4518         if (!map2.empty())
   4519             kernelName += "_16UC1";
   4520     }
   4521     else if (map1.type() == CV_32FC1 && map2.type() == CV_32FC1)
   4522         kernelName += "_2_32FC1";
   4523     else
   4524         CV_Error(Error::StsBadArg, "Unsupported map types");
   4525 
   4526     static const char * const interMap[] = { "INTER_NEAREST", "INTER_LINEAR", "INTER_CUBIC", "INTER_LINEAR", "INTER_LANCZOS" };
   4527     static const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
   4528                            "BORDER_REFLECT_101", "BORDER_TRANSPARENT" };
   4529     String buildOptions = format("-D %s -D %s -D T=%s -D rowsPerWI=%d",
   4530                                  interMap[interpolation], borderMap[borderType],
   4531                                  ocl::typeToStr(type), rowsPerWI);
   4532 
   4533     if (interpolation != INTER_NEAREST)
   4534     {
   4535         char cvt[3][40];
   4536         int wdepth = std::max(CV_32F, depth);
   4537         buildOptions = buildOptions
   4538                       + format(" -D WT=%s -D convertToT=%s -D convertToWT=%s"
   4539                                " -D convertToWT2=%s -D WT2=%s",
   4540                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)),
   4541                                ocl::convertTypeStr(wdepth, depth, cn, cvt[0]),
   4542                                ocl::convertTypeStr(depth, wdepth, cn, cvt[1]),
   4543                                ocl::convertTypeStr(CV_32S, wdepth, 2, cvt[2]),
   4544                                ocl::typeToStr(CV_MAKE_TYPE(wdepth, 2)));
   4545     }
   4546     int scalarcn = cn == 3 ? 4 : cn;
   4547     int sctype = CV_MAKETYPE(depth, scalarcn);
   4548     buildOptions += format(" -D T=%s -D T1=%s -D cn=%d -D ST=%s -D depth=%d",
   4549                            ocl::typeToStr(type), ocl::typeToStr(depth),
   4550                            cn, ocl::typeToStr(sctype), depth);
   4551 
   4552     ocl::Kernel k(kernelName.c_str(), ocl::imgproc::remap_oclsrc, buildOptions);
   4553 
   4554     Mat scalar(1, 1, sctype, borderValue);
   4555     ocl::KernelArg srcarg = ocl::KernelArg::ReadOnly(src), dstarg = ocl::KernelArg::WriteOnly(dst),
   4556             map1arg = ocl::KernelArg::ReadOnlyNoSize(map1),
   4557             scalararg = ocl::KernelArg::Constant((void*)scalar.ptr(), scalar.elemSize());
   4558 
   4559     if (map2.empty())
   4560         k.args(srcarg, dstarg, map1arg, scalararg);
   4561     else
   4562         k.args(srcarg, dstarg, map1arg, ocl::KernelArg::ReadOnlyNoSize(map2), scalararg);
   4563 
   4564     size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
   4565     return k.run(2, globalThreads, NULL, false);
   4566 }
   4567 
   4568 #endif
   4569 
   4570 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
   4571 
   4572 typedef IppStatus (CV_STDCALL * ippiRemap)(const void * pSrc, IppiSize srcSize, int srcStep, IppiRect srcRoi,
   4573                                            const Ipp32f* pxMap, int xMapStep, const Ipp32f* pyMap, int yMapStep,
   4574                                            void * pDst, int dstStep, IppiSize dstRoiSize, int interpolation);
   4575 
   4576 class IPPRemapInvoker :
   4577         public ParallelLoopBody
   4578 {
   4579 public:
   4580     IPPRemapInvoker(Mat & _src, Mat & _dst, Mat & _xmap, Mat & _ymap, ippiRemap _ippFunc,
   4581                     int _ippInterpolation, int _borderType, const Scalar & _borderValue, bool * _ok) :
   4582         ParallelLoopBody(), src(_src), dst(_dst), map1(_xmap), map2(_ymap), ippFunc(_ippFunc),
   4583         ippInterpolation(_ippInterpolation), borderType(_borderType), borderValue(_borderValue), ok(_ok)
   4584     {
   4585         *ok = true;
   4586     }
   4587 
   4588     virtual void operator() (const Range & range) const
   4589     {
   4590         IppiRect srcRoiRect = { 0, 0, src.cols, src.rows };
   4591         Mat dstRoi = dst.rowRange(range);
   4592         IppiSize dstRoiSize = ippiSize(dstRoi.size());
   4593         int type = dst.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   4594 
   4595         if (borderType == BORDER_CONSTANT &&
   4596                 !IPPSet(borderValue, dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, cn, depth))
   4597         {
   4598             *ok = false;
   4599             return;
   4600         }
   4601 
   4602         if (ippFunc(src.ptr(), ippiSize(src.size()), (int)src.step, srcRoiRect,
   4603                     map1.ptr<Ipp32f>(), (int)map1.step, map2.ptr<Ipp32f>(), (int)map2.step,
   4604                     dstRoi.ptr(), (int)dstRoi.step, dstRoiSize, ippInterpolation) < 0)
   4605             *ok = false;
   4606         else
   4607         {
   4608             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   4609         }
   4610     }
   4611 
   4612 private:
   4613     Mat & src, & dst, & map1, & map2;
   4614     ippiRemap ippFunc;
   4615     int ippInterpolation, borderType;
   4616     Scalar borderValue;
   4617     bool * ok;
   4618 };
   4619 
   4620 #endif
   4621 
   4622 }
   4623 
   4624 void cv::remap( InputArray _src, OutputArray _dst,
   4625                 InputArray _map1, InputArray _map2,
   4626                 int interpolation, int borderType, const Scalar& borderValue )
   4627 {
   4628     static RemapNNFunc nn_tab[] =
   4629     {
   4630         remapNearest<uchar>, remapNearest<schar>, remapNearest<ushort>, remapNearest<short>,
   4631         remapNearest<int>, remapNearest<float>, remapNearest<double>, 0
   4632     };
   4633 
   4634     static RemapFunc linear_tab[] =
   4635     {
   4636         remapBilinear<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, RemapVec_8u, short>, 0,
   4637         remapBilinear<Cast<float, ushort>, RemapNoVec, float>,
   4638         remapBilinear<Cast<float, short>, RemapNoVec, float>, 0,
   4639         remapBilinear<Cast<float, float>, RemapNoVec, float>,
   4640         remapBilinear<Cast<double, double>, RemapNoVec, float>, 0
   4641     };
   4642 
   4643     static RemapFunc cubic_tab[] =
   4644     {
   4645         remapBicubic<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
   4646         remapBicubic<Cast<float, ushort>, float, 1>,
   4647         remapBicubic<Cast<float, short>, float, 1>, 0,
   4648         remapBicubic<Cast<float, float>, float, 1>,
   4649         remapBicubic<Cast<double, double>, float, 1>, 0
   4650     };
   4651 
   4652     static RemapFunc lanczos4_tab[] =
   4653     {
   4654         remapLanczos4<FixedPtCast<int, uchar, INTER_REMAP_COEF_BITS>, short, INTER_REMAP_COEF_SCALE>, 0,
   4655         remapLanczos4<Cast<float, ushort>, float, 1>,
   4656         remapLanczos4<Cast<float, short>, float, 1>, 0,
   4657         remapLanczos4<Cast<float, float>, float, 1>,
   4658         remapLanczos4<Cast<double, double>, float, 1>, 0
   4659     };
   4660 
   4661     CV_Assert( _map1.size().area() > 0 );
   4662     CV_Assert( _map2.empty() || (_map2.size() == _map1.size()));
   4663 
   4664     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
   4665                ocl_remap(_src, _dst, _map1, _map2, interpolation, borderType, borderValue))
   4666 
   4667     Mat src = _src.getMat(), map1 = _map1.getMat(), map2 = _map2.getMat();
   4668     _dst.create( map1.size(), src.type() );
   4669     Mat dst = _dst.getMat();
   4670     if( dst.data == src.data )
   4671         src = src.clone();
   4672 
   4673     if( interpolation == INTER_AREA )
   4674         interpolation = INTER_LINEAR;
   4675 
   4676     int type = src.type(), depth = CV_MAT_DEPTH(type);
   4677 
   4678 #if IPP_VERSION_X100 >= 0 && !defined HAVE_IPP_ICV_ONLY && 0
   4679     CV_IPP_CHECK()
   4680     {
   4681         if ((interpolation == INTER_LINEAR || interpolation == INTER_CUBIC || interpolation == INTER_NEAREST) &&
   4682                 map1.type() == CV_32FC1 && map2.type() == CV_32FC1 &&
   4683                 (borderType == BORDER_CONSTANT || borderType == BORDER_TRANSPARENT))
   4684         {
   4685             int ippInterpolation =
   4686                 interpolation == INTER_NEAREST ? IPPI_INTER_NN :
   4687                 interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR : IPPI_INTER_CUBIC;
   4688 
   4689             ippiRemap ippFunc =
   4690                 type == CV_8UC1 ? (ippiRemap)ippiRemap_8u_C1R :
   4691                 type == CV_8UC3 ? (ippiRemap)ippiRemap_8u_C3R :
   4692                 type == CV_8UC4 ? (ippiRemap)ippiRemap_8u_C4R :
   4693                 type == CV_16UC1 ? (ippiRemap)ippiRemap_16u_C1R :
   4694                 type == CV_16UC3 ? (ippiRemap)ippiRemap_16u_C3R :
   4695                 type == CV_16UC4 ? (ippiRemap)ippiRemap_16u_C4R :
   4696                 type == CV_32FC1 ? (ippiRemap)ippiRemap_32f_C1R :
   4697                 type == CV_32FC3 ? (ippiRemap)ippiRemap_32f_C3R :
   4698                 type == CV_32FC4 ? (ippiRemap)ippiRemap_32f_C4R : 0;
   4699 
   4700             if (ippFunc)
   4701             {
   4702                 bool ok;
   4703                 IPPRemapInvoker invoker(src, dst, map1, map2, ippFunc, ippInterpolation,
   4704                                         borderType, borderValue, &ok);
   4705                 Range range(0, dst.rows);
   4706                 parallel_for_(range, invoker, dst.total() / (double)(1 << 16));
   4707 
   4708                 if (ok)
   4709                 {
   4710                     CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   4711                     return;
   4712                 }
   4713                 setIppErrorStatus();
   4714             }
   4715         }
   4716     }
   4717 #endif
   4718 
   4719     RemapNNFunc nnfunc = 0;
   4720     RemapFunc ifunc = 0;
   4721     const void* ctab = 0;
   4722     bool fixpt = depth == CV_8U;
   4723     bool planar_input = false;
   4724 
   4725     if( interpolation == INTER_NEAREST )
   4726     {
   4727         nnfunc = nn_tab[depth];
   4728         CV_Assert( nnfunc != 0 );
   4729     }
   4730     else
   4731     {
   4732         if( interpolation == INTER_LINEAR )
   4733             ifunc = linear_tab[depth];
   4734         else if( interpolation == INTER_CUBIC )
   4735             ifunc = cubic_tab[depth];
   4736         else if( interpolation == INTER_LANCZOS4 )
   4737             ifunc = lanczos4_tab[depth];
   4738         else
   4739             CV_Error( CV_StsBadArg, "Unknown interpolation method" );
   4740         CV_Assert( ifunc != 0 );
   4741         ctab = initInterTab2D( interpolation, fixpt );
   4742     }
   4743 
   4744     const Mat *m1 = &map1, *m2 = &map2;
   4745 
   4746     if( (map1.type() == CV_16SC2 && (map2.type() == CV_16UC1 || map2.type() == CV_16SC1 || map2.empty())) ||
   4747         (map2.type() == CV_16SC2 && (map1.type() == CV_16UC1 || map1.type() == CV_16SC1 || map1.empty())) )
   4748     {
   4749         if( map1.type() != CV_16SC2 )
   4750             std::swap(m1, m2);
   4751     }
   4752     else
   4753     {
   4754         CV_Assert( ((map1.type() == CV_32FC2 || map1.type() == CV_16SC2) && map2.empty()) ||
   4755             (map1.type() == CV_32FC1 && map2.type() == CV_32FC1) );
   4756         planar_input = map1.channels() == 1;
   4757     }
   4758 
   4759     RemapInvoker invoker(src, dst, m1, m2,
   4760                          borderType, borderValue, planar_input, nnfunc, ifunc,
   4761                          ctab);
   4762     parallel_for_(Range(0, dst.rows), invoker, dst.total()/(double)(1<<16));
   4763 }
   4764 
   4765 
   4766 void cv::convertMaps( InputArray _map1, InputArray _map2,
   4767                       OutputArray _dstmap1, OutputArray _dstmap2,
   4768                       int dstm1type, bool nninterpolate )
   4769 {
   4770     Mat map1 = _map1.getMat(), map2 = _map2.getMat(), dstmap1, dstmap2;
   4771     Size size = map1.size();
   4772     const Mat *m1 = &map1, *m2 = &map2;
   4773     int m1type = m1->type(), m2type = m2->type();
   4774 
   4775     CV_Assert( (m1type == CV_16SC2 && (nninterpolate || m2type == CV_16UC1 || m2type == CV_16SC1)) ||
   4776                (m2type == CV_16SC2 && (nninterpolate || m1type == CV_16UC1 || m1type == CV_16SC1)) ||
   4777                (m1type == CV_32FC1 && m2type == CV_32FC1) ||
   4778                (m1type == CV_32FC2 && m2->empty()) );
   4779 
   4780     if( m2type == CV_16SC2 )
   4781     {
   4782         std::swap( m1, m2 );
   4783         std::swap( m1type, m2type );
   4784     }
   4785 
   4786     if( dstm1type <= 0 )
   4787         dstm1type = m1type == CV_16SC2 ? CV_32FC2 : CV_16SC2;
   4788     CV_Assert( dstm1type == CV_16SC2 || dstm1type == CV_32FC1 || dstm1type == CV_32FC2 );
   4789     _dstmap1.create( size, dstm1type );
   4790     dstmap1 = _dstmap1.getMat();
   4791 
   4792     if( !nninterpolate && dstm1type != CV_32FC2 )
   4793     {
   4794         _dstmap2.create( size, dstm1type == CV_16SC2 ? CV_16UC1 : CV_32FC1 );
   4795         dstmap2 = _dstmap2.getMat();
   4796     }
   4797     else
   4798         _dstmap2.release();
   4799 
   4800     if( m1type == dstm1type || (nninterpolate &&
   4801         ((m1type == CV_16SC2 && dstm1type == CV_32FC2) ||
   4802         (m1type == CV_32FC2 && dstm1type == CV_16SC2))) )
   4803     {
   4804         m1->convertTo( dstmap1, dstmap1.type() );
   4805         if( !dstmap2.empty() && dstmap2.type() == m2->type() )
   4806             m2->copyTo( dstmap2 );
   4807         return;
   4808     }
   4809 
   4810     if( m1type == CV_32FC1 && dstm1type == CV_32FC2 )
   4811     {
   4812         Mat vdata[] = { *m1, *m2 };
   4813         merge( vdata, 2, dstmap1 );
   4814         return;
   4815     }
   4816 
   4817     if( m1type == CV_32FC2 && dstm1type == CV_32FC1 )
   4818     {
   4819         Mat mv[] = { dstmap1, dstmap2 };
   4820         split( *m1, mv );
   4821         return;
   4822     }
   4823 
   4824     if( m1->isContinuous() && (m2->empty() || m2->isContinuous()) &&
   4825         dstmap1.isContinuous() && (dstmap2.empty() || dstmap2.isContinuous()) )
   4826     {
   4827         size.width *= size.height;
   4828         size.height = 1;
   4829     }
   4830 
   4831 #if CV_SSE2
   4832     bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
   4833 #endif
   4834 #if CV_SSE4_1
   4835     bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
   4836 #endif
   4837 
   4838     const float scale = 1.f/INTER_TAB_SIZE;
   4839     int x, y;
   4840     for( y = 0; y < size.height; y++ )
   4841     {
   4842         const float* src1f = m1->ptr<float>(y);
   4843         const float* src2f = m2->ptr<float>(y);
   4844         const short* src1 = (const short*)src1f;
   4845         const ushort* src2 = (const ushort*)src2f;
   4846 
   4847         float* dst1f = dstmap1.ptr<float>(y);
   4848         float* dst2f = dstmap2.ptr<float>(y);
   4849         short* dst1 = (short*)dst1f;
   4850         ushort* dst2 = (ushort*)dst2f;
   4851         x = 0;
   4852 
   4853         if( m1type == CV_32FC1 && dstm1type == CV_16SC2 )
   4854         {
   4855             if( nninterpolate )
   4856             {
   4857                 #if CV_NEON
   4858                 for( ; x <= size.width - 8; x += 8 )
   4859                 {
   4860                     int16x8x2_t v_dst;
   4861                     v_dst.val[0] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
   4862                                                 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4))));
   4863                     v_dst.val[1] = vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x))),
   4864                                                 vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src2f + x + 4))));
   4865 
   4866                     vst2q_s16(dst1 + (x << 1), v_dst);
   4867                 }
   4868                 #elif CV_SSE4_1
   4869                 if (useSSE4_1)
   4870                 {
   4871                     for( ; x <= size.width - 16; x += 16 )
   4872                     {
   4873                         __m128i v_dst0 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
   4874                                                          _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4)));
   4875                         __m128i v_dst1 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 8)),
   4876                                                          _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 12)));
   4877 
   4878                         __m128i v_dst2 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x)),
   4879                                                          _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 4)));
   4880                         __m128i v_dst3 = _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 8)),
   4881                                                          _mm_cvtps_epi32(_mm_loadu_ps(src2f + x + 12)));
   4882 
   4883                         _mm_interleave_epi16(v_dst0, v_dst1, v_dst2, v_dst3);
   4884 
   4885                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst0);
   4886                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst1);
   4887                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst2);
   4888                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst3);
   4889                     }
   4890                 }
   4891                 #endif
   4892                 for( ; x < size.width; x++ )
   4893                 {
   4894                     dst1[x*2] = saturate_cast<short>(src1f[x]);
   4895                     dst1[x*2+1] = saturate_cast<short>(src2f[x]);
   4896                 }
   4897             }
   4898             else
   4899             {
   4900                 #if CV_NEON
   4901                 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
   4902                 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
   4903 
   4904                 for( ; x <= size.width - 8; x += 8 )
   4905                 {
   4906                     int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x), v_scale));
   4907                     int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src1f + x + 4), v_scale));
   4908                     int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x), v_scale));
   4909                     int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(vld1q_f32(src2f + x + 4), v_scale));
   4910 
   4911                     int16x8x2_t v_dst;
   4912                     v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
   4913                                                 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
   4914                     v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
   4915                                                 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
   4916 
   4917                     vst2q_s16(dst1 + (x << 1), v_dst);
   4918 
   4919                     uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
   4920                                                               vandq_s32(v_ix0, v_mask)));
   4921                     uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
   4922                                                               vandq_s32(v_ix1, v_mask)));
   4923                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
   4924                 }
   4925                 #elif CV_SSE4_1
   4926                 if (useSSE4_1)
   4927                 {
   4928                     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
   4929                     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
   4930 
   4931                     for( ; x <= size.width - 16; x += 16 )
   4932                     {
   4933                         __m128i v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x), v_its));
   4934                         __m128i v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 4), v_its));
   4935                         __m128i v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x), v_its));
   4936                         __m128i v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 4), v_its));
   4937 
   4938                         __m128i v_dst10 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
   4939                                                           _mm_srai_epi32(v_ix1, INTER_BITS));
   4940                         __m128i v_dst12 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
   4941                                                           _mm_srai_epi32(v_iy1, INTER_BITS));
   4942                         __m128i v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
   4943                                                         _mm_and_si128(v_ix0, v_its1));
   4944                         __m128i v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
   4945                                                         _mm_and_si128(v_ix1, v_its1));
   4946                         _mm_storeu_si128((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst20, v_dst21));
   4947 
   4948                         v_ix0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 8), v_its));
   4949                         v_ix1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x + 12), v_its));
   4950                         v_iy0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 8), v_its));
   4951                         v_iy1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src2f + x + 12), v_its));
   4952 
   4953                         __m128i v_dst11 = _mm_packs_epi32(_mm_srai_epi32(v_ix0, INTER_BITS),
   4954                                                           _mm_srai_epi32(v_ix1, INTER_BITS));
   4955                         __m128i v_dst13 = _mm_packs_epi32(_mm_srai_epi32(v_iy0, INTER_BITS),
   4956                                                           _mm_srai_epi32(v_iy1, INTER_BITS));
   4957                         v_dst20 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy0, v_its1), INTER_BITS),
   4958                                                 _mm_and_si128(v_ix0, v_its1));
   4959                         v_dst21 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_iy1, v_its1), INTER_BITS),
   4960                                                 _mm_and_si128(v_ix1, v_its1));
   4961                         _mm_storeu_si128((__m128i *)(dst2 + x + 8), _mm_packus_epi32(v_dst20, v_dst21));
   4962 
   4963                         _mm_interleave_epi16(v_dst10, v_dst11, v_dst12, v_dst13);
   4964 
   4965                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst10);
   4966                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 8), v_dst11);
   4967                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 16), v_dst12);
   4968                         _mm_storeu_si128((__m128i *)(dst1 + x * 2 + 24), v_dst13);
   4969                     }
   4970                 }
   4971                 #endif
   4972                 for( ; x < size.width; x++ )
   4973                 {
   4974                     int ix = saturate_cast<int>(src1f[x]*INTER_TAB_SIZE);
   4975                     int iy = saturate_cast<int>(src2f[x]*INTER_TAB_SIZE);
   4976                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
   4977                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
   4978                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
   4979                 }
   4980             }
   4981         }
   4982         else if( m1type == CV_32FC2 && dstm1type == CV_16SC2 )
   4983         {
   4984             if( nninterpolate )
   4985             {
   4986                 #if CV_NEON
   4987                 for( ; x <= (size.width << 1) - 8; x += 8 )
   4988                     vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x))),
   4989                                                      vqmovn_s32(cv_vrndq_s32_f32(vld1q_f32(src1f + x + 4)))));
   4990                 #elif CV_SSE2
   4991                 for( ; x <= (size.width << 1) - 8; x += 8 )
   4992                 {
   4993                     _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packs_epi32(_mm_cvtps_epi32(_mm_loadu_ps(src1f + x)),
   4994                                                                             _mm_cvtps_epi32(_mm_loadu_ps(src1f + x + 4))));
   4995                 }
   4996                 #endif
   4997                 for( ; x < size.width; x++ )
   4998                 {
   4999                     dst1[x*2] = saturate_cast<short>(src1f[x*2]);
   5000                     dst1[x*2+1] = saturate_cast<short>(src1f[x*2+1]);
   5001                 }
   5002             }
   5003             else
   5004             {
   5005                 #if CV_NEON
   5006                 float32x4_t v_scale = vdupq_n_f32((float)INTER_TAB_SIZE);
   5007                 int32x4_t v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
   5008 
   5009                 for( ; x <= size.width - 8; x += 8 )
   5010                 {
   5011                     float32x4x2_t v_src0 = vld2q_f32(src1f + (x << 1)), v_src1 = vld2q_f32(src1f + (x << 1) + 8);
   5012                     int32x4_t v_ix0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[0], v_scale));
   5013                     int32x4_t v_ix1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[0], v_scale));
   5014                     int32x4_t v_iy0 = cv_vrndq_s32_f32(vmulq_f32(v_src0.val[1], v_scale));
   5015                     int32x4_t v_iy1 = cv_vrndq_s32_f32(vmulq_f32(v_src1.val[1], v_scale));
   5016 
   5017                     int16x8x2_t v_dst;
   5018                     v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_ix0, INTER_BITS)),
   5019                                                 vqmovn_s32(vshrq_n_s32(v_ix1, INTER_BITS)));
   5020                     v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_iy0, INTER_BITS)),
   5021                                                 vqmovn_s32(vshrq_n_s32(v_iy1, INTER_BITS)));
   5022 
   5023                     vst2q_s16(dst1 + (x << 1), v_dst);
   5024 
   5025                     uint16x4_t v_dst0 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy0, v_mask), INTER_BITS),
   5026                                                               vandq_s32(v_ix0, v_mask)));
   5027                     uint16x4_t v_dst1 = vqmovun_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_iy1, v_mask), INTER_BITS),
   5028                                                               vandq_s32(v_ix1, v_mask)));
   5029                     vst1q_u16(dst2 + x, vcombine_u16(v_dst0, v_dst1));
   5030                 }
   5031                 #elif CV_SSE4_1
   5032                 if (useSSE4_1)
   5033                 {
   5034                     __m128 v_its = _mm_set1_ps(INTER_TAB_SIZE);
   5035                     __m128i v_its1 = _mm_set1_epi32(INTER_TAB_SIZE-1);
   5036                     __m128i v_y_mask = _mm_set1_epi32((INTER_TAB_SIZE-1) << 16);
   5037 
   5038                     for( ; x <= size.width - 4; x += 4 )
   5039                     {
   5040                         __m128i v_src0 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2), v_its));
   5041                         __m128i v_src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(src1f + x * 2 + 4), v_its));
   5042 
   5043                         __m128i v_dst1 = _mm_packs_epi32(_mm_srai_epi32(v_src0, INTER_BITS),
   5044                                                          _mm_srai_epi32(v_src1, INTER_BITS));
   5045                         _mm_storeu_si128((__m128i *)(dst1 + x * 2), v_dst1);
   5046 
   5047                         // x0 y0 x1 y1 . . .
   5048                         v_src0 = _mm_packs_epi32(_mm_and_si128(v_src0, v_its1),
   5049                                                  _mm_and_si128(v_src1, v_its1));
   5050                         __m128i v_dst2 = _mm_or_si128(_mm_srli_epi32(_mm_and_si128(v_src0, v_y_mask), 16 - INTER_BITS), // y0 0 y1 0 . . .
   5051                                                       _mm_and_si128(v_src0, v_its1)); // 0 x0 0 x1 . . .
   5052                         _mm_storel_epi64((__m128i *)(dst2 + x), _mm_packus_epi32(v_dst2, v_dst2));
   5053                     }
   5054                 }
   5055                 #endif
   5056                 for( ; x < size.width; x++ )
   5057                 {
   5058                     int ix = saturate_cast<int>(src1f[x*2]*INTER_TAB_SIZE);
   5059                     int iy = saturate_cast<int>(src1f[x*2+1]*INTER_TAB_SIZE);
   5060                     dst1[x*2] = saturate_cast<short>(ix >> INTER_BITS);
   5061                     dst1[x*2+1] = saturate_cast<short>(iy >> INTER_BITS);
   5062                     dst2[x] = (ushort)((iy & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE + (ix & (INTER_TAB_SIZE-1)));
   5063                 }
   5064             }
   5065         }
   5066         else if( m1type == CV_16SC2 && dstm1type == CV_32FC1 )
   5067         {
   5068             #if CV_NEON
   5069             uint16x8_t v_mask2 = vdupq_n_u16(INTER_TAB_SIZE2-1);
   5070             uint32x4_t v_zero = vdupq_n_u32(0u), v_mask = vdupq_n_u32(INTER_TAB_SIZE-1);
   5071             float32x4_t v_scale = vdupq_n_f32(scale);
   5072 
   5073             for( ; x <= size.width - 8; x += 8)
   5074             {
   5075                 uint32x4_t v_fxy1, v_fxy2;
   5076                 if (src2)
   5077                 {
   5078                     uint16x8_t v_src2 = vandq_u16(vld1q_u16(src2 + x), v_mask2);
   5079                     v_fxy1 = vmovl_u16(vget_low_u16(v_src2));
   5080                     v_fxy2 = vmovl_u16(vget_high_u16(v_src2));
   5081                 }
   5082                 else
   5083                     v_fxy1 = v_fxy2 = v_zero;
   5084 
   5085                 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
   5086                 float32x4_t v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
   5087                                                v_scale, vcvtq_f32_u32(vandq_u32(v_fxy1, v_mask)));
   5088                 float32x4_t v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
   5089                                                v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy1, INTER_BITS)));
   5090                 vst1q_f32(dst1f + x, v_dst1);
   5091                 vst1q_f32(dst2f + x, v_dst2);
   5092 
   5093                 v_dst1 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
   5094                                    v_scale, vcvtq_f32_u32(vandq_u32(v_fxy2, v_mask)));
   5095                 v_dst2 = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
   5096                                    v_scale, vcvtq_f32_u32(vshrq_n_u32(v_fxy2, INTER_BITS)));
   5097                 vst1q_f32(dst1f + x + 4, v_dst1);
   5098                 vst1q_f32(dst2f + x + 4, v_dst2);
   5099             }
   5100             #elif CV_SSE2
   5101             __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
   5102             __m128i v_zero = _mm_setzero_si128(), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
   5103             __m128 v_scale = _mm_set1_ps(scale);
   5104 
   5105             for( ; x <= size.width - 16; x += 16)
   5106             {
   5107                 __m128i v_src10 = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
   5108                 __m128i v_src11 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 8));
   5109                 __m128i v_src20 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 16));
   5110                 __m128i v_src21 = _mm_loadu_si128((__m128i const *)(src1 + x * 2 + 24));
   5111 
   5112                 _mm_deinterleave_epi16(v_src10, v_src11, v_src20, v_src21);
   5113 
   5114                 __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
   5115                 __m128i v_fxy_p = _mm_unpacklo_epi16(v_fxy, v_zero);
   5116                 _mm_storeu_ps(dst1f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src10), 16)),
   5117                                                     _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
   5118                 _mm_storeu_ps(dst2f + x, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src20), 16)),
   5119                                                     _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
   5120                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
   5121                 _mm_storeu_ps(dst1f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src10), 16)),
   5122                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
   5123                 _mm_storeu_ps(dst2f + x + 4, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src20), 16)),
   5124                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
   5125 
   5126                 v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x + 8)), v_mask2) : v_zero;
   5127                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
   5128                 _mm_storeu_ps(dst1f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src11), 16)),
   5129                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
   5130                 _mm_storeu_ps(dst2f + x + 8, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(v_zero, v_src21), 16)),
   5131                                                         _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
   5132                 v_fxy_p = _mm_unpackhi_epi16(v_fxy, v_zero);
   5133                 _mm_storeu_ps(dst1f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src11), 16)),
   5134                                                          _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_and_si128(v_fxy_p, v_mask)))));
   5135                 _mm_storeu_ps(dst2f + x + 12, _mm_add_ps(_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(v_zero, v_src21), 16)),
   5136                                                          _mm_mul_ps(v_scale, _mm_cvtepi32_ps(_mm_srli_epi32(v_fxy_p, INTER_BITS)))));
   5137             }
   5138             #endif
   5139             for( ; x < size.width; x++ )
   5140             {
   5141                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1) : 0;
   5142                 dst1f[x] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
   5143                 dst2f[x] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
   5144             }
   5145         }
   5146         else if( m1type == CV_16SC2 && dstm1type == CV_32FC2 )
   5147         {
   5148             #if CV_NEON
   5149             int16x8_t v_mask2 = vdupq_n_s16(INTER_TAB_SIZE2-1);
   5150             int32x4_t v_zero = vdupq_n_s32(0), v_mask = vdupq_n_s32(INTER_TAB_SIZE-1);
   5151             float32x4_t v_scale = vdupq_n_f32(scale);
   5152 
   5153             for( ; x <= size.width - 8; x += 8)
   5154             {
   5155                 int32x4_t v_fxy1, v_fxy2;
   5156                 if (src2)
   5157                 {
   5158                     int16x8_t v_src2 = vandq_s16(vld1q_s16((short *)src2 + x), v_mask2);
   5159                     v_fxy1 = vmovl_s16(vget_low_s16(v_src2));
   5160                     v_fxy2 = vmovl_s16(vget_high_s16(v_src2));
   5161                 }
   5162                 else
   5163                     v_fxy1 = v_fxy2 = v_zero;
   5164 
   5165                 int16x8x2_t v_src = vld2q_s16(src1 + (x << 1));
   5166                 float32x4x2_t v_dst;
   5167                 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[0]))),
   5168                                          v_scale, vcvtq_f32_s32(vandq_s32(v_fxy1, v_mask)));
   5169                 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src.val[1]))),
   5170                                          v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy1, INTER_BITS)));
   5171                 vst2q_f32(dst1f + (x << 1), v_dst);
   5172 
   5173                 v_dst.val[0] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[0]))),
   5174                                          v_scale, vcvtq_f32_s32(vandq_s32(v_fxy2, v_mask)));
   5175                 v_dst.val[1] = vmlaq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src.val[1]))),
   5176                                          v_scale, vcvtq_f32_s32(vshrq_n_s32(v_fxy2, INTER_BITS)));
   5177                 vst2q_f32(dst1f + (x << 1) + 8, v_dst);
   5178             }
   5179             #elif CV_SSE2
   5180             if (useSSE2)
   5181             {
   5182                 __m128i v_mask2 = _mm_set1_epi16(INTER_TAB_SIZE2-1);
   5183                 __m128i v_zero = _mm_set1_epi32(0), v_mask = _mm_set1_epi32(INTER_TAB_SIZE-1);
   5184                 __m128 v_scale = _mm_set1_ps(scale);
   5185 
   5186                 for ( ; x <= size.width - 8; x += 8)
   5187                 {
   5188                     __m128i v_src = _mm_loadu_si128((__m128i const *)(src1 + x * 2));
   5189                     __m128i v_fxy = src2 ? _mm_and_si128(_mm_loadu_si128((__m128i const *)(src2 + x)), v_mask2) : v_zero;
   5190                     __m128i v_fxy1 = _mm_and_si128(v_fxy, v_mask);
   5191                     __m128i v_fxy2 = _mm_srli_epi16(v_fxy, INTER_BITS);
   5192 
   5193                     __m128 v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_fxy1, v_fxy2)), v_scale);
   5194                     _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v_src, v_zero)), v_add));
   5195 
   5196                     v_add = _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_fxy1, v_fxy2)), v_scale);
   5197                     _mm_storeu_ps(dst1f + x * 2, _mm_add_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v_src, v_zero)), v_add));
   5198                 }
   5199             }
   5200             #endif
   5201             for( ; x < size.width; x++ )
   5202             {
   5203                 int fxy = src2 ? src2[x] & (INTER_TAB_SIZE2-1): 0;
   5204                 dst1f[x*2] = src1[x*2] + (fxy & (INTER_TAB_SIZE-1))*scale;
   5205                 dst1f[x*2+1] = src1[x*2+1] + (fxy >> INTER_BITS)*scale;
   5206             }
   5207         }
   5208         else
   5209             CV_Error( CV_StsNotImplemented, "Unsupported combination of input/output matrices" );
   5210     }
   5211 }
   5212 
   5213 
   5214 namespace cv
   5215 {
   5216 
   5217 class WarpAffineInvoker :
   5218     public ParallelLoopBody
   5219 {
   5220 public:
   5221     WarpAffineInvoker(const Mat &_src, Mat &_dst, int _interpolation, int _borderType,
   5222                       const Scalar &_borderValue, int *_adelta, int *_bdelta, double *_M) :
   5223         ParallelLoopBody(), src(_src), dst(_dst), interpolation(_interpolation),
   5224         borderType(_borderType), borderValue(_borderValue), adelta(_adelta), bdelta(_bdelta),
   5225         M(_M)
   5226     {
   5227     }
   5228 
   5229     virtual void operator() (const Range& range) const
   5230     {
   5231         const int BLOCK_SZ = 64;
   5232         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
   5233         const int AB_BITS = MAX(10, (int)INTER_BITS);
   5234         const int AB_SCALE = 1 << AB_BITS;
   5235         int round_delta = interpolation == INTER_NEAREST ? AB_SCALE/2 : AB_SCALE/INTER_TAB_SIZE/2, x, y, x1, y1;
   5236     #if CV_SSE2
   5237         bool useSSE2 = checkHardwareSupport(CV_CPU_SSE2);
   5238     #endif
   5239     #if CV_SSE4_1
   5240         bool useSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
   5241     #endif
   5242 
   5243         int bh0 = std::min(BLOCK_SZ/2, dst.rows);
   5244         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, dst.cols);
   5245         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, dst.rows);
   5246 
   5247         for( y = range.start; y < range.end; y += bh0 )
   5248         {
   5249             for( x = 0; x < dst.cols; x += bw0 )
   5250             {
   5251                 int bw = std::min( bw0, dst.cols - x);
   5252                 int bh = std::min( bh0, range.end - y);
   5253 
   5254                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
   5255                 Mat dpart(dst, Rect(x, y, bw, bh));
   5256 
   5257                 for( y1 = 0; y1 < bh; y1++ )
   5258                 {
   5259                     short* xy = XY + y1*bw*2;
   5260                     int X0 = saturate_cast<int>((M[1]*(y + y1) + M[2])*AB_SCALE) + round_delta;
   5261                     int Y0 = saturate_cast<int>((M[4]*(y + y1) + M[5])*AB_SCALE) + round_delta;
   5262 
   5263                     if( interpolation == INTER_NEAREST )
   5264                     {
   5265                         x1 = 0;
   5266                         #if CV_NEON
   5267                         int32x4_t v_X0 = vdupq_n_s32(X0), v_Y0 = vdupq_n_s32(Y0);
   5268                         for( ; x1 <= bw - 8; x1 += 8 )
   5269                         {
   5270                             int16x8x2_t v_dst;
   5271                             v_dst.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1)), AB_BITS)),
   5272                                                         vqmovn_s32(vshrq_n_s32(vaddq_s32(v_X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS)));
   5273                             v_dst.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1)), AB_BITS)),
   5274                                                         vqmovn_s32(vshrq_n_s32(vaddq_s32(v_Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS)));
   5275 
   5276                             vst2q_s16(xy + (x1 << 1), v_dst);
   5277                         }
   5278                         #elif CV_SSE4_1
   5279                         if (useSSE4_1)
   5280                         {
   5281                             __m128i v_X0 = _mm_set1_epi32(X0);
   5282                             __m128i v_Y0 = _mm_set1_epi32(Y0);
   5283                             for ( ; x1 <= bw - 16; x1 += 16)
   5284                             {
   5285                                 __m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1))), AB_BITS),
   5286                                                                _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 4))), AB_BITS));
   5287                                 __m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 8))), AB_BITS),
   5288                                                                _mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x + x1 + 12))), AB_BITS));
   5289 
   5290                                 __m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1))), AB_BITS),
   5291                                                                _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 4))), AB_BITS));
   5292                                 __m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 8))), AB_BITS),
   5293                                                                _mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x + x1 + 12))), AB_BITS));
   5294 
   5295                                 _mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
   5296 
   5297                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
   5298                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
   5299                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
   5300                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
   5301                             }
   5302                         }
   5303                         #endif
   5304                         for( ; x1 < bw; x1++ )
   5305                         {
   5306                             int X = (X0 + adelta[x+x1]) >> AB_BITS;
   5307                             int Y = (Y0 + bdelta[x+x1]) >> AB_BITS;
   5308                             xy[x1*2] = saturate_cast<short>(X);
   5309                             xy[x1*2+1] = saturate_cast<short>(Y);
   5310                         }
   5311                     }
   5312                     else
   5313                     {
   5314                         short* alpha = A + y1*bw;
   5315                         x1 = 0;
   5316                     #if CV_SSE2
   5317                         if( useSSE2 )
   5318                         {
   5319                             __m128i fxy_mask = _mm_set1_epi32(INTER_TAB_SIZE - 1);
   5320                             __m128i XX = _mm_set1_epi32(X0), YY = _mm_set1_epi32(Y0);
   5321                             for( ; x1 <= bw - 8; x1 += 8 )
   5322                             {
   5323                                 __m128i tx0, tx1, ty0, ty1;
   5324                                 tx0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1)), XX);
   5325                                 ty0 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1)), YY);
   5326                                 tx1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(adelta + x + x1 + 4)), XX);
   5327                                 ty1 = _mm_add_epi32(_mm_loadu_si128((const __m128i*)(bdelta + x + x1 + 4)), YY);
   5328 
   5329                                 tx0 = _mm_srai_epi32(tx0, AB_BITS - INTER_BITS);
   5330                                 ty0 = _mm_srai_epi32(ty0, AB_BITS - INTER_BITS);
   5331                                 tx1 = _mm_srai_epi32(tx1, AB_BITS - INTER_BITS);
   5332                                 ty1 = _mm_srai_epi32(ty1, AB_BITS - INTER_BITS);
   5333 
   5334                                 __m128i fx_ = _mm_packs_epi32(_mm_and_si128(tx0, fxy_mask),
   5335                                                             _mm_and_si128(tx1, fxy_mask));
   5336                                 __m128i fy_ = _mm_packs_epi32(_mm_and_si128(ty0, fxy_mask),
   5337                                                             _mm_and_si128(ty1, fxy_mask));
   5338                                 tx0 = _mm_packs_epi32(_mm_srai_epi32(tx0, INTER_BITS),
   5339                                                             _mm_srai_epi32(tx1, INTER_BITS));
   5340                                 ty0 = _mm_packs_epi32(_mm_srai_epi32(ty0, INTER_BITS),
   5341                                                     _mm_srai_epi32(ty1, INTER_BITS));
   5342                                 fx_ = _mm_adds_epi16(fx_, _mm_slli_epi16(fy_, INTER_BITS));
   5343 
   5344                                 _mm_storeu_si128((__m128i*)(xy + x1*2), _mm_unpacklo_epi16(tx0, ty0));
   5345                                 _mm_storeu_si128((__m128i*)(xy + x1*2 + 8), _mm_unpackhi_epi16(tx0, ty0));
   5346                                 _mm_storeu_si128((__m128i*)(alpha + x1), fx_);
   5347                             }
   5348                         }
   5349                     #elif CV_NEON
   5350                         int32x4_t v__X0 = vdupq_n_s32(X0), v__Y0 = vdupq_n_s32(Y0), v_mask = vdupq_n_s32(INTER_TAB_SIZE - 1);
   5351                         for( ; x1 <= bw - 8; x1 += 8 )
   5352                         {
   5353                             int32x4_t v_X0 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1)), AB_BITS - INTER_BITS);
   5354                             int32x4_t v_Y0 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1)), AB_BITS - INTER_BITS);
   5355                             int32x4_t v_X1 = vshrq_n_s32(vaddq_s32(v__X0, vld1q_s32(adelta + x + x1 + 4)), AB_BITS - INTER_BITS);
   5356                             int32x4_t v_Y1 = vshrq_n_s32(vaddq_s32(v__Y0, vld1q_s32(bdelta + x + x1 + 4)), AB_BITS - INTER_BITS);
   5357 
   5358                             int16x8x2_t v_xy;
   5359                             v_xy.val[0] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_X0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_X1, INTER_BITS)));
   5360                             v_xy.val[1] = vcombine_s16(vqmovn_s32(vshrq_n_s32(v_Y0, INTER_BITS)), vqmovn_s32(vshrq_n_s32(v_Y1, INTER_BITS)));
   5361 
   5362                             vst2q_s16(xy + (x1 << 1), v_xy);
   5363 
   5364                             int16x4_t v_alpha0 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y0, v_mask), INTER_BITS),
   5365                                                                      vandq_s32(v_X0, v_mask)));
   5366                             int16x4_t v_alpha1 = vmovn_s32(vaddq_s32(vshlq_n_s32(vandq_s32(v_Y1, v_mask), INTER_BITS),
   5367                                                                      vandq_s32(v_X1, v_mask)));
   5368                             vst1q_s16(alpha + x1, vcombine_s16(v_alpha0, v_alpha1));
   5369                         }
   5370                     #endif
   5371                         for( ; x1 < bw; x1++ )
   5372                         {
   5373                             int X = (X0 + adelta[x+x1]) >> (AB_BITS - INTER_BITS);
   5374                             int Y = (Y0 + bdelta[x+x1]) >> (AB_BITS - INTER_BITS);
   5375                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
   5376                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
   5377                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
   5378                                     (X & (INTER_TAB_SIZE-1)));
   5379                         }
   5380                     }
   5381                 }
   5382 
   5383                 if( interpolation == INTER_NEAREST )
   5384                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
   5385                 else
   5386                 {
   5387                     Mat _matA(bh, bw, CV_16U, A);
   5388                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
   5389                 }
   5390             }
   5391         }
   5392     }
   5393 
   5394 private:
   5395     Mat src;
   5396     Mat dst;
   5397     int interpolation, borderType;
   5398     Scalar borderValue;
   5399     int *adelta, *bdelta;
   5400     double *M;
   5401 };
   5402 
   5403 
   5404 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
   5405 class IPPWarpAffineInvoker :
   5406     public ParallelLoopBody
   5407 {
   5408 public:
   5409     IPPWarpAffineInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[2][3], int &_interpolation, int _borderType,
   5410                          const Scalar &_borderValue, ippiWarpAffineBackFunc _func, bool *_ok) :
   5411         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
   5412         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
   5413     {
   5414         *ok = true;
   5415     }
   5416 
   5417     virtual void operator() (const Range& range) const
   5418     {
   5419         IppiSize srcsize = { src.cols, src.rows };
   5420         IppiRect srcroi = { 0, 0, src.cols, src.rows };
   5421         IppiRect dstroi = { 0, range.start, dst.cols, range.end - range.start };
   5422         int cnn = src.channels();
   5423         if( borderType == BORDER_CONSTANT )
   5424         {
   5425             IppiSize setSize = { dst.cols, range.end - range.start };
   5426             void *dataPointer = dst.ptr(range.start);
   5427             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
   5428             {
   5429                 *ok = false;
   5430                 return;
   5431             }
   5432         }
   5433 
   5434         // Aug 2013: problem in IPP 7.1, 8.0 : sometimes function return ippStsCoeffErr
   5435         IppStatus status = func( src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(),
   5436                                 (int)dst.step[0], dstroi, coeffs, mode );
   5437         if( status < 0)
   5438             *ok = false;
   5439         else
   5440         {
   5441             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   5442         }
   5443     }
   5444 private:
   5445     Mat &src;
   5446     Mat &dst;
   5447     int mode;
   5448     double (&coeffs)[2][3];
   5449     int borderType;
   5450     Scalar borderValue;
   5451     ippiWarpAffineBackFunc func;
   5452     bool *ok;
   5453     const IPPWarpAffineInvoker& operator= (const IPPWarpAffineInvoker&);
   5454 };
   5455 #endif
   5456 
   5457 #ifdef HAVE_OPENCL
   5458 
   5459 enum { OCL_OP_PERSPECTIVE = 1, OCL_OP_AFFINE = 0 };
   5460 
   5461 static bool ocl_warpTransform(InputArray _src, OutputArray _dst, InputArray _M0,
   5462                               Size dsize, int flags, int borderType, const Scalar& borderValue,
   5463                               int op_type)
   5464 {
   5465     CV_Assert(op_type == OCL_OP_AFFINE || op_type == OCL_OP_PERSPECTIVE);
   5466     const ocl::Device & dev = ocl::Device::getDefault();
   5467 
   5468     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   5469     const bool doubleSupport = dev.doubleFPConfig() > 0;
   5470 
   5471     int interpolation = flags & INTER_MAX;
   5472     if( interpolation == INTER_AREA )
   5473         interpolation = INTER_LINEAR;
   5474     int rowsPerWI = dev.isIntel() && op_type == OCL_OP_AFFINE && interpolation <= INTER_LINEAR ? 4 : 1;
   5475 
   5476     if ( !(borderType == cv::BORDER_CONSTANT &&
   5477            (interpolation == cv::INTER_NEAREST || interpolation == cv::INTER_LINEAR || interpolation == cv::INTER_CUBIC)) ||
   5478          (!doubleSupport && depth == CV_64F) || cn > 4)
   5479         return false;
   5480 
   5481     const char * const interpolationMap[3] = { "NEAREST", "LINEAR", "CUBIC" };
   5482     ocl::ProgramSource program = op_type == OCL_OP_AFFINE ?
   5483                 ocl::imgproc::warp_affine_oclsrc : ocl::imgproc::warp_perspective_oclsrc;
   5484     const char * const kernelName = op_type == OCL_OP_AFFINE ? "warpAffine" : "warpPerspective";
   5485 
   5486     int scalarcn = cn == 3 ? 4 : cn;
   5487     bool is32f = !dev.isAMD() && (interpolation == INTER_CUBIC || interpolation == INTER_LINEAR) && op_type == OCL_OP_AFFINE;
   5488     int wdepth = interpolation == INTER_NEAREST ? depth : std::max(is32f ? CV_32F : CV_32S, depth);
   5489     int sctype = CV_MAKETYPE(wdepth, scalarcn);
   5490 
   5491     ocl::Kernel k;
   5492     String opts;
   5493     if (interpolation == INTER_NEAREST)
   5494     {
   5495         opts = format("-D INTER_NEAREST -D T=%s%s -D T1=%s -D ST=%s -D cn=%d -D rowsPerWI=%d",
   5496                       ocl::typeToStr(type), doubleSupport ? " -D DOUBLE_SUPPORT" : "",
   5497                       ocl::typeToStr(CV_MAT_DEPTH(type)),
   5498                       ocl::typeToStr(sctype), cn, rowsPerWI);
   5499     }
   5500     else
   5501     {
   5502         char cvt[2][50];
   5503         opts = format("-D INTER_%s -D T=%s -D T1=%s -D ST=%s -D WT=%s -D depth=%d"
   5504                       " -D convertToWT=%s -D convertToT=%s%s -D cn=%d -D rowsPerWI=%d",
   5505                       interpolationMap[interpolation], ocl::typeToStr(type),
   5506                       ocl::typeToStr(CV_MAT_DEPTH(type)),
   5507                       ocl::typeToStr(sctype),
   5508                       ocl::typeToStr(CV_MAKE_TYPE(wdepth, cn)), depth,
   5509                       ocl::convertTypeStr(depth, wdepth, cn, cvt[0]),
   5510                       ocl::convertTypeStr(wdepth, depth, cn, cvt[1]),
   5511                       doubleSupport ? " -D DOUBLE_SUPPORT" : "", cn, rowsPerWI);
   5512     }
   5513 
   5514     k.create(kernelName, program, opts);
   5515     if (k.empty())
   5516         return false;
   5517 
   5518     double borderBuf[] = { 0, 0, 0, 0 };
   5519     scalarToRawData(borderValue, borderBuf, sctype);
   5520 
   5521     UMat src = _src.getUMat(), M0;
   5522     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
   5523     UMat dst = _dst.getUMat();
   5524 
   5525     double M[9];
   5526     int matRows = (op_type == OCL_OP_AFFINE ? 2 : 3);
   5527     Mat matM(matRows, 3, CV_64F, M), M1 = _M0.getMat();
   5528     CV_Assert( (M1.type() == CV_32F || M1.type() == CV_64F) &&
   5529                M1.rows == matRows && M1.cols == 3 );
   5530     M1.convertTo(matM, matM.type());
   5531 
   5532     if( !(flags & WARP_INVERSE_MAP) )
   5533     {
   5534         if (op_type == OCL_OP_PERSPECTIVE)
   5535             invert(matM, matM);
   5536         else
   5537         {
   5538             double D = M[0]*M[4] - M[1]*M[3];
   5539             D = D != 0 ? 1./D : 0;
   5540             double A11 = M[4]*D, A22=M[0]*D;
   5541             M[0] = A11; M[1] *= -D;
   5542             M[3] *= -D; M[4] = A22;
   5543             double b1 = -M[0]*M[2] - M[1]*M[5];
   5544             double b2 = -M[3]*M[2] - M[4]*M[5];
   5545             M[2] = b1; M[5] = b2;
   5546         }
   5547     }
   5548     matM.convertTo(M0, doubleSupport ? CV_64F : CV_32F);
   5549 
   5550     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst), ocl::KernelArg::PtrReadOnly(M0),
   5551            ocl::KernelArg(0, 0, 0, 0, borderBuf, CV_ELEM_SIZE(sctype)));
   5552 
   5553     size_t globalThreads[2] = { dst.cols, (dst.rows + rowsPerWI - 1) / rowsPerWI };
   5554     return k.run(2, globalThreads, NULL, false);
   5555 }
   5556 
   5557 #endif
   5558 
   5559 }
   5560 
   5561 
   5562 void cv::warpAffine( InputArray _src, OutputArray _dst,
   5563                      InputArray _M0, Size dsize,
   5564                      int flags, int borderType, const Scalar& borderValue )
   5565 {
   5566     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
   5567                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType,
   5568                                  borderValue, OCL_OP_AFFINE))
   5569 
   5570     Mat src = _src.getMat(), M0 = _M0.getMat();
   5571     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
   5572     Mat dst = _dst.getMat();
   5573     CV_Assert( src.cols > 0 && src.rows > 0 );
   5574     if( dst.data == src.data )
   5575         src = src.clone();
   5576 
   5577     double M[6];
   5578     Mat matM(2, 3, CV_64F, M);
   5579     int interpolation = flags & INTER_MAX;
   5580     if( interpolation == INTER_AREA )
   5581         interpolation = INTER_LINEAR;
   5582 
   5583     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 2 && M0.cols == 3 );
   5584     M0.convertTo(matM, matM.type());
   5585 
   5586 #ifdef HAVE_TEGRA_OPTIMIZATION
   5587     if( tegra::useTegra() && tegra::warpAffine(src, dst, M, flags, borderType, borderValue) )
   5588         return;
   5589 #endif
   5590 
   5591     if( !(flags & WARP_INVERSE_MAP) )
   5592     {
   5593         double D = M[0]*M[4] - M[1]*M[3];
   5594         D = D != 0 ? 1./D : 0;
   5595         double A11 = M[4]*D, A22=M[0]*D;
   5596         M[0] = A11; M[1] *= -D;
   5597         M[3] *= -D; M[4] = A22;
   5598         double b1 = -M[0]*M[2] - M[1]*M[5];
   5599         double b2 = -M[3]*M[2] - M[4]*M[5];
   5600         M[2] = b1; M[5] = b2;
   5601     }
   5602 
   5603     int x;
   5604     AutoBuffer<int> _abdelta(dst.cols*2);
   5605     int* adelta = &_abdelta[0], *bdelta = adelta + dst.cols;
   5606     const int AB_BITS = MAX(10, (int)INTER_BITS);
   5607     const int AB_SCALE = 1 << AB_BITS;
   5608 
   5609 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
   5610     CV_IPP_CHECK()
   5611     {
   5612         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   5613         if( ( depth == CV_8U || depth == CV_16U || depth == CV_32F ) &&
   5614            ( cn == 1 || cn == 3 || cn == 4 ) &&
   5615            ( interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC) &&
   5616            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT) )
   5617         {
   5618             ippiWarpAffineBackFunc ippFunc = 0;
   5619             if ((flags & WARP_INVERSE_MAP) != 0)
   5620             {
   5621                 ippFunc =
   5622                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C1R :
   5623                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C3R :
   5624                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_8u_C4R :
   5625                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C1R :
   5626                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C3R :
   5627                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_16u_C4R :
   5628                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C1R :
   5629                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C3R :
   5630                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffineBack_32f_C4R :
   5631                 0;
   5632             }
   5633             else
   5634             {
   5635                 ippFunc =
   5636                 type == CV_8UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C1R :
   5637                 type == CV_8UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C3R :
   5638                 type == CV_8UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_8u_C4R :
   5639                 type == CV_16UC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C1R :
   5640                 type == CV_16UC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C3R :
   5641                 type == CV_16UC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_16u_C4R :
   5642                 type == CV_32FC1 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C1R :
   5643                 type == CV_32FC3 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C3R :
   5644                 type == CV_32FC4 ? (ippiWarpAffineBackFunc)ippiWarpAffine_32f_C4R :
   5645                 0;
   5646             }
   5647             int mode =
   5648             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
   5649             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
   5650             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC :
   5651             0;
   5652             CV_Assert(mode && ippFunc);
   5653 
   5654             double coeffs[2][3];
   5655             for( int i = 0; i < 2; i++ )
   5656                 for( int j = 0; j < 3; j++ )
   5657                     coeffs[i][j] = matM.at<double>(i, j);
   5658 
   5659             bool ok;
   5660             Range range(0, dst.rows);
   5661             IPPWarpAffineInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
   5662             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
   5663             if( ok )
   5664             {
   5665                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   5666                 return;
   5667             }
   5668             setIppErrorStatus();
   5669         }
   5670     }
   5671 #endif
   5672 
   5673     for( x = 0; x < dst.cols; x++ )
   5674     {
   5675         adelta[x] = saturate_cast<int>(M[0]*x*AB_SCALE);
   5676         bdelta[x] = saturate_cast<int>(M[3]*x*AB_SCALE);
   5677     }
   5678 
   5679     Range range(0, dst.rows);
   5680     WarpAffineInvoker invoker(src, dst, interpolation, borderType,
   5681                               borderValue, adelta, bdelta, M);
   5682     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
   5683 }
   5684 
   5685 
   5686 namespace cv
   5687 {
   5688 
   5689 class WarpPerspectiveInvoker :
   5690     public ParallelLoopBody
   5691 {
   5692 public:
   5693     WarpPerspectiveInvoker(const Mat &_src, Mat &_dst, double *_M, int _interpolation,
   5694                            int _borderType, const Scalar &_borderValue) :
   5695         ParallelLoopBody(), src(_src), dst(_dst), M(_M), interpolation(_interpolation),
   5696         borderType(_borderType), borderValue(_borderValue)
   5697     {
   5698     }
   5699 
   5700     virtual void operator() (const Range& range) const
   5701     {
   5702         const int BLOCK_SZ = 32;
   5703         short XY[BLOCK_SZ*BLOCK_SZ*2], A[BLOCK_SZ*BLOCK_SZ];
   5704         int x, y, x1, y1, width = dst.cols, height = dst.rows;
   5705 
   5706         int bh0 = std::min(BLOCK_SZ/2, height);
   5707         int bw0 = std::min(BLOCK_SZ*BLOCK_SZ/bh0, width);
   5708         bh0 = std::min(BLOCK_SZ*BLOCK_SZ/bw0, height);
   5709 
   5710         #if CV_SSE4_1
   5711         bool haveSSE4_1 = checkHardwareSupport(CV_CPU_SSE4_1);
   5712         __m128d v_M0 = _mm_set1_pd(M[0]);
   5713         __m128d v_M3 = _mm_set1_pd(M[3]);
   5714         __m128d v_M6 = _mm_set1_pd(M[6]);
   5715         __m128d v_intmax = _mm_set1_pd((double)INT_MAX);
   5716         __m128d v_intmin = _mm_set1_pd((double)INT_MIN);
   5717         __m128d v_2 = _mm_set1_pd(2),
   5718                 v_zero = _mm_setzero_pd(),
   5719                 v_1 = _mm_set1_pd(1),
   5720                 v_its = _mm_set1_pd(INTER_TAB_SIZE);
   5721         __m128i v_itsi1 = _mm_set1_epi32(INTER_TAB_SIZE - 1);
   5722         #endif
   5723 
   5724         for( y = range.start; y < range.end; y += bh0 )
   5725         {
   5726             for( x = 0; x < width; x += bw0 )
   5727             {
   5728                 int bw = std::min( bw0, width - x);
   5729                 int bh = std::min( bh0, range.end - y); // height
   5730 
   5731                 Mat _XY(bh, bw, CV_16SC2, XY), matA;
   5732                 Mat dpart(dst, Rect(x, y, bw, bh));
   5733 
   5734                 for( y1 = 0; y1 < bh; y1++ )
   5735                 {
   5736                     short* xy = XY + y1*bw*2;
   5737                     double X0 = M[0]*x + M[1]*(y + y1) + M[2];
   5738                     double Y0 = M[3]*x + M[4]*(y + y1) + M[5];
   5739                     double W0 = M[6]*x + M[7]*(y + y1) + M[8];
   5740 
   5741                     if( interpolation == INTER_NEAREST )
   5742                     {
   5743                         x1 = 0;
   5744 
   5745                         #if CV_SSE4_1
   5746                         if (haveSSE4_1)
   5747                         {
   5748                             __m128d v_X0d = _mm_set1_pd(X0);
   5749                             __m128d v_Y0d = _mm_set1_pd(Y0);
   5750                             __m128d v_W0 = _mm_set1_pd(W0);
   5751                             __m128d v_x1 = _mm_set_pd(1, 0);
   5752 
   5753                             for( ; x1 <= bw - 16; x1 += 16 )
   5754                             {
   5755                                 // 0-3
   5756                                 __m128i v_X0, v_Y0;
   5757                                 {
   5758                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5759                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
   5760                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5761                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5762                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5763 
   5764                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5765                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
   5766                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5767                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5768                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5769 
   5770                                     v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
   5771                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
   5772                                     v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
   5773                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
   5774                                 }
   5775 
   5776                                 // 4-8
   5777                                 __m128i v_X1, v_Y1;
   5778                                 {
   5779                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5780                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
   5781                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5782                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5783                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5784 
   5785                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5786                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
   5787                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5788                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5789                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5790 
   5791                                     v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
   5792                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
   5793                                     v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
   5794                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
   5795                                 }
   5796 
   5797                                 // 8-11
   5798                                 __m128i v_X2, v_Y2;
   5799                                 {
   5800                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5801                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
   5802                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5803                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5804                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5805 
   5806                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5807                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
   5808                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5809                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5810                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5811 
   5812                                     v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
   5813                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
   5814                                     v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
   5815                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
   5816                                 }
   5817 
   5818                                 // 12-15
   5819                                 __m128i v_X3, v_Y3;
   5820                                 {
   5821                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5822                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
   5823                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5824                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5825                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5826 
   5827                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5828                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_1, v_W));
   5829                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5830                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5831                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5832 
   5833                                     v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
   5834                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
   5835                                     v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
   5836                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
   5837                                 }
   5838 
   5839                                 // convert to 16s
   5840                                 v_X0 = _mm_packs_epi32(v_X0, v_X1);
   5841                                 v_X1 = _mm_packs_epi32(v_X2, v_X3);
   5842                                 v_Y0 = _mm_packs_epi32(v_Y0, v_Y1);
   5843                                 v_Y1 = _mm_packs_epi32(v_Y2, v_Y3);
   5844 
   5845                                 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
   5846 
   5847                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
   5848                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
   5849                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
   5850                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
   5851                             }
   5852                         }
   5853                         #endif
   5854 
   5855                         for( ; x1 < bw; x1++ )
   5856                         {
   5857                             double W = W0 + M[6]*x1;
   5858                             W = W ? 1./W : 0;
   5859                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
   5860                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
   5861                             int X = saturate_cast<int>(fX);
   5862                             int Y = saturate_cast<int>(fY);
   5863 
   5864                             xy[x1*2] = saturate_cast<short>(X);
   5865                             xy[x1*2+1] = saturate_cast<short>(Y);
   5866                         }
   5867                     }
   5868                     else
   5869                     {
   5870                         short* alpha = A + y1*bw;
   5871                         x1 = 0;
   5872 
   5873                         #if CV_SSE4_1
   5874                         if (haveSSE4_1)
   5875                         {
   5876                             __m128d v_X0d = _mm_set1_pd(X0);
   5877                             __m128d v_Y0d = _mm_set1_pd(Y0);
   5878                             __m128d v_W0 = _mm_set1_pd(W0);
   5879                             __m128d v_x1 = _mm_set_pd(1, 0);
   5880 
   5881                             for( ; x1 <= bw - 16; x1 += 16 )
   5882                             {
   5883                                 // 0-3
   5884                                 __m128i v_X0, v_Y0;
   5885                                 {
   5886                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5887                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
   5888                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5889                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5890                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5891 
   5892                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5893                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
   5894                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5895                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5896                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5897 
   5898                                     v_X0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
   5899                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
   5900                                     v_Y0 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
   5901                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
   5902                                 }
   5903 
   5904                                 // 4-8
   5905                                 __m128i v_X1, v_Y1;
   5906                                 {
   5907                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5908                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
   5909                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5910                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5911                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5912 
   5913                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5914                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
   5915                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5916                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5917                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5918 
   5919                                     v_X1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
   5920                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
   5921                                     v_Y1 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
   5922                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
   5923                                 }
   5924 
   5925                                 // 8-11
   5926                                 __m128i v_X2, v_Y2;
   5927                                 {
   5928                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5929                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
   5930                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5931                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5932                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5933 
   5934                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5935                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
   5936                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5937                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5938                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5939 
   5940                                     v_X2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
   5941                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
   5942                                     v_Y2 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
   5943                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
   5944                                 }
   5945 
   5946                                 // 12-15
   5947                                 __m128i v_X3, v_Y3;
   5948                                 {
   5949                                     __m128d v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5950                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
   5951                                     __m128d v_fX0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5952                                     __m128d v_fY0 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5953                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5954 
   5955                                     v_W = _mm_add_pd(_mm_mul_pd(v_M6, v_x1), v_W0);
   5956                                     v_W = _mm_andnot_pd(_mm_cmpeq_pd(v_W, v_zero), _mm_div_pd(v_its, v_W));
   5957                                     __m128d v_fX1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_X0d, _mm_mul_pd(v_M0, v_x1)), v_W)));
   5958                                     __m128d v_fY1 = _mm_max_pd(v_intmin, _mm_min_pd(v_intmax, _mm_mul_pd(_mm_add_pd(v_Y0d, _mm_mul_pd(v_M3, v_x1)), v_W)));
   5959                                     v_x1 = _mm_add_pd(v_x1, v_2);
   5960 
   5961                                     v_X3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fX0)),
   5962                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fX1))));
   5963                                     v_Y3 = _mm_castps_si128(_mm_movelh_ps(_mm_castsi128_ps(_mm_cvtpd_epi32(v_fY0)),
   5964                                                                           _mm_castsi128_ps(_mm_cvtpd_epi32(v_fY1))));
   5965                                 }
   5966 
   5967                                 // store alpha
   5968                                 __m128i v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y0, v_itsi1), INTER_BITS),
   5969                                                                  _mm_and_si128(v_X0, v_itsi1));
   5970                                 __m128i v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y1, v_itsi1), INTER_BITS),
   5971                                                                  _mm_and_si128(v_X1, v_itsi1));
   5972                                 _mm_storeu_si128((__m128i *)(alpha + x1), _mm_packs_epi32(v_alpha0, v_alpha1));
   5973 
   5974                                 v_alpha0 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y2, v_itsi1), INTER_BITS),
   5975                                                          _mm_and_si128(v_X2, v_itsi1));
   5976                                 v_alpha1 = _mm_add_epi32(_mm_slli_epi32(_mm_and_si128(v_Y3, v_itsi1), INTER_BITS),
   5977                                                          _mm_and_si128(v_X3, v_itsi1));
   5978                                 _mm_storeu_si128((__m128i *)(alpha + x1 + 8), _mm_packs_epi32(v_alpha0, v_alpha1));
   5979 
   5980                                 // convert to 16s
   5981                                 v_X0 = _mm_packs_epi32(_mm_srai_epi32(v_X0, INTER_BITS), _mm_srai_epi32(v_X1, INTER_BITS));
   5982                                 v_X1 = _mm_packs_epi32(_mm_srai_epi32(v_X2, INTER_BITS), _mm_srai_epi32(v_X3, INTER_BITS));
   5983                                 v_Y0 = _mm_packs_epi32(_mm_srai_epi32(v_Y0, INTER_BITS), _mm_srai_epi32(v_Y1, INTER_BITS));
   5984                                 v_Y1 = _mm_packs_epi32(_mm_srai_epi32(v_Y2, INTER_BITS), _mm_srai_epi32(v_Y3, INTER_BITS));
   5985 
   5986                                 _mm_interleave_epi16(v_X0, v_X1, v_Y0, v_Y1);
   5987 
   5988                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2), v_X0);
   5989                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_X1);
   5990                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_Y0);
   5991                                 _mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_Y1);
   5992                             }
   5993                         }
   5994                         #endif
   5995 
   5996                         for( ; x1 < bw; x1++ )
   5997                         {
   5998                             double W = W0 + M[6]*x1;
   5999                             W = W ? INTER_TAB_SIZE/W : 0;
   6000                             double fX = std::max((double)INT_MIN, std::min((double)INT_MAX, (X0 + M[0]*x1)*W));
   6001                             double fY = std::max((double)INT_MIN, std::min((double)INT_MAX, (Y0 + M[3]*x1)*W));
   6002                             int X = saturate_cast<int>(fX);
   6003                             int Y = saturate_cast<int>(fY);
   6004 
   6005                             xy[x1*2] = saturate_cast<short>(X >> INTER_BITS);
   6006                             xy[x1*2+1] = saturate_cast<short>(Y >> INTER_BITS);
   6007                             alpha[x1] = (short)((Y & (INTER_TAB_SIZE-1))*INTER_TAB_SIZE +
   6008                                                 (X & (INTER_TAB_SIZE-1)));
   6009                         }
   6010                     }
   6011                 }
   6012 
   6013                 if( interpolation == INTER_NEAREST )
   6014                     remap( src, dpart, _XY, Mat(), interpolation, borderType, borderValue );
   6015                 else
   6016                 {
   6017                     Mat _matA(bh, bw, CV_16U, A);
   6018                     remap( src, dpart, _XY, _matA, interpolation, borderType, borderValue );
   6019                 }
   6020             }
   6021         }
   6022     }
   6023 
   6024 private:
   6025     Mat src;
   6026     Mat dst;
   6027     double* M;
   6028     int interpolation, borderType;
   6029     Scalar borderValue;
   6030 };
   6031 
   6032 
   6033 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
   6034 class IPPWarpPerspectiveInvoker :
   6035     public ParallelLoopBody
   6036 {
   6037 public:
   6038     IPPWarpPerspectiveInvoker(Mat &_src, Mat &_dst, double (&_coeffs)[3][3], int &_interpolation,
   6039                               int &_borderType, const Scalar &_borderValue, ippiWarpPerspectiveFunc _func, bool *_ok) :
   6040         ParallelLoopBody(), src(_src), dst(_dst), mode(_interpolation), coeffs(_coeffs),
   6041         borderType(_borderType), borderValue(_borderValue), func(_func), ok(_ok)
   6042     {
   6043         *ok = true;
   6044     }
   6045 
   6046     virtual void operator() (const Range& range) const
   6047     {
   6048         IppiSize srcsize = {src.cols, src.rows};
   6049         IppiRect srcroi = {0, 0, src.cols, src.rows};
   6050         IppiRect dstroi = {0, range.start, dst.cols, range.end - range.start};
   6051         int cnn = src.channels();
   6052 
   6053         if( borderType == BORDER_CONSTANT )
   6054         {
   6055             IppiSize setSize = {dst.cols, range.end - range.start};
   6056             void *dataPointer = dst.ptr(range.start);
   6057             if( !IPPSet( borderValue, dataPointer, (int)dst.step[0], setSize, cnn, src.depth() ) )
   6058             {
   6059                 *ok = false;
   6060                 return;
   6061             }
   6062         }
   6063 
   6064         IppStatus status = func(src.ptr(), srcsize, (int)src.step[0], srcroi, dst.ptr(), (int)dst.step[0], dstroi, coeffs, mode);
   6065         if (status != ippStsNoErr)
   6066             *ok = false;
   6067         else
   6068         {
   6069             CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   6070         }
   6071     }
   6072 private:
   6073     Mat &src;
   6074     Mat &dst;
   6075     int mode;
   6076     double (&coeffs)[3][3];
   6077     int borderType;
   6078     const Scalar borderValue;
   6079     ippiWarpPerspectiveFunc func;
   6080     bool *ok;
   6081 
   6082     const IPPWarpPerspectiveInvoker& operator= (const IPPWarpPerspectiveInvoker&);
   6083 };
   6084 #endif
   6085 }
   6086 
   6087 void cv::warpPerspective( InputArray _src, OutputArray _dst, InputArray _M0,
   6088                           Size dsize, int flags, int borderType, const Scalar& borderValue )
   6089 {
   6090     CV_Assert( _src.total() > 0 );
   6091 
   6092     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
   6093                ocl_warpTransform(_src, _dst, _M0, dsize, flags, borderType, borderValue,
   6094                               OCL_OP_PERSPECTIVE))
   6095 
   6096     Mat src = _src.getMat(), M0 = _M0.getMat();
   6097     _dst.create( dsize.area() == 0 ? src.size() : dsize, src.type() );
   6098     Mat dst = _dst.getMat();
   6099 
   6100     if( dst.data == src.data )
   6101         src = src.clone();
   6102 
   6103     double M[9];
   6104     Mat matM(3, 3, CV_64F, M);
   6105     int interpolation = flags & INTER_MAX;
   6106     if( interpolation == INTER_AREA )
   6107         interpolation = INTER_LINEAR;
   6108 
   6109     CV_Assert( (M0.type() == CV_32F || M0.type() == CV_64F) && M0.rows == 3 && M0.cols == 3 );
   6110     M0.convertTo(matM, matM.type());
   6111 
   6112 #ifdef HAVE_TEGRA_OPTIMIZATION
   6113     if( tegra::useTegra() && tegra::warpPerspective(src, dst, M, flags, borderType, borderValue) )
   6114         return;
   6115 #endif
   6116 
   6117 
   6118 #if defined (HAVE_IPP) && IPP_VERSION_MAJOR * 100 + IPP_VERSION_MINOR >= 801 && 0
   6119     CV_IPP_CHECK()
   6120     {
   6121         int type = src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   6122         if( (depth == CV_8U || depth == CV_16U || depth == CV_32F) &&
   6123            (cn == 1 || cn == 3 || cn == 4) &&
   6124            ( borderType == cv::BORDER_TRANSPARENT || borderType == cv::BORDER_CONSTANT ) &&
   6125            (interpolation == INTER_NEAREST || interpolation == INTER_LINEAR || interpolation == INTER_CUBIC))
   6126         {
   6127             ippiWarpPerspectiveFunc ippFunc = 0;
   6128             if ((flags & WARP_INVERSE_MAP) != 0)
   6129             {
   6130                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C1R :
   6131                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C3R :
   6132                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_8u_C4R :
   6133                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C1R :
   6134                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C3R :
   6135                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_16u_C4R :
   6136                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C1R :
   6137                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C3R :
   6138                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspectiveBack_32f_C4R : 0;
   6139             }
   6140             else
   6141             {
   6142                 ippFunc = type == CV_8UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C1R :
   6143                 type == CV_8UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C3R :
   6144                 type == CV_8UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_8u_C4R :
   6145                 type == CV_16UC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C1R :
   6146                 type == CV_16UC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C3R :
   6147                 type == CV_16UC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_16u_C4R :
   6148                 type == CV_32FC1 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C1R :
   6149                 type == CV_32FC3 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C3R :
   6150                 type == CV_32FC4 ? (ippiWarpPerspectiveFunc)ippiWarpPerspective_32f_C4R : 0;
   6151             }
   6152             int mode =
   6153             interpolation == INTER_NEAREST ? IPPI_INTER_NN :
   6154             interpolation == INTER_LINEAR ? IPPI_INTER_LINEAR :
   6155             interpolation == INTER_CUBIC ? IPPI_INTER_CUBIC : 0;
   6156             CV_Assert(mode && ippFunc);
   6157 
   6158             double coeffs[3][3];
   6159             for( int i = 0; i < 3; i++ )
   6160                 for( int j = 0; j < 3; j++ )
   6161                     coeffs[i][j] = matM.at<double>(i, j);
   6162 
   6163             bool ok;
   6164             Range range(0, dst.rows);
   6165             IPPWarpPerspectiveInvoker invoker(src, dst, coeffs, mode, borderType, borderValue, ippFunc, &ok);
   6166             parallel_for_(range, invoker, dst.total()/(double)(1<<16));
   6167             if( ok )
   6168             {
   6169                 CV_IMPL_ADD(CV_IMPL_IPP|CV_IMPL_MT);
   6170                 return;
   6171             }
   6172             setIppErrorStatus();
   6173         }
   6174     }
   6175 #endif
   6176 
   6177     if( !(flags & WARP_INVERSE_MAP) )
   6178         invert(matM, matM);
   6179 
   6180     Range range(0, dst.rows);
   6181     WarpPerspectiveInvoker invoker(src, dst, M, interpolation, borderType, borderValue);
   6182     parallel_for_(range, invoker, dst.total()/(double)(1<<16));
   6183 }
   6184 
   6185 
   6186 cv::Mat cv::getRotationMatrix2D( Point2f center, double angle, double scale )
   6187 {
   6188     angle *= CV_PI/180;
   6189     double alpha = cos(angle)*scale;
   6190     double beta = sin(angle)*scale;
   6191 
   6192     Mat M(2, 3, CV_64F);
   6193     double* m = M.ptr<double>();
   6194 
   6195     m[0] = alpha;
   6196     m[1] = beta;
   6197     m[2] = (1-alpha)*center.x - beta*center.y;
   6198     m[3] = -beta;
   6199     m[4] = alpha;
   6200     m[5] = beta*center.x + (1-alpha)*center.y;
   6201 
   6202     return M;
   6203 }
   6204 
   6205 /* Calculates coefficients of perspective transformation
   6206  * which maps (xi,yi) to (ui,vi), (i=1,2,3,4):
   6207  *
   6208  *      c00*xi + c01*yi + c02
   6209  * ui = ---------------------
   6210  *      c20*xi + c21*yi + c22
   6211  *
   6212  *      c10*xi + c11*yi + c12
   6213  * vi = ---------------------
   6214  *      c20*xi + c21*yi + c22
   6215  *
   6216  * Coefficients are calculated by solving linear system:
   6217  * / x0 y0  1  0  0  0 -x0*u0 -y0*u0 \ /c00\ /u0\
   6218  * | x1 y1  1  0  0  0 -x1*u1 -y1*u1 | |c01| |u1|
   6219  * | x2 y2  1  0  0  0 -x2*u2 -y2*u2 | |c02| |u2|
   6220  * | x3 y3  1  0  0  0 -x3*u3 -y3*u3 |.|c10|=|u3|,
   6221  * |  0  0  0 x0 y0  1 -x0*v0 -y0*v0 | |c11| |v0|
   6222  * |  0  0  0 x1 y1  1 -x1*v1 -y1*v1 | |c12| |v1|
   6223  * |  0  0  0 x2 y2  1 -x2*v2 -y2*v2 | |c20| |v2|
   6224  * \  0  0  0 x3 y3  1 -x3*v3 -y3*v3 / \c21/ \v3/
   6225  *
   6226  * where:
   6227  *   cij - matrix coefficients, c22 = 1
   6228  */
   6229 cv::Mat cv::getPerspectiveTransform( const Point2f src[], const Point2f dst[] )
   6230 {
   6231     Mat M(3, 3, CV_64F), X(8, 1, CV_64F, M.ptr());
   6232     double a[8][8], b[8];
   6233     Mat A(8, 8, CV_64F, a), B(8, 1, CV_64F, b);
   6234 
   6235     for( int i = 0; i < 4; ++i )
   6236     {
   6237         a[i][0] = a[i+4][3] = src[i].x;
   6238         a[i][1] = a[i+4][4] = src[i].y;
   6239         a[i][2] = a[i+4][5] = 1;
   6240         a[i][3] = a[i][4] = a[i][5] =
   6241         a[i+4][0] = a[i+4][1] = a[i+4][2] = 0;
   6242         a[i][6] = -src[i].x*dst[i].x;
   6243         a[i][7] = -src[i].y*dst[i].x;
   6244         a[i+4][6] = -src[i].x*dst[i].y;
   6245         a[i+4][7] = -src[i].y*dst[i].y;
   6246         b[i] = dst[i].x;
   6247         b[i+4] = dst[i].y;
   6248     }
   6249 
   6250     solve( A, B, X, DECOMP_SVD );
   6251     M.ptr<double>()[8] = 1.;
   6252 
   6253     return M;
   6254 }
   6255 
   6256 /* Calculates coefficients of affine transformation
   6257  * which maps (xi,yi) to (ui,vi), (i=1,2,3):
   6258  *
   6259  * ui = c00*xi + c01*yi + c02
   6260  *
   6261  * vi = c10*xi + c11*yi + c12
   6262  *
   6263  * Coefficients are calculated by solving linear system:
   6264  * / x0 y0  1  0  0  0 \ /c00\ /u0\
   6265  * | x1 y1  1  0  0  0 | |c01| |u1|
   6266  * | x2 y2  1  0  0  0 | |c02| |u2|
   6267  * |  0  0  0 x0 y0  1 | |c10| |v0|
   6268  * |  0  0  0 x1 y1  1 | |c11| |v1|
   6269  * \  0  0  0 x2 y2  1 / |c12| |v2|
   6270  *
   6271  * where:
   6272  *   cij - matrix coefficients
   6273  */
   6274 
   6275 cv::Mat cv::getAffineTransform( const Point2f src[], const Point2f dst[] )
   6276 {
   6277     Mat M(2, 3, CV_64F), X(6, 1, CV_64F, M.ptr());
   6278     double a[6*6], b[6];
   6279     Mat A(6, 6, CV_64F, a), B(6, 1, CV_64F, b);
   6280 
   6281     for( int i = 0; i < 3; i++ )
   6282     {
   6283         int j = i*12;
   6284         int k = i*12+6;
   6285         a[j] = a[k+3] = src[i].x;
   6286         a[j+1] = a[k+4] = src[i].y;
   6287         a[j+2] = a[k+5] = 1;
   6288         a[j+3] = a[j+4] = a[j+5] = 0;
   6289         a[k] = a[k+1] = a[k+2] = 0;
   6290         b[i*2] = dst[i].x;
   6291         b[i*2+1] = dst[i].y;
   6292     }
   6293 
   6294     solve( A, B, X );
   6295     return M;
   6296 }
   6297 
   6298 void cv::invertAffineTransform(InputArray _matM, OutputArray __iM)
   6299 {
   6300     Mat matM = _matM.getMat();
   6301     CV_Assert(matM.rows == 2 && matM.cols == 3);
   6302     __iM.create(2, 3, matM.type());
   6303     Mat _iM = __iM.getMat();
   6304 
   6305     if( matM.type() == CV_32F )
   6306     {
   6307         const float* M = matM.ptr<float>();
   6308         float* iM = _iM.ptr<float>();
   6309         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
   6310 
   6311         double D = M[0]*M[step+1] - M[1]*M[step];
   6312         D = D != 0 ? 1./D : 0;
   6313         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
   6314         double b1 = -A11*M[2] - A12*M[step+2];
   6315         double b2 = -A21*M[2] - A22*M[step+2];
   6316 
   6317         iM[0] = (float)A11; iM[1] = (float)A12; iM[2] = (float)b1;
   6318         iM[istep] = (float)A21; iM[istep+1] = (float)A22; iM[istep+2] = (float)b2;
   6319     }
   6320     else if( matM.type() == CV_64F )
   6321     {
   6322         const double* M = matM.ptr<double>();
   6323         double* iM = _iM.ptr<double>();
   6324         int step = (int)(matM.step/sizeof(M[0])), istep = (int)(_iM.step/sizeof(iM[0]));
   6325 
   6326         double D = M[0]*M[step+1] - M[1]*M[step];
   6327         D = D != 0 ? 1./D : 0;
   6328         double A11 = M[step+1]*D, A22 = M[0]*D, A12 = -M[1]*D, A21 = -M[step]*D;
   6329         double b1 = -A11*M[2] - A12*M[step+2];
   6330         double b2 = -A21*M[2] - A22*M[step+2];
   6331 
   6332         iM[0] = A11; iM[1] = A12; iM[2] = b1;
   6333         iM[istep] = A21; iM[istep+1] = A22; iM[istep+2] = b2;
   6334     }
   6335     else
   6336         CV_Error( CV_StsUnsupportedFormat, "" );
   6337 }
   6338 
   6339 cv::Mat cv::getPerspectiveTransform(InputArray _src, InputArray _dst)
   6340 {
   6341     Mat src = _src.getMat(), dst = _dst.getMat();
   6342     CV_Assert(src.checkVector(2, CV_32F) == 4 && dst.checkVector(2, CV_32F) == 4);
   6343     return getPerspectiveTransform((const Point2f*)src.data, (const Point2f*)dst.data);
   6344 }
   6345 
   6346 cv::Mat cv::getAffineTransform(InputArray _src, InputArray _dst)
   6347 {
   6348     Mat src = _src.getMat(), dst = _dst.getMat();
   6349     CV_Assert(src.checkVector(2, CV_32F) == 3 && dst.checkVector(2, CV_32F) == 3);
   6350     return getAffineTransform((const Point2f*)src.data, (const Point2f*)dst.data);
   6351 }
   6352 
   6353 CV_IMPL void
   6354 cvResize( const CvArr* srcarr, CvArr* dstarr, int method )
   6355 {
   6356     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
   6357     CV_Assert( src.type() == dst.type() );
   6358     cv::resize( src, dst, dst.size(), (double)dst.cols/src.cols,
   6359         (double)dst.rows/src.rows, method );
   6360 }
   6361 
   6362 
   6363 CV_IMPL void
   6364 cvWarpAffine( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
   6365               int flags, CvScalar fillval )
   6366 {
   6367     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
   6368     cv::Mat matrix = cv::cvarrToMat(marr);
   6369     CV_Assert( src.type() == dst.type() );
   6370     cv::warpAffine( src, dst, matrix, dst.size(), flags,
   6371         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
   6372         fillval );
   6373 }
   6374 
   6375 CV_IMPL void
   6376 cvWarpPerspective( const CvArr* srcarr, CvArr* dstarr, const CvMat* marr,
   6377                    int flags, CvScalar fillval )
   6378 {
   6379     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
   6380     cv::Mat matrix = cv::cvarrToMat(marr);
   6381     CV_Assert( src.type() == dst.type() );
   6382     cv::warpPerspective( src, dst, matrix, dst.size(), flags,
   6383         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
   6384         fillval );
   6385 }
   6386 
   6387 CV_IMPL void
   6388 cvRemap( const CvArr* srcarr, CvArr* dstarr,
   6389          const CvArr* _mapx, const CvArr* _mapy,
   6390          int flags, CvScalar fillval )
   6391 {
   6392     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr), dst0 = dst;
   6393     cv::Mat mapx = cv::cvarrToMat(_mapx), mapy = cv::cvarrToMat(_mapy);
   6394     CV_Assert( src.type() == dst.type() && dst.size() == mapx.size() );
   6395     cv::remap( src, dst, mapx, mapy, flags & cv::INTER_MAX,
   6396         (flags & CV_WARP_FILL_OUTLIERS) ? cv::BORDER_CONSTANT : cv::BORDER_TRANSPARENT,
   6397         fillval );
   6398     CV_Assert( dst0.data == dst.data );
   6399 }
   6400 
   6401 
   6402 CV_IMPL CvMat*
   6403 cv2DRotationMatrix( CvPoint2D32f center, double angle,
   6404                     double scale, CvMat* matrix )
   6405 {
   6406     cv::Mat M0 = cv::cvarrToMat(matrix), M = cv::getRotationMatrix2D(center, angle, scale);
   6407     CV_Assert( M.size() == M0.size() );
   6408     M.convertTo(M0, M0.type());
   6409     return matrix;
   6410 }
   6411 
   6412 
   6413 CV_IMPL CvMat*
   6414 cvGetPerspectiveTransform( const CvPoint2D32f* src,
   6415                           const CvPoint2D32f* dst,
   6416                           CvMat* matrix )
   6417 {
   6418     cv::Mat M0 = cv::cvarrToMat(matrix),
   6419         M = cv::getPerspectiveTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
   6420     CV_Assert( M.size() == M0.size() );
   6421     M.convertTo(M0, M0.type());
   6422     return matrix;
   6423 }
   6424 
   6425 
   6426 CV_IMPL CvMat*
   6427 cvGetAffineTransform( const CvPoint2D32f* src,
   6428                           const CvPoint2D32f* dst,
   6429                           CvMat* matrix )
   6430 {
   6431     cv::Mat M0 = cv::cvarrToMat(matrix),
   6432         M = cv::getAffineTransform((const cv::Point2f*)src, (const cv::Point2f*)dst);
   6433     CV_Assert( M.size() == M0.size() );
   6434     M.convertTo(M0, M0.type());
   6435     return matrix;
   6436 }
   6437 
   6438 
   6439 CV_IMPL void
   6440 cvConvertMaps( const CvArr* arr1, const CvArr* arr2, CvArr* dstarr1, CvArr* dstarr2 )
   6441 {
   6442     cv::Mat map1 = cv::cvarrToMat(arr1), map2;
   6443     cv::Mat dstmap1 = cv::cvarrToMat(dstarr1), dstmap2;
   6444 
   6445     if( arr2 )
   6446         map2 = cv::cvarrToMat(arr2);
   6447     if( dstarr2 )
   6448     {
   6449         dstmap2 = cv::cvarrToMat(dstarr2);
   6450         if( dstmap2.type() == CV_16SC1 )
   6451             dstmap2 = cv::Mat(dstmap2.size(), CV_16UC1, dstmap2.ptr(), dstmap2.step);
   6452     }
   6453 
   6454     cv::convertMaps( map1, map2, dstmap1, dstmap2, dstmap1.type(), false );
   6455 }
   6456 
   6457 /****************************************************************************************\
   6458 *                                   Log-Polar Transform                                  *
   6459 \****************************************************************************************/
   6460 
   6461 /* now it is done via Remap; more correct implementation should use
   6462    some super-sampling technique outside of the "fovea" circle */
   6463 CV_IMPL void
   6464 cvLogPolar( const CvArr* srcarr, CvArr* dstarr,
   6465             CvPoint2D32f center, double M, int flags )
   6466 {
   6467     cv::Ptr<CvMat> mapx, mapy;
   6468 
   6469     CvMat srcstub, *src = cvGetMat(srcarr, &srcstub);
   6470     CvMat dststub, *dst = cvGetMat(dstarr, &dststub);
   6471     CvSize ssize, dsize;
   6472 
   6473     if( !CV_ARE_TYPES_EQ( src, dst ))
   6474         CV_Error( CV_StsUnmatchedFormats, "" );
   6475 
   6476     if( M <= 0 )
   6477         CV_Error( CV_StsOutOfRange, "M should be >0" );
   6478 
   6479     ssize = cvGetMatSize(src);
   6480     dsize = cvGetMatSize(dst);
   6481 
   6482     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
   6483     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
   6484 
   6485     if( !(flags & CV_WARP_INVERSE_MAP) )
   6486     {
   6487         int phi, rho;
   6488         cv::AutoBuffer<double> _exp_tab(dsize.width);
   6489         double* exp_tab = _exp_tab;
   6490 
   6491         for( rho = 0; rho < dst->width; rho++ )
   6492             exp_tab[rho] = std::exp(rho/M);
   6493 
   6494         for( phi = 0; phi < dsize.height; phi++ )
   6495         {
   6496             double cp = cos(phi*2*CV_PI/dsize.height);
   6497             double sp = sin(phi*2*CV_PI/dsize.height);
   6498             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
   6499             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
   6500 
   6501             for( rho = 0; rho < dsize.width; rho++ )
   6502             {
   6503                 double r = exp_tab[rho];
   6504                 double x = r*cp + center.x;
   6505                 double y = r*sp + center.y;
   6506 
   6507                 mx[rho] = (float)x;
   6508                 my[rho] = (float)y;
   6509             }
   6510         }
   6511     }
   6512     else
   6513     {
   6514         int x, y;
   6515         CvMat bufx, bufy, bufp, bufa;
   6516         double ascale = ssize.height/(2*CV_PI);
   6517         cv::AutoBuffer<float> _buf(4*dsize.width);
   6518         float* buf = _buf;
   6519 
   6520         bufx = cvMat( 1, dsize.width, CV_32F, buf );
   6521         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
   6522         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
   6523         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
   6524 
   6525         for( x = 0; x < dsize.width; x++ )
   6526             bufx.data.fl[x] = (float)x - center.x;
   6527 
   6528         for( y = 0; y < dsize.height; y++ )
   6529         {
   6530             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
   6531             float* my = (float*)(mapy->data.ptr + y*mapy->step);
   6532 
   6533             for( x = 0; x < dsize.width; x++ )
   6534                 bufy.data.fl[x] = (float)y - center.y;
   6535 
   6536 #if 1
   6537             cvCartToPolar( &bufx, &bufy, &bufp, &bufa );
   6538 
   6539             for( x = 0; x < dsize.width; x++ )
   6540                 bufp.data.fl[x] += 1.f;
   6541 
   6542             cvLog( &bufp, &bufp );
   6543 
   6544             for( x = 0; x < dsize.width; x++ )
   6545             {
   6546                 double rho = bufp.data.fl[x]*M;
   6547                 double phi = bufa.data.fl[x]*ascale;
   6548 
   6549                 mx[x] = (float)rho;
   6550                 my[x] = (float)phi;
   6551             }
   6552 #else
   6553             for( x = 0; x < dsize.width; x++ )
   6554             {
   6555                 double xx = bufx.data.fl[x];
   6556                 double yy = bufy.data.fl[x];
   6557 
   6558                 double p = log(std::sqrt(xx*xx + yy*yy) + 1.)*M;
   6559                 double a = atan2(yy,xx);
   6560                 if( a < 0 )
   6561                     a = 2*CV_PI + a;
   6562                 a *= ascale;
   6563 
   6564                 mx[x] = (float)p;
   6565                 my[x] = (float)a;
   6566             }
   6567 #endif
   6568         }
   6569     }
   6570 
   6571     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
   6572 }
   6573 
   6574 void cv::logPolar( InputArray _src, OutputArray _dst,
   6575                    Point2f center, double M, int flags )
   6576 {
   6577     Mat src = _src.getMat();
   6578     _dst.create( src.size(), src.type() );
   6579     CvMat c_src = src, c_dst = _dst.getMat();
   6580     cvLogPolar( &c_src, &c_dst, center, M, flags );
   6581 }
   6582 
   6583 /****************************************************************************************
   6584                                    Linear-Polar Transform
   6585   J.L. Blanco, Apr 2009
   6586  ****************************************************************************************/
   6587 CV_IMPL
   6588 void cvLinearPolar( const CvArr* srcarr, CvArr* dstarr,
   6589             CvPoint2D32f center, double maxRadius, int flags )
   6590 {
   6591     cv::Ptr<CvMat> mapx, mapy;
   6592 
   6593     CvMat srcstub, *src = (CvMat*)srcarr;
   6594     CvMat dststub, *dst = (CvMat*)dstarr;
   6595     CvSize ssize, dsize;
   6596 
   6597     src = cvGetMat( srcarr, &srcstub,0,0 );
   6598     dst = cvGetMat( dstarr, &dststub,0,0 );
   6599 
   6600     if( !CV_ARE_TYPES_EQ( src, dst ))
   6601         CV_Error( CV_StsUnmatchedFormats, "" );
   6602 
   6603     ssize.width = src->cols;
   6604     ssize.height = src->rows;
   6605     dsize.width = dst->cols;
   6606     dsize.height = dst->rows;
   6607 
   6608     mapx.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
   6609     mapy.reset(cvCreateMat( dsize.height, dsize.width, CV_32F ));
   6610 
   6611     if( !(flags & CV_WARP_INVERSE_MAP) )
   6612     {
   6613         int phi, rho;
   6614 
   6615         for( phi = 0; phi < dsize.height; phi++ )
   6616         {
   6617             double cp = cos(phi*2*CV_PI/dsize.height);
   6618             double sp = sin(phi*2*CV_PI/dsize.height);
   6619             float* mx = (float*)(mapx->data.ptr + phi*mapx->step);
   6620             float* my = (float*)(mapy->data.ptr + phi*mapy->step);
   6621 
   6622             for( rho = 0; rho < dsize.width; rho++ )
   6623             {
   6624                 double r = maxRadius*(rho+1)/dsize.width;
   6625                 double x = r*cp + center.x;
   6626                 double y = r*sp + center.y;
   6627 
   6628                 mx[rho] = (float)x;
   6629                 my[rho] = (float)y;
   6630             }
   6631         }
   6632     }
   6633     else
   6634     {
   6635         int x, y;
   6636         CvMat bufx, bufy, bufp, bufa;
   6637         const double ascale = ssize.height/(2*CV_PI);
   6638         const double pscale = ssize.width/maxRadius;
   6639 
   6640         cv::AutoBuffer<float> _buf(4*dsize.width);
   6641         float* buf = _buf;
   6642 
   6643         bufx = cvMat( 1, dsize.width, CV_32F, buf );
   6644         bufy = cvMat( 1, dsize.width, CV_32F, buf + dsize.width );
   6645         bufp = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*2 );
   6646         bufa = cvMat( 1, dsize.width, CV_32F, buf + dsize.width*3 );
   6647 
   6648         for( x = 0; x < dsize.width; x++ )
   6649             bufx.data.fl[x] = (float)x - center.x;
   6650 
   6651         for( y = 0; y < dsize.height; y++ )
   6652         {
   6653             float* mx = (float*)(mapx->data.ptr + y*mapx->step);
   6654             float* my = (float*)(mapy->data.ptr + y*mapy->step);
   6655 
   6656             for( x = 0; x < dsize.width; x++ )
   6657                 bufy.data.fl[x] = (float)y - center.y;
   6658 
   6659             cvCartToPolar( &bufx, &bufy, &bufp, &bufa, 0 );
   6660 
   6661             for( x = 0; x < dsize.width; x++ )
   6662                 bufp.data.fl[x] += 1.f;
   6663 
   6664             for( x = 0; x < dsize.width; x++ )
   6665             {
   6666                 double rho = bufp.data.fl[x]*pscale;
   6667                 double phi = bufa.data.fl[x]*ascale;
   6668                 mx[x] = (float)rho;
   6669                 my[x] = (float)phi;
   6670             }
   6671         }
   6672     }
   6673 
   6674     cvRemap( src, dst, mapx, mapy, flags, cvScalarAll(0) );
   6675 }
   6676 
   6677 void cv::linearPolar( InputArray _src, OutputArray _dst,
   6678                       Point2f center, double maxRadius, int flags )
   6679 {
   6680     Mat src = _src.getMat();
   6681     _dst.create( src.size(), src.type() );
   6682     CvMat c_src = src, c_dst = _dst.getMat();
   6683     cvLinearPolar( &c_src, &c_dst, center, maxRadius, flags );
   6684 }
   6685 
   6686 /* End of file. */
   6687