Home | History | Annotate | Download | only in src
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                           License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2014-2015, Itseez Inc., all rights reserved.
     16 // Third party copyrights are property of their respective owners.
     17 //
     18 // Redistribution and use in source and binary forms, with or without modification,
     19 // are permitted provided that the following conditions are met:
     20 //
     21 //   * Redistribution's of source code must retain the above copyright notice,
     22 //     this list of conditions and the following disclaimer.
     23 //
     24 //   * Redistribution's in binary form must reproduce the above copyright notice,
     25 //     this list of conditions and the following disclaimer in the documentation
     26 //     and/or other materials provided with the distribution.
     27 //
     28 //   * The name of the copyright holders may not be used to endorse or promote products
     29 //     derived from this software without specific prior written permission.
     30 //
     31 // This software is provided by the copyright holders and contributors "as is" and
     32 // any express or implied warranties, including, but not limited to, the implied
     33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     34 // In no event shall the Intel Corporation or contributors be liable for any direct,
     35 // indirect, incidental, special, exemplary, or consequential damages
     36 // (including, but not limited to, procurement of substitute goods or services;
     37 // loss of use, data, or profits; or business interruption) however caused
     38 // and on any theory of liability, whether in contract, strict liability,
     39 // or tort (including negligence or otherwise) arising in any way out of
     40 // the use of this software, even if advised of the possibility of such damage.
     41 //
     42 //M*/
     43 
     44 #include "precomp.hpp"
     45 #include "opencl_kernels_imgproc.hpp"
     46 
     47 namespace cv
     48 {
     49 
     50 template<typename T, int shift> struct FixPtCast
     51 {
     52     typedef int type1;
     53     typedef T rtype;
     54     rtype operator ()(type1 arg) const { return (T)((arg + (1 << (shift-1))) >> shift); }
     55 };
     56 
     57 template<typename T, int shift> struct FltCast
     58 {
     59     typedef T type1;
     60     typedef T rtype;
     61     rtype operator ()(type1 arg) const { return arg*(T)(1./(1 << shift)); }
     62 };
     63 
     64 template<typename T1, typename T2> struct PyrDownNoVec
     65 {
     66     int operator()(T1**, T2*, int, int) const { return 0; }
     67 };
     68 
     69 template<typename T1, typename T2> struct PyrUpNoVec
     70 {
     71     int operator()(T1**, T2**, int, int) const { return 0; }
     72 };
     73 
     74 #if CV_SSE2
     75 
     76 struct PyrDownVec_32s8u
     77 {
     78     int operator()(int** src, uchar* dst, int, int width) const
     79     {
     80         if( !checkHardwareSupport(CV_CPU_SSE2) )
     81             return 0;
     82 
     83         int x = 0;
     84         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
     85         __m128i delta = _mm_set1_epi16(128);
     86 
     87         for( ; x <= width - 16; x += 16 )
     88         {
     89             __m128i r0, r1, r2, r3, r4, t0, t1;
     90             r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)),
     91                                  _mm_load_si128((const __m128i*)(row0 + x + 4)));
     92             r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)),
     93                                  _mm_load_si128((const __m128i*)(row1 + x + 4)));
     94             r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)),
     95                                  _mm_load_si128((const __m128i*)(row2 + x + 4)));
     96             r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)),
     97                                  _mm_load_si128((const __m128i*)(row3 + x + 4)));
     98             r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)),
     99                                  _mm_load_si128((const __m128i*)(row4 + x + 4)));
    100             r0 = _mm_add_epi16(r0, r4);
    101             r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
    102             r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
    103             t0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
    104             r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x + 8)),
    105                                  _mm_load_si128((const __m128i*)(row0 + x + 12)));
    106             r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x + 8)),
    107                                  _mm_load_si128((const __m128i*)(row1 + x + 12)));
    108             r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x + 8)),
    109                                  _mm_load_si128((const __m128i*)(row2 + x + 12)));
    110             r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x + 8)),
    111                                  _mm_load_si128((const __m128i*)(row3 + x + 12)));
    112             r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x + 8)),
    113                                  _mm_load_si128((const __m128i*)(row4 + x + 12)));
    114             r0 = _mm_add_epi16(r0, r4);
    115             r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
    116             r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
    117             t1 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
    118             t0 = _mm_srli_epi16(_mm_add_epi16(t0, delta), 8);
    119             t1 = _mm_srli_epi16(_mm_add_epi16(t1, delta), 8);
    120             _mm_storeu_si128((__m128i*)(dst + x), _mm_packus_epi16(t0, t1));
    121         }
    122 
    123         for( ; x <= width - 4; x += 4 )
    124         {
    125             __m128i r0, r1, r2, r3, r4, z = _mm_setzero_si128();
    126             r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), z);
    127             r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), z);
    128             r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), z);
    129             r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), z);
    130             r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), z);
    131             r0 = _mm_add_epi16(r0, r4);
    132             r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2);
    133             r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2));
    134             r0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2));
    135             r0 = _mm_srli_epi16(_mm_add_epi16(r0, delta), 8);
    136             *(int*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(r0, r0));
    137         }
    138 
    139         return x;
    140     }
    141 };
    142 
    143 struct PyrDownVec_32f
    144 {
    145     int operator()(float** src, float* dst, int, int width) const
    146     {
    147         if( !checkHardwareSupport(CV_CPU_SSE) )
    148             return 0;
    149 
    150         int x = 0;
    151         const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
    152         __m128 _4 = _mm_set1_ps(4.f), _scale = _mm_set1_ps(1.f/256);
    153         for( ; x <= width - 8; x += 8 )
    154         {
    155             __m128 r0, r1, r2, r3, r4, t0, t1;
    156             r0 = _mm_load_ps(row0 + x);
    157             r1 = _mm_load_ps(row1 + x);
    158             r2 = _mm_load_ps(row2 + x);
    159             r3 = _mm_load_ps(row3 + x);
    160             r4 = _mm_load_ps(row4 + x);
    161             r0 = _mm_add_ps(r0, r4);
    162             r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
    163             r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
    164             t0 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
    165 
    166             r0 = _mm_load_ps(row0 + x + 4);
    167             r1 = _mm_load_ps(row1 + x + 4);
    168             r2 = _mm_load_ps(row2 + x + 4);
    169             r3 = _mm_load_ps(row3 + x + 4);
    170             r4 = _mm_load_ps(row4 + x + 4);
    171             r0 = _mm_add_ps(r0, r4);
    172             r1 = _mm_add_ps(_mm_add_ps(r1, r3), r2);
    173             r0 = _mm_add_ps(r0, _mm_add_ps(r2, r2));
    174             t1 = _mm_add_ps(r0, _mm_mul_ps(r1, _4));
    175 
    176             t0 = _mm_mul_ps(t0, _scale);
    177             t1 = _mm_mul_ps(t1, _scale);
    178 
    179             _mm_storeu_ps(dst + x, t0);
    180             _mm_storeu_ps(dst + x + 4, t1);
    181         }
    182 
    183         return x;
    184     }
    185 };
    186 
    187 #if CV_SSE4_1
    188 
    189 struct PyrDownVec_32s16u
    190 {
    191     PyrDownVec_32s16u()
    192     {
    193         haveSSE = checkHardwareSupport(CV_CPU_SSE4_1);
    194     }
    195 
    196     int operator()(int** src, ushort* dst, int, int width) const
    197     {
    198         int x = 0;
    199 
    200         if (!haveSSE)
    201             return x;
    202 
    203         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
    204         __m128i v_delta = _mm_set1_epi32(128);
    205 
    206         for( ; x <= width - 8; x += 8 )
    207         {
    208             __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
    209                     v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
    210             __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
    211                     v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
    212             __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
    213                     v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
    214             __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
    215                     v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
    216             __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
    217                     v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
    218 
    219             v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
    220             v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
    221 
    222             v_r10 = _mm_slli_epi32(v_r10, 2);
    223             __m128i v_dst0 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
    224 
    225             v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
    226             v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
    227             v_r11 = _mm_slli_epi32(v_r11, 2);
    228             __m128i v_dst1 = _mm_srli_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
    229 
    230             _mm_storeu_si128((__m128i *)(dst + x), _mm_packus_epi32(v_dst0, v_dst1));
    231         }
    232 
    233         return x;
    234     }
    235 
    236     bool haveSSE;
    237 };
    238 
    239 #else
    240 
    241 typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
    242 
    243 #endif // CV_SSE4_1
    244 
    245 struct PyrDownVec_32s16s
    246 {
    247     PyrDownVec_32s16s()
    248     {
    249         haveSSE = checkHardwareSupport(CV_CPU_SSE2);
    250     }
    251 
    252     int operator()(int** src, short* dst, int, int width) const
    253     {
    254         int x = 0;
    255 
    256         if (!haveSSE)
    257             return x;
    258 
    259         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
    260         __m128i v_delta = _mm_set1_epi32(128);
    261 
    262         for( ; x <= width - 8; x += 8 )
    263         {
    264             __m128i v_r00 = _mm_loadu_si128((__m128i const *)(row0 + x)),
    265                     v_r01 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
    266             __m128i v_r10 = _mm_loadu_si128((__m128i const *)(row1 + x)),
    267                     v_r11 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
    268             __m128i v_r20 = _mm_loadu_si128((__m128i const *)(row2 + x)),
    269                     v_r21 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
    270             __m128i v_r30 = _mm_loadu_si128((__m128i const *)(row3 + x)),
    271                     v_r31 = _mm_loadu_si128((__m128i const *)(row3 + x + 4));
    272             __m128i v_r40 = _mm_loadu_si128((__m128i const *)(row4 + x)),
    273                     v_r41 = _mm_loadu_si128((__m128i const *)(row4 + x + 4));
    274 
    275             v_r00 = _mm_add_epi32(_mm_add_epi32(v_r00, v_r40), _mm_add_epi32(v_r20, v_r20));
    276             v_r10 = _mm_add_epi32(_mm_add_epi32(v_r10, v_r20), v_r30);
    277 
    278             v_r10 = _mm_slli_epi32(v_r10, 2);
    279             __m128i v_dst0 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r00, v_r10), v_delta), 8);
    280 
    281             v_r01 = _mm_add_epi32(_mm_add_epi32(v_r01, v_r41), _mm_add_epi32(v_r21, v_r21));
    282             v_r11 = _mm_add_epi32(_mm_add_epi32(v_r11, v_r21), v_r31);
    283             v_r11 = _mm_slli_epi32(v_r11, 2);
    284             __m128i v_dst1 = _mm_srai_epi32(_mm_add_epi32(_mm_add_epi32(v_r01, v_r11), v_delta), 8);
    285 
    286             _mm_storeu_si128((__m128i *)(dst + x), _mm_packs_epi32(v_dst0, v_dst1));
    287         }
    288 
    289         return x;
    290     }
    291 
    292     bool haveSSE;
    293 };
    294 
    295 struct PyrUpVec_32s8u
    296 {
    297     int operator()(int** src, uchar** dst, int, int width) const
    298     {
    299         int x = 0;
    300 
    301         if (!checkHardwareSupport(CV_CPU_SSE2))
    302             return x;
    303 
    304         uchar *dst0 = dst[0], *dst1 = dst[1];
    305         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
    306         __m128i v_delta = _mm_set1_epi16(32), v_zero = _mm_setzero_si128();
    307 
    308         for( ; x <= width - 16; x += 16 )
    309         {
    310             __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
    311                                            _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
    312             __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
    313                                            _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
    314             __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
    315                                            _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
    316 
    317             __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
    318             __m128i v_dst00 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
    319             __m128i v_dst10 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
    320 
    321             v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x + 8)),
    322                                    _mm_loadu_si128((__m128i const *)(row0 + x + 12)));
    323             v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x + 8)),
    324                                    _mm_loadu_si128((__m128i const *)(row1 + x + 12)));
    325             v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x + 8)),
    326                                    _mm_loadu_si128((__m128i const *)(row2 + x + 12)));
    327 
    328             v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
    329             __m128i v_dst01 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
    330             __m128i v_dst11 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
    331 
    332             _mm_storeu_si128((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst00, v_delta), 6),
    333                                                                      _mm_srli_epi16(_mm_adds_epu16(v_dst01, v_delta), 6)));
    334             _mm_storeu_si128((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst10, v_delta), 6),
    335                                                                      _mm_srli_epi16(_mm_adds_epu16(v_dst11, v_delta), 6)));
    336         }
    337 
    338         for( ; x <= width - 8; x += 8 )
    339         {
    340             __m128i v_r0 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row0 + x)),
    341                                            _mm_loadu_si128((__m128i const *)(row0 + x + 4)));
    342             __m128i v_r1 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row1 + x)),
    343                                            _mm_loadu_si128((__m128i const *)(row1 + x + 4)));
    344             __m128i v_r2 = _mm_packs_epi32(_mm_loadu_si128((__m128i const *)(row2 + x)),
    345                                            _mm_loadu_si128((__m128i const *)(row2 + x + 4)));
    346 
    347             __m128i v_2r1 = _mm_adds_epu16(v_r1, v_r1), v_4r1 = _mm_adds_epu16(v_2r1, v_2r1);
    348             __m128i v_dst0 = _mm_adds_epu16(_mm_adds_epu16(v_r0, v_r2), _mm_adds_epu16(v_2r1, v_4r1));
    349             __m128i v_dst1 = _mm_slli_epi16(_mm_adds_epu16(v_r1, v_r2), 2);
    350 
    351             _mm_storel_epi64((__m128i *)(dst0 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst0, v_delta), 6), v_zero));
    352             _mm_storel_epi64((__m128i *)(dst1 + x), _mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(v_dst1, v_delta), 6), v_zero));
    353         }
    354 
    355         return x;
    356     }
    357 };
    358 
    359 struct PyrUpVec_32s16s
    360 {
    361     int operator()(int** src, short** dst, int, int width) const
    362     {
    363         int x = 0;
    364 
    365         if (!checkHardwareSupport(CV_CPU_SSE2))
    366             return x;
    367 
    368         short *dst0 = dst[0], *dst1 = dst[1];
    369         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
    370         __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
    371 
    372         for( ; x <= width - 8; x += 8 )
    373         {
    374             __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
    375                     v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
    376                     v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
    377             __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
    378             __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
    379             __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
    380 
    381             v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
    382             v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
    383             v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
    384             v_2r1 = _mm_slli_epi32(v_r1, 1);
    385             v_4r1 = _mm_slli_epi32(v_r1, 2);
    386             __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
    387             __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
    388 
    389             _mm_storeu_si128((__m128i *)(dst0 + x),
    390                 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
    391                                 _mm_srai_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
    392             _mm_storeu_si128((__m128i *)(dst1 + x),
    393                 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
    394                                 _mm_srai_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
    395         }
    396 
    397         for( ; x <= width - 4; x += 4 )
    398         {
    399             __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
    400                     v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
    401                     v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
    402             __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
    403 
    404             __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
    405             __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
    406 
    407             _mm_storel_epi64((__m128i *)(dst0 + x),
    408                 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
    409             _mm_storel_epi64((__m128i *)(dst1 + x),
    410                 _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
    411         }
    412 
    413         return x;
    414     }
    415 };
    416 
    417 #if CV_SSE4_1
    418 
    419 struct PyrUpVec_32s16u
    420 {
    421     int operator()(int** src, ushort** dst, int, int width) const
    422     {
    423         int x = 0;
    424 
    425         if (!checkHardwareSupport(CV_CPU_SSE4_1))
    426             return x;
    427 
    428         ushort *dst0 = dst[0], *dst1 = dst[1];
    429         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
    430         __m128i v_delta = _mm_set1_epi32(32), v_zero = _mm_setzero_si128();
    431 
    432         for( ; x <= width - 8; x += 8 )
    433         {
    434             __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
    435                     v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
    436                     v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
    437             __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
    438             __m128i v_dst00 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
    439             __m128i v_dst10 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
    440 
    441             v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x + 4));
    442             v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x + 4));
    443             v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x + 4));
    444             v_2r1 = _mm_slli_epi32(v_r1, 1);
    445             v_4r1 = _mm_slli_epi32(v_r1, 2);
    446             __m128i v_dst01 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
    447             __m128i v_dst11 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
    448 
    449             _mm_storeu_si128((__m128i *)(dst0 + x),
    450                 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst00, v_delta), 6),
    451                                  _mm_srli_epi32(_mm_add_epi32(v_dst01, v_delta), 6)));
    452             _mm_storeu_si128((__m128i *)(dst1 + x),
    453                 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst10, v_delta), 6),
    454                                  _mm_srli_epi32(_mm_add_epi32(v_dst11, v_delta), 6)));
    455         }
    456 
    457         for( ; x <= width - 4; x += 4 )
    458         {
    459             __m128i v_r0 = _mm_loadu_si128((__m128i const *)(row0 + x)),
    460                     v_r1 = _mm_loadu_si128((__m128i const *)(row1 + x)),
    461                     v_r2 = _mm_loadu_si128((__m128i const *)(row2 + x));
    462             __m128i v_2r1 = _mm_slli_epi32(v_r1, 1), v_4r1 = _mm_slli_epi32(v_r1, 2);
    463 
    464             __m128i v_dst0 = _mm_add_epi32(_mm_add_epi32(v_r0, v_r2), _mm_add_epi32(v_2r1, v_4r1));
    465             __m128i v_dst1 = _mm_slli_epi32(_mm_add_epi32(v_r1, v_r2), 2);
    466 
    467             _mm_storel_epi64((__m128i *)(dst0 + x),
    468                 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst0, v_delta), 6), v_zero));
    469             _mm_storel_epi64((__m128i *)(dst1 + x),
    470                 _mm_packus_epi32(_mm_srli_epi32(_mm_add_epi32(v_dst1, v_delta), 6), v_zero));
    471         }
    472 
    473         return x;
    474     }
    475 };
    476 
    477 #else
    478 
    479 typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
    480 
    481 #endif // CV_SSE4_1
    482 
    483 struct PyrUpVec_32f
    484 {
    485     int operator()(float** src, float** dst, int, int width) const
    486     {
    487         int x = 0;
    488 
    489         if (!checkHardwareSupport(CV_CPU_SSE2))
    490             return x;
    491 
    492         const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
    493         float *dst0 = dst[0], *dst1 = dst[1];
    494         __m128 v_6 = _mm_set1_ps(6.0f), v_scale = _mm_set1_ps(1.f/64.0f),
    495                v_scale4 = _mm_mul_ps(v_scale, _mm_set1_ps(4.0f));
    496 
    497         for( ; x <= width - 8; x += 8 )
    498         {
    499             __m128 v_r0 = _mm_loadu_ps(row0 + x);
    500             __m128 v_r1 = _mm_loadu_ps(row1 + x);
    501             __m128 v_r2 = _mm_loadu_ps(row2 + x);
    502 
    503             _mm_storeu_ps(dst1 + x, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
    504             _mm_storeu_ps(dst0 + x, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
    505 
    506             v_r0 = _mm_loadu_ps(row0 + x + 4);
    507             v_r1 = _mm_loadu_ps(row1 + x + 4);
    508             v_r2 = _mm_loadu_ps(row2 + x + 4);
    509 
    510             _mm_storeu_ps(dst1 + x + 4, _mm_mul_ps(v_scale4, _mm_add_ps(v_r1, v_r2)));
    511             _mm_storeu_ps(dst0 + x + 4, _mm_mul_ps(v_scale, _mm_add_ps(_mm_add_ps(v_r0, _mm_mul_ps(v_6, v_r1)), v_r2)));
    512         }
    513 
    514         return x;
    515     }
    516 };
    517 
    518 #elif CV_NEON
    519 
    520 struct PyrDownVec_32s8u
    521 {
    522     int operator()(int** src, uchar* dst, int, int width) const
    523     {
    524         int x = 0;
    525         const unsigned int *row0 = (unsigned int*)src[0], *row1 = (unsigned int*)src[1],
    526                            *row2 = (unsigned int*)src[2], *row3 = (unsigned int*)src[3],
    527                            *row4 = (unsigned int*)src[4];
    528         uint16x8_t v_delta = vdupq_n_u16(128);
    529 
    530         for( ; x <= width - 16; x += 16 )
    531         {
    532             uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
    533             uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
    534             uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
    535             uint16x8_t v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x)), vqmovn_u32(vld1q_u32(row3 + x + 4)));
    536             uint16x8_t v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x)), vqmovn_u32(vld1q_u32(row4 + x + 4)));
    537 
    538             v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
    539             v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
    540             uint16x8_t v_dst0 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
    541 
    542             v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
    543             v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
    544             v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
    545             v_r3 = vcombine_u16(vqmovn_u32(vld1q_u32(row3 + x + 8)), vqmovn_u32(vld1q_u32(row3 + x + 12)));
    546             v_r4 = vcombine_u16(vqmovn_u32(vld1q_u32(row4 + x + 8)), vqmovn_u32(vld1q_u32(row4 + x + 12)));
    547 
    548             v_r0 = vaddq_u16(vaddq_u16(v_r0, v_r4), vaddq_u16(v_r2, v_r2));
    549             v_r1 = vaddq_u16(vaddq_u16(v_r1, v_r2), v_r3);
    550             uint16x8_t v_dst1 = vaddq_u16(v_r0, vshlq_n_u16(v_r1, 2));
    551 
    552             vst1q_u8(dst + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 8)),
    553                                           vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 8))));
    554         }
    555 
    556         return x;
    557     }
    558 };
    559 
    560 struct PyrDownVec_32s16u
    561 {
    562     int operator()(int** src, ushort* dst, int, int width) const
    563     {
    564         int x = 0;
    565         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
    566         int32x4_t v_delta = vdupq_n_s32(128);
    567 
    568         for( ; x <= width - 8; x += 8 )
    569         {
    570             int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
    571             int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
    572             int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
    573             int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
    574             int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
    575 
    576             v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
    577             v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
    578 
    579             v_r10 = vshlq_n_s32(v_r10, 2);
    580             int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
    581 
    582             v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
    583             v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
    584             v_r11 = vshlq_n_s32(v_r11, 2);
    585             int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
    586 
    587             vst1q_u16(dst + x, vcombine_u16(vqmovun_s32(v_dst0), vqmovun_s32(v_dst1)));
    588         }
    589 
    590         return x;
    591     }
    592 };
    593 
    594 struct PyrDownVec_32s16s
    595 {
    596     int operator()(int** src, short* dst, int, int width) const
    597     {
    598         int x = 0;
    599         const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
    600         int32x4_t v_delta = vdupq_n_s32(128);
    601 
    602         for( ; x <= width - 8; x += 8 )
    603         {
    604             int32x4_t v_r00 = vld1q_s32(row0 + x), v_r01 = vld1q_s32(row0 + x + 4);
    605             int32x4_t v_r10 = vld1q_s32(row1 + x), v_r11 = vld1q_s32(row1 + x + 4);
    606             int32x4_t v_r20 = vld1q_s32(row2 + x), v_r21 = vld1q_s32(row2 + x + 4);
    607             int32x4_t v_r30 = vld1q_s32(row3 + x), v_r31 = vld1q_s32(row3 + x + 4);
    608             int32x4_t v_r40 = vld1q_s32(row4 + x), v_r41 = vld1q_s32(row4 + x + 4);
    609 
    610             v_r00 = vaddq_s32(vaddq_s32(v_r00, v_r40), vaddq_s32(v_r20, v_r20));
    611             v_r10 = vaddq_s32(vaddq_s32(v_r10, v_r20), v_r30);
    612             v_r10 = vshlq_n_s32(v_r10, 2);
    613             int32x4_t v_dst0 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r00, v_r10), v_delta), 8);
    614 
    615             v_r01 = vaddq_s32(vaddq_s32(v_r01, v_r41), vaddq_s32(v_r21, v_r21));
    616             v_r11 = vaddq_s32(vaddq_s32(v_r11, v_r21), v_r31);
    617             v_r11 = vshlq_n_s32(v_r11, 2);
    618             int32x4_t v_dst1 = vshrq_n_s32(vaddq_s32(vaddq_s32(v_r01, v_r11), v_delta), 8);
    619 
    620             vst1q_s16(dst + x, vcombine_s16(vqmovn_s32(v_dst0), vqmovn_s32(v_dst1)));
    621         }
    622 
    623         return x;
    624     }
    625 };
    626 
    627 struct PyrDownVec_32f
    628 {
    629     int operator()(float** src, float* dst, int, int width) const
    630     {
    631         int x = 0;
    632         const float *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4];
    633         float32x4_t v_4 = vdupq_n_f32(4.0f), v_scale = vdupq_n_f32(1.f/256.0f);
    634 
    635         for( ; x <= width - 8; x += 8 )
    636         {
    637             float32x4_t v_r0 = vld1q_f32(row0 + x);
    638             float32x4_t v_r1 = vld1q_f32(row1 + x);
    639             float32x4_t v_r2 = vld1q_f32(row2 + x);
    640             float32x4_t v_r3 = vld1q_f32(row3 + x);
    641             float32x4_t v_r4 = vld1q_f32(row4 + x);
    642 
    643             v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
    644             v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
    645             vst1q_f32(dst + x, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
    646 
    647             v_r0 = vld1q_f32(row0 + x + 4);
    648             v_r1 = vld1q_f32(row1 + x + 4);
    649             v_r2 = vld1q_f32(row2 + x + 4);
    650             v_r3 = vld1q_f32(row3 + x + 4);
    651             v_r4 = vld1q_f32(row4 + x + 4);
    652 
    653             v_r0 = vaddq_f32(vaddq_f32(v_r0, v_r4), vaddq_f32(v_r2, v_r2));
    654             v_r1 = vaddq_f32(vaddq_f32(v_r1, v_r2), v_r3);
    655             vst1q_f32(dst + x + 4, vmulq_f32(vmlaq_f32(v_r0, v_4, v_r1), v_scale));
    656         }
    657 
    658         return x;
    659     }
    660 };
    661 
    662 struct PyrUpVec_32s8u
    663 {
    664     int operator()(int** src, uchar** dst, int, int width) const
    665     {
    666         int x = 0;
    667         uchar *dst0 = dst[0], *dst1 = dst[1];
    668         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
    669         uint16x8_t v_delta = vdupq_n_u16(32);
    670 
    671         for( ; x <= width - 16; x += 16 )
    672         {
    673             uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
    674             uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
    675             uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
    676 
    677             uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
    678             uint16x8_t v_dst00 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
    679             uint16x8_t v_dst10 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
    680 
    681             v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x + 8)), vqmovn_u32(vld1q_u32(row0 + x + 12)));
    682             v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x + 8)), vqmovn_u32(vld1q_u32(row1 + x + 12)));
    683             v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x + 8)), vqmovn_u32(vld1q_u32(row2 + x + 12)));
    684 
    685             v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
    686             uint16x8_t v_dst01 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
    687             uint16x8_t v_dst11 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
    688 
    689             vst1q_u8(dst0 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst00, v_delta), 6)),
    690                                            vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst01, v_delta), 6))));
    691             vst1q_u8(dst1 + x, vcombine_u8(vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst10, v_delta), 6)),
    692                                            vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst11, v_delta), 6))));
    693         }
    694 
    695         for( ; x <= width - 8; x += 8 )
    696         {
    697             uint16x8_t v_r0 = vcombine_u16(vqmovn_u32(vld1q_u32(row0 + x)), vqmovn_u32(vld1q_u32(row0 + x + 4)));
    698             uint16x8_t v_r1 = vcombine_u16(vqmovn_u32(vld1q_u32(row1 + x)), vqmovn_u32(vld1q_u32(row1 + x + 4)));
    699             uint16x8_t v_r2 = vcombine_u16(vqmovn_u32(vld1q_u32(row2 + x)), vqmovn_u32(vld1q_u32(row2 + x + 4)));
    700 
    701             uint16x8_t v_2r1 = vaddq_u16(v_r1, v_r1), v_4r1 = vaddq_u16(v_2r1, v_2r1);
    702             uint16x8_t v_dst0 = vaddq_u16(vaddq_u16(v_r0, v_r2), vaddq_u16(v_2r1, v_4r1));
    703             uint16x8_t v_dst1 = vshlq_n_u16(vaddq_u16(v_r1, v_r2), 2);
    704 
    705             vst1_u8(dst0 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst0, v_delta), 6)));
    706             vst1_u8(dst1 + x, vqmovn_u16(vshrq_n_u16(vaddq_u16(v_dst1, v_delta), 6)));
    707         }
    708 
    709         return x;
    710     }
    711 };
    712 
    713 struct PyrUpVec_32s16u
    714 {
    715     int operator()(int** src, ushort** dst, int, int width) const
    716     {
    717         int x = 0;
    718         ushort *dst0 = dst[0], *dst1 = dst[1];
    719         const uint *row0 = (uint *)src[0], *row1 = (uint *)src[1], *row2 = (uint *)src[2];
    720         uint32x4_t v_delta = vdupq_n_u32(32);
    721 
    722         for( ; x <= width - 8; x += 8 )
    723         {
    724             uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
    725             uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
    726             uint32x4_t v_dst00 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
    727             uint32x4_t v_dst10 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
    728 
    729             v_r0 = vld1q_u32(row0 + x + 4);
    730             v_r1 = vld1q_u32(row1 + x + 4);
    731             v_r2 = vld1q_u32(row2 + x + 4);
    732             v_2r1 = vshlq_n_u32(v_r1, 1);
    733             v_4r1 = vshlq_n_u32(v_r1, 2);
    734             uint32x4_t v_dst01 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
    735             uint32x4_t v_dst11 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
    736 
    737             vst1q_u16(dst0 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst00, v_delta), 6)),
    738                                              vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst01, v_delta), 6))));
    739             vst1q_u16(dst1 + x, vcombine_u16(vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst10, v_delta), 6)),
    740                                              vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst11, v_delta), 6))));
    741         }
    742 
    743         for( ; x <= width - 4; x += 4 )
    744         {
    745             uint32x4_t v_r0 = vld1q_u32(row0 + x), v_r1 = vld1q_u32(row1 + x), v_r2 = vld1q_u32(row2 + x);
    746             uint32x4_t v_2r1 = vshlq_n_u32(v_r1, 1), v_4r1 = vshlq_n_u32(v_r1, 2);
    747 
    748             uint32x4_t v_dst0 = vaddq_u32(vaddq_u32(v_r0, v_r2), vaddq_u32(v_2r1, v_4r1));
    749             uint32x4_t v_dst1 = vshlq_n_u32(vaddq_u32(v_r1, v_r2), 2);
    750 
    751             vst1_u16(dst0 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst0, v_delta), 6)));
    752             vst1_u16(dst1 + x, vmovn_u32(vshrq_n_u32(vaddq_u32(v_dst1, v_delta), 6)));
    753         }
    754 
    755         return x;
    756     }
    757 };
    758 
    759 struct PyrUpVec_32s16s
    760 {
    761     int operator()(int** src, short** dst, int, int width) const
    762     {
    763         int x = 0;
    764         short *dst0 = dst[0], *dst1 = dst[1];
    765         const int *row0 = src[0], *row1 = src[1], *row2 = src[2];
    766         int32x4_t v_delta = vdupq_n_s32(32);
    767 
    768         for( ; x <= width - 8; x += 8 )
    769         {
    770             int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
    771             int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
    772             int32x4_t v_dst00 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
    773             int32x4_t v_dst10 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
    774 
    775             v_r0 = vld1q_s32(row0 + x + 4);
    776             v_r1 = vld1q_s32(row1 + x + 4);
    777             v_r2 = vld1q_s32(row2 + x + 4);
    778             v_2r1 = vshlq_n_s32(v_r1, 1);
    779             v_4r1 = vshlq_n_s32(v_r1, 2);
    780             int32x4_t v_dst01 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
    781             int32x4_t v_dst11 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
    782 
    783             vst1q_s16(dst0 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst00, v_delta), 6)),
    784                                              vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst01, v_delta), 6))));
    785             vst1q_s16(dst1 + x, vcombine_s16(vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst10, v_delta), 6)),
    786                                              vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst11, v_delta), 6))));
    787         }
    788 
    789         for( ; x <= width - 4; x += 4 )
    790         {
    791             int32x4_t v_r0 = vld1q_s32(row0 + x), v_r1 = vld1q_s32(row1 + x), v_r2 = vld1q_s32(row2 + x);
    792             int32x4_t v_2r1 = vshlq_n_s32(v_r1, 1), v_4r1 = vshlq_n_s32(v_r1, 2);
    793 
    794             int32x4_t v_dst0 = vaddq_s32(vaddq_s32(v_r0, v_r2), vaddq_s32(v_2r1, v_4r1));
    795             int32x4_t v_dst1 = vshlq_n_s32(vaddq_s32(v_r1, v_r2), 2);
    796 
    797             vst1_s16(dst0 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst0, v_delta), 6)));
    798             vst1_s16(dst1 + x, vqmovn_s32(vshrq_n_s32(vaddq_s32(v_dst1, v_delta), 6)));
    799         }
    800 
    801         return x;
    802     }
    803 };
    804 
    805 struct PyrUpVec_32f
    806 {
    807     int operator()(float** src, float** dst, int, int width) const
    808     {
    809         int x = 0;
    810         const float *row0 = src[0], *row1 = src[1], *row2 = src[2];
    811         float *dst0 = dst[0], *dst1 = dst[1];
    812         float32x4_t v_6 = vdupq_n_f32(6.0f), v_scale = vdupq_n_f32(1.f/64.0f), v_scale4 = vmulq_n_f32(v_scale, 4.0f);
    813 
    814         for( ; x <= width - 8; x += 8 )
    815         {
    816             float32x4_t v_r0 = vld1q_f32(row0 + x);
    817             float32x4_t v_r1 = vld1q_f32(row1 + x);
    818             float32x4_t v_r2 = vld1q_f32(row2 + x);
    819 
    820             vst1q_f32(dst1 + x, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
    821             vst1q_f32(dst0 + x, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
    822 
    823             v_r0 = vld1q_f32(row0 + x + 4);
    824             v_r1 = vld1q_f32(row1 + x + 4);
    825             v_r2 = vld1q_f32(row2 + x + 4);
    826 
    827             vst1q_f32(dst1 + x + 4, vmulq_f32(v_scale4, vaddq_f32(v_r1, v_r2)));
    828             vst1q_f32(dst0 + x + 4, vmulq_f32(v_scale, vaddq_f32(vmlaq_f32(v_r0, v_6, v_r1), v_r2)));
    829         }
    830 
    831         return x;
    832     }
    833 };
    834 
    835 #else
    836 
    837 typedef PyrDownNoVec<int, uchar> PyrDownVec_32s8u;
    838 typedef PyrDownNoVec<int, ushort> PyrDownVec_32s16u;
    839 typedef PyrDownNoVec<int, short> PyrDownVec_32s16s;
    840 typedef PyrDownNoVec<float, float> PyrDownVec_32f;
    841 
    842 typedef PyrUpNoVec<int, uchar> PyrUpVec_32s8u;
    843 typedef PyrUpNoVec<int, short> PyrUpVec_32s16s;
    844 typedef PyrUpNoVec<int, ushort> PyrUpVec_32s16u;
    845 typedef PyrUpNoVec<float, float> PyrUpVec_32f;
    846 
    847 #endif
    848 
    849 template<class CastOp, class VecOp> void
    850 pyrDown_( const Mat& _src, Mat& _dst, int borderType )
    851 {
    852     const int PD_SZ = 5;
    853     typedef typename CastOp::type1 WT;
    854     typedef typename CastOp::rtype T;
    855 
    856     CV_Assert( !_src.empty() );
    857     Size ssize = _src.size(), dsize = _dst.size();
    858     int cn = _src.channels();
    859     int bufstep = (int)alignSize(dsize.width*cn, 16);
    860     AutoBuffer<WT> _buf(bufstep*PD_SZ + 16);
    861     WT* buf = alignPtr((WT*)_buf, 16);
    862     int tabL[CV_CN_MAX*(PD_SZ+2)], tabR[CV_CN_MAX*(PD_SZ+2)];
    863     AutoBuffer<int> _tabM(dsize.width*cn);
    864     int* tabM = _tabM;
    865     WT* rows[PD_SZ];
    866     CastOp castOp;
    867     VecOp vecOp;
    868 
    869     CV_Assert( ssize.width > 0 && ssize.height > 0 &&
    870                std::abs(dsize.width*2 - ssize.width) <= 2 &&
    871                std::abs(dsize.height*2 - ssize.height) <= 2 );
    872     int k, x, sy0 = -PD_SZ/2, sy = sy0, width0 = std::min((ssize.width-PD_SZ/2-1)/2 + 1, dsize.width);
    873 
    874     for( x = 0; x <= PD_SZ+1; x++ )
    875     {
    876         int sx0 = borderInterpolate(x - PD_SZ/2, ssize.width, borderType)*cn;
    877         int sx1 = borderInterpolate(x + width0*2 - PD_SZ/2, ssize.width, borderType)*cn;
    878         for( k = 0; k < cn; k++ )
    879         {
    880             tabL[x*cn + k] = sx0 + k;
    881             tabR[x*cn + k] = sx1 + k;
    882         }
    883     }
    884 
    885     ssize.width *= cn;
    886     dsize.width *= cn;
    887     width0 *= cn;
    888 
    889     for( x = 0; x < dsize.width; x++ )
    890         tabM[x] = (x/cn)*2*cn + x % cn;
    891 
    892     for( int y = 0; y < dsize.height; y++ )
    893     {
    894         T* dst = _dst.ptr<T>(y);
    895         WT *row0, *row1, *row2, *row3, *row4;
    896 
    897         // fill the ring buffer (horizontal convolution and decimation)
    898         for( ; sy <= y*2 + 2; sy++ )
    899         {
    900             WT* row = buf + ((sy - sy0) % PD_SZ)*bufstep;
    901             int _sy = borderInterpolate(sy, ssize.height, borderType);
    902             const T* src = _src.ptr<T>(_sy);
    903             int limit = cn;
    904             const int* tab = tabL;
    905 
    906             for( x = 0;;)
    907             {
    908                 for( ; x < limit; x++ )
    909                 {
    910                     row[x] = src[tab[x+cn*2]]*6 + (src[tab[x+cn]] + src[tab[x+cn*3]])*4 +
    911                         src[tab[x]] + src[tab[x+cn*4]];
    912                 }
    913 
    914                 if( x == dsize.width )
    915                     break;
    916 
    917                 if( cn == 1 )
    918                 {
    919                     for( ; x < width0; x++ )
    920                         row[x] = src[x*2]*6 + (src[x*2 - 1] + src[x*2 + 1])*4 +
    921                             src[x*2 - 2] + src[x*2 + 2];
    922                 }
    923                 else if( cn == 3 )
    924                 {
    925                     for( ; x < width0; x += 3 )
    926                     {
    927                         const T* s = src + x*2;
    928                         WT t0 = s[0]*6 + (s[-3] + s[3])*4 + s[-6] + s[6];
    929                         WT t1 = s[1]*6 + (s[-2] + s[4])*4 + s[-5] + s[7];
    930                         WT t2 = s[2]*6 + (s[-1] + s[5])*4 + s[-4] + s[8];
    931                         row[x] = t0; row[x+1] = t1; row[x+2] = t2;
    932                     }
    933                 }
    934                 else if( cn == 4 )
    935                 {
    936                     for( ; x < width0; x += 4 )
    937                     {
    938                         const T* s = src + x*2;
    939                         WT t0 = s[0]*6 + (s[-4] + s[4])*4 + s[-8] + s[8];
    940                         WT t1 = s[1]*6 + (s[-3] + s[5])*4 + s[-7] + s[9];
    941                         row[x] = t0; row[x+1] = t1;
    942                         t0 = s[2]*6 + (s[-2] + s[6])*4 + s[-6] + s[10];
    943                         t1 = s[3]*6 + (s[-1] + s[7])*4 + s[-5] + s[11];
    944                         row[x+2] = t0; row[x+3] = t1;
    945                     }
    946                 }
    947                 else
    948                 {
    949                     for( ; x < width0; x++ )
    950                     {
    951                         int sx = tabM[x];
    952                         row[x] = src[sx]*6 + (src[sx - cn] + src[sx + cn])*4 +
    953                             src[sx - cn*2] + src[sx + cn*2];
    954                     }
    955                 }
    956 
    957                 limit = dsize.width;
    958                 tab = tabR - x;
    959             }
    960         }
    961 
    962         // do vertical convolution and decimation and write the result to the destination image
    963         for( k = 0; k < PD_SZ; k++ )
    964             rows[k] = buf + ((y*2 - PD_SZ/2 + k - sy0) % PD_SZ)*bufstep;
    965         row0 = rows[0]; row1 = rows[1]; row2 = rows[2]; row3 = rows[3]; row4 = rows[4];
    966 
    967         x = vecOp(rows, dst, (int)_dst.step, dsize.width);
    968         for( ; x < dsize.width; x++ )
    969             dst[x] = castOp(row2[x]*6 + (row1[x] + row3[x])*4 + row0[x] + row4[x]);
    970     }
    971 }
    972 
    973 
    974 template<class CastOp, class VecOp> void
    975 pyrUp_( const Mat& _src, Mat& _dst, int)
    976 {
    977     const int PU_SZ = 3;
    978     typedef typename CastOp::type1 WT;
    979     typedef typename CastOp::rtype T;
    980 
    981     Size ssize = _src.size(), dsize = _dst.size();
    982     int cn = _src.channels();
    983     int bufstep = (int)alignSize((dsize.width+1)*cn, 16);
    984     AutoBuffer<WT> _buf(bufstep*PU_SZ + 16);
    985     WT* buf = alignPtr((WT*)_buf, 16);
    986     AutoBuffer<int> _dtab(ssize.width*cn);
    987     int* dtab = _dtab;
    988     WT* rows[PU_SZ];
    989     T* dsts[2];
    990     CastOp castOp;
    991     VecOp vecOp;
    992 
    993     CV_Assert( std::abs(dsize.width - ssize.width*2) == dsize.width % 2 &&
    994                std::abs(dsize.height - ssize.height*2) == dsize.height % 2);
    995     int k, x, sy0 = -PU_SZ/2, sy = sy0;
    996 
    997     ssize.width *= cn;
    998     dsize.width *= cn;
    999 
   1000     for( x = 0; x < ssize.width; x++ )
   1001         dtab[x] = (x/cn)*2*cn + x % cn;
   1002 
   1003     for( int y = 0; y < ssize.height; y++ )
   1004     {
   1005         T* dst0 = _dst.ptr<T>(y*2);
   1006         T* dst1 = _dst.ptr<T>(std::min(y*2+1, dsize.height-1));
   1007         WT *row0, *row1, *row2;
   1008 
   1009         // fill the ring buffer (horizontal convolution and decimation)
   1010         for( ; sy <= y + 1; sy++ )
   1011         {
   1012             WT* row = buf + ((sy - sy0) % PU_SZ)*bufstep;
   1013             int _sy = borderInterpolate(sy*2, dsize.height, BORDER_REFLECT_101)/2;
   1014             const T* src = _src.ptr<T>(_sy);
   1015 
   1016             if( ssize.width == cn )
   1017             {
   1018                 for( x = 0; x < cn; x++ )
   1019                     row[x] = row[x + cn] = src[x]*8;
   1020                 continue;
   1021             }
   1022 
   1023             for( x = 0; x < cn; x++ )
   1024             {
   1025                 int dx = dtab[x];
   1026                 WT t0 = src[x]*6 + src[x + cn]*2;
   1027                 WT t1 = (src[x] + src[x + cn])*4;
   1028                 row[dx] = t0; row[dx + cn] = t1;
   1029                 dx = dtab[ssize.width - cn + x];
   1030                 int sx = ssize.width - cn + x;
   1031                 t0 = src[sx - cn] + src[sx]*7;
   1032                 t1 = src[sx]*8;
   1033                 row[dx] = t0; row[dx + cn] = t1;
   1034             }
   1035 
   1036             for( x = cn; x < ssize.width - cn; x++ )
   1037             {
   1038                 int dx = dtab[x];
   1039                 WT t0 = src[x-cn] + src[x]*6 + src[x+cn];
   1040                 WT t1 = (src[x] + src[x+cn])*4;
   1041                 row[dx] = t0;
   1042                 row[dx+cn] = t1;
   1043             }
   1044         }
   1045 
   1046         // do vertical convolution and decimation and write the result to the destination image
   1047         for( k = 0; k < PU_SZ; k++ )
   1048             rows[k] = buf + ((y - PU_SZ/2 + k - sy0) % PU_SZ)*bufstep;
   1049         row0 = rows[0]; row1 = rows[1]; row2 = rows[2];
   1050         dsts[0] = dst0; dsts[1] = dst1;
   1051 
   1052         x = vecOp(rows, dsts, (int)_dst.step, dsize.width);
   1053         for( ; x < dsize.width; x++ )
   1054         {
   1055             T t1 = castOp((row1[x] + row2[x])*4);
   1056             T t0 = castOp(row0[x] + row1[x]*6 + row2[x]);
   1057             dst1[x] = t1; dst0[x] = t0;
   1058         }
   1059     }
   1060 }
   1061 
   1062 typedef void (*PyrFunc)(const Mat&, Mat&, int);
   1063 
   1064 #ifdef HAVE_OPENCL
   1065 
   1066 static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
   1067 {
   1068     int type = _src.type(), depth = CV_MAT_DEPTH(type), cn = CV_MAT_CN(type);
   1069 
   1070     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
   1071     if (cn > 4 || (depth == CV_64F && !doubleSupport))
   1072         return false;
   1073 
   1074     Size ssize = _src.size();
   1075     Size dsize = _dsz.area() == 0 ? Size((ssize.width + 1) / 2, (ssize.height + 1) / 2) : _dsz;
   1076     if (dsize.height < 2 || dsize.width < 2)
   1077         return false;
   1078 
   1079     CV_Assert( ssize.width > 0 && ssize.height > 0 &&
   1080             std::abs(dsize.width*2 - ssize.width) <= 2 &&
   1081             std::abs(dsize.height*2 - ssize.height) <= 2 );
   1082 
   1083     UMat src = _src.getUMat();
   1084     _dst.create( dsize, src.type() );
   1085     UMat dst = _dst.getUMat();
   1086 
   1087     int float_depth = depth == CV_64F ? CV_64F : CV_32F;
   1088     const int local_size = 256;
   1089     int kercn = 1;
   1090     if (depth == CV_8U && float_depth == CV_32F && cn == 1 && ocl::Device::getDefault().isIntel())
   1091         kercn = 4;
   1092     const char * const borderMap[] = { "BORDER_CONSTANT", "BORDER_REPLICATE", "BORDER_REFLECT", "BORDER_WRAP",
   1093                                        "BORDER_REFLECT_101" };
   1094     char cvt[2][50];
   1095     String buildOptions = format(
   1096             "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
   1097             "-D T1=%s -D cn=%d -D kercn=%d -D fdepth=%d -D %s -D LOCAL_SIZE=%d",
   1098             ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, cn)),
   1099             ocl::convertTypeStr(float_depth, depth, cn, cvt[0]),
   1100             ocl::convertTypeStr(depth, float_depth, cn, cvt[1]),
   1101             doubleSupport ? " -D DOUBLE_SUPPORT" : "", ocl::typeToStr(depth),
   1102             cn, kercn, float_depth, borderMap[borderType], local_size
   1103     );
   1104     ocl::Kernel k("pyrDown", ocl::imgproc::pyr_down_oclsrc, buildOptions);
   1105     if (k.empty())
   1106         return false;
   1107 
   1108     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
   1109 
   1110     size_t localThreads[2]  = { local_size/kercn, 1 };
   1111     size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, (dst.rows + 1) / 2 };
   1112     return k.run(2, globalThreads, localThreads, false);
   1113 }
   1114 
   1115 static bool ocl_pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType)
   1116 {
   1117     int type = _src.type(), depth = CV_MAT_DEPTH(type), channels = CV_MAT_CN(type);
   1118 
   1119     if (channels > 4 || borderType != BORDER_DEFAULT)
   1120         return false;
   1121 
   1122     bool doubleSupport = ocl::Device::getDefault().doubleFPConfig() > 0;
   1123     if (depth == CV_64F && !doubleSupport)
   1124         return false;
   1125 
   1126     Size ssize = _src.size();
   1127     if ((_dsz.area() != 0) && (_dsz != Size(ssize.width * 2, ssize.height * 2)))
   1128         return false;
   1129 
   1130     UMat src = _src.getUMat();
   1131     Size dsize = Size(ssize.width * 2, ssize.height * 2);
   1132     _dst.create( dsize, src.type() );
   1133     UMat dst = _dst.getUMat();
   1134 
   1135     int float_depth = depth == CV_64F ? CV_64F : CV_32F;
   1136     const int local_size = 16;
   1137     char cvt[2][50];
   1138     String buildOptions = format(
   1139             "-D T=%s -D FT=%s -D convertToT=%s -D convertToFT=%s%s "
   1140             "-D T1=%s -D cn=%d -D LOCAL_SIZE=%d",
   1141             ocl::typeToStr(type), ocl::typeToStr(CV_MAKETYPE(float_depth, channels)),
   1142             ocl::convertTypeStr(float_depth, depth, channels, cvt[0]),
   1143             ocl::convertTypeStr(depth, float_depth, channels, cvt[1]),
   1144             doubleSupport ? " -D DOUBLE_SUPPORT" : "",
   1145             ocl::typeToStr(depth), channels, local_size
   1146     );
   1147     size_t globalThreads[2] = { dst.cols, dst.rows };
   1148     size_t localThreads[2] = { local_size, local_size };
   1149     ocl::Kernel k;
   1150     if (ocl::Device::getDefault().isIntel() && channels == 1)
   1151     {
   1152         k.create("pyrUp_unrolled", ocl::imgproc::pyr_up_oclsrc, buildOptions);
   1153         globalThreads[0] = dst.cols/2; globalThreads[1] = dst.rows/2;
   1154     }
   1155     else
   1156         k.create("pyrUp", ocl::imgproc::pyr_up_oclsrc, buildOptions);
   1157 
   1158     if (k.empty())
   1159         return false;
   1160 
   1161     k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));
   1162     return k.run(2, globalThreads, localThreads, false);
   1163 }
   1164 
   1165 #endif
   1166 
   1167 }
   1168 
   1169 void cv::pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
   1170 {
   1171     CV_Assert(borderType != BORDER_CONSTANT);
   1172 
   1173     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
   1174                ocl_pyrDown(_src, _dst, _dsz, borderType))
   1175 
   1176     Mat src = _src.getMat();
   1177     Size dsz = _dsz.area() == 0 ? Size((src.cols + 1)/2, (src.rows + 1)/2) : _dsz;
   1178     _dst.create( dsz, src.type() );
   1179     Mat dst = _dst.getMat();
   1180     int depth = src.depth();
   1181 
   1182 #ifdef HAVE_TEGRA_OPTIMIZATION
   1183     if(borderType == BORDER_DEFAULT && tegra::useTegra() && tegra::pyrDown(src, dst))
   1184         return;
   1185 #endif
   1186 
   1187 #if IPP_VERSION_X100 >= 801 && 0
   1188     CV_IPP_CHECK()
   1189     {
   1190         bool isolated = (borderType & BORDER_ISOLATED) != 0;
   1191         int borderTypeNI = borderType & ~BORDER_ISOLATED;
   1192         if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size((src.cols + 1)/2, (src.rows + 1)/2))
   1193         {
   1194             typedef IppStatus (CV_STDCALL * ippiPyrDown)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer);
   1195             int type = src.type();
   1196             CV_SUPPRESS_DEPRECATED_START
   1197             ippiPyrDown pyrDownFunc = type == CV_8UC1 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_8u_C1R :
   1198                                       type == CV_8UC3 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_8u_C3R :
   1199                                       type == CV_32FC1 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_32f_C1R :
   1200                                       type == CV_32FC3 ? (ippiPyrDown) ippiPyrDown_Gauss5x5_32f_C3R : 0;
   1201             CV_SUPPRESS_DEPRECATED_END
   1202 
   1203             if (pyrDownFunc)
   1204             {
   1205                 int bufferSize;
   1206                 IppiSize srcRoi = { src.cols, src.rows };
   1207                 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f;
   1208                 CV_SUPPRESS_DEPRECATED_START
   1209                 IppStatus ok = ippiPyrDownGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize);
   1210                 CV_SUPPRESS_DEPRECATED_END
   1211                 if (ok >= 0)
   1212                 {
   1213                     Ipp8u* buffer = ippsMalloc_8u(bufferSize);
   1214                     ok = pyrDownFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer);
   1215                     ippsFree(buffer);
   1216 
   1217                     if (ok >= 0)
   1218                     {
   1219                         CV_IMPL_ADD(CV_IMPL_IPP);
   1220                         return;
   1221                     }
   1222                     setIppErrorStatus();
   1223                 }
   1224             }
   1225         }
   1226     }
   1227 #endif
   1228 
   1229     PyrFunc func = 0;
   1230     if( depth == CV_8U )
   1231         func = pyrDown_<FixPtCast<uchar, 8>, PyrDownVec_32s8u>;
   1232     else if( depth == CV_16S )
   1233         func = pyrDown_<FixPtCast<short, 8>, PyrDownVec_32s16s >;
   1234     else if( depth == CV_16U )
   1235         func = pyrDown_<FixPtCast<ushort, 8>, PyrDownVec_32s16u >;
   1236     else if( depth == CV_32F )
   1237         func = pyrDown_<FltCast<float, 8>, PyrDownVec_32f>;
   1238     else if( depth == CV_64F )
   1239         func = pyrDown_<FltCast<double, 8>, PyrDownNoVec<double, double> >;
   1240     else
   1241         CV_Error( CV_StsUnsupportedFormat, "" );
   1242 
   1243     func( src, dst, borderType );
   1244 }
   1245 
   1246 void cv::pyrUp( InputArray _src, OutputArray _dst, const Size& _dsz, int borderType )
   1247 {
   1248     CV_Assert(borderType == BORDER_DEFAULT);
   1249 
   1250     CV_OCL_RUN(_src.dims() <= 2 && _dst.isUMat(),
   1251                ocl_pyrUp(_src, _dst, _dsz, borderType))
   1252 
   1253     Mat src = _src.getMat();
   1254     Size dsz = _dsz.area() == 0 ? Size(src.cols*2, src.rows*2) : _dsz;
   1255     _dst.create( dsz, src.type() );
   1256     Mat dst = _dst.getMat();
   1257     int depth = src.depth();
   1258 
   1259 #ifdef HAVE_TEGRA_OPTIMIZATION
   1260     if(borderType == BORDER_DEFAULT && tegra::useTegra() && tegra::pyrUp(src, dst))
   1261         return;
   1262 #endif
   1263 
   1264 #if IPP_VERSION_X100 >= 801 && 0
   1265     CV_IPP_CHECK()
   1266     {
   1267         bool isolated = (borderType & BORDER_ISOLATED) != 0;
   1268         int borderTypeNI = borderType & ~BORDER_ISOLATED;
   1269         if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated) && dsz == Size(src.cols*2, src.rows*2))
   1270         {
   1271             typedef IppStatus (CV_STDCALL * ippiPyrUp)(const void* pSrc, int srcStep, void* pDst, int dstStep, IppiSize srcRoi, Ipp8u* buffer);
   1272             int type = src.type();
   1273             CV_SUPPRESS_DEPRECATED_START
   1274             ippiPyrUp pyrUpFunc = type == CV_8UC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C1R :
   1275                                   type == CV_8UC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_8u_C3R :
   1276                                   type == CV_32FC1 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C1R :
   1277                                   type == CV_32FC3 ? (ippiPyrUp) ippiPyrUp_Gauss5x5_32f_C3R : 0;
   1278             CV_SUPPRESS_DEPRECATED_END
   1279 
   1280             if (pyrUpFunc)
   1281             {
   1282                 int bufferSize;
   1283                 IppiSize srcRoi = { src.cols, src.rows };
   1284                 IppDataType dataType = depth == CV_8U ? ipp8u : ipp32f;
   1285                 CV_SUPPRESS_DEPRECATED_START
   1286                 IppStatus ok = ippiPyrUpGetBufSize_Gauss5x5(srcRoi.width, dataType, src.channels(), &bufferSize);
   1287                 CV_SUPPRESS_DEPRECATED_END
   1288                 if (ok >= 0)
   1289                 {
   1290                     Ipp8u* buffer = ippsMalloc_8u(bufferSize);
   1291                     ok = pyrUpFunc(src.data, (int) src.step, dst.data, (int) dst.step, srcRoi, buffer);
   1292                     ippsFree(buffer);
   1293 
   1294                     if (ok >= 0)
   1295                     {
   1296                         CV_IMPL_ADD(CV_IMPL_IPP);
   1297                         return;
   1298                     }
   1299                     setIppErrorStatus();
   1300                 }
   1301             }
   1302         }
   1303     }
   1304 #endif
   1305 
   1306     PyrFunc func = 0;
   1307     if( depth == CV_8U )
   1308         func = pyrUp_<FixPtCast<uchar, 6>, PyrUpVec_32s8u >;
   1309     else if( depth == CV_16S )
   1310         func = pyrUp_<FixPtCast<short, 6>, PyrUpVec_32s16s >;
   1311     else if( depth == CV_16U )
   1312         func = pyrUp_<FixPtCast<ushort, 6>, PyrUpVec_32s16u >;
   1313     else if( depth == CV_32F )
   1314         func = pyrUp_<FltCast<float, 6>, PyrUpVec_32f >;
   1315     else if( depth == CV_64F )
   1316         func = pyrUp_<FltCast<double, 6>, PyrUpNoVec<double, double> >;
   1317     else
   1318         CV_Error( CV_StsUnsupportedFormat, "" );
   1319 
   1320     func( src, dst, borderType );
   1321 }
   1322 
   1323 void cv::buildPyramid( InputArray _src, OutputArrayOfArrays _dst, int maxlevel, int borderType )
   1324 {
   1325     CV_Assert(borderType != BORDER_CONSTANT);
   1326 
   1327     if (_src.dims() <= 2 && _dst.isUMatVector())
   1328     {
   1329         UMat src = _src.getUMat();
   1330         _dst.create( maxlevel + 1, 1, 0 );
   1331         _dst.getUMatRef(0) = src;
   1332         for( int i = 1; i <= maxlevel; i++ )
   1333             pyrDown( _dst.getUMatRef(i-1), _dst.getUMatRef(i), Size(), borderType );
   1334         return;
   1335     }
   1336 
   1337     Mat src = _src.getMat();
   1338     _dst.create( maxlevel + 1, 1, 0 );
   1339     _dst.getMatRef(0) = src;
   1340 
   1341     int i=1;
   1342 
   1343 #if IPP_VERSION_X100 >= 801 && 0
   1344     CV_IPP_CHECK()
   1345     {
   1346         bool isolated = (borderType & BORDER_ISOLATED) != 0;
   1347         int borderTypeNI = borderType & ~BORDER_ISOLATED;
   1348         if (borderTypeNI == BORDER_DEFAULT && (!src.isSubmatrix() || isolated))
   1349         {
   1350             typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownInitAlloc)(void** ppState, IppiSize srcRoi, Ipp32f rate, void* pKernel, int kerSize, int mode);
   1351             typedef IppStatus (CV_STDCALL * ippiPyramidLayerDown)(void* pSrc, int srcStep, IppiSize srcRoiSize, void* pDst, int dstStep, IppiSize dstRoiSize, void* pState);
   1352             typedef IppStatus (CV_STDCALL * ippiPyramidLayerDownFree)(void* pState);
   1353 
   1354             int type = src.type();
   1355             int depth = src.depth();
   1356             ippiPyramidLayerDownInitAlloc pyrInitAllocFunc = 0;
   1357             ippiPyramidLayerDown pyrDownFunc = 0;
   1358             ippiPyramidLayerDownFree pyrFreeFunc = 0;
   1359 
   1360             if (type == CV_8UC1)
   1361             {
   1362                 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C1R;
   1363                 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C1R;
   1364                 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C1R;
   1365             }
   1366             else if (type == CV_8UC3)
   1367             {
   1368                 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_8u_C3R;
   1369                 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_8u_C3R;
   1370                 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_8u_C3R;
   1371             }
   1372             else if (type == CV_32FC1)
   1373             {
   1374                 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C1R;
   1375                 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C1R;
   1376                 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C1R;
   1377             }
   1378             else if (type == CV_32FC3)
   1379             {
   1380                 pyrInitAllocFunc = (ippiPyramidLayerDownInitAlloc) ippiPyramidLayerDownInitAlloc_32f_C3R;
   1381                 pyrDownFunc = (ippiPyramidLayerDown) ippiPyramidLayerDown_32f_C3R;
   1382                 pyrFreeFunc = (ippiPyramidLayerDownFree) ippiPyramidLayerDownFree_32f_C3R;
   1383             }
   1384 
   1385             if (pyrInitAllocFunc && pyrDownFunc && pyrFreeFunc)
   1386             {
   1387                 float rate = 2.f;
   1388                 IppiSize srcRoi = { src.cols, src.rows };
   1389                 IppiPyramid *gPyr;
   1390                 IppStatus ok = ippiPyramidInitAlloc(&gPyr, maxlevel + 1, srcRoi, rate);
   1391 
   1392                 Ipp16s iKernel[5] = { 1, 4, 6, 4, 1 };
   1393                 Ipp32f fKernel[5] = { 1.f, 4.f, 6.f, 4.f, 1.f };
   1394                 void* kernel = depth >= CV_32F ? (void*) fKernel : (void*) iKernel;
   1395 
   1396                 if (ok >= 0) ok = pyrInitAllocFunc((void**) &(gPyr->pState), srcRoi, rate, kernel, 5, IPPI_INTER_LINEAR);
   1397                 if (ok >= 0)
   1398                 {
   1399                     gPyr->pImage[0] = src.data;
   1400                     gPyr->pStep[0] = (int) src.step;
   1401                     gPyr->pRoi[0] = srcRoi;
   1402                     for( ; i <= maxlevel; i++ )
   1403                     {
   1404                         IppiSize dstRoi;
   1405                         ok = ippiGetPyramidDownROI(gPyr->pRoi[i-1], &dstRoi, rate);
   1406                         Mat& dst = _dst.getMatRef(i);
   1407                         dst.create(Size(dstRoi.width, dstRoi.height), type);
   1408                         gPyr->pImage[i] = dst.data;
   1409                         gPyr->pStep[i] = (int) dst.step;
   1410                         gPyr->pRoi[i] = dstRoi;
   1411 
   1412                         if (ok >= 0) ok = pyrDownFunc(gPyr->pImage[i-1], gPyr->pStep[i-1], gPyr->pRoi[i-1],
   1413                                                       gPyr->pImage[i], gPyr->pStep[i], gPyr->pRoi[i], gPyr->pState);
   1414 
   1415                         if (ok < 0)
   1416                         {
   1417                             setIppErrorStatus();
   1418                             break;
   1419                         }
   1420                         else
   1421                         {
   1422                             CV_IMPL_ADD(CV_IMPL_IPP);
   1423                         }
   1424                     }
   1425                     pyrFreeFunc(gPyr->pState);
   1426                 }
   1427                 else
   1428                     setIppErrorStatus();
   1429 
   1430                 ippiPyramidFree(gPyr);
   1431             }
   1432         }
   1433     }
   1434 #endif
   1435     for( ; i <= maxlevel; i++ )
   1436         pyrDown( _dst.getMatRef(i-1), _dst.getMatRef(i), Size(), borderType );
   1437 }
   1438 
   1439 CV_IMPL void cvPyrDown( const void* srcarr, void* dstarr, int _filter )
   1440 {
   1441     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
   1442 
   1443     CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type());
   1444     cv::pyrDown( src, dst, dst.size() );
   1445 }
   1446 
   1447 CV_IMPL void cvPyrUp( const void* srcarr, void* dstarr, int _filter )
   1448 {
   1449     cv::Mat src = cv::cvarrToMat(srcarr), dst = cv::cvarrToMat(dstarr);
   1450 
   1451     CV_Assert( _filter == CV_GAUSSIAN_5x5 && src.type() == dst.type());
   1452     cv::pyrUp( src, dst, dst.size() );
   1453 }
   1454 
   1455 
   1456 CV_IMPL void
   1457 cvReleasePyramid( CvMat*** _pyramid, int extra_layers )
   1458 {
   1459     if( !_pyramid )
   1460         CV_Error( CV_StsNullPtr, "" );
   1461 
   1462     if( *_pyramid )
   1463         for( int i = 0; i <= extra_layers; i++ )
   1464             cvReleaseMat( &(*_pyramid)[i] );
   1465 
   1466     cvFree( _pyramid );
   1467 }
   1468 
   1469 
   1470 CV_IMPL CvMat**
   1471 cvCreatePyramid( const CvArr* srcarr, int extra_layers, double rate,
   1472                  const CvSize* layer_sizes, CvArr* bufarr,
   1473                  int calc, int filter )
   1474 {
   1475     const float eps = 0.1f;
   1476     uchar* ptr = 0;
   1477 
   1478     CvMat stub, *src = cvGetMat( srcarr, &stub );
   1479 
   1480     if( extra_layers < 0 )
   1481         CV_Error( CV_StsOutOfRange, "The number of extra layers must be non negative" );
   1482 
   1483     int i, layer_step, elem_size = CV_ELEM_SIZE(src->type);
   1484     CvSize layer_size, size = cvGetMatSize(src);
   1485 
   1486     if( bufarr )
   1487     {
   1488         CvMat bstub, *buf;
   1489         int bufsize = 0;
   1490 
   1491         buf = cvGetMat( bufarr, &bstub );
   1492         bufsize = buf->rows*buf->cols*CV_ELEM_SIZE(buf->type);
   1493         layer_size = size;
   1494         for( i = 1; i <= extra_layers; i++ )
   1495         {
   1496             if( !layer_sizes )
   1497             {
   1498                 layer_size.width = cvRound(layer_size.width*rate+eps);
   1499                 layer_size.height = cvRound(layer_size.height*rate+eps);
   1500             }
   1501             else
   1502                 layer_size = layer_sizes[i-1];
   1503             layer_step = layer_size.width*elem_size;
   1504             bufsize -= layer_step*layer_size.height;
   1505         }
   1506 
   1507         if( bufsize < 0 )
   1508             CV_Error( CV_StsOutOfRange, "The buffer is too small to fit the pyramid" );
   1509         ptr = buf->data.ptr;
   1510     }
   1511 
   1512     CvMat** pyramid = (CvMat**)cvAlloc( (extra_layers+1)*sizeof(pyramid[0]) );
   1513     memset( pyramid, 0, (extra_layers+1)*sizeof(pyramid[0]) );
   1514 
   1515     pyramid[0] = cvCreateMatHeader( size.height, size.width, src->type );
   1516     cvSetData( pyramid[0], src->data.ptr, src->step );
   1517     layer_size = size;
   1518 
   1519     for( i = 1; i <= extra_layers; i++ )
   1520     {
   1521         if( !layer_sizes )
   1522         {
   1523             layer_size.width = cvRound(layer_size.width*rate + eps);
   1524             layer_size.height = cvRound(layer_size.height*rate + eps);
   1525         }
   1526         else
   1527             layer_size = layer_sizes[i];
   1528 
   1529         if( bufarr )
   1530         {
   1531             pyramid[i] = cvCreateMatHeader( layer_size.height, layer_size.width, src->type );
   1532             layer_step = layer_size.width*elem_size;
   1533             cvSetData( pyramid[i], ptr, layer_step );
   1534             ptr += layer_step*layer_size.height;
   1535         }
   1536         else
   1537             pyramid[i] = cvCreateMat( layer_size.height, layer_size.width, src->type );
   1538 
   1539         if( calc )
   1540             cvPyrDown( pyramid[i-1], pyramid[i], filter );
   1541             //cvResize( pyramid[i-1], pyramid[i], CV_INTER_LINEAR );
   1542     }
   1543 
   1544     return pyramid;
   1545 }
   1546 
   1547 /* End of file. */
   1548