Home | History | Annotate | Download | only in hal
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                          License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
     16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
     17 // Third party copyrights are property of their respective owners.
     18 //
     19 // Redistribution and use in source and binary forms, with or without modification,
     20 // are permitted provided that the following conditions are met:
     21 //
     22 //   * Redistribution's of source code must retain the above copyright notice,
     23 //     this list of conditions and the following disclaimer.
     24 //
     25 //   * Redistribution's in binary form must reproduce the above copyright notice,
     26 //     this list of conditions and the following disclaimer in the documentation
     27 //     and/or other materials provided with the distribution.
     28 //
     29 //   * The name of the copyright holders may not be used to endorse or promote products
     30 //     derived from this software without specific prior written permission.
     31 //
     32 // This software is provided by the copyright holders and contributors "as is" and
     33 // any express or implied warranties, including, but not limited to, the implied
     34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     35 // In no event shall the Intel Corporation or contributors be liable for any direct,
     36 // indirect, incidental, special, exemplary, or consequential damages
     37 // (including, but not limited to, procurement of substitute goods or services;
     38 // loss of use, data, or profits; or business interruption) however caused
     39 // and on any theory of liability, whether in contract, strict liability,
     40 // or tort (including negligence or otherwise) arising in any way out of
     41 // the use of this software, even if advised of the possibility of such damage.
     42 //
     43 //M*/
     44 
     45 #ifndef __OPENCV_HAL_SSE_HPP__
     46 #define __OPENCV_HAL_SSE_HPP__
     47 
     48 #define CV_SIMD128 1
     49 #define CV_SIMD128_64F 1
     50 
     51 namespace cv
     52 {
     53 
     54 struct v_uint8x16
     55 {
     56     typedef uchar lane_type;
     57     enum { nlanes = 16 };
     58 
     59     v_uint8x16() {}
     60     explicit v_uint8x16(__m128i v) : val(v) {}
     61     v_uint8x16(uchar v0, uchar v1, uchar v2, uchar v3, uchar v4, uchar v5, uchar v6, uchar v7,
     62                uchar v8, uchar v9, uchar v10, uchar v11, uchar v12, uchar v13, uchar v14, uchar v15)
     63     {
     64         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
     65                             (char)v4, (char)v5, (char)v6, (char)v7,
     66                             (char)v8, (char)v9, (char)v10, (char)v11,
     67                             (char)v12, (char)v13, (char)v14, (char)v15);
     68     }
     69     uchar get0() const
     70     {
     71         return (uchar)_mm_cvtsi128_si32(val);
     72     }
     73 
     74     __m128i val;
     75 };
     76 
     77 struct v_int8x16
     78 {
     79     typedef schar lane_type;
     80     enum { nlanes = 16 };
     81 
     82     v_int8x16() {}
     83     explicit v_int8x16(__m128i v) : val(v) {}
     84     v_int8x16(schar v0, schar v1, schar v2, schar v3, schar v4, schar v5, schar v6, schar v7,
     85               schar v8, schar v9, schar v10, schar v11, schar v12, schar v13, schar v14, schar v15)
     86     {
     87         val = _mm_setr_epi8((char)v0, (char)v1, (char)v2, (char)v3,
     88                             (char)v4, (char)v5, (char)v6, (char)v7,
     89                             (char)v8, (char)v9, (char)v10, (char)v11,
     90                             (char)v12, (char)v13, (char)v14, (char)v15);
     91     }
     92     schar get0() const
     93     {
     94         return (schar)_mm_cvtsi128_si32(val);
     95     }
     96 
     97     __m128i val;
     98 };
     99 
    100 struct v_uint16x8
    101 {
    102     typedef ushort lane_type;
    103     enum { nlanes = 8 };
    104 
    105     v_uint16x8() {}
    106     explicit v_uint16x8(__m128i v) : val(v) {}
    107     v_uint16x8(ushort v0, ushort v1, ushort v2, ushort v3, ushort v4, ushort v5, ushort v6, ushort v7)
    108     {
    109         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
    110                              (short)v4, (short)v5, (short)v6, (short)v7);
    111     }
    112     ushort get0() const
    113     {
    114         return (ushort)_mm_cvtsi128_si32(val);
    115     }
    116 
    117     __m128i val;
    118 };
    119 
    120 struct v_int16x8
    121 {
    122     typedef short lane_type;
    123     enum { nlanes = 8 };
    124 
    125     v_int16x8() {}
    126     explicit v_int16x8(__m128i v) : val(v) {}
    127     v_int16x8(short v0, short v1, short v2, short v3, short v4, short v5, short v6, short v7)
    128     {
    129         val = _mm_setr_epi16((short)v0, (short)v1, (short)v2, (short)v3,
    130                              (short)v4, (short)v5, (short)v6, (short)v7);
    131     }
    132     short get0() const
    133     {
    134         return (short)_mm_cvtsi128_si32(val);
    135     }
    136     __m128i val;
    137 };
    138 
    139 struct v_uint32x4
    140 {
    141     typedef unsigned lane_type;
    142     enum { nlanes = 4 };
    143 
    144     v_uint32x4() {}
    145     explicit v_uint32x4(__m128i v) : val(v) {}
    146     v_uint32x4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
    147     {
    148         val = _mm_setr_epi32((int)v0, (int)v1, (int)v2, (int)v3);
    149     }
    150     unsigned get0() const
    151     {
    152         return (unsigned)_mm_cvtsi128_si32(val);
    153     }
    154     __m128i val;
    155 };
    156 
    157 struct v_int32x4
    158 {
    159     typedef int lane_type;
    160     enum { nlanes = 4 };
    161 
    162     v_int32x4() {}
    163     explicit v_int32x4(__m128i v) : val(v) {}
    164     v_int32x4(int v0, int v1, int v2, int v3)
    165     {
    166         val = _mm_setr_epi32(v0, v1, v2, v3);
    167     }
    168     int get0() const
    169     {
    170         return _mm_cvtsi128_si32(val);
    171     }
    172     __m128i val;
    173 };
    174 
    175 struct v_float32x4
    176 {
    177     typedef float lane_type;
    178     enum { nlanes = 4 };
    179 
    180     v_float32x4() {}
    181     explicit v_float32x4(__m128 v) : val(v) {}
    182     v_float32x4(float v0, float v1, float v2, float v3)
    183     {
    184         val = _mm_setr_ps(v0, v1, v2, v3);
    185     }
    186     float get0() const
    187     {
    188         return _mm_cvtss_f32(val);
    189     }
    190     __m128 val;
    191 };
    192 
    193 struct v_uint64x2
    194 {
    195     typedef uint64 lane_type;
    196     enum { nlanes = 2 };
    197 
    198     v_uint64x2() {}
    199     explicit v_uint64x2(__m128i v) : val(v) {}
    200     v_uint64x2(uint64 v0, uint64 v1)
    201     {
    202         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
    203     }
    204     uint64 get0() const
    205     {
    206         int a = _mm_cvtsi128_si32(val);
    207         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
    208         return (unsigned)a | ((uint64)(unsigned)b << 32);
    209     }
    210     __m128i val;
    211 };
    212 
    213 struct v_int64x2
    214 {
    215     typedef int64 lane_type;
    216     enum { nlanes = 2 };
    217 
    218     v_int64x2() {}
    219     explicit v_int64x2(__m128i v) : val(v) {}
    220     v_int64x2(int64 v0, int64 v1)
    221     {
    222         val = _mm_setr_epi32((int)v0, (int)(v0 >> 32), (int)v1, (int)(v1 >> 32));
    223     }
    224     int64 get0() const
    225     {
    226         int a = _mm_cvtsi128_si32(val);
    227         int b = _mm_cvtsi128_si32(_mm_srli_epi64(val, 32));
    228         return (int64)((unsigned)a | ((uint64)(unsigned)b << 32));
    229     }
    230     __m128i val;
    231 };
    232 
    233 struct v_float64x2
    234 {
    235     typedef double lane_type;
    236     enum { nlanes = 2 };
    237 
    238     v_float64x2() {}
    239     explicit v_float64x2(__m128d v) : val(v) {}
    240     v_float64x2(double v0, double v1)
    241     {
    242         val = _mm_setr_pd(v0, v1);
    243     }
    244     double get0() const
    245     {
    246         return _mm_cvtsd_f64(val);
    247     }
    248     __m128d val;
    249 };
    250 
    251 #define OPENCV_HAL_IMPL_SSE_INITVEC(_Tpvec, _Tp, suffix, zsuffix, ssuffix, _Tps, cast) \
    252 inline _Tpvec v_setzero_##suffix() { return _Tpvec(_mm_setzero_##zsuffix()); } \
    253 inline _Tpvec v_setall_##suffix(_Tp v) { return _Tpvec(_mm_set1_##ssuffix((_Tps)v)); } \
    254 template<typename _Tpvec0> inline _Tpvec v_reinterpret_as_##suffix(const _Tpvec0& a) \
    255 { return _Tpvec(cast(a.val)); }
    256 
    257 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint8x16, uchar, u8, si128, epi8, char, OPENCV_HAL_NOP)
    258 OPENCV_HAL_IMPL_SSE_INITVEC(v_int8x16, schar, s8, si128, epi8, char, OPENCV_HAL_NOP)
    259 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint16x8, ushort, u16, si128, epi16, short, OPENCV_HAL_NOP)
    260 OPENCV_HAL_IMPL_SSE_INITVEC(v_int16x8, short, s16, si128, epi16, short, OPENCV_HAL_NOP)
    261 OPENCV_HAL_IMPL_SSE_INITVEC(v_uint32x4, unsigned, u32, si128, epi32, int, OPENCV_HAL_NOP)
    262 OPENCV_HAL_IMPL_SSE_INITVEC(v_int32x4, int, s32, si128, epi32, int, OPENCV_HAL_NOP)
    263 OPENCV_HAL_IMPL_SSE_INITVEC(v_float32x4, float, f32, ps, ps, float, _mm_castsi128_ps)
    264 OPENCV_HAL_IMPL_SSE_INITVEC(v_float64x2, double, f64, pd, pd, double, _mm_castsi128_pd)
    265 
    266 inline v_uint64x2 v_setzero_u64() { return v_uint64x2(_mm_setzero_si128()); }
    267 inline v_int64x2 v_setzero_s64() { return v_int64x2(_mm_setzero_si128()); }
    268 inline v_uint64x2 v_setall_u64(uint64 val) { return v_uint64x2(val, val); }
    269 inline v_int64x2 v_setall_s64(int64 val) { return v_int64x2(val, val); }
    270 
    271 template<typename _Tpvec> inline
    272 v_uint64x2 v_reinterpret_as_u64(const _Tpvec& a) { return v_uint64x2(a.val); }
    273 template<typename _Tpvec> inline
    274 v_int64x2 v_reinterpret_as_s64(const _Tpvec& a) { return v_int64x2(a.val); }
    275 inline v_float32x4 v_reinterpret_as_f32(const v_uint64x2& a)
    276 { return v_float32x4(_mm_castsi128_ps(a.val)); }
    277 inline v_float32x4 v_reinterpret_as_f32(const v_int64x2& a)
    278 { return v_float32x4(_mm_castsi128_ps(a.val)); }
    279 inline v_float64x2 v_reinterpret_as_f64(const v_uint64x2& a)
    280 { return v_float64x2(_mm_castsi128_pd(a.val)); }
    281 inline v_float64x2 v_reinterpret_as_f64(const v_int64x2& a)
    282 { return v_float64x2(_mm_castsi128_pd(a.val)); }
    283 
    284 #define OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(_Tpvec, suffix) \
    285 inline _Tpvec v_reinterpret_as_##suffix(const v_float32x4& a) \
    286 { return _Tpvec(_mm_castps_si128(a.val)); } \
    287 inline _Tpvec v_reinterpret_as_##suffix(const v_float64x2& a) \
    288 { return _Tpvec(_mm_castpd_si128(a.val)); }
    289 
    290 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint8x16, u8)
    291 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int8x16, s8)
    292 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint16x8, u16)
    293 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int16x8, s16)
    294 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint32x4, u32)
    295 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int32x4, s32)
    296 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_uint64x2, u64)
    297 OPENCV_HAL_IMPL_SSE_INIT_FROM_FLT(v_int64x2, s64)
    298 
    299 //////////////// PACK ///////////////
    300 inline v_uint8x16 v_pack(const v_uint16x8& a, const v_uint16x8& b)
    301 {
    302     __m128i delta = _mm_set1_epi16(255);
    303     return v_uint8x16(_mm_packus_epi16(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta)),
    304                                        _mm_subs_epu16(b.val, _mm_subs_epu16(b.val, delta))));
    305 }
    306 
    307 inline void v_pack_store(uchar* ptr, const v_uint16x8& a)
    308 {
    309     __m128i delta = _mm_set1_epi16(255);
    310     __m128i a1 = _mm_subs_epu16(a.val, _mm_subs_epu16(a.val, delta));
    311     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
    312 }
    313 
    314 inline v_uint8x16 v_pack_u(const v_int16x8& a, const v_int16x8& b)
    315 { return v_uint8x16(_mm_packus_epi16(a.val, b.val)); }
    316 
    317 inline void v_pack_u_store(uchar* ptr, const v_int16x8& a)
    318 { _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a.val, a.val)); }
    319 
    320 template<int n> inline
    321 v_uint8x16 v_rshr_pack(const v_uint16x8& a, const v_uint16x8& b)
    322 {
    323     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
    324     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
    325     return v_uint8x16(_mm_packus_epi16(_mm_srli_epi16(_mm_adds_epu16(a.val, delta), n),
    326                                        _mm_srli_epi16(_mm_adds_epu16(b.val, delta), n)));
    327 }
    328 
    329 template<int n> inline
    330 void v_rshr_pack_store(uchar* ptr, const v_uint16x8& a)
    331 {
    332     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
    333     __m128i a1 = _mm_srli_epi16(_mm_adds_epu16(a.val, delta), n);
    334     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
    335 }
    336 
    337 template<int n> inline
    338 v_uint8x16 v_rshr_pack_u(const v_int16x8& a, const v_int16x8& b)
    339 {
    340     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
    341     return v_uint8x16(_mm_packus_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
    342                                        _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
    343 }
    344 
    345 template<int n> inline
    346 void v_rshr_pack_u_store(uchar* ptr, const v_int16x8& a)
    347 {
    348     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
    349     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
    350     _mm_storel_epi64((__m128i*)ptr, _mm_packus_epi16(a1, a1));
    351 }
    352 
    353 inline v_int8x16 v_pack(const v_int16x8& a, const v_int16x8& b)
    354 { return v_int8x16(_mm_packs_epi16(a.val, b.val)); }
    355 
    356 inline void v_pack_store(schar* ptr, v_int16x8& a)
    357 { _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a.val, a.val)); }
    358 
    359 template<int n> inline
    360 v_int8x16 v_rshr_pack(const v_int16x8& a, const v_int16x8& b)
    361 {
    362     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
    363     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
    364     return v_int8x16(_mm_packs_epi16(_mm_srai_epi16(_mm_adds_epi16(a.val, delta), n),
    365                                      _mm_srai_epi16(_mm_adds_epi16(b.val, delta), n)));
    366 }
    367 template<int n> inline
    368 void v_rshr_pack_store(schar* ptr, const v_int16x8& a)
    369 {
    370     // we assume that n > 0, and so the shifted 16-bit values can be treated as signed numbers.
    371     __m128i delta = _mm_set1_epi16((short)(1 << (n-1)));
    372     __m128i a1 = _mm_srai_epi16(_mm_adds_epi16(a.val, delta), n);
    373     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi16(a1, a1));
    374 }
    375 
    376 
    377 // bit-wise "mask ? a : b"
    378 inline __m128i v_select_si128(__m128i mask, __m128i a, __m128i b)
    379 {
    380     return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(a, b), mask));
    381 }
    382 
    383 inline v_uint16x8 v_pack(const v_uint32x4& a, const v_uint32x4& b)
    384 {
    385     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
    386     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
    387     __m128i b1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, b.val), maxval32, b.val), delta32);
    388     __m128i r = _mm_packs_epi32(a1, b1);
    389     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
    390 }
    391 
    392 inline void v_pack_store(ushort* ptr, const v_uint32x4& a)
    393 {
    394     __m128i z = _mm_setzero_si128(), maxval32 = _mm_set1_epi32(65535), delta32 = _mm_set1_epi32(32768);
    395     __m128i a1 = _mm_sub_epi32(v_select_si128(_mm_cmpgt_epi32(z, a.val), maxval32, a.val), delta32);
    396     __m128i r = _mm_packs_epi32(a1, a1);
    397     _mm_storel_epi64((__m128i*)ptr, _mm_sub_epi16(r, _mm_set1_epi16(-32768)));
    398 }
    399 
    400 template<int n> inline
    401 v_uint16x8 v_rshr_pack(const v_uint32x4& a, const v_uint32x4& b)
    402 {
    403     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
    404     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
    405     __m128i b1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(b.val, delta), n), delta32);
    406     return v_uint16x8(_mm_sub_epi16(_mm_packs_epi32(a1, b1), _mm_set1_epi16(-32768)));
    407 }
    408 
    409 template<int n> inline
    410 void v_rshr_pack_store(ushort* ptr, const v_uint32x4& a)
    411 {
    412     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
    413     __m128i a1 = _mm_sub_epi32(_mm_srli_epi32(_mm_add_epi32(a.val, delta), n), delta32);
    414     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
    415     _mm_storel_epi64((__m128i*)ptr, a2);
    416 }
    417 
    418 inline v_uint16x8 v_pack_u(const v_int32x4& a, const v_int32x4& b)
    419 {
    420     __m128i delta32 = _mm_set1_epi32(32768);
    421     __m128i r = _mm_packs_epi32(_mm_sub_epi32(a.val, delta32), _mm_sub_epi32(b.val, delta32));
    422     return v_uint16x8(_mm_sub_epi16(r, _mm_set1_epi16(-32768)));
    423 }
    424 
    425 inline void v_pack_u_store(ushort* ptr, const v_int32x4& a)
    426 {
    427     __m128i delta32 = _mm_set1_epi32(32768);
    428     __m128i a1 = _mm_sub_epi32(a.val, delta32);
    429     __m128i r = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
    430     _mm_storel_epi64((__m128i*)ptr, r);
    431 }
    432 
    433 template<int n> inline
    434 void v_rshr_pack_u_store(ushort* ptr, const v_int32x4& a)
    435 {
    436     __m128i delta = _mm_set1_epi32(1 << (n-1)), delta32 = _mm_set1_epi32(32768);
    437     __m128i a1 = _mm_sub_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n), delta32);
    438     __m128i a2 = _mm_sub_epi16(_mm_packs_epi32(a1, a1), _mm_set1_epi16(-32768));
    439     _mm_storel_epi64((__m128i*)ptr, a2);
    440 }
    441 
    442 inline v_int16x8 v_pack(const v_int32x4& a, const v_int32x4& b)
    443 { return v_int16x8(_mm_packs_epi32(a.val, b.val)); }
    444 
    445 inline void v_pack_store(short* ptr, const v_int32x4& a)
    446 {
    447     _mm_storel_epi64((__m128i*)ptr, _mm_packs_epi32(a.val, a.val));
    448 }
    449 
    450 template<int n> inline
    451 v_int16x8 v_rshr_pack(const v_int32x4& a, const v_int32x4& b)
    452 {
    453     __m128i delta = _mm_set1_epi32(1 << (n-1));
    454     return v_int16x8(_mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(a.val, delta), n),
    455                                      _mm_srai_epi32(_mm_add_epi32(b.val, delta), n)));
    456 }
    457 
    458 template<int n> inline
    459 void v_rshr_pack_store(short* ptr, const v_int32x4& a)
    460 {
    461     __m128i delta = _mm_set1_epi32(1 << (n-1));
    462     __m128i a1 = _mm_srai_epi32(_mm_add_epi32(a.val, delta), n);
    463     _mm_storel_epi64((__m128i*)ptr, a1);
    464 }
    465 
    466 
    467 // [a0 0 | b0 0]  [a1 0 | b1 0]
    468 inline v_uint32x4 v_pack(const v_uint64x2& a, const v_uint64x2& b)
    469 {
    470     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
    471     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
    472     return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
    473 }
    474 
    475 inline void v_pack_store(unsigned* ptr, const v_uint64x2& a)
    476 {
    477     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
    478     _mm_storel_epi64((__m128i*)ptr, a1);
    479 }
    480 
    481 // [a0 0 | b0 0]  [a1 0 | b1 0]
    482 inline v_int32x4 v_pack(const v_int64x2& a, const v_int64x2& b)
    483 {
    484     __m128i v0 = _mm_unpacklo_epi32(a.val, b.val); // a0 a1 0 0
    485     __m128i v1 = _mm_unpackhi_epi32(a.val, b.val); // b0 b1 0 0
    486     return v_int32x4(_mm_unpacklo_epi64(v0, v1));
    487 }
    488 
    489 inline void v_pack_store(int* ptr, const v_int64x2& a)
    490 {
    491     __m128i a1 = _mm_shuffle_epi32(a.val, _MM_SHUFFLE(0, 2, 2, 0));
    492     _mm_storel_epi64((__m128i*)ptr, a1);
    493 }
    494 
    495 template<int n> inline
    496 v_uint32x4 v_rshr_pack(const v_uint64x2& a, const v_uint64x2& b)
    497 {
    498     uint64 delta = (uint64)1 << (n-1);
    499     v_uint64x2 delta2(delta, delta);
    500     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
    501     __m128i b1 = _mm_srli_epi64(_mm_add_epi64(b.val, delta2.val), n);
    502     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
    503     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
    504     return v_uint32x4(_mm_unpacklo_epi64(v0, v1));
    505 }
    506 
    507 template<int n> inline
    508 void v_rshr_pack_store(unsigned* ptr, const v_uint64x2& a)
    509 {
    510     uint64 delta = (uint64)1 << (n-1);
    511     v_uint64x2 delta2(delta, delta);
    512     __m128i a1 = _mm_srli_epi64(_mm_add_epi64(a.val, delta2.val), n);
    513     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
    514     _mm_storel_epi64((__m128i*)ptr, a2);
    515 }
    516 
    517 inline __m128i v_sign_epi64(__m128i a)
    518 {
    519     return _mm_shuffle_epi32(_mm_srai_epi32(a, 31), _MM_SHUFFLE(3, 3, 1, 1)); // x m0 | x m1
    520 }
    521 
    522 inline __m128i v_srai_epi64(__m128i a, int imm)
    523 {
    524     __m128i smask = v_sign_epi64(a);
    525     return _mm_xor_si128(_mm_srli_epi64(_mm_xor_si128(a, smask), imm), smask);
    526 }
    527 
    528 template<int n> inline
    529 v_int32x4 v_rshr_pack(const v_int64x2& a, const v_int64x2& b)
    530 {
    531     int64 delta = (int64)1 << (n-1);
    532     v_int64x2 delta2(delta, delta);
    533     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
    534     __m128i b1 = v_srai_epi64(_mm_add_epi64(b.val, delta2.val), n);
    535     __m128i v0 = _mm_unpacklo_epi32(a1, b1); // a0 a1 0 0
    536     __m128i v1 = _mm_unpackhi_epi32(a1, b1); // b0 b1 0 0
    537     return v_int32x4(_mm_unpacklo_epi64(v0, v1));
    538 }
    539 
    540 template<int n> inline
    541 void v_rshr_pack_store(int* ptr, const v_int64x2& a)
    542 {
    543     int64 delta = (int64)1 << (n-1);
    544     v_int64x2 delta2(delta, delta);
    545     __m128i a1 = v_srai_epi64(_mm_add_epi64(a.val, delta2.val), n);
    546     __m128i a2 = _mm_shuffle_epi32(a1, _MM_SHUFFLE(0, 2, 2, 0));
    547     _mm_storel_epi64((__m128i*)ptr, a2);
    548 }
    549 
    550 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
    551                             const v_float32x4& m1, const v_float32x4& m2,
    552                             const v_float32x4& m3)
    553 {
    554     __m128 v0 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(0, 0, 0, 0)), m0.val);
    555     __m128 v1 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(1, 1, 1, 1)), m1.val);
    556     __m128 v2 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(2, 2, 2, 2)), m2.val);
    557     __m128 v3 = _mm_mul_ps(_mm_shuffle_ps(v.val, v.val, _MM_SHUFFLE(3, 3, 3, 3)), m3.val);
    558 
    559     return v_float32x4(_mm_add_ps(_mm_add_ps(v0, v1), _mm_add_ps(v2, v3)));
    560 }
    561 
    562 
    563 #define OPENCV_HAL_IMPL_SSE_BIN_OP(bin_op, _Tpvec, intrin) \
    564     inline _Tpvec operator bin_op (const _Tpvec& a, const _Tpvec& b) \
    565     { \
    566         return _Tpvec(intrin(a.val, b.val)); \
    567     } \
    568     inline _Tpvec& operator bin_op##= (_Tpvec& a, const _Tpvec& b) \
    569     { \
    570         a.val = intrin(a.val, b.val); \
    571         return a; \
    572     }
    573 
    574 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint8x16, _mm_adds_epu8)
    575 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint8x16, _mm_subs_epu8)
    576 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int8x16, _mm_adds_epi8)
    577 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int8x16, _mm_subs_epi8)
    578 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint16x8, _mm_adds_epu16)
    579 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint16x8, _mm_subs_epu16)
    580 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_uint16x8, _mm_mullo_epi16)
    581 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int16x8, _mm_adds_epi16)
    582 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int16x8, _mm_subs_epi16)
    583 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_int16x8, _mm_mullo_epi16)
    584 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint32x4, _mm_add_epi32)
    585 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint32x4, _mm_sub_epi32)
    586 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int32x4, _mm_add_epi32)
    587 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int32x4, _mm_sub_epi32)
    588 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float32x4, _mm_add_ps)
    589 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float32x4, _mm_sub_ps)
    590 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float32x4, _mm_mul_ps)
    591 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float32x4, _mm_div_ps)
    592 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_float64x2, _mm_add_pd)
    593 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_float64x2, _mm_sub_pd)
    594 OPENCV_HAL_IMPL_SSE_BIN_OP(*, v_float64x2, _mm_mul_pd)
    595 OPENCV_HAL_IMPL_SSE_BIN_OP(/, v_float64x2, _mm_div_pd)
    596 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_uint64x2, _mm_add_epi64)
    597 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_uint64x2, _mm_sub_epi64)
    598 OPENCV_HAL_IMPL_SSE_BIN_OP(+, v_int64x2, _mm_add_epi64)
    599 OPENCV_HAL_IMPL_SSE_BIN_OP(-, v_int64x2, _mm_sub_epi64)
    600 
    601 inline v_uint32x4 operator * (const v_uint32x4& a, const v_uint32x4& b)
    602 {
    603     __m128i c0 = _mm_mul_epu32(a.val, b.val);
    604     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
    605     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
    606     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
    607     return v_uint32x4(_mm_unpacklo_epi64(d0, d1));
    608 }
    609 inline v_int32x4 operator * (const v_int32x4& a, const v_int32x4& b)
    610 {
    611     __m128i c0 = _mm_mul_epu32(a.val, b.val);
    612     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
    613     __m128i d0 = _mm_unpacklo_epi32(c0, c1);
    614     __m128i d1 = _mm_unpackhi_epi32(c0, c1);
    615     return v_int32x4(_mm_unpacklo_epi64(d0, d1));
    616 }
    617 inline v_uint32x4& operator *= (v_uint32x4& a, const v_uint32x4& b)
    618 {
    619     a = a * b;
    620     return a;
    621 }
    622 inline v_int32x4& operator *= (v_int32x4& a, const v_int32x4& b)
    623 {
    624     a = a * b;
    625     return a;
    626 }
    627 
    628 inline void v_mul_expand(const v_int16x8& a, const v_int16x8& b,
    629                          v_int32x4& c, v_int32x4& d)
    630 {
    631     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
    632     __m128i v1 = _mm_mulhi_epi16(a.val, b.val);
    633     c.val = _mm_unpacklo_epi32(v0, v1);
    634     d.val = _mm_unpackhi_epi32(v0, v1);
    635 }
    636 
    637 inline void v_mul_expand(const v_uint16x8& a, const v_uint16x8& b,
    638                          v_uint32x4& c, v_uint32x4& d)
    639 {
    640     __m128i v0 = _mm_mullo_epi16(a.val, b.val);
    641     __m128i v1 = _mm_mulhi_epu16(a.val, b.val);
    642     c.val = _mm_unpacklo_epi32(v0, v1);
    643     d.val = _mm_unpackhi_epi32(v0, v1);
    644 }
    645 
    646 inline void v_mul_expand(const v_uint32x4& a, const v_uint32x4& b,
    647                          v_uint64x2& c, v_uint64x2& d)
    648 {
    649     __m128i c0 = _mm_mul_epu32(a.val, b.val);
    650     __m128i c1 = _mm_mul_epu32(_mm_srli_epi64(a.val, 32), _mm_srli_epi64(b.val, 32));
    651     c.val = _mm_unpacklo_epi64(c0, c1);
    652     d.val = _mm_unpackhi_epi64(c0, c1);
    653 }
    654 
    655 inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
    656 {
    657     return v_int32x4(_mm_madd_epi16(a.val, b.val));
    658 }
    659 
    660 #define OPENCV_HAL_IMPL_SSE_LOGIC_OP(_Tpvec, suffix, not_const) \
    661     OPENCV_HAL_IMPL_SSE_BIN_OP(&, _Tpvec, _mm_and_##suffix) \
    662     OPENCV_HAL_IMPL_SSE_BIN_OP(|, _Tpvec, _mm_or_##suffix) \
    663     OPENCV_HAL_IMPL_SSE_BIN_OP(^, _Tpvec, _mm_xor_##suffix) \
    664     inline _Tpvec operator ~ (const _Tpvec& a) \
    665     { \
    666         return _Tpvec(_mm_xor_##suffix(a.val, not_const)); \
    667     }
    668 
    669 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint8x16, si128, _mm_set1_epi32(-1))
    670 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int8x16, si128, _mm_set1_epi32(-1))
    671 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint16x8, si128, _mm_set1_epi32(-1))
    672 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int16x8, si128, _mm_set1_epi32(-1))
    673 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint32x4, si128, _mm_set1_epi32(-1))
    674 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int32x4, si128, _mm_set1_epi32(-1))
    675 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_uint64x2, si128, _mm_set1_epi32(-1))
    676 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_int64x2, si128, _mm_set1_epi32(-1))
    677 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float32x4, ps, _mm_castsi128_ps(_mm_set1_epi32(-1)))
    678 OPENCV_HAL_IMPL_SSE_LOGIC_OP(v_float64x2, pd, _mm_castsi128_pd(_mm_set1_epi32(-1)))
    679 
    680 inline v_float32x4 v_sqrt(const v_float32x4& x)
    681 { return v_float32x4(_mm_sqrt_ps(x.val)); }
    682 
    683 inline v_float32x4 v_invsqrt(const v_float32x4& x)
    684 {
    685     static const __m128 _0_5 = _mm_set1_ps(0.5f), _1_5 = _mm_set1_ps(1.5f);
    686     __m128 t = x.val;
    687     __m128 h = _mm_mul_ps(t, _0_5);
    688     t = _mm_rsqrt_ps(t);
    689     t = _mm_mul_ps(t, _mm_sub_ps(_1_5, _mm_mul_ps(_mm_mul_ps(t, t), h)));
    690     return v_float32x4(t);
    691 }
    692 
    693 inline v_float64x2 v_sqrt(const v_float64x2& x)
    694 { return v_float64x2(_mm_sqrt_pd(x.val)); }
    695 
    696 inline v_float64x2 v_invsqrt(const v_float64x2& x)
    697 {
    698     static const __m128d v_1 = _mm_set1_pd(1.);
    699     return v_float64x2(_mm_div_pd(v_1, _mm_sqrt_pd(x.val)));
    700 }
    701 
    702 inline v_float32x4 v_abs(const v_float32x4& x)
    703 { return v_float32x4(_mm_and_ps(x.val, _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)))); }
    704 inline v_float64x2 v_abs(const v_float64x2& x)
    705 {
    706     return v_float64x2(_mm_and_pd(x.val,
    707         _mm_castsi128_pd(_mm_srli_epi64(_mm_set1_epi32(-1), 1))));
    708 }
    709 
    710 // TODO: exp, log, sin, cos
    711 
    712 #define OPENCV_HAL_IMPL_SSE_BIN_FUNC(_Tpvec, func, intrin) \
    713 inline _Tpvec func(const _Tpvec& a, const _Tpvec& b) \
    714 { \
    715     return _Tpvec(intrin(a.val, b.val)); \
    716 }
    717 
    718 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_min, _mm_min_epu8)
    719 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_max, _mm_max_epu8)
    720 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_min, _mm_min_epi16)
    721 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_max, _mm_max_epi16)
    722 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_min, _mm_min_ps)
    723 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float32x4, v_max, _mm_max_ps)
    724 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_min, _mm_min_pd)
    725 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_float64x2, v_max, _mm_max_pd)
    726 
    727 inline v_int8x16 v_min(const v_int8x16& a, const v_int8x16& b)
    728 {
    729     __m128i delta = _mm_set1_epi8((char)-128);
    730     return v_int8x16(_mm_xor_si128(delta, _mm_min_epu8(_mm_xor_si128(a.val, delta),
    731                                                        _mm_xor_si128(b.val, delta))));
    732 }
    733 inline v_int8x16 v_max(const v_int8x16& a, const v_int8x16& b)
    734 {
    735     __m128i delta = _mm_set1_epi8((char)-128);
    736     return v_int8x16(_mm_xor_si128(delta, _mm_max_epu8(_mm_xor_si128(a.val, delta),
    737                                                        _mm_xor_si128(b.val, delta))));
    738 }
    739 inline v_uint16x8 v_min(const v_uint16x8& a, const v_uint16x8& b)
    740 {
    741     return v_uint16x8(_mm_subs_epu16(a.val, _mm_subs_epu16(a.val, b.val)));
    742 }
    743 inline v_uint16x8 v_max(const v_uint16x8& a, const v_uint16x8& b)
    744 {
    745     return v_uint16x8(_mm_adds_epu16(_mm_subs_epu16(a.val, b.val), b.val));
    746 }
    747 inline v_uint32x4 v_min(const v_uint32x4& a, const v_uint32x4& b)
    748 {
    749     __m128i delta = _mm_set1_epi32((int)0x80000000);
    750     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
    751     return v_uint32x4(v_select_si128(mask, b.val, a.val));
    752 }
    753 inline v_uint32x4 v_max(const v_uint32x4& a, const v_uint32x4& b)
    754 {
    755     __m128i delta = _mm_set1_epi32((int)0x80000000);
    756     __m128i mask = _mm_cmpgt_epi32(_mm_xor_si128(a.val, delta), _mm_xor_si128(b.val, delta));
    757     return v_uint32x4(v_select_si128(mask, a.val, b.val));
    758 }
    759 inline v_int32x4 v_min(const v_int32x4& a, const v_int32x4& b)
    760 {
    761     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), b.val, a.val));
    762 }
    763 inline v_int32x4 v_max(const v_int32x4& a, const v_int32x4& b)
    764 {
    765     return v_int32x4(v_select_si128(_mm_cmpgt_epi32(a.val, b.val), a.val, b.val));
    766 }
    767 
    768 #define OPENCV_HAL_IMPL_SSE_INT_CMP_OP(_Tpuvec, _Tpsvec, suffix, sbit) \
    769 inline _Tpuvec operator == (const _Tpuvec& a, const _Tpuvec& b) \
    770 { return _Tpuvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
    771 inline _Tpuvec operator != (const _Tpuvec& a, const _Tpuvec& b) \
    772 { \
    773     __m128i not_mask = _mm_set1_epi32(-1); \
    774     return _Tpuvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
    775 } \
    776 inline _Tpsvec operator == (const _Tpsvec& a, const _Tpsvec& b) \
    777 { return _Tpsvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
    778 inline _Tpsvec operator != (const _Tpsvec& a, const _Tpsvec& b) \
    779 { \
    780     __m128i not_mask = _mm_set1_epi32(-1); \
    781     return _Tpsvec(_mm_xor_si128(_mm_cmpeq_##suffix(a.val, b.val), not_mask)); \
    782 } \
    783 inline _Tpuvec operator < (const _Tpuvec& a, const _Tpuvec& b) \
    784 { \
    785     __m128i smask = _mm_set1_##suffix(sbit); \
    786     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask))); \
    787 } \
    788 inline _Tpuvec operator > (const _Tpuvec& a, const _Tpuvec& b) \
    789 { \
    790     __m128i smask = _mm_set1_##suffix(sbit); \
    791     return _Tpuvec(_mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask))); \
    792 } \
    793 inline _Tpuvec operator <= (const _Tpuvec& a, const _Tpuvec& b) \
    794 { \
    795     __m128i smask = _mm_set1_##suffix(sbit); \
    796     __m128i not_mask = _mm_set1_epi32(-1); \
    797     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(a.val, smask), _mm_xor_si128(b.val, smask)); \
    798     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
    799 } \
    800 inline _Tpuvec operator >= (const _Tpuvec& a, const _Tpuvec& b) \
    801 { \
    802     __m128i smask = _mm_set1_##suffix(sbit); \
    803     __m128i not_mask = _mm_set1_epi32(-1); \
    804     __m128i res = _mm_cmpgt_##suffix(_mm_xor_si128(b.val, smask), _mm_xor_si128(a.val, smask)); \
    805     return _Tpuvec(_mm_xor_si128(res, not_mask)); \
    806 } \
    807 inline _Tpsvec operator < (const _Tpsvec& a, const _Tpsvec& b) \
    808 { \
    809     return _Tpsvec(_mm_cmpgt_##suffix(b.val, a.val)); \
    810 } \
    811 inline _Tpsvec operator > (const _Tpsvec& a, const _Tpsvec& b) \
    812 { \
    813     return _Tpsvec(_mm_cmpgt_##suffix(a.val, b.val)); \
    814 } \
    815 inline _Tpsvec operator <= (const _Tpsvec& a, const _Tpsvec& b) \
    816 { \
    817     __m128i not_mask = _mm_set1_epi32(-1); \
    818     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(a.val, b.val), not_mask)); \
    819 } \
    820 inline _Tpsvec operator >= (const _Tpsvec& a, const _Tpsvec& b) \
    821 { \
    822     __m128i not_mask = _mm_set1_epi32(-1); \
    823     return _Tpsvec(_mm_xor_si128(_mm_cmpgt_##suffix(b.val, a.val), not_mask)); \
    824 }
    825 
    826 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint8x16, v_int8x16, epi8, (char)-128)
    827 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint16x8, v_int16x8, epi16, (short)-32768)
    828 OPENCV_HAL_IMPL_SSE_INT_CMP_OP(v_uint32x4, v_int32x4, epi32, (int)0x80000000)
    829 
    830 #define OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(_Tpvec, suffix) \
    831 inline _Tpvec operator == (const _Tpvec& a, const _Tpvec& b) \
    832 { return _Tpvec(_mm_cmpeq_##suffix(a.val, b.val)); } \
    833 inline _Tpvec operator != (const _Tpvec& a, const _Tpvec& b) \
    834 { return _Tpvec(_mm_cmpneq_##suffix(a.val, b.val)); } \
    835 inline _Tpvec operator < (const _Tpvec& a, const _Tpvec& b) \
    836 { return _Tpvec(_mm_cmplt_##suffix(a.val, b.val)); } \
    837 inline _Tpvec operator > (const _Tpvec& a, const _Tpvec& b) \
    838 { return _Tpvec(_mm_cmpgt_##suffix(a.val, b.val)); } \
    839 inline _Tpvec operator <= (const _Tpvec& a, const _Tpvec& b) \
    840 { return _Tpvec(_mm_cmple_##suffix(a.val, b.val)); } \
    841 inline _Tpvec operator >= (const _Tpvec& a, const _Tpvec& b) \
    842 { return _Tpvec(_mm_cmpge_##suffix(a.val, b.val)); }
    843 
    844 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float32x4, ps)
    845 OPENCV_HAL_IMPL_SSE_FLT_CMP_OP(v_float64x2, pd)
    846 
    847 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_add_wrap, _mm_add_epi8)
    848 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_add_wrap, _mm_add_epi8)
    849 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_add_wrap, _mm_add_epi16)
    850 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_add_wrap, _mm_add_epi16)
    851 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint8x16, v_sub_wrap, _mm_sub_epi8)
    852 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int8x16, v_sub_wrap, _mm_sub_epi8)
    853 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_uint16x8, v_sub_wrap, _mm_sub_epi16)
    854 OPENCV_HAL_IMPL_SSE_BIN_FUNC(v_int16x8, v_sub_wrap, _mm_sub_epi16)
    855 
    856 #define OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(_Tpuvec, _Tpsvec, bits, smask32) \
    857 inline _Tpuvec v_absdiff(const _Tpuvec& a, const _Tpuvec& b) \
    858 { \
    859     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a.val, b.val), _mm_subs_epu##bits(b.val, a.val))); \
    860 } \
    861 inline _Tpuvec v_absdiff(const _Tpsvec& a, const _Tpsvec& b) \
    862 { \
    863     __m128i smask = _mm_set1_epi32(smask32); \
    864     __m128i a1 = _mm_xor_si128(a.val, smask); \
    865     __m128i b1 = _mm_xor_si128(b.val, smask); \
    866     return _Tpuvec(_mm_add_epi##bits(_mm_subs_epu##bits(a1, b1), _mm_subs_epu##bits(b1, a1))); \
    867 }
    868 
    869 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint8x16, v_int8x16, 8, (int)0x80808080)
    870 OPENCV_HAL_IMPL_SSE_ABSDIFF_8_16(v_uint16x8, v_int16x8, 16, (int)0x80008000)
    871 
    872 #define OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(_Tpvec, _Tp, _Tpreg, suffix, absmask_vec) \
    873 inline _Tpvec v_absdiff(const _Tpvec& a, const _Tpvec& b) \
    874 { \
    875     _Tpreg absmask = _mm_castsi128_##suffix(absmask_vec); \
    876     return _Tpvec(_mm_and_##suffix(_mm_sub_##suffix(a.val, b.val), absmask)); \
    877 } \
    878 inline _Tpvec v_magnitude(const _Tpvec& a, const _Tpvec& b) \
    879 { \
    880     _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
    881     return _Tpvec(_mm_sqrt_##suffix(res)); \
    882 } \
    883 inline _Tpvec v_sqr_magnitude(const _Tpvec& a, const _Tpvec& b) \
    884 { \
    885     _Tpreg res = _mm_add_##suffix(_mm_mul_##suffix(a.val, a.val), _mm_mul_##suffix(b.val, b.val)); \
    886     return _Tpvec(res); \
    887 } \
    888 inline _Tpvec v_muladd(const _Tpvec& a, const _Tpvec& b, const _Tpvec& c) \
    889 { \
    890     return _Tpvec(_mm_add_##suffix(_mm_mul_##suffix(a.val, b.val), c.val)); \
    891 }
    892 
    893 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float32x4, float, __m128, ps, _mm_set1_epi32((int)0x7fffffff))
    894 OPENCV_HAL_IMPL_SSE_MISC_FLT_OP(v_float64x2, double, __m128d, pd, _mm_srli_epi64(_mm_set1_epi32(-1), 1))
    895 
    896 #define OPENCV_HAL_IMPL_SSE_SHIFT_OP(_Tpuvec, _Tpsvec, suffix, srai) \
    897 inline _Tpuvec operator << (const _Tpuvec& a, int imm) \
    898 { \
    899     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
    900 } \
    901 inline _Tpsvec operator << (const _Tpsvec& a, int imm) \
    902 { \
    903     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
    904 } \
    905 inline _Tpuvec operator >> (const _Tpuvec& a, int imm) \
    906 { \
    907     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
    908 } \
    909 inline _Tpsvec operator >> (const _Tpsvec& a, int imm) \
    910 { \
    911     return _Tpsvec(srai(a.val, imm)); \
    912 } \
    913 template<int imm> \
    914 inline _Tpuvec v_shl(const _Tpuvec& a) \
    915 { \
    916     return _Tpuvec(_mm_slli_##suffix(a.val, imm)); \
    917 } \
    918 template<int imm> \
    919 inline _Tpsvec v_shl(const _Tpsvec& a) \
    920 { \
    921     return _Tpsvec(_mm_slli_##suffix(a.val, imm)); \
    922 } \
    923 template<int imm> \
    924 inline _Tpuvec v_shr(const _Tpuvec& a) \
    925 { \
    926     return _Tpuvec(_mm_srli_##suffix(a.val, imm)); \
    927 } \
    928 template<int imm> \
    929 inline _Tpsvec v_shr(const _Tpsvec& a) \
    930 { \
    931     return _Tpsvec(srai(a.val, imm)); \
    932 }
    933 
    934 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint16x8, v_int16x8, epi16, _mm_srai_epi16)
    935 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint32x4, v_int32x4, epi32, _mm_srai_epi32)
    936 OPENCV_HAL_IMPL_SSE_SHIFT_OP(v_uint64x2, v_int64x2, epi64, v_srai_epi64)
    937 
    938 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(_Tpvec, _Tp) \
    939 inline _Tpvec v_load(const _Tp* ptr) \
    940 { return _Tpvec(_mm_loadu_si128((const __m128i*)ptr)); } \
    941 inline _Tpvec v_load_aligned(const _Tp* ptr) \
    942 { return _Tpvec(_mm_load_si128((const __m128i*)ptr)); } \
    943 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
    944 { \
    945     return _Tpvec(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
    946                                      _mm_loadl_epi64((const __m128i*)ptr1))); \
    947 } \
    948 inline void v_store(_Tp* ptr, const _Tpvec& a) \
    949 { _mm_storeu_si128((__m128i*)ptr, a.val); } \
    950 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
    951 { _mm_store_si128((__m128i*)ptr, a.val); } \
    952 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
    953 { _mm_storel_epi64((__m128i*)ptr, a.val); } \
    954 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
    955 { _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a.val, a.val)); }
    956 
    957 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint8x16, uchar)
    958 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int8x16, schar)
    959 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint16x8, ushort)
    960 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int16x8, short)
    961 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint32x4, unsigned)
    962 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int32x4, int)
    963 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_uint64x2, uint64)
    964 OPENCV_HAL_IMPL_SSE_LOADSTORE_INT_OP(v_int64x2, int64)
    965 
    966 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(_Tpvec, _Tp, suffix) \
    967 inline _Tpvec v_load(const _Tp* ptr) \
    968 { return _Tpvec(_mm_loadu_##suffix(ptr)); } \
    969 inline _Tpvec v_load_aligned(const _Tp* ptr) \
    970 { return _Tpvec(_mm_load_##suffix(ptr)); } \
    971 inline _Tpvec v_load_halves(const _Tp* ptr0, const _Tp* ptr1) \
    972 { \
    973     return _Tpvec(_mm_castsi128_##suffix( \
    974         _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)ptr0), \
    975                            _mm_loadl_epi64((const __m128i*)ptr1)))); \
    976 } \
    977 inline void v_store(_Tp* ptr, const _Tpvec& a) \
    978 { _mm_storeu_##suffix(ptr, a.val); } \
    979 inline void v_store_aligned(_Tp* ptr, const _Tpvec& a) \
    980 { _mm_store_##suffix(ptr, a.val); } \
    981 inline void v_store_low(_Tp* ptr, const _Tpvec& a) \
    982 { _mm_storel_epi64((__m128i*)ptr, _mm_cast##suffix##_si128(a.val)); } \
    983 inline void v_store_high(_Tp* ptr, const _Tpvec& a) \
    984 { \
    985     __m128i a1 = _mm_cast##suffix##_si128(a.val); \
    986     _mm_storel_epi64((__m128i*)ptr, _mm_unpackhi_epi64(a1, a1)); \
    987 }
    988 
    989 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float32x4, float, ps)
    990 OPENCV_HAL_IMPL_SSE_LOADSTORE_FLT_OP(v_float64x2, double, pd)
    991 
    992 #define OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(_Tpvec, scalartype, func, scalar_func) \
    993 inline scalartype v_reduce_##func(const _Tpvec& a) \
    994 { \
    995     scalartype CV_DECL_ALIGNED(16) buf[4]; \
    996     v_store_aligned(buf, a); \
    997     scalartype s0 = scalar_func(buf[0], buf[1]); \
    998     scalartype s1 = scalar_func(buf[2], buf[3]); \
    999     return scalar_func(s0, s1); \
   1000 }
   1001 
   1002 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, sum, OPENCV_HAL_ADD)
   1003 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, max, std::max)
   1004 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_uint32x4, unsigned, min, std::min)
   1005 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, sum, OPENCV_HAL_ADD)
   1006 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, max, std::max)
   1007 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_int32x4, int, min, std::min)
   1008 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, sum, OPENCV_HAL_ADD)
   1009 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, max, std::max)
   1010 OPENCV_HAL_IMPL_SSE_REDUCE_OP_4(v_float32x4, float, min, std::min)
   1011 
   1012 #define OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(_Tpvec, suffix, pack_op, and_op, signmask, allmask) \
   1013 inline int v_signmask(const _Tpvec& a) \
   1014 { \
   1015     return and_op(_mm_movemask_##suffix(pack_op(a.val)), signmask); \
   1016 } \
   1017 inline bool v_check_all(const _Tpvec& a) \
   1018 { return and_op(_mm_movemask_##suffix(a.val), allmask) == allmask; } \
   1019 inline bool v_check_any(const _Tpvec& a) \
   1020 { return and_op(_mm_movemask_##suffix(a.val), allmask) != 0; }
   1021 
   1022 #define OPENCV_HAL_PACKS(a) _mm_packs_epi16(a, a)
   1023 inline __m128i v_packq_epi32(__m128i a)
   1024 {
   1025     __m128i b = _mm_packs_epi32(a, a);
   1026     return _mm_packs_epi16(b, b);
   1027 }
   1028 
   1029 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
   1030 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 65535, 65535)
   1031 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
   1032 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int16x8, epi8, OPENCV_HAL_PACKS, OPENCV_HAL_AND, 255, (int)0xaaaa)
   1033 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_uint32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
   1034 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, 15, (int)0x8888)
   1035 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15)
   1036 OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3)
   1037 
   1038 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, suffix) \
   1039 inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \
   1040 { \
   1041     return _Tpvec(_mm_xor_##suffix(b.val, _mm_and_##suffix(_mm_xor_##suffix(b.val, a.val), mask.val))); \
   1042 }
   1043 
   1044 OPENCV_HAL_IMPL_SSE_SELECT(v_uint8x16, si128)
   1045 OPENCV_HAL_IMPL_SSE_SELECT(v_int8x16, si128)
   1046 OPENCV_HAL_IMPL_SSE_SELECT(v_uint16x8, si128)
   1047 OPENCV_HAL_IMPL_SSE_SELECT(v_int16x8, si128)
   1048 OPENCV_HAL_IMPL_SSE_SELECT(v_uint32x4, si128)
   1049 OPENCV_HAL_IMPL_SSE_SELECT(v_int32x4, si128)
   1050 OPENCV_HAL_IMPL_SSE_SELECT(v_uint64x2, si128)
   1051 OPENCV_HAL_IMPL_SSE_SELECT(v_int64x2, si128)
   1052 OPENCV_HAL_IMPL_SSE_SELECT(v_float32x4, ps)
   1053 OPENCV_HAL_IMPL_SSE_SELECT(v_float64x2, pd)
   1054 
   1055 #define OPENCV_HAL_IMPL_SSE_EXPAND(_Tpuvec, _Tpwuvec, _Tpu, _Tpsvec, _Tpwsvec, _Tps, suffix, wsuffix, shift) \
   1056 inline void v_expand(const _Tpuvec& a, _Tpwuvec& b0, _Tpwuvec& b1) \
   1057 { \
   1058     __m128i z = _mm_setzero_si128(); \
   1059     b0.val = _mm_unpacklo_##suffix(a.val, z); \
   1060     b1.val = _mm_unpackhi_##suffix(a.val, z); \
   1061 } \
   1062 inline _Tpwuvec v_load_expand(const _Tpu* ptr) \
   1063 { \
   1064     __m128i z = _mm_setzero_si128(); \
   1065     return _Tpwuvec(_mm_unpacklo_##suffix(_mm_loadl_epi64((const __m128i*)ptr), z)); \
   1066 } \
   1067 inline void v_expand(const _Tpsvec& a, _Tpwsvec& b0, _Tpwsvec& b1) \
   1068 { \
   1069     b0.val = _mm_srai_##wsuffix(_mm_unpacklo_##suffix(a.val, a.val), shift); \
   1070     b1.val = _mm_srai_##wsuffix(_mm_unpackhi_##suffix(a.val, a.val), shift); \
   1071 } \
   1072 inline _Tpwsvec v_load_expand(const _Tps* ptr) \
   1073 { \
   1074     __m128i a = _mm_loadl_epi64((const __m128i*)ptr); \
   1075     return _Tpwsvec(_mm_srai_##wsuffix(_mm_unpacklo_##suffix(a, a), shift)); \
   1076 }
   1077 
   1078 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint8x16, v_uint16x8, uchar, v_int8x16, v_int16x8, schar, epi8, epi16, 8)
   1079 OPENCV_HAL_IMPL_SSE_EXPAND(v_uint16x8, v_uint32x4, ushort, v_int16x8, v_int32x4, short, epi16, epi32, 16)
   1080 
   1081 inline void v_expand(const v_uint32x4& a, v_uint64x2& b0, v_uint64x2& b1)
   1082 {
   1083     __m128i z = _mm_setzero_si128();
   1084     b0.val = _mm_unpacklo_epi32(a.val, z);
   1085     b1.val = _mm_unpackhi_epi32(a.val, z);
   1086 }
   1087 inline v_uint64x2 v_load_expand(const unsigned* ptr)
   1088 {
   1089     __m128i z = _mm_setzero_si128();
   1090     return v_uint64x2(_mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i*)ptr), z));
   1091 }
   1092 inline void v_expand(const v_int32x4& a, v_int64x2& b0, v_int64x2& b1)
   1093 {
   1094     __m128i s = _mm_srai_epi32(a.val, 31);
   1095     b0.val = _mm_unpacklo_epi32(a.val, s);
   1096     b1.val = _mm_unpackhi_epi32(a.val, s);
   1097 }
   1098 inline v_int64x2 v_load_expand(const int* ptr)
   1099 {
   1100     __m128i a = _mm_loadl_epi64((const __m128i*)ptr);
   1101     __m128i s = _mm_srai_epi32(a, 31);
   1102     return v_int64x2(_mm_unpacklo_epi32(a, s));
   1103 }
   1104 
   1105 inline v_uint32x4 v_load_expand_q(const uchar* ptr)
   1106 {
   1107     __m128i z = _mm_setzero_si128();
   1108     __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
   1109     return v_uint32x4(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z));
   1110 }
   1111 
   1112 inline v_int32x4 v_load_expand_q(const schar* ptr)
   1113 {
   1114     __m128i a = _mm_cvtsi32_si128(*(const int*)ptr);
   1115     a = _mm_unpacklo_epi8(a, a);
   1116     a = _mm_unpacklo_epi8(a, a);
   1117     return v_int32x4(_mm_srai_epi32(a, 24));
   1118 }
   1119 
   1120 #define OPENCV_HAL_IMPL_SSE_UNPACKS(_Tpvec, suffix, cast_from, cast_to) \
   1121 inline void v_zip(const _Tpvec& a0, const _Tpvec& a1, _Tpvec& b0, _Tpvec& b1) \
   1122 { \
   1123     b0.val = _mm_unpacklo_##suffix(a0.val, a1.val); \
   1124     b1.val = _mm_unpackhi_##suffix(a0.val, a1.val); \
   1125 } \
   1126 inline _Tpvec v_combine_low(const _Tpvec& a, const _Tpvec& b) \
   1127 { \
   1128     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
   1129     return _Tpvec(cast_to(_mm_unpacklo_epi64(a1, b1))); \
   1130 } \
   1131 inline _Tpvec v_combine_high(const _Tpvec& a, const _Tpvec& b) \
   1132 { \
   1133     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
   1134     return _Tpvec(cast_to(_mm_unpackhi_epi64(a1, b1))); \
   1135 } \
   1136 inline void v_recombine(const _Tpvec& a, const _Tpvec& b, _Tpvec& c, _Tpvec& d) \
   1137 { \
   1138     __m128i a1 = cast_from(a.val), b1 = cast_from(b.val); \
   1139     c.val = cast_to(_mm_unpacklo_epi64(a1, b1)); \
   1140     d.val = cast_to(_mm_unpackhi_epi64(a1, b1)); \
   1141 }
   1142 
   1143 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
   1144 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int8x16, epi8, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
   1145 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
   1146 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int16x8, epi16, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
   1147 OPENCV_HAL_IMPL_SSE_UNPACKS(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
   1148 OPENCV_HAL_IMPL_SSE_UNPACKS(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
   1149 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
   1150 OPENCV_HAL_IMPL_SSE_UNPACKS(v_float64x2, pd, _mm_castpd_si128, _mm_castsi128_pd)
   1151 
   1152 inline v_int32x4 v_round(const v_float32x4& a)
   1153 { return v_int32x4(_mm_cvtps_epi32(a.val)); }
   1154 
   1155 inline v_int32x4 v_floor(const v_float32x4& a)
   1156 {
   1157     __m128i a1 = _mm_cvtps_epi32(a.val);
   1158     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(_mm_cvtepi32_ps(a1), a.val));
   1159     return v_int32x4(_mm_add_epi32(a1, mask));
   1160 }
   1161 
   1162 inline v_int32x4 v_ceil(const v_float32x4& a)
   1163 {
   1164     __m128i a1 = _mm_cvtps_epi32(a.val);
   1165     __m128i mask = _mm_castps_si128(_mm_cmpgt_ps(a.val, _mm_cvtepi32_ps(a1)));
   1166     return v_int32x4(_mm_sub_epi32(a1, mask));
   1167 }
   1168 
   1169 inline v_int32x4 v_trunc(const v_float32x4& a)
   1170 { return v_int32x4(_mm_cvttps_epi32(a.val)); }
   1171 
   1172 inline v_int32x4 v_round(const v_float64x2& a)
   1173 { return v_int32x4(_mm_cvtpd_epi32(a.val)); }
   1174 
   1175 inline v_int32x4 v_floor(const v_float64x2& a)
   1176 {
   1177     __m128i a1 = _mm_cvtpd_epi32(a.val);
   1178     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(_mm_cvtepi32_pd(a1), a.val));
   1179     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
   1180     return v_int32x4(_mm_add_epi32(a1, mask));
   1181 }
   1182 
   1183 inline v_int32x4 v_ceil(const v_float64x2& a)
   1184 {
   1185     __m128i a1 = _mm_cvtpd_epi32(a.val);
   1186     __m128i mask = _mm_castpd_si128(_mm_cmpgt_pd(a.val, _mm_cvtepi32_pd(a1)));
   1187     mask = _mm_srli_si128(_mm_slli_si128(mask, 4), 8); // m0 m0 m1 m1 => m0 m1 0 0
   1188     return v_int32x4(_mm_sub_epi32(a1, mask));
   1189 }
   1190 
   1191 inline v_int32x4 v_trunc(const v_float64x2& a)
   1192 { return v_int32x4(_mm_cvttpd_epi32(a.val)); }
   1193 
   1194 #define OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(_Tpvec, suffix, cast_from, cast_to) \
   1195 inline void v_transpose4x4(const _Tpvec& a0, const _Tpvec& a1, \
   1196                            const _Tpvec& a2, const _Tpvec& a3, \
   1197                            _Tpvec& b0, _Tpvec& b1, \
   1198                            _Tpvec& b2, _Tpvec& b3) \
   1199 { \
   1200     __m128i t0 = cast_from(_mm_unpacklo_##suffix(a0.val, a1.val)); \
   1201     __m128i t1 = cast_from(_mm_unpacklo_##suffix(a2.val, a3.val)); \
   1202     __m128i t2 = cast_from(_mm_unpackhi_##suffix(a0.val, a1.val)); \
   1203     __m128i t3 = cast_from(_mm_unpackhi_##suffix(a2.val, a3.val)); \
   1204 \
   1205     b0.val = cast_to(_mm_unpacklo_epi64(t0, t1)); \
   1206     b1.val = cast_to(_mm_unpackhi_epi64(t0, t1)); \
   1207     b2.val = cast_to(_mm_unpacklo_epi64(t2, t3)); \
   1208     b3.val = cast_to(_mm_unpackhi_epi64(t2, t3)); \
   1209 }
   1210 
   1211 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_uint32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
   1212 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_int32x4, epi32, OPENCV_HAL_NOP, OPENCV_HAL_NOP)
   1213 OPENCV_HAL_IMPL_SSE_TRANSPOSE4x4(v_float32x4, ps, _mm_castps_si128, _mm_castsi128_ps)
   1214 
   1215 // adopted from sse_utils.hpp
   1216 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c)
   1217 {
   1218     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
   1219     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 16));
   1220     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 32));
   1221 
   1222     __m128i t10 = _mm_unpacklo_epi8(t00, _mm_unpackhi_epi64(t01, t01));
   1223     __m128i t11 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t00, t00), t02);
   1224     __m128i t12 = _mm_unpacklo_epi8(t01, _mm_unpackhi_epi64(t02, t02));
   1225 
   1226     __m128i t20 = _mm_unpacklo_epi8(t10, _mm_unpackhi_epi64(t11, t11));
   1227     __m128i t21 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t10, t10), t12);
   1228     __m128i t22 = _mm_unpacklo_epi8(t11, _mm_unpackhi_epi64(t12, t12));
   1229 
   1230     __m128i t30 = _mm_unpacklo_epi8(t20, _mm_unpackhi_epi64(t21, t21));
   1231     __m128i t31 = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t20, t20), t22);
   1232     __m128i t32 = _mm_unpacklo_epi8(t21, _mm_unpackhi_epi64(t22, t22));
   1233 
   1234     a.val = _mm_unpacklo_epi8(t30, _mm_unpackhi_epi64(t31, t31));
   1235     b.val = _mm_unpacklo_epi8(_mm_unpackhi_epi64(t30, t30), t32);
   1236     c.val = _mm_unpacklo_epi8(t31, _mm_unpackhi_epi64(t32, t32));
   1237 }
   1238 
   1239 inline void v_load_deinterleave(const uchar* ptr, v_uint8x16& a, v_uint8x16& b, v_uint8x16& c, v_uint8x16& d)
   1240 {
   1241     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1 ...
   1242     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
   1243     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 32)); // a8 b8 c8 d8 ...
   1244     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 48)); // a12 b12 c12 d12 ...
   1245 
   1246     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 a8 b0 b8 ...
   1247     __m128i v1 = _mm_unpackhi_epi8(u0, u2); // a2 a10 b2 b10 ...
   1248     __m128i v2 = _mm_unpacklo_epi8(u1, u3); // a4 a12 b4 b12 ...
   1249     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a6 a14 b4 b14 ...
   1250 
   1251     u0 = _mm_unpacklo_epi8(v0, v2); // a0 a4 a8 a12 ...
   1252     u1 = _mm_unpacklo_epi8(v1, v3); // a2 a6 a10 a14 ...
   1253     u2 = _mm_unpackhi_epi8(v0, v2); // a1 a5 a9 a13 ...
   1254     u3 = _mm_unpackhi_epi8(v1, v3); // a3 a7 a11 a15 ...
   1255 
   1256     v0 = _mm_unpacklo_epi8(u0, u1); // a0 a2 a4 a6 ...
   1257     v1 = _mm_unpacklo_epi8(u2, u3); // a1 a3 a5 a7 ...
   1258     v2 = _mm_unpackhi_epi8(u0, u1); // b0 b2 b4 b6 ...
   1259     v3 = _mm_unpackhi_epi8(u2, u3); // b1 b3 b5 b7 ...
   1260 
   1261     a.val = _mm_unpacklo_epi8(v0, v1);
   1262     b.val = _mm_unpacklo_epi8(v2, v3);
   1263     c.val = _mm_unpackhi_epi8(v0, v1);
   1264     d.val = _mm_unpacklo_epi8(v2, v3);
   1265 }
   1266 
   1267 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c)
   1268 {
   1269     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
   1270     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 8));
   1271     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 16));
   1272 
   1273     __m128i t10 = _mm_unpacklo_epi16(t00, _mm_unpackhi_epi64(t01, t01));
   1274     __m128i t11 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t00, t00), t02);
   1275     __m128i t12 = _mm_unpacklo_epi16(t01, _mm_unpackhi_epi64(t02, t02));
   1276 
   1277     __m128i t20 = _mm_unpacklo_epi16(t10, _mm_unpackhi_epi64(t11, t11));
   1278     __m128i t21 = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t10, t10), t12);
   1279     __m128i t22 = _mm_unpacklo_epi16(t11, _mm_unpackhi_epi64(t12, t12));
   1280 
   1281     a.val = _mm_unpacklo_epi16(t20, _mm_unpackhi_epi64(t21, t21));
   1282     b.val = _mm_unpacklo_epi16(_mm_unpackhi_epi64(t20, t20), t22);
   1283     c.val = _mm_unpacklo_epi16(t21, _mm_unpackhi_epi64(t22, t22));
   1284 }
   1285 
   1286 inline void v_load_deinterleave(const ushort* ptr, v_uint16x8& a, v_uint16x8& b, v_uint16x8& c, v_uint16x8& d)
   1287 {
   1288     __m128i u0 = _mm_loadu_si128((const __m128i*)ptr); // a0 b0 c0 d0 a1 b1 c1 d1
   1289     __m128i u1 = _mm_loadu_si128((const __m128i*)(ptr + 8)); // a2 b2 c2 d2 ...
   1290     __m128i u2 = _mm_loadu_si128((const __m128i*)(ptr + 16)); // a4 b4 c4 d4 ...
   1291     __m128i u3 = _mm_loadu_si128((const __m128i*)(ptr + 24)); // a6 b6 c6 d6 ...
   1292 
   1293     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 a4 b0 b4 ...
   1294     __m128i v1 = _mm_unpackhi_epi16(u0, u2); // a1 a5 b1 b5 ...
   1295     __m128i v2 = _mm_unpacklo_epi16(u1, u3); // a2 a6 b2 b6 ...
   1296     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a3 a7 b3 b7 ...
   1297 
   1298     u0 = _mm_unpacklo_epi16(v0, v2); // a0 a2 a4 a6 ...
   1299     u1 = _mm_unpacklo_epi16(v1, v3); // a1 a3 a5 a7 ...
   1300     u2 = _mm_unpackhi_epi16(v0, v2); // c0 c2 c4 c6 ...
   1301     u3 = _mm_unpackhi_epi16(v1, v3); // c1 c3 c5 c7 ...
   1302 
   1303     a.val = _mm_unpacklo_epi16(u0, u1);
   1304     b.val = _mm_unpackhi_epi16(u0, u1);
   1305     c.val = _mm_unpacklo_epi16(u2, u3);
   1306     d.val = _mm_unpackhi_epi16(u2, u3);
   1307 }
   1308 
   1309 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c)
   1310 {
   1311     __m128i t00 = _mm_loadu_si128((const __m128i*)ptr);
   1312     __m128i t01 = _mm_loadu_si128((const __m128i*)(ptr + 4));
   1313     __m128i t02 = _mm_loadu_si128((const __m128i*)(ptr + 8));
   1314 
   1315     __m128i t10 = _mm_unpacklo_epi32(t00, _mm_unpackhi_epi64(t01, t01));
   1316     __m128i t11 = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t00, t00), t02);
   1317     __m128i t12 = _mm_unpacklo_epi32(t01, _mm_unpackhi_epi64(t02, t02));
   1318 
   1319     a.val = _mm_unpacklo_epi32(t10, _mm_unpackhi_epi64(t11, t11));
   1320     b.val = _mm_unpacklo_epi32(_mm_unpackhi_epi64(t10, t10), t12);
   1321     c.val = _mm_unpacklo_epi32(t11, _mm_unpackhi_epi64(t12, t12));
   1322 }
   1323 
   1324 inline void v_load_deinterleave(const unsigned* ptr, v_uint32x4& a, v_uint32x4& b, v_uint32x4& c, v_uint32x4& d)
   1325 {
   1326     v_uint32x4 u0(_mm_loadu_si128((const __m128i*)ptr));        // a0 b0 c0 d0
   1327     v_uint32x4 u1(_mm_loadu_si128((const __m128i*)(ptr + 4))); // a1 b1 c1 d1
   1328     v_uint32x4 u2(_mm_loadu_si128((const __m128i*)(ptr + 8))); // a2 b2 c2 d2
   1329     v_uint32x4 u3(_mm_loadu_si128((const __m128i*)(ptr + 12))); // a3 b3 c3 d3
   1330 
   1331     v_transpose4x4(u0, u1, u2, u3, a, b, c, d);
   1332 }
   1333 
   1334 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
   1335                                 const v_uint8x16& c )
   1336 {
   1337     __m128i z = _mm_setzero_si128();
   1338     __m128i ab0 = _mm_unpacklo_epi8(a.val, b.val);
   1339     __m128i ab1 = _mm_unpackhi_epi8(a.val, b.val);
   1340     __m128i c0 = _mm_unpacklo_epi8(c.val, z);
   1341     __m128i c1 = _mm_unpackhi_epi8(c.val, z);
   1342 
   1343     __m128i p00 = _mm_unpacklo_epi16(ab0, c0);
   1344     __m128i p01 = _mm_unpackhi_epi16(ab0, c0);
   1345     __m128i p02 = _mm_unpacklo_epi16(ab1, c1);
   1346     __m128i p03 = _mm_unpackhi_epi16(ab1, c1);
   1347 
   1348     __m128i p10 = _mm_unpacklo_epi32(p00, p01);
   1349     __m128i p11 = _mm_unpackhi_epi32(p00, p01);
   1350     __m128i p12 = _mm_unpacklo_epi32(p02, p03);
   1351     __m128i p13 = _mm_unpackhi_epi32(p02, p03);
   1352 
   1353     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
   1354     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
   1355     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
   1356     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
   1357 
   1358     p20 = _mm_slli_si128(p20, 1);
   1359     p22 = _mm_slli_si128(p22, 1);
   1360 
   1361     __m128i p30 = _mm_slli_epi64(_mm_unpacklo_epi32(p20, p21), 8);
   1362     __m128i p31 = _mm_srli_epi64(_mm_unpackhi_epi32(p20, p21), 8);
   1363     __m128i p32 = _mm_slli_epi64(_mm_unpacklo_epi32(p22, p23), 8);
   1364     __m128i p33 = _mm_srli_epi64(_mm_unpackhi_epi32(p22, p23), 8);
   1365 
   1366     __m128i p40 = _mm_unpacklo_epi64(p30, p31);
   1367     __m128i p41 = _mm_unpackhi_epi64(p30, p31);
   1368     __m128i p42 = _mm_unpacklo_epi64(p32, p33);
   1369     __m128i p43 = _mm_unpackhi_epi64(p32, p33);
   1370 
   1371     __m128i v0 = _mm_or_si128(_mm_srli_si128(p40, 2), _mm_slli_si128(p41, 10));
   1372     __m128i v1 = _mm_or_si128(_mm_srli_si128(p41, 6), _mm_slli_si128(p42, 6));
   1373     __m128i v2 = _mm_or_si128(_mm_srli_si128(p42, 10), _mm_slli_si128(p43, 2));
   1374 
   1375     _mm_storeu_si128((__m128i*)(ptr), v0);
   1376     _mm_storeu_si128((__m128i*)(ptr + 16), v1);
   1377     _mm_storeu_si128((__m128i*)(ptr + 32), v2);
   1378 }
   1379 
   1380 inline void v_store_interleave( uchar* ptr, const v_uint8x16& a, const v_uint8x16& b,
   1381                                 const v_uint8x16& c, const v_uint8x16& d)
   1382 {
   1383     // a0 a1 a2 a3 ....
   1384     // b0 b1 b2 b3 ....
   1385     // c0 c1 c2 c3 ....
   1386     // d0 d1 d2 d3 ....
   1387     __m128i u0 = _mm_unpacklo_epi8(a.val, c.val); // a0 c0 a1 c1 ...
   1388     __m128i u1 = _mm_unpackhi_epi8(a.val, c.val); // a8 c8 a9 c9 ...
   1389     __m128i u2 = _mm_unpacklo_epi8(b.val, d.val); // b0 d0 b1 d1 ...
   1390     __m128i u3 = _mm_unpackhi_epi8(b.val, d.val); // b8 d8 b9 d9 ...
   1391 
   1392     __m128i v0 = _mm_unpacklo_epi8(u0, u2); // a0 b0 c0 d0 ...
   1393     __m128i v1 = _mm_unpacklo_epi8(u1, u3); // a8 b8 c8 d8 ...
   1394     __m128i v2 = _mm_unpackhi_epi8(u0, u2); // a4 b4 c4 d4 ...
   1395     __m128i v3 = _mm_unpackhi_epi8(u1, u3); // a12 b12 c12 d12 ...
   1396 
   1397     _mm_storeu_si128((__m128i*)ptr, v0);
   1398     _mm_storeu_si128((__m128i*)(ptr + 16), v2);
   1399     _mm_storeu_si128((__m128i*)(ptr + 32), v1);
   1400     _mm_storeu_si128((__m128i*)(ptr + 48), v3);
   1401 }
   1402 
   1403 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a,
   1404                                 const v_uint16x8& b,
   1405                                 const v_uint16x8& c )
   1406 {
   1407     __m128i z = _mm_setzero_si128();
   1408     __m128i ab0 = _mm_unpacklo_epi16(a.val, b.val);
   1409     __m128i ab1 = _mm_unpackhi_epi16(a.val, b.val);
   1410     __m128i c0 = _mm_unpacklo_epi16(c.val, z);
   1411     __m128i c1 = _mm_unpackhi_epi16(c.val, z);
   1412 
   1413     __m128i p10 = _mm_unpacklo_epi32(ab0, c0);
   1414     __m128i p11 = _mm_unpackhi_epi32(ab0, c0);
   1415     __m128i p12 = _mm_unpacklo_epi32(ab1, c1);
   1416     __m128i p13 = _mm_unpackhi_epi32(ab1, c1);
   1417 
   1418     __m128i p20 = _mm_unpacklo_epi64(p10, p11);
   1419     __m128i p21 = _mm_unpackhi_epi64(p10, p11);
   1420     __m128i p22 = _mm_unpacklo_epi64(p12, p13);
   1421     __m128i p23 = _mm_unpackhi_epi64(p12, p13);
   1422 
   1423     p20 = _mm_slli_si128(p20, 2);
   1424     p22 = _mm_slli_si128(p22, 2);
   1425 
   1426     __m128i p30 = _mm_unpacklo_epi64(p20, p21);
   1427     __m128i p31 = _mm_unpackhi_epi64(p20, p21);
   1428     __m128i p32 = _mm_unpacklo_epi64(p22, p23);
   1429     __m128i p33 = _mm_unpackhi_epi64(p22, p23);
   1430 
   1431     __m128i v0 = _mm_or_si128(_mm_srli_si128(p30, 2), _mm_slli_si128(p31, 10));
   1432     __m128i v1 = _mm_or_si128(_mm_srli_si128(p31, 6), _mm_slli_si128(p32, 6));
   1433     __m128i v2 = _mm_or_si128(_mm_srli_si128(p32, 10), _mm_slli_si128(p33, 2));
   1434 
   1435     _mm_storeu_si128((__m128i*)(ptr), v0);
   1436     _mm_storeu_si128((__m128i*)(ptr + 8), v1);
   1437     _mm_storeu_si128((__m128i*)(ptr + 16), v2);
   1438 }
   1439 
   1440 inline void v_store_interleave( ushort* ptr, const v_uint16x8& a, const v_uint16x8& b,
   1441                                 const v_uint16x8& c, const v_uint16x8& d)
   1442 {
   1443     // a0 a1 a2 a3 ....
   1444     // b0 b1 b2 b3 ....
   1445     // c0 c1 c2 c3 ....
   1446     // d0 d1 d2 d3 ....
   1447     __m128i u0 = _mm_unpacklo_epi16(a.val, c.val); // a0 c0 a1 c1 ...
   1448     __m128i u1 = _mm_unpackhi_epi16(a.val, c.val); // a4 c4 a5 c5 ...
   1449     __m128i u2 = _mm_unpacklo_epi16(b.val, d.val); // b0 d0 b1 d1 ...
   1450     __m128i u3 = _mm_unpackhi_epi16(b.val, d.val); // b4 d4 b5 d5 ...
   1451 
   1452     __m128i v0 = _mm_unpacklo_epi16(u0, u2); // a0 b0 c0 d0 ...
   1453     __m128i v1 = _mm_unpacklo_epi16(u1, u3); // a4 b4 c4 d4 ...
   1454     __m128i v2 = _mm_unpackhi_epi16(u0, u2); // a2 b2 c2 d2 ...
   1455     __m128i v3 = _mm_unpackhi_epi16(u1, u3); // a6 b6 c6 d6 ...
   1456 
   1457     _mm_storeu_si128((__m128i*)ptr, v0);
   1458     _mm_storeu_si128((__m128i*)(ptr + 8), v2);
   1459     _mm_storeu_si128((__m128i*)(ptr + 16), v1);
   1460     _mm_storeu_si128((__m128i*)(ptr + 24), v3);
   1461 }
   1462 
   1463 inline void v_store_interleave( unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
   1464                                 const v_uint32x4& c )
   1465 {
   1466     v_uint32x4 z = v_setzero_u32(), u0, u1, u2, u3;
   1467     v_transpose4x4(a, b, c, z, u0, u1, u2, u3);
   1468 
   1469     __m128i v0 = _mm_or_si128(u0.val, _mm_slli_si128(u1.val, 12));
   1470     __m128i v1 = _mm_or_si128(_mm_srli_si128(u1.val, 4), _mm_slli_si128(u2.val, 8));
   1471     __m128i v2 = _mm_or_si128(_mm_srli_si128(u2.val, 8), _mm_slli_si128(u3.val, 4));
   1472 
   1473     _mm_storeu_si128((__m128i*)ptr, v0);
   1474     _mm_storeu_si128((__m128i*)(ptr + 4), v1);
   1475     _mm_storeu_si128((__m128i*)(ptr + 8), v2);
   1476 }
   1477 
   1478 inline void v_store_interleave(unsigned* ptr, const v_uint32x4& a, const v_uint32x4& b,
   1479                                const v_uint32x4& c, const v_uint32x4& d)
   1480 {
   1481     v_uint32x4 t0, t1, t2, t3;
   1482     v_transpose4x4(a, b, c, d, t0, t1, t2, t3);
   1483     v_store(ptr, t0);
   1484     v_store(ptr + 4, t1);
   1485     v_store(ptr + 8, t2);
   1486     v_store(ptr + 12, t3);
   1487 }
   1488 
   1489 #define OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(_Tpvec, _Tp, suffix, _Tpuvec, _Tpu, usuffix) \
   1490 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
   1491                                  _Tpvec& b0, _Tpvec& c0 ) \
   1492 { \
   1493     _Tpuvec a1, b1, c1; \
   1494     v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1); \
   1495     a0 = v_reinterpret_as_##suffix(a1); \
   1496     b0 = v_reinterpret_as_##suffix(b1); \
   1497     c0 = v_reinterpret_as_##suffix(c1); \
   1498 } \
   1499 inline void v_load_deinterleave( const _Tp* ptr, _Tpvec& a0, \
   1500                                  _Tpvec& b0, _Tpvec& c0, _Tpvec& d0 ) \
   1501 { \
   1502     _Tpuvec a1, b1, c1, d1; \
   1503     v_load_deinterleave((const _Tpu*)ptr, a1, b1, c1, d1); \
   1504     a0 = v_reinterpret_as_##suffix(a1); \
   1505     b0 = v_reinterpret_as_##suffix(b1); \
   1506     c0 = v_reinterpret_as_##suffix(c1); \
   1507     d0 = v_reinterpret_as_##suffix(d1); \
   1508 } \
   1509 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, \
   1510                                const _Tpvec& b0, const _Tpvec& c0 ) \
   1511 { \
   1512     _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
   1513     _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
   1514     _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
   1515     v_store_interleave((_Tpu*)ptr, a1, b1, c1); \
   1516 } \
   1517 inline void v_store_interleave( _Tp* ptr, const _Tpvec& a0, const _Tpvec& b0, \
   1518                                const _Tpvec& c0, const _Tpvec& d0 ) \
   1519 { \
   1520     _Tpuvec a1 = v_reinterpret_as_##usuffix(a0); \
   1521     _Tpuvec b1 = v_reinterpret_as_##usuffix(b0); \
   1522     _Tpuvec c1 = v_reinterpret_as_##usuffix(c0); \
   1523     _Tpuvec d1 = v_reinterpret_as_##usuffix(d0); \
   1524     v_store_interleave((_Tpu*)ptr, a1, b1, c1, d1); \
   1525 }
   1526 
   1527 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int8x16, schar, s8, v_uint8x16, uchar, u8)
   1528 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int16x8, short, s16, v_uint16x8, ushort, u16)
   1529 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_int32x4, int, s32, v_uint32x4, unsigned, u32)
   1530 OPENCV_HAL_IMPL_SSE_LOADSTORE_INTERLEAVE(v_float32x4, float, f32, v_uint32x4, unsigned, u32)
   1531 
   1532 inline v_float32x4 v_cvt_f32(const v_int32x4& a)
   1533 {
   1534     return v_float32x4(_mm_cvtepi32_ps(a.val));
   1535 }
   1536 
   1537 inline v_float32x4 v_cvt_f32(const v_float64x2& a)
   1538 {
   1539     return v_float32x4(_mm_cvtpd_ps(a.val));
   1540 }
   1541 
   1542 inline v_float64x2 v_cvt_f64(const v_int32x4& a)
   1543 {
   1544     return v_float64x2(_mm_cvtepi32_pd(a.val));
   1545 }
   1546 
   1547 inline v_float64x2 v_cvt_f64(const v_float32x4& a)
   1548 {
   1549     return v_float64x2(_mm_cvtps_pd(a.val));
   1550 }
   1551 
   1552 }
   1553 
   1554 #endif
   1555