Home | History | Annotate | Download | only in hal
      1 /*M///////////////////////////////////////////////////////////////////////////////////////
      2 //
      3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      4 //
      5 //  By downloading, copying, installing or using the software you agree to this license.
      6 //  If you do not agree to this license, do not download, install,
      7 //  copy or use the software.
      8 //
      9 //
     10 //                          License Agreement
     11 //                For Open Source Computer Vision Library
     12 //
     13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
     14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
     15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
     16 // Copyright (C) 2015, Itseez Inc., all rights reserved.
     17 // Third party copyrights are property of their respective owners.
     18 //
     19 // Redistribution and use in source and binary forms, with or without modification,
     20 // are permitted provided that the following conditions are met:
     21 //
     22 //   * Redistribution's of source code must retain the above copyright notice,
     23 //     this list of conditions and the following disclaimer.
     24 //
     25 //   * Redistribution's in binary form must reproduce the above copyright notice,
     26 //     this list of conditions and the following disclaimer in the documentation
     27 //     and/or other materials provided with the distribution.
     28 //
     29 //   * The name of the copyright holders may not be used to endorse or promote products
     30 //     derived from this software without specific prior written permission.
     31 //
     32 // This software is provided by the copyright holders and contributors "as is" and
     33 // any express or implied warranties, including, but not limited to, the implied
     34 // warranties of merchantability and fitness for a particular purpose are disclaimed.
     35 // In no event shall the Intel Corporation or contributors be liable for any direct,
     36 // indirect, incidental, special, exemplary, or consequential damages
     37 // (including, but not limited to, procurement of substitute goods or services;
     38 // loss of use, data, or profits; or business interruption) however caused
     39 // and on any theory of liability, whether in contract, strict liability,
     40 // or tort (including negligence or otherwise) arising in any way out of
     41 // the use of this software, even if advised of the possibility of such damage.
     42 //
     43 //M*/
     44 
     45 #ifndef __OPENCV_HAL_INTRIN_CPP_HPP__
     46 #define __OPENCV_HAL_INTRIN_CPP_HPP__
     47 
     48 namespace cv
     49 {
     50 
     51 template<typename _Tp, int n> struct v_reg
     52 {
     53     typedef _Tp lane_type;
     54     typedef v_reg<typename V_TypeTraits<_Tp>::int_type, n> int_vec;
     55     typedef v_reg<typename V_TypeTraits<_Tp>::abs_type, n> abs_vec;
     56     enum { nlanes = n };
     57 
     58     explicit v_reg(const _Tp* ptr) { for( int i = 0; i < n; i++ ) s[i] = ptr[i]; }
     59     v_reg(_Tp s0, _Tp s1) { s[0] = s0; s[1] = s1; }
     60     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3) { s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3; }
     61     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
     62            _Tp s4, _Tp s5, _Tp s6, _Tp s7)
     63     {
     64         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
     65         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
     66     }
     67     v_reg(_Tp s0, _Tp s1, _Tp s2, _Tp s3,
     68            _Tp s4, _Tp s5, _Tp s6, _Tp s7,
     69            _Tp s8, _Tp s9, _Tp s10, _Tp s11,
     70            _Tp s12, _Tp s13, _Tp s14, _Tp s15)
     71     {
     72         s[0] = s0; s[1] = s1; s[2] = s2; s[3] = s3;
     73         s[4] = s4; s[5] = s5; s[6] = s6; s[7] = s7;
     74         s[8] = s8; s[9] = s9; s[10] = s10; s[11] = s11;
     75         s[12] = s12; s[13] = s13; s[14] = s14; s[15] = s15;
     76     }
     77 
     78     v_reg() {}
     79     v_reg(const v_reg<_Tp, n> & r)
     80     {
     81         for( int i = 0; i < n; i++ )
     82             s[i] = r.s[i];
     83     }
     84 
     85     _Tp get(const int i) const { return s[i]; }
     86     _Tp get0() const { return s[0]; }
     87     v_reg<_Tp, n> high() const
     88     {
     89         v_reg<_Tp, n> c;
     90         int i;
     91         for( i = 0; i < n/2; i++ )
     92         {
     93             c.s[i] = s[i+(n/2)];
     94             c.s[i+(n/2)] = 0;
     95         }
     96         return c;
     97     }
     98 
     99     static v_reg<_Tp, n> zero()
    100     {
    101         v_reg<_Tp, n> c;
    102         for( int i = 0; i < n; i++ )
    103             c.s[i] = (_Tp)0;
    104         return c;
    105     }
    106 
    107     static v_reg<_Tp, n> all(_Tp s)
    108     {
    109         v_reg<_Tp, n> c;
    110         for( int i = 0; i < n; i++ )
    111             c.s[i] = s;
    112         return c;
    113     }
    114 
    115     template<typename _Tp2, int n2> v_reg<_Tp2, n2> reinterpret_as() const
    116     {
    117         size_t bytes = std::min(sizeof(_Tp2)*n2, sizeof(_Tp)*n);
    118         v_reg<_Tp2, n2> c;
    119         memcpy(&c.s[0], &s[0], bytes);
    120         return c;
    121     }
    122 
    123     _Tp s[n];
    124 };
    125 
    126 #define OPENCV_HAL_IMPL_BIN_OP(bin_op) \
    127 template<typename _Tp, int n> inline v_reg<_Tp, n> \
    128     operator bin_op (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
    129 { \
    130     v_reg<_Tp, n> c; \
    131     for( int i = 0; i < n; i++ ) \
    132         c.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
    133     return c; \
    134 } \
    135 template<typename _Tp, int n> inline v_reg<_Tp, n>& \
    136     operator bin_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
    137 { \
    138     for( int i = 0; i < n; i++ ) \
    139         a.s[i] = saturate_cast<_Tp>(a.s[i] bin_op b.s[i]); \
    140     return a; \
    141 }
    142 
    143 OPENCV_HAL_IMPL_BIN_OP(+)
    144 OPENCV_HAL_IMPL_BIN_OP(-)
    145 OPENCV_HAL_IMPL_BIN_OP(*)
    146 OPENCV_HAL_IMPL_BIN_OP(/)
    147 
    148 #define OPENCV_HAL_IMPL_BIT_OP(bit_op) \
    149 template<typename _Tp, int n> inline v_reg<_Tp, n> operator bit_op \
    150     (const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
    151 { \
    152     v_reg<_Tp, n> c; \
    153     typedef typename V_TypeTraits<_Tp>::int_type itype; \
    154     for( int i = 0; i < n; i++ ) \
    155         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
    156                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
    157     return c; \
    158 } \
    159 template<typename _Tp, int n> inline v_reg<_Tp, n>& operator \
    160     bit_op##= (v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
    161 { \
    162     typedef typename V_TypeTraits<_Tp>::int_type itype; \
    163     for( int i = 0; i < n; i++ ) \
    164         a.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) bit_op \
    165                                                         V_TypeTraits<_Tp>::reinterpret_int(b.s[i]))); \
    166     return a; \
    167 }
    168 
    169 OPENCV_HAL_IMPL_BIT_OP(&)
    170 OPENCV_HAL_IMPL_BIT_OP(|)
    171 OPENCV_HAL_IMPL_BIT_OP(^)
    172 
    173 template<typename _Tp, int n> inline v_reg<_Tp, n> operator ~ (const v_reg<_Tp, n>& a)
    174 {
    175     v_reg<_Tp, n> c;
    176     for( int i = 0; i < n; i++ )
    177         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int(~V_TypeTraits<_Tp>::reinterpret_int(a.s[i]));
    178         return c;
    179 }
    180 
    181 #define OPENCV_HAL_IMPL_MATH_FUNC(func, cfunc, _Tp2) \
    182 template<typename _Tp, int n> inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a) \
    183 { \
    184     v_reg<_Tp2, n> c; \
    185     for( int i = 0; i < n; i++ ) \
    186         c.s[i] = cfunc(a.s[i]); \
    187     return c; \
    188 }
    189 
    190 OPENCV_HAL_IMPL_MATH_FUNC(v_sqrt, std::sqrt, _Tp)
    191 OPENCV_HAL_IMPL_MATH_FUNC(v_sin, std::sin, _Tp)
    192 OPENCV_HAL_IMPL_MATH_FUNC(v_cos, std::cos, _Tp)
    193 OPENCV_HAL_IMPL_MATH_FUNC(v_exp, std::exp, _Tp)
    194 OPENCV_HAL_IMPL_MATH_FUNC(v_log, std::log, _Tp)
    195 OPENCV_HAL_IMPL_MATH_FUNC(v_abs, (typename V_TypeTraits<_Tp>::abs_type)std::abs,
    196                           typename V_TypeTraits<_Tp>::abs_type)
    197 OPENCV_HAL_IMPL_MATH_FUNC(v_round, cvRound, int)
    198 OPENCV_HAL_IMPL_MATH_FUNC(v_floor, cvFloor, int)
    199 OPENCV_HAL_IMPL_MATH_FUNC(v_ceil, cvCeil, int)
    200 OPENCV_HAL_IMPL_MATH_FUNC(v_trunc, int, int)
    201 
    202 #define OPENCV_HAL_IMPL_MINMAX_FUNC(func, hfunc, cfunc) \
    203 template<typename _Tp, int n> inline v_reg<_Tp, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
    204 { \
    205     v_reg<_Tp, n> c; \
    206     for( int i = 0; i < n; i++ ) \
    207         c.s[i] = cfunc(a.s[i], b.s[i]); \
    208     return c; \
    209 } \
    210 template<typename _Tp, int n> inline _Tp hfunc(const v_reg<_Tp, n>& a) \
    211 { \
    212     _Tp c = a.s[0]; \
    213     for( int i = 1; i < n; i++ ) \
    214         c = cfunc(c, a.s[i]); \
    215     return c; \
    216 }
    217 
    218 OPENCV_HAL_IMPL_MINMAX_FUNC(v_min, v_reduce_min, std::min)
    219 OPENCV_HAL_IMPL_MINMAX_FUNC(v_max, v_reduce_max, std::max)
    220 
    221 template<typename _Tp, int n>
    222 inline void v_minmax( const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
    223                       v_reg<_Tp, n>& minval, v_reg<_Tp, n>& maxval )
    224 {
    225     for( int i = 0; i < n; i++ )
    226     {
    227         minval.s[i] = std::min(a.s[i], b.s[i]);
    228         maxval.s[i] = std::max(a.s[i], b.s[i]);
    229     }
    230 }
    231 
    232 
    233 #define OPENCV_HAL_IMPL_CMP_OP(cmp_op) \
    234 template<typename _Tp, int n> \
    235 inline v_reg<_Tp, n> operator cmp_op(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
    236 { \
    237     typedef typename V_TypeTraits<_Tp>::int_type itype; \
    238     v_reg<_Tp, n> c; \
    239     for( int i = 0; i < n; i++ ) \
    240         c.s[i] = V_TypeTraits<_Tp>::reinterpret_from_int((itype)-(int)(a.s[i] cmp_op b.s[i])); \
    241     return c; \
    242 }
    243 
    244 OPENCV_HAL_IMPL_CMP_OP(<)
    245 OPENCV_HAL_IMPL_CMP_OP(>)
    246 OPENCV_HAL_IMPL_CMP_OP(<=)
    247 OPENCV_HAL_IMPL_CMP_OP(>=)
    248 OPENCV_HAL_IMPL_CMP_OP(==)
    249 OPENCV_HAL_IMPL_CMP_OP(!=)
    250 
    251 #define OPENCV_HAL_IMPL_ADD_SUB_OP(func, bin_op, cast_op, _Tp2) \
    252 template<typename _Tp, int n> \
    253 inline v_reg<_Tp2, n> func(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b) \
    254 { \
    255     typedef _Tp2 rtype; \
    256     v_reg<rtype, n> c; \
    257     for( int i = 0; i < n; i++ ) \
    258         c.s[i] = cast_op(a.s[i] bin_op b.s[i]); \
    259     return c; \
    260 }
    261 
    262 OPENCV_HAL_IMPL_ADD_SUB_OP(v_add_wrap, +, (_Tp), _Tp)
    263 OPENCV_HAL_IMPL_ADD_SUB_OP(v_sub_wrap, -, (_Tp), _Tp)
    264 OPENCV_HAL_IMPL_ADD_SUB_OP(v_absdiff, -, (rtype)std::abs, typename V_TypeTraits<_Tp>::abs_type)
    265 
    266 template<typename _Tp, int n>
    267 inline v_reg<_Tp, n> v_invsqrt(const v_reg<_Tp, n>& a)
    268 {
    269     v_reg<_Tp, n> c;
    270     for( int i = 0; i < n; i++ )
    271         c.s[i] = 1.f/std::sqrt(a.s[i]);
    272     return c;
    273 }
    274 
    275 template<typename _Tp, int n>
    276 inline v_reg<_Tp, n> v_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
    277 {
    278     v_reg<_Tp, n> c;
    279     for( int i = 0; i < n; i++ )
    280         c.s[i] = std::sqrt(a.s[i]*a.s[i] + b.s[i]*b.s[i]);
    281     return c;
    282 }
    283 
    284 
    285 template<typename _Tp, int n>
    286 inline v_reg<_Tp, n> v_sqr_magnitude(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
    287 {
    288     v_reg<_Tp, n> c;
    289     for( int i = 0; i < n; i++ )
    290         c.s[i] = a.s[i]*a.s[i] + b.s[i]*b.s[i];
    291     return c;
    292 }
    293 
    294 template<typename _Tp, int n>
    295 inline v_reg<_Tp, n> v_muladd(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
    296                               const v_reg<_Tp, n>& c)
    297 {
    298     v_reg<_Tp, n> d;
    299     for( int i = 0; i < n; i++ )
    300         d.s[i] = a.s[i]*b.s[i] + c.s[i];
    301     return d;
    302 }
    303 
    304 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>
    305     v_dotprod(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
    306 {
    307     typedef typename V_TypeTraits<_Tp>::w_type w_type;
    308     v_reg<w_type, n/2> c;
    309     for( int i = 0; i < (n/2); i++ )
    310         c.s[i] = (w_type)a.s[i*2]*b.s[i*2] + (w_type)a.s[i*2+1]*b.s[i*2+1];
    311     return c;
    312 }
    313 
    314 template<typename _Tp, int n> inline void v_mul_expand(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
    315                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c,
    316                                                        v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& d)
    317 {
    318     typedef typename V_TypeTraits<_Tp>::w_type w_type;
    319     for( int i = 0; i < (n/2); i++ )
    320     {
    321         c.s[i] = (w_type)a.s[i]*b.s[i]*2;
    322         d.s[i] = (w_type)a.s[i+(n/2)]*b.s[i+(n/2)];
    323     }
    324 }
    325 
    326 template<typename _Tp, int n> inline void v_hsum(const v_reg<_Tp, n>& a,
    327                                                  v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& c)
    328 {
    329     typedef typename V_TypeTraits<_Tp>::w_type w_type;
    330     for( int i = 0; i < (n/2); i++ )
    331     {
    332         c.s[i] = (w_type)a.s[i*2] + a.s[i*2+1];
    333     }
    334 }
    335 
    336 #define OPENCV_HAL_IMPL_SHIFT_OP(shift_op) \
    337 template<typename _Tp, int n> inline v_reg<_Tp, n> operator shift_op(const v_reg<_Tp, n>& a, int imm) \
    338 { \
    339     v_reg<_Tp, n> c; \
    340     for( int i = 0; i < n; i++ ) \
    341         c.s[i] = (_Tp)(a.s[i] shift_op imm); \
    342     return c; \
    343 }
    344 
    345 OPENCV_HAL_IMPL_SHIFT_OP(<<)
    346 OPENCV_HAL_IMPL_SHIFT_OP(>>)
    347 
    348 template<typename _Tp, int n> inline typename V_TypeTraits<_Tp>::sum_type v_reduce_sum(const v_reg<_Tp, n>& a)
    349 {
    350     typename V_TypeTraits<_Tp>::sum_type c = a.s[0];
    351     for( int i = 1; i < n; i++ )
    352         c += a.s[i];
    353     return c;
    354 }
    355 
    356 template<typename _Tp, int n> inline int v_signmask(const v_reg<_Tp, n>& a)
    357 {
    358     int mask = 0;
    359     for( int i = 0; i < n; i++ )
    360         mask |= (V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) << i;
    361     return mask;
    362 }
    363 
    364 template<typename _Tp, int n> inline bool v_check_all(const v_reg<_Tp, n>& a)
    365 {
    366     for( int i = 0; i < n; i++ )
    367         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) >= 0 )
    368             return false;
    369     return true;
    370 }
    371 
    372 template<typename _Tp, int n> inline bool v_check_any(const v_reg<_Tp, n>& a)
    373 {
    374     for( int i = 0; i < n; i++ )
    375         if( V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0 )
    376             return true;
    377     return false;
    378 }
    379 
    380 template<typename _Tp, int n> inline v_reg<_Tp, n> v_select(const v_reg<_Tp, n>& mask,
    381                                                            const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
    382 {
    383     v_reg<_Tp, n> c;
    384     for( int i = 0; i < n; i++ )
    385         c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(mask.s[i]) < 0 ? b.s[i] : a.s[i];
    386     return c;
    387 }
    388 
    389 template<typename _Tp, int n> inline void v_expand(const v_reg<_Tp, n>& a,
    390                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b0,
    391                             v_reg<typename V_TypeTraits<_Tp>::w_type, n/2>& b1)
    392 {
    393     for( int i = 0; i < (n/2); i++ )
    394     {
    395         b0.s[i] = a.s[i];
    396         b1.s[i] = a.s[i+(n/2)];
    397     }
    398 }
    399 
    400 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::int_type, n>
    401     v_reinterpret_as_int(const v_reg<_Tp, n>& a)
    402 {
    403     v_reg<typename V_TypeTraits<_Tp>::int_type, n> c;
    404     for( int i = 0; i < n; i++ )
    405         c.s[i] = V_TypeTraits<_Tp>::reinterpret_int(a.s[i]);
    406     return c;
    407 }
    408 
    409 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::uint_type, n>
    410     v_reinterpret_as_uint(const v_reg<_Tp, n>& a)
    411 {
    412     v_reg<typename V_TypeTraits<_Tp>::uint_type, n> c;
    413     for( int i = 0; i < n; i++ )
    414         c.s[i] = V_TypeTraits<_Tp>::reinterpret_uint(a.s[i]);
    415     return c;
    416 }
    417 
    418 template<typename _Tp, int n> inline void v_zip( const v_reg<_Tp, n>& a0, const v_reg<_Tp, n>& a1,
    419                                                v_reg<_Tp, n>& b0, v_reg<_Tp, n>& b1 )
    420 {
    421     int i;
    422     for( i = 0; i < n/2; i++ )
    423     {
    424         b0.s[i*2] = a0.s[i];
    425         b0.s[i*2+1] = a1.s[i];
    426     }
    427     for( ; i < n; i++ )
    428     {
    429         b1.s[i*2-n] = a0.s[i];
    430         b1.s[i*2-n+1] = a1.s[i];
    431     }
    432 }
    433 
    434 template<typename _Tp, int n> inline v_reg<_Tp, n> v_load(const _Tp* ptr)
    435 {
    436     return v_reg<_Tp, n>(ptr);
    437 }
    438 
    439 template<typename _Tp, int n> inline v_reg<_Tp, n> v_load_aligned(const _Tp* ptr)
    440 {
    441     return v_reg<_Tp, n>(ptr);
    442 }
    443 
    444 template<typename _Tp, int n> inline void v_load_halves(const _Tp* loptr, const _Tp* hiptr)
    445 {
    446     v_reg<_Tp, n> c;
    447     for( int i = 0; i < n/2; i++ )
    448     {
    449         c.s[i] = loptr[i];
    450         c.s[i+n/2] = hiptr[i];
    451     }
    452     return c;
    453 }
    454 
    455 template<typename _Tp, int n> inline v_reg<typename V_TypeTraits<_Tp>::w_type, n> v_load_expand(const _Tp* ptr)
    456 {
    457     typedef typename V_TypeTraits<_Tp>::w_type w_type;
    458     v_reg<w_type, n> c;
    459     for( int i = 0; i < n; i++ )
    460     {
    461         c.s[i] = ptr[i];
    462     }
    463     return c;
    464 }
    465 
    466 template<typename _Tp, int n> inline v_reg<typename
    467     V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type, n> v_load_expand_q(const _Tp* ptr)
    468 {
    469     typedef typename V_TypeTraits<typename V_TypeTraits<_Tp>::w_type>::w_type w_type;
    470     v_reg<w_type, n> c;
    471     for( int i = 0; i < n; i++ )
    472     {
    473         c.s[i] = ptr[i];
    474     }
    475     return c;
    476 }
    477 
    478 template<typename _Tp, int n> inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
    479                                                             v_reg<_Tp, n>& b, v_reg<_Tp, n>& c)
    480 {
    481     int i, i3;
    482     for( i = i3 = 0; i < n; i++, i3 += 3 )
    483     {
    484         a.s[i] = ptr[i3];
    485         b.s[i] = ptr[i3+1];
    486         c.s[i] = ptr[i3+2];
    487     }
    488 }
    489 
    490 template<typename _Tp, int n>
    491 inline void v_load_deinterleave(const _Tp* ptr, v_reg<_Tp, n>& a,
    492                                 v_reg<_Tp, n>& b, v_reg<_Tp, n>& c,
    493                                 v_reg<_Tp, n>& d)
    494 {
    495     int i, i4;
    496     for( i = i4 = 0; i < n; i++, i4 += 4 )
    497     {
    498         a.s[i] = ptr[i4];
    499         b.s[i] = ptr[i4+1];
    500         c.s[i] = ptr[i4+2];
    501         d.s[i] = ptr[i4+3];
    502     }
    503 }
    504 
    505 template<typename _Tp, int n>
    506 inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
    507                                 const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c)
    508 {
    509     int i, i3;
    510     for( i = i3 = 0; i < n; i++, i3 += 3 )
    511     {
    512         ptr[i3] = a.s[i];
    513         ptr[i3+1] = b.s[i];
    514         ptr[i3+2] = c.s[i];
    515     }
    516 }
    517 
    518 template<typename _Tp, int n> inline void v_store_interleave( _Tp* ptr, const v_reg<_Tp, n>& a,
    519                                                             const v_reg<_Tp, n>& b, const v_reg<_Tp, n>& c,
    520                                                             const v_reg<_Tp, n>& d)
    521 {
    522     int i, i4;
    523     for( i = i4 = 0; i < n; i++, i4 += 4 )
    524     {
    525         ptr[i4] = a.s[i];
    526         ptr[i4+1] = b.s[i];
    527         ptr[i4+2] = c.s[i];
    528         ptr[i4+3] = d.s[i];
    529     }
    530 }
    531 
    532 template<typename _Tp, int n>
    533 inline void v_store(_Tp* ptr, const v_reg<_Tp, n>& a)
    534 {
    535     for( int i = 0; i < n; i++ )
    536         ptr[i] = a.s[i];
    537 }
    538 
    539 template<typename _Tp, int n>
    540 inline void v_store_low(_Tp* ptr, const v_reg<_Tp, n>& a)
    541 {
    542     for( int i = 0; i < (n/2); i++ )
    543         ptr[i] = a.s[i];
    544 }
    545 
    546 template<typename _Tp, int n>
    547 inline void v_store_high(_Tp* ptr, const v_reg<_Tp, n>& a)
    548 {
    549     for( int i = 0; i < (n/2); i++ )
    550         ptr[i] = a.s[i+(n/2)];
    551 }
    552 
    553 template<typename _Tp, int n>
    554 inline void v_store_aligned(_Tp* ptr, const v_reg<_Tp, n>& a)
    555 {
    556     for( int i = 0; i < n; i++ )
    557         ptr[i] = a.s[i];
    558 }
    559 
    560 template<typename _Tp, int n>
    561 inline v_reg<_Tp, n> v_combine_low(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
    562 {
    563     v_reg<_Tp, n> c;
    564     for( int i = 0; i < (n/2); i++ )
    565     {
    566         c.s[i] = a.s[i];
    567         c.s[i+(n/2)] = b.s[i];
    568     }
    569 }
    570 
    571 template<typename _Tp, int n>
    572 inline v_reg<_Tp, n> v_combine_high(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b)
    573 {
    574     v_reg<_Tp, n> c;
    575     for( int i = 0; i < (n/2); i++ )
    576     {
    577         c.s[i] = a.s[i+(n/2)];
    578         c.s[i+(n/2)] = b.s[i+(n/2)];
    579     }
    580 }
    581 
    582 template<typename _Tp, int n>
    583 inline void v_recombine(const v_reg<_Tp, n>& a, const v_reg<_Tp, n>& b,
    584                         v_reg<_Tp, n>& low, v_reg<_Tp, n>& high)
    585 {
    586     for( int i = 0; i < (n/2); i++ )
    587     {
    588         low.s[i] = a.s[i];
    589         low.s[i+(n/2)] = b.s[i];
    590         high.s[i] = a.s[i+(n/2)];
    591         high.s[i+(n/2)] = b.s[i+(n/2)];
    592     }
    593 }
    594 
    595 template<int n> inline v_reg<int, n> v_round(const v_reg<float, n>& a)
    596 {
    597     v_reg<int, n> c;
    598     for( int i = 0; i < n; i++ )
    599         c.s[i] = cvRound(a.s[i]);
    600     return c;
    601 }
    602 
    603 template<int n> inline v_reg<int, n> v_floor(const v_reg<float, n>& a)
    604 {
    605     v_reg<int, n> c;
    606     for( int i = 0; i < n; i++ )
    607         c.s[i] = cvFloor(a.s[i]);
    608     return c;
    609 }
    610 
    611 template<int n> inline v_reg<int, n> v_ceil(const v_reg<float, n>& a)
    612 {
    613     v_reg<int, n> c;
    614     for( int i = 0; i < n; i++ )
    615         c.s[i] = cvCeil(a.s[i]);
    616     return c;
    617 }
    618 
    619 template<int n> inline v_reg<int, n> v_trunc(const v_reg<float, n>& a)
    620 {
    621     v_reg<int, n> c;
    622     for( int i = 0; i < n; i++ )
    623         c.s[i] = (int)(a.s[i]);
    624     return c;
    625 }
    626 
    627 template<int n> inline v_reg<int, n*2> v_round(const v_reg<double, n>& a)
    628 {
    629     v_reg<int, n*2> c;
    630     for( int i = 0; i < n; i++ )
    631     {
    632         c.s[i] = cvRound(a.s[i]);
    633         c.s[i+n] = 0;
    634     }
    635     return c;
    636 }
    637 
    638 template<int n> inline v_reg<int, n*2> v_floor(const v_reg<double, n>& a)
    639 {
    640     v_reg<int, n> c;
    641     for( int i = 0; i < n; i++ )
    642     {
    643         c.s[i] = cvFloor(a.s[i]);
    644         c.s[i+n] = 0;
    645     }
    646     return c;
    647 }
    648 
    649 template<int n> inline v_reg<int, n*2> v_ceil(const v_reg<double, n>& a)
    650 {
    651     v_reg<int, n> c;
    652     for( int i = 0; i < n; i++ )
    653     {
    654         c.s[i] = cvCeil(a.s[i]);
    655         c.s[i+n] = 0;
    656     }
    657     return c;
    658 }
    659 
    660 template<int n> inline v_reg<int, n*2> v_trunc(const v_reg<double, n>& a)
    661 {
    662     v_reg<int, n> c;
    663     for( int i = 0; i < n; i++ )
    664     {
    665         c.s[i] = cvCeil(a.s[i]);
    666         c.s[i+n] = 0;
    667     }
    668     return c;
    669 }
    670 
    671 template<int n> inline v_reg<float, n> v_cvt_f32(const v_reg<int, n>& a)
    672 {
    673     v_reg<float, n> c;
    674     for( int i = 0; i < n; i++ )
    675         c.s[i] = (float)a.s[i];
    676     return c;
    677 }
    678 
    679 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<int, n*2>& a)
    680 {
    681     v_reg<double, n> c;
    682     for( int i = 0; i < n; i++ )
    683         c.s[i] = (double)a.s[i];
    684     return c;
    685 }
    686 
    687 template<int n> inline v_reg<double, n> v_cvt_f64(const v_reg<float, n*2>& a)
    688 {
    689     v_reg<double, n> c;
    690     for( int i = 0; i < n; i++ )
    691         c.s[i] = (double)a.s[i];
    692     return c;
    693 }
    694 
    695 template<typename _Tp>
    696 inline void v_transpose4x4( v_reg<_Tp, 4>& a0, const v_reg<_Tp, 4>& a1,
    697                             const v_reg<_Tp, 4>& a2, const v_reg<_Tp, 4>& a3,
    698                             v_reg<_Tp, 4>& b0, v_reg<_Tp, 4>& b1,
    699                             v_reg<_Tp, 4>& b2, v_reg<_Tp, 4>& b3 )
    700 {
    701     b0 = v_reg<_Tp, 4>(a0.s[0], a1.s[0], a2.s[0], a3.s[0]);
    702     b1 = v_reg<_Tp, 4>(a0.s[1], a1.s[1], a2.s[1], a3.s[1]);
    703     b2 = v_reg<_Tp, 4>(a0.s[2], a1.s[2], a2.s[2], a3.s[2]);
    704     b3 = v_reg<_Tp, 4>(a0.s[3], a1.s[3], a2.s[3], a3.s[3]);
    705 }
    706 
    707 typedef v_reg<uchar, 16> v_uint8x16;
    708 typedef v_reg<schar, 16> v_int8x16;
    709 typedef v_reg<ushort, 8> v_uint16x8;
    710 typedef v_reg<short, 8> v_int16x8;
    711 typedef v_reg<unsigned, 4> v_uint32x4;
    712 typedef v_reg<int, 4> v_int32x4;
    713 typedef v_reg<float, 4> v_float32x4;
    714 typedef v_reg<float, 8> v_float32x8;
    715 typedef v_reg<double, 2> v_float64x2;
    716 typedef v_reg<uint64, 2> v_uint64x2;
    717 typedef v_reg<int64, 2> v_int64x2;
    718 
    719 #define OPENCV_HAL_IMPL_C_INIT(_Tpvec, _Tp, suffix) \
    720 inline _Tpvec v_setzero_##suffix() { return _Tpvec::zero(); } \
    721 inline _Tpvec v_setall_##suffix(_Tp val) { return _Tpvec::all(val); } \
    722 template<typename _Tp0, int n0> inline _Tpvec \
    723     v_reinterpret_as_##suffix(const v_reg<_Tp0, n0>& a) \
    724 { return a.template reinterpret_as<_Tp, _Tpvec::nlanes>(a); }
    725 
    726 OPENCV_HAL_IMPL_C_INIT(v_uint8x16, uchar, u8)
    727 OPENCV_HAL_IMPL_C_INIT(v_int8x16, schar, s8)
    728 OPENCV_HAL_IMPL_C_INIT(v_uint16x8, ushort, u16)
    729 OPENCV_HAL_IMPL_C_INIT(v_int16x8, short, s16)
    730 OPENCV_HAL_IMPL_C_INIT(v_uint32x4, unsigned, u32)
    731 OPENCV_HAL_IMPL_C_INIT(v_int32x4, int, s32)
    732 OPENCV_HAL_IMPL_C_INIT(v_float32x4, float, f32)
    733 OPENCV_HAL_IMPL_C_INIT(v_float64x2, double, f64)
    734 OPENCV_HAL_IMPL_C_INIT(v_uint64x2, uint64, u64)
    735 OPENCV_HAL_IMPL_C_INIT(v_uint64x2, int64, s64)
    736 
    737 #define OPENCV_HAL_IMPL_C_SHIFT(_Tpvec, _Tp) \
    738 template<int n> inline _Tpvec v_shl(const _Tpvec& a) \
    739 { return a << n; } \
    740 template<int n> inline _Tpvec v_shr(const _Tpvec& a) \
    741 { return a >> n; } \
    742 template<int n> inline _Tpvec v_rshr(const _Tpvec& a) \
    743 { \
    744     _Tpvec c; \
    745     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
    746         c.s[i] = (_Tp)((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
    747     return c; \
    748 }
    749 
    750 OPENCV_HAL_IMPL_C_SHIFT(v_uint16x8, ushort)
    751 OPENCV_HAL_IMPL_C_SHIFT(v_int16x8, short)
    752 OPENCV_HAL_IMPL_C_SHIFT(v_uint32x4, unsigned)
    753 OPENCV_HAL_IMPL_C_SHIFT(v_int32x4, int)
    754 OPENCV_HAL_IMPL_C_SHIFT(v_uint64x2, uint64)
    755 OPENCV_HAL_IMPL_C_SHIFT(v_int64x2, int64)
    756 
    757 
    758 #define OPENCV_HAL_IMPL_C_PACK(_Tpvec, _Tp, _Tpnvec, _Tpn, pack_suffix) \
    759 inline _Tpnvec v_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
    760 { \
    761     _Tpnvec c; \
    762     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
    763     { \
    764         c.s[i] = saturate_cast<_Tpn>(a.s[i]); \
    765         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>(b.s[i]); \
    766     } \
    767     return c; \
    768 } \
    769 template<int n> inline _Tpnvec v_rshr_##pack_suffix(const _Tpvec& a, const _Tpvec& b) \
    770 { \
    771     _Tpnvec c; \
    772     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
    773     { \
    774         c.s[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
    775         c.s[i+_Tpvec::nlanes] = saturate_cast<_Tpn>((b.s[i] + ((_Tp)1 << (n - 1))) >> n); \
    776     } \
    777     return c; \
    778 } \
    779 inline void v_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
    780 { \
    781     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
    782         ptr[i] = saturate_cast<_Tpn>(a.s[i]); \
    783 } \
    784 template<int n> inline void v_rshr_##pack_suffix##_store(_Tpn* ptr, const _Tpvec& a) \
    785 { \
    786     for( int i = 0; i < _Tpvec::nlanes; i++ ) \
    787         ptr[i] = saturate_cast<_Tpn>((a.s[i] + ((_Tp)1 << (n - 1))) >> n); \
    788 }
    789 
    790 OPENCV_HAL_IMPL_C_PACK(v_uint16x8, ushort, v_uint8x16, uchar, pack)
    791 OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_int8x16, schar, pack)
    792 OPENCV_HAL_IMPL_C_PACK(v_int16x8, short, v_uint8x16, uchar, pack_u)
    793 OPENCV_HAL_IMPL_C_PACK(v_uint32x4, unsigned, v_uint16x8, ushort, pack)
    794 OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_int16x8, short, pack)
    795 OPENCV_HAL_IMPL_C_PACK(v_int32x4, int, v_uint16x8, ushort, pack_u)
    796 OPENCV_HAL_IMPL_C_PACK(v_uint64x2, uint64, v_uint32x4, unsigned, pack)
    797 OPENCV_HAL_IMPL_C_PACK(v_int64x2, int64, v_int32x4, int, pack)
    798 
    799 inline v_float32x4 v_matmul(const v_float32x4& v, const v_float32x4& m0,
    800                             const v_float32x4& m1, const v_float32x4& m2,
    801                             const v_float32x4& m3)
    802 {
    803     return v_float32x4(v.s[0]*m0.s[0] + v.s[1]*m1.s[0] + v.s[2]*m2.s[0] + v.s[3]*m3.s[0],
    804                        v.s[0]*m0.s[1] + v.s[1]*m1.s[1] + v.s[2]*m2.s[1] + v.s[3]*m3.s[1],
    805                        v.s[0]*m0.s[2] + v.s[1]*m1.s[2] + v.s[2]*m2.s[2] + v.s[3]*m3.s[2],
    806                        v.s[0]*m0.s[3] + v.s[1]*m1.s[3] + v.s[2]*m2.s[3] + v.s[3]*m3.s[3]);
    807 }
    808 
    809 }
    810 
    811 #endif
    812