Home | History | Annotate | Download | only in include
      1 //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation,  victoria.zhislina (at) intel.com
      2 
      3 //*** Copyright (C) 2012-2014 Intel Corporation.  All rights reserved.
      4 
      5 //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      6 
      7 //By downloading, copying, installing or using the software you agree to this license.
      8 //If you do not agree to this license, do not download, install, copy or use the software.
      9 
     10 //                              License Agreement
     11 
     12 //Permission to use, copy, modify, and/or distribute this software for any
     13 //purpose with or without fee is hereby granted, provided that the above
     14 //copyright notice and this permission notice appear in all copies.
     15 
     16 //THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
     17 //REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
     18 //AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
     19 //INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
     20 //LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
     21 //OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
     22 //PERFORMANCE OF THIS SOFTWARE.
     23 
     24 //*****************************************************************************************
     25 // This file is intended to simplify ARM->IA32 porting
     26 // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
     27 // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
     28 // MMX instruction set is not used due to performance overhead and the necessity to use the
     29 // EMMS instruction (_mm_empty())for mmx-x87 floating point switching
     30 //*****************************************************************************************
     31 
     32 //!!!!!!!  To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual.
     33 //!!!!!!!  Please pay attention at USE_SSE4 below - you need to define it for newest Intel platforms for
     34 //!!!!!!!  greater performance. It can be done by -msse4.2 compiler switch.
     35 
     36 #ifndef NEON2SSE_H
     37 #define NEON2SSE_H
     38 
     39 #ifndef USE_SSE4
     40 #if defined(__SSE4_2__)
     41     #define USE_SSE4
     42 #endif
     43 #endif
     44 
     45 #include <xmmintrin.h>     //SSE
     46 #include <emmintrin.h>     //SSE2
     47 #include <pmmintrin.h>     //SSE3
     48 #include <tmmintrin.h>     //SSSE3
     49 #ifdef USE_SSE4
     50 #include <smmintrin.h> //SSE4.1
     51 #include <nmmintrin.h> //SSE4.2
     52 #endif
     53 
     54 
     55 //***************  functions and data attributes, compiler dependent  *********************************
     56 //***********************************************************************************
     57 #ifdef __GNUC__
     58 #define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
     59 #define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
     60 #define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     61 #if _GCC_VERSION <  40500
     62     #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
     63 #else
     64     #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
     65 #endif
     66 #if defined(__x86_64__)
     67     #define _NEON2SSE_64BIT  __x86_64__
     68 #endif
     69 #else
     70 #define _NEON2SSE_ALIGN_16  __declspec(align(16))
     71 #define _NEON2SSE_INLINE __inline
     72 #if defined(_MSC_VER)|| defined (__INTEL_COMPILER)
     73     #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
     74 #if defined(_M_X64)
     75         #define _NEON2SSE_64BIT  _M_X64
     76 #endif
     77 #else
     78     #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
     79 #endif
     80 #endif
     81 
     82 #if defined  (_NEON2SSE_64BIT) && defined (USE_SSE4)
     83     #define _NEON2SSE_64BIT_SSE4
     84 #endif
     85 
     86 /*********************************************************************************************************************/
     87 //    data types conversion
     88 /*********************************************************************************************************************/
     89 #if defined(_MSC_VER) && (_MSC_VER < 1300)
     90     typedef signed char int8_t;
     91     typedef unsigned char uint8_t;
     92     typedef signed short int16_t;
     93     typedef unsigned short uint16_t;
     94     typedef signed int int32_t;
     95     typedef unsigned int uint32_t;
     96     typedef signed long long int64_t;
     97     typedef unsigned long long uint64_t;
     98 #elif defined(_MSC_VER)
     99     typedef signed __int8 int8_t;
    100     typedef unsigned __int8 uint8_t;
    101     typedef signed __int16 int16_t;
    102     typedef unsigned __int16 uint16_t;
    103     typedef signed __int32 int32_t;
    104     typedef unsigned __int32 uint32_t;
    105 
    106     typedef signed long long int64_t;
    107     typedef unsigned long long uint64_t;
    108 #else
    109 #include <stdint.h>
    110 #include <limits.h>
    111 #endif
    112 
    113 typedef union   __m64_128 {
    114     uint64_t m64_u64[1];
    115     float m64_f32[2];
    116     int8_t m64_i8[8];
    117     int16_t m64_i16[4];
    118     int32_t m64_i32[2];
    119     int64_t m64_i64[1];
    120     uint8_t m64_u8[8];
    121     uint16_t m64_u16[4];
    122     uint32_t m64_u32[2];
    123 } __m64_128;
    124 
    125 typedef __m64_128 int8x8_t;
    126 typedef __m64_128 uint8x8_t;
    127 typedef __m64_128 int16x4_t;
    128 typedef __m64_128 uint16x4_t;
    129 typedef __m64_128 int32x2_t;
    130 typedef __m64_128 uint32x2_t;
    131 typedef __m64_128 int64x1_t;
    132 typedef __m64_128 uint64x1_t;
    133 typedef __m64_128 poly8x8_t;
    134 typedef __m64_128 poly16x4_t;
    135 
    136 typedef __m64_128 float32x2_t;
    137 typedef __m128 float32x4_t;
    138 
    139 typedef __m128 float16x4_t; //not supported by IA, for compatibility
    140 typedef __m128 float16x8_t; //not supported by IA, for compatibility
    141 
    142 typedef __m128i int8x16_t;
    143 typedef __m128i int16x8_t;
    144 typedef __m128i int32x4_t;
    145 typedef __m128i int64x2_t;
    146 typedef __m128i uint8x16_t;
    147 typedef __m128i uint16x8_t;
    148 typedef __m128i uint32x4_t;
    149 typedef __m128i uint64x2_t;
    150 typedef __m128i poly8x16_t;
    151 typedef __m128i poly16x8_t;
    152 
    153 #if defined(_MSC_VER)
    154     #define SINT_MIN     (-2147483647 - 1) /* min signed int value */
    155     #define SINT_MAX       2147483647 /* max signed int value */
    156 #else
    157     #define SINT_MIN     INT_MIN /* min signed int value */
    158     #define SINT_MAX     INT_MAX /* max signed int value */
    159 #endif
    160 
    161 typedef   float float32_t;
    162 typedef   float __fp16;
    163 
    164 typedef  uint8_t poly8_t;
    165 typedef  uint16_t poly16_t;
    166 
    167 
    168 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
    169 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
    170 struct int8x16x2_t {
    171     int8x16_t val[2];
    172 };
    173 struct int16x8x2_t {
    174     int16x8_t val[2];
    175 };
    176 struct int32x4x2_t {
    177     int32x4_t val[2];
    178 };
    179 struct int64x2x2_t {
    180     int64x2_t val[2];
    181 };
    182 //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
    183 struct int8x8x2_t {
    184     int8x8_t val[2];
    185 };
    186 struct int16x4x2_t {
    187     int16x4_t val[2];
    188 };
    189 struct int32x2x2_t {
    190     int32x2_t val[2];
    191 };
    192 struct int64x1x2_t {
    193     int64x1_t val[2];
    194 };
    195 
    196 typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
    197 typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
    198 typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
    199 typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
    200 
    201 typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
    202 typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
    203 typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
    204 typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
    205 
    206 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
    207 typedef struct int8x16x2_t uint8x16x2_t;
    208 typedef struct int16x8x2_t uint16x8x2_t;
    209 typedef struct int32x4x2_t uint32x4x2_t;
    210 typedef struct int64x2x2_t uint64x2x2_t;
    211 typedef struct int8x16x2_t poly8x16x2_t;
    212 typedef struct int16x8x2_t poly16x8x2_t;
    213 
    214 typedef struct int8x8x2_t uint8x8x2_t;
    215 typedef struct int16x4x2_t uint16x4x2_t;
    216 typedef struct int32x2x2_t uint32x2x2_t;
    217 typedef struct int64x1x2_t uint64x1x2_t;
    218 typedef struct int8x8x2_t poly8x8x2_t;
    219 typedef struct int16x4x2_t poly16x4x2_t;
    220 
    221 //float
    222 struct float32x4x2_t {
    223     float32x4_t val[2];
    224 };
    225 struct float16x8x2_t {
    226     float16x8_t val[2];
    227 };
    228 struct float32x2x2_t {
    229     float32x2_t val[2];
    230 };
    231 
    232 typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
    233 typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
    234 typedef struct  float32x2x2_t float32x2x2_t; //for C compilers to make them happy
    235 typedef  float16x8x2_t float16x4x2_t;
    236 
    237 //4
    238 struct int8x16x4_t {
    239     int8x16_t val[4];
    240 };
    241 struct int16x8x4_t {
    242     int16x8_t val[4];
    243 };
    244 struct int32x4x4_t {
    245     int32x4_t val[4];
    246 };
    247 struct int64x2x4_t {
    248     int64x2_t val[4];
    249 };
    250 
    251 struct int8x8x4_t {
    252     int8x8_t val[4];
    253 };
    254 struct int16x4x4_t {
    255     int16x4_t val[4];
    256 };
    257 struct int32x2x4_t {
    258     int32x2_t val[4];
    259 };
    260 struct int64x1x4_t {
    261     int64x1_t val[4];
    262 };
    263 
    264 typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
    265 typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
    266 typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
    267 typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
    268 
    269 typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
    270 typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
    271 typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
    272 typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
    273 
    274 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
    275 typedef struct int8x8x4_t uint8x8x4_t;
    276 typedef struct int16x4x4_t uint16x4x4_t;
    277 typedef struct int32x2x4_t uint32x2x4_t;
    278 typedef struct int64x1x4_t uint64x1x4_t;
    279 typedef struct int8x8x4_t poly8x8x4_t;
    280 typedef struct int16x4x4_t poly16x4x4_t;
    281 
    282 typedef struct int8x16x4_t uint8x16x4_t;
    283 typedef struct int16x8x4_t uint16x8x4_t;
    284 typedef struct int32x4x4_t uint32x4x4_t;
    285 typedef struct int64x2x4_t uint64x2x4_t;
    286 typedef struct int8x16x4_t poly8x16x4_t;
    287 typedef struct int16x8x4_t poly16x8x4_t;
    288 
    289 struct float32x4x4_t {
    290     float32x4_t val[4];
    291 };
    292 struct float16x8x4_t {
    293     float16x8_t val[4];
    294 };
    295 struct float32x2x4_t {
    296     float32x2_t val[4];
    297 };
    298 
    299 typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
    300 typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
    301 typedef struct  float32x2x4_t float32x2x4_t; //for C compilers to make them happy
    302 typedef  float16x8x4_t float16x4x4_t;
    303 
    304 //3
    305 struct int16x8x3_t {
    306     int16x8_t val[3];
    307 };
    308 struct int32x4x3_t {
    309     int32x4_t val[3];
    310 };
    311 struct int64x2x3_t {
    312     int64x2_t val[3];
    313 };
    314 struct int8x16x3_t {
    315     int8x16_t val[3];
    316 };
    317 
    318 struct int16x4x3_t {
    319     int16x4_t val[3];
    320 };
    321 struct int32x2x3_t {
    322     int32x2_t val[3];
    323 };
    324 struct int64x1x3_t {
    325     int64x1_t val[3];
    326 };
    327 struct int8x8x3_t {
    328     int8x8_t val[3];
    329 };
    330 typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
    331 typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
    332 typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
    333 typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
    334 
    335 typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
    336 typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
    337 typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
    338 typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
    339 
    340 
    341 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
    342 typedef struct int8x16x3_t uint8x16x3_t;
    343 typedef struct int16x8x3_t uint16x8x3_t;
    344 typedef struct int32x4x3_t uint32x4x3_t;
    345 typedef struct int64x2x3_t uint64x2x3_t;
    346 typedef struct int8x16x3_t poly8x16x3_t;
    347 typedef struct int16x8x3_t poly16x8x3_t;
    348 typedef struct  int8x8x3_t uint8x8x3_t;
    349 typedef struct  int16x4x3_t uint16x4x3_t;
    350 typedef struct  int32x2x3_t uint32x2x3_t;
    351 typedef struct  int64x1x3_t uint64x1x3_t;
    352 typedef struct  int8x8x3_t poly8x8x3_t;
    353 typedef struct  int16x4x3_t poly16x4x3_t;
    354 
    355 //float
    356 struct float32x4x3_t {
    357     float32x4_t val[3];
    358 };
    359 struct float32x2x3_t {
    360     float32x2_t val[3];
    361 };
    362 struct float16x8x3_t {
    363     float16x8_t val[3];
    364 };
    365 
    366 typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
    367 typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
    368 typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
    369 typedef  float16x8x3_t float16x4x3_t;
    370 
    371 
    372 //****************************************************************************
    373 //****** Porting auxiliary macros ********************************************
    374 
    375 //** floating point related macros **
    376 #define _M128i(a) _mm_castps_si128(a)
    377 #define _M128(a) _mm_castsi128_ps(a)
    378 //here the most performance effective implementation is compiler and 32/64 bits build dependent
    379 #if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER  >= 1500) )
    380 
    381         #define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
    382         #define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
    383         #define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
    384 #else
    385    //for 32bit gcc and Microsoft compilers builds
    386     #define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
    387     #define _M64(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), inp)
    388     #define _M64f(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
    389 #endif
    390 #define _pM128(a) _mm_castsi128_ps(_pM128i(a))
    391 
    392 #define return64(a)  _M64(res64,a); return res64;
    393 #define return64f(a)  _M64f(res64,a); return res64;
    394 
    395 #define _Ui64(a) (*(uint64_t*)&(a))
    396 #define _UNSIGNED_T(a) u ## a
    397 
    398 #define _SIGNBIT64 ((uint64_t)1 << 63)
    399 #define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
    400 #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
    401 
    402 #define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
    403 #define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
    404 
    405 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    406 #define __constrange(min,max)  const
    407 #define __transfersize(size)
    408 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    409 
    410 
    411 //*************************************************************************
    412 //*************************************************************************
    413 //*********  Functions declarations as declared in original arm_neon.h *****
    414 //*************************************************************************
    415 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
    416 int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
    417 int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
    418 int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
    419 int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
    420 float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
    421 uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
    422 uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
    423 uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
    424 uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
    425 int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
    426 int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
    427 int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
    428 int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
    429 float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
    430 uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
    431 uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
    432 uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
    433 uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
    434 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
    435 int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
    436 int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
    437 int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
    438 uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
    439 uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
    440 uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
    441 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
    442 int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
    443 int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
    444 int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
    445 uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
    446 uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
    447 uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
    448 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
    449 int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
    450 int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
    451 int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
    452 uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
    453 uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
    454 uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
    455 int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
    456 int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
    457 int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
    458 uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
    459 uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
    460 uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
    461 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
    462 int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
    463 int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
    464 int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
    465 uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
    466 uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
    467 uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
    468 int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
    469 int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
    470 int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
    471 uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
    472 uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
    473 uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
    474 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
    475 int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
    476 int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
    477 int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
    478 int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
    479 uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
    480 uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
    481 uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
    482 uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
    483 int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
    484 int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
    485 int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
    486 int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
    487 uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
    488 uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
    489 uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
    490 uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
    491 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
    492 int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
    493 int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
    494 int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
    495 uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
    496 uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
    497 uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
    498 //Vector rounding add high half: vraddhn
    499 int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
    500 int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
    501 int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
    502 uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
    503 uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
    504 uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
    505 //Multiplication
    506 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
    507 int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
    508 int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
    509 int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
    510 float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
    511 uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
    512 uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
    513 uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
    514 poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
    515 int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
    516 int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
    517 int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
    518 float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
    519 uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
    520 uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
    521 uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
    522 poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
    523 //multiply lane
    524 int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
    525 int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
    526 float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
    527 uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
    528 uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
    529 int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
    530 int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
    531 float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
    532 uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
    533 uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
    534 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
    535 int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
    536 int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
    537 int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
    538 float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
    539 uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
    540 uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
    541 uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
    542 int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
    543 int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
    544 int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
    545 float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
    546 uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
    547 uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
    548 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
    549 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
    550 int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
    551 int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
    552 int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
    553 uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
    554 uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
    555 uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
    556 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
    557 int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
    558 int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
    559 int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
    560 float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
    561 uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
    562 uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
    563 uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
    564 int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
    565 int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
    566 int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
    567 float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
    568 uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
    569 uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
    570 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
    571 //Vector multiply subtract long
    572 int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
    573 int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
    574 int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
    575 uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
    576 uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
    577 uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
    578 //Vector saturating doubling multiply high
    579 int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
    580 int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
    581 int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
    582 int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
    583 //Vector saturating rounding doubling multiply high
    584 int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
    585 int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
    586 int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
    587 int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
    588 //Vector saturating doubling multiply accumulate long
    589 int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
    590 int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
    591 //Vector saturating doubling multiply subtract long
    592 int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
    593 int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
    594 //Vector long multiply
    595 int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
    596 int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
    597 int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
    598 uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
    599 uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
    600 uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
    601 poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
    602 //Vector saturating doubling long multiply
    603 int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
    604 int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
    605 //Subtraction
    606 //Vector subtract
    607 int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
    608 int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
    609 int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
    610 int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
    611 float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
    612 uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
    613 uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
    614 uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
    615 uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
    616 int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
    617 int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
    618 int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
    619 int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
    620 float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
    621 uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
    622 uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
    623 uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
    624 uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
    625 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
    626 int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
    627 int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
    628 int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
    629 uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
    630 uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
    631 uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
    632 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
    633 int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
    634 int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
    635 int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
    636 uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
    637 uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
    638 uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
    639 //Vector saturating subtract
    640 int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
    641 int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
    642 int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
    643 int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
    644 uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
    645 uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
    646 uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
    647 uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
    648 int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
    649 int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
    650 int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
    651 int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
    652 uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
    653 uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
    654 uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
    655 uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
    656 //Vector halving subtract
    657 int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
    658 int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
    659 int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
    660 uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
    661 uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
    662 uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
    663 int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
    664 int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
    665 int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
    666 uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
    667 uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
    668 uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
    669 //Vector subtract high half
    670 int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
    671 int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
    672 int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
    673 uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
    674 uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
    675 uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
    676 //Vector rounding subtract high half
    677 int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
    678 int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
    679 int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
    680 uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
    681 uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
    682 uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
    683 //Comparison
    684 //Vector compare equal
    685 uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
    686 uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
    687 uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
    688 uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
    689 uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
    690 uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
    691 uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
    692 uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
    693 uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
    694 uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
    695 uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
    696 uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
    697 uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
    698 uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
    699 uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
    700 uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
    701 //Vector compare greater-than or equal
    702 uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
    703 uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
    704 uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
    705 uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
    706 uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
    707 uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
    708 uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
    709 uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
    710 uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
    711 uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
    712 uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
    713 uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
    714 uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
    715 uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
    716 //Vector compare less-than or equal
    717 uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
    718 uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
    719 uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
    720 uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
    721 uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
    722 uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
    723 uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
    724 uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
    725 uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
    726 uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
    727 uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
    728 uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
    729 uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
    730 uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
    731 //Vector compare greater-than
    732 uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
    733 uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
    734 uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
    735 uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
    736 uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
    737 uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
    738 uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
    739 uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
    740 uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
    741 uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
    742 uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
    743 uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
    744 uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
    745 uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
    746 //Vector compare less-than
    747 uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
    748 uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
    749 uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
    750 uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
    751 uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
    752 uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
    753 uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
    754 uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
    755 uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
    756 uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
    757 uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
    758 uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
    759 uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
    760 uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
    761 //Vector compare absolute greater-than or equal
    762 uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
    763 uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
    764 //Vector compare absolute less-than or equal
    765 uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
    766 uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
    767 //Vector compare absolute greater-than
    768 uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
    769 uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
    770 //Vector compare absolute less-than
    771 uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
    772 uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
    773 //Vector test bits
    774 uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
    775 uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
    776 uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
    777 uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
    778 uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
    779 uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
    780 uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
    781 uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
    782 uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
    783 uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
    784 uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
    785 uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
    786 uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
    787 uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
    788 //Absolute difference
    789 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
    790 int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
    791 int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
    792 int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
    793 uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
    794 uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
    795 uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
    796 float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
    797 int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
    798 int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
    799 int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
    800 uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
    801 uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
    802 uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
    803 float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
    804 //Absolute difference - long
    805 int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
    806 int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
    807 int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
    808 uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
    809 uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
    810 uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
    811 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
    812 int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
    813 int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
    814 int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
    815 uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
    816 uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
    817 uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
    818 int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
    819 int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
    820 int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
    821 uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
    822 uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
    823 uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
    824 //Absolute difference and accumulate - long
    825 int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
    826 int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
    827 int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
    828 uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
    829 uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
    830 uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
    831 //Max/Min
    832 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
    833 int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
    834 int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
    835 int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
    836 uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
    837 uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
    838 uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
    839 float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
    840 int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
    841 int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
    842 int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
    843 uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
    844 uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
    845 uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
    846 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
    847 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
    848 int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
    849 int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
    850 int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
    851 uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
    852 uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
    853 uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
    854 float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
    855 int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
    856 int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
    857 int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
    858 uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
    859 uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
    860 uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
    861 float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
    862 //Pairwise addition
    863 //Pairwise add
    864 int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
    865 int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
    866 int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
    867 uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
    868 uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
    869 uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
    870 float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
    871 //Long pairwise add
    872 int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
    873 int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
    874 int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
    875 uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
    876 uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
    877 uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
    878 int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
    879 int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
    880 int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
    881 uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
    882 uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
    883 uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
    884 //Long pairwise add and accumulate
    885 int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
    886 int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
    887 int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
    888 uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
    889 uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
    890 uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
    891 int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
    892 int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
    893 int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
    894 uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
    895 uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
    896 uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
    897 //Folding maximum vpmax -> takes maximum of adjacent pairs
    898 int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
    899 int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
    900 int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
    901 uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
    902 uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
    903 uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
    904 float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
    905 //Folding minimum vpmin -> takes minimum of adjacent pairs
    906 int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
    907 int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
    908 int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
    909 uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
    910 uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
    911 uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
    912 float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
    913 //Reciprocal/Sqrt
    914 float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
    915 float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
    916 float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
    917 float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
    918 //Shifts by signed variable
    919 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
    920 int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
    921 int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
    922 int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
    923 int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
    924 uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
    925 uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
    926 uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
    927 uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
    928 int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
    929 int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
    930 int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
    931 int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
    932 uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
    933 uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
    934 uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
    935 uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
    936 //Vector saturating shift left: (negative values shift right)
    937 int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
    938 int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
    939 int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
    940 int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
    941 uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
    942 uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
    943 uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
    944 uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
    945 int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
    946 int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
    947 int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
    948 int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
    949 uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
    950 uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
    951 uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
    952 uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
    953 //Vector rounding shift left: (negative values shift right)
    954 int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
    955 int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
    956 int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
    957 int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
    958 uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
    959 uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
    960 uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
    961 uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
    962 int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
    963 int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
    964 int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
    965 int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
    966 uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
    967 uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
    968 uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
    969 uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
    970 //Vector saturating rounding shift left: (negative values shift right)
    971 int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
    972 int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
    973 int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
    974 int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
    975 uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
    976 uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
    977 uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
    978 uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
    979 int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
    980 int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
    981 int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
    982 int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
    983 uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
    984 uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
    985 uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
    986 uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
    987 //Shifts by a constant
    988 //Vector shift right by constant
    989 int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
    990 int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
    991 int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
    992 int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
    993 uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
    994 uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
    995 uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
    996 uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
    997 int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
    998 int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
    999 int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
   1000 int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
   1001 uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
   1002 uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
   1003 uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
   1004 uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
   1005 //Vector shift left by constant
   1006 int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
   1007 int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
   1008 int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
   1009 int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
   1010 uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
   1011 uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
   1012 uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
   1013 uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
   1014 int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
   1015 int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
   1016 int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
   1017 int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
   1018 uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
   1019 uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
   1020 uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
   1021 uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
   1022 //Vector rounding shift right by constant
   1023 int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
   1024 int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
   1025 int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
   1026 int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
   1027 uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
   1028 uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
   1029 uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
   1030 uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
   1031 int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
   1032 int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
   1033 int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
   1034 int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
   1035 uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
   1036 uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
   1037 uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
   1038 uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
   1039 //Vector shift right by constant and accumulate
   1040 int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
   1041 int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
   1042 int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
   1043 int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
   1044 uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
   1045 uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
   1046 uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
   1047 uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
   1048 int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
   1049 int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
   1050 int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
   1051 int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
   1052 uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
   1053 uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
   1054 uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
   1055 uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
   1056 //Vector rounding shift right by constant and accumulate
   1057 int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
   1058 int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
   1059 int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
   1060 int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
   1061 uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
   1062 uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
   1063 uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
   1064 uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
   1065 int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
   1066 int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
   1067 int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
   1068 int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
   1069 uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
   1070 uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
   1071 uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
   1072 uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
   1073 //Vector saturating shift left by constant
   1074 int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
   1075 int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
   1076 int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
   1077 int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
   1078 uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
   1079 uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
   1080 uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
   1081 uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
   1082 int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
   1083 int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
   1084 int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
   1085 int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
   1086 uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
   1087 uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
   1088 uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
   1089 uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
   1090 //Vector signed->unsigned saturating shift left by constant
   1091 uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
   1092 uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
   1093 uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
   1094 uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
   1095 uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
   1096 uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
   1097 uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
   1098 uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
   1099 //Vector narrowing shift right by constant
   1100 int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
   1101 int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
   1102 int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
   1103 uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
   1104 uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
   1105 uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
   1106 //Vector signed->unsigned narrowing saturating shift right by constant
   1107 uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
   1108 uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
   1109 uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
   1110 //Vector signed->unsigned rounding narrowing saturating shift right by constant
   1111 uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
   1112 uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
   1113 uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
   1114 //Vector narrowing saturating shift right by constant
   1115 int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
   1116 int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
   1117 int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
   1118 uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
   1119 uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
   1120 uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
   1121 //Vector rounding narrowing shift right by constant
   1122 int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
   1123 int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
   1124 int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
   1125 uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
   1126 uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
   1127 uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
   1128 //Vector rounding narrowing saturating shift right by constant
   1129 int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
   1130 int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
   1131 int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
   1132 uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
   1133 uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
   1134 uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
   1135 //Vector widening shift left by constant
   1136 int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
   1137 int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
   1138 int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
   1139 uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
   1140 uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
   1141 uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
   1142 //Shifts with insert
   1143 //Vector shift right and insert
   1144 int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   1145 int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   1146 int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
   1147 int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
   1148 uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   1149 uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   1150 uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
   1151 uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
   1152 poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   1153 poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   1154 int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   1155 int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   1156 int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
   1157 int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
   1158 uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   1159 uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   1160 uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
   1161 uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
   1162 poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   1163 poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   1164 //Vector shift left and insert
   1165 int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   1166 int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   1167 int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
   1168 int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
   1169 uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   1170 uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   1171 uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
   1172 uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
   1173 poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   1174 poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   1175 int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   1176 int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   1177 int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
   1178 int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
   1179 uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   1180 uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   1181 uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
   1182 uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
   1183 poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   1184 poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   1185 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
   1186 //Load a single vector from memory
   1187 uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   1188 uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   1189 uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   1190 uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1191 int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   1192 int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   1193 int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   1194 int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1195 float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
   1196 float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   1197 poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   1198 poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   1199 uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
   1200 uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
   1201 uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
   1202 uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   1203 int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
   1204 int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
   1205 int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
   1206 int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   1207 float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
   1208 float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
   1209 poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
   1210 poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
   1211 //Load a single lane from memory
   1212 uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
   1213 uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   1214 uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   1215 uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
   1216 int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
   1217 int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
   1218 int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
   1219 float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
   1220 float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   1221 int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
   1222 poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
   1223 poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   1224 uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
   1225 uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
   1226 uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
   1227 uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
   1228 int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
   1229 int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
   1230 int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
   1231 float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
   1232 float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
   1233 int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
   1234 poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
   1235 poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
   1236 //Load all lanes of vector with same value from memory
   1237 uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1238 uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1239 uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1240 uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   1241 int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1242 int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1243 int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1244 int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   1245 float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
   1246 float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1247 poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1248 poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1249 uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1250 uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1251 uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1252 uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   1253 int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1254 int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1255 int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1256 int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   1257 float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
   1258 float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1259 poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1260 poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1261 //Store a single vector or lane. Stores all lanes or a single lane of a vector.
   1262 //Store a single vector into memory
   1263 void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
   1264 void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
   1265 void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
   1266 void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
   1267 void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
   1268 void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
   1269 void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
   1270 void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
   1271 void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
   1272 void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
   1273 void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
   1274 void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
   1275 void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
   1276 void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
   1277 void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
   1278 void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
   1279 void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
   1280 void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
   1281 void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
   1282 void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
   1283 void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
   1284 void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
   1285 void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
   1286 void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
   1287 //Store a lane of a vector into memory
   1288 //Loads of an N-element structure
   1289 //Load N-element structure from memory
   1290 uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   1291 uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   1292 uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   1293 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   1294 int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   1295 int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   1296 float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
   1297 float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   1298 poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   1299 poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   1300 uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   1301 uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   1302 uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   1303 uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1304 int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   1305 int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   1306 int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   1307 int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1308 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
   1309 float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   1310 poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   1311 poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   1312 uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   1313 uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   1314 uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   1315 int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   1316 int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   1317 int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   1318 float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   1319 float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   1320 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   1321 poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   1322 uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   1323 uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   1324 uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   1325 uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   1326 int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   1327 int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   1328 int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   1329 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   1330 float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   1331 float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   1332 poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   1333 poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   1334 uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   1335 uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   1336 uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   1337 int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   1338 int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   1339 int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   1340 float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   1341 float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   1342 poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   1343 poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   1344 uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   1345 uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   1346 uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   1347 uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   1348 int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   1349 int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   1350 int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   1351 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   1352 float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   1353 float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   1354 poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   1355 poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   1356 //Load all lanes of N-element structure with same value from memory
   1357 uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   1358 uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   1359 uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   1360 uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1361 int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   1362 int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   1363 int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   1364 int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1365 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   1366 float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   1367 poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   1368 poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   1369 uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   1370 uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   1371 uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   1372 uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   1373 int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   1374 int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   1375 int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   1376 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   1377 float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   1378 float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   1379 poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   1380 poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   1381 uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   1382 uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   1383 uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   1384 uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   1385 int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   1386 int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   1387 int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   1388 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   1389 float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   1390 float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   1391 poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   1392 poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   1393 //Load a single lane of N-element structure from memory
   1394 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
   1395 uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
   1396 uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
   1397 int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
   1398 int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
   1399 float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
   1400 float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
   1401 poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
   1402 uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
   1403 uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   1404 uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
   1405 int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
   1406 int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
   1407 int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
   1408 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   1409 float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
   1410 poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
   1411 poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   1412 uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   1413 uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   1414 int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   1415 int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   1416 float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   1417 float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   1418 poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   1419 uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   1420 uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   1421 uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   1422 int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   1423 int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   1424 int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   1425 float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   1426 float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   1427 poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   1428 poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   1429 uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1430 uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1431 int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1432 int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1433 float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1434 float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1435 poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1436 uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1437 uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1438 uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1439 int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1440 int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1441 int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1442 float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1443 float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1444 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1445 poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1446 //Store N-element structure to memory
   1447 void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
   1448 void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
   1449 void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
   1450 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
   1451 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
   1452 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
   1453 void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
   1454 void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
   1455 void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
   1456 void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
   1457 void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
   1458 void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
   1459 void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
   1460 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
   1461 void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
   1462 void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
   1463 void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
   1464 void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0]
   1465 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
   1466 void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
   1467 void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
   1468 void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
   1469 void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
   1470 void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
   1471 void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
   1472 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
   1473 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
   1474 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
   1475 void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
   1476 void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
   1477 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
   1478 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
   1479 void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
   1480 void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
   1481 void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
   1482 void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
   1483 void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
   1484 void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
   1485 void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
   1486 void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0]
   1487 void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
   1488 void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0]
   1489 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0]
   1490 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
   1491 void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
   1492 void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
   1493 void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
   1494 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
   1495 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
   1496 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
   1497 void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
   1498 void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
   1499 void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
   1500 void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
   1501 void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
   1502 void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
   1503 void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
   1504 void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
   1505 void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
   1506 void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
   1507 void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
   1508 void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0]
   1509 void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
   1510 void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0]
   1511 void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0]
   1512 void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
   1513 //Store a single lane of N-element structure to memory
   1514 void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
   1515 void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
   1516 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
   1517 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
   1518 void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
   1519 void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
   1520 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
   1521 void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
   1522 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
   1523 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
   1524 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
   1525 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
   1526 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
   1527 void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
   1528 void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
   1529 void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
   1530 void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
   1531 void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1532 void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
   1533 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1534 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
   1535 void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1536 void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   1537 void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1538 void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
   1539 void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1540 void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
   1541 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
   1542 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1543 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
   1544 void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1545 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
   1546 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
   1547 void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1548 void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1549 void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1550 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1551 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1552 void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1553 void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1554 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1555 void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1556 void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1557 void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1558 void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
   1559 void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1560 void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1561 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1562 void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1563 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1564 void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1565 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
   1566 uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
   1567 uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
   1568 uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   1569 int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
   1570 int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
   1571 int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   1572 poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
   1573 poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
   1574 float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   1575 uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
   1576 uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
   1577 uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   1578 int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
   1579 int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
   1580 int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   1581 poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
   1582 poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
   1583 float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   1584 int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
   1585 uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
   1586 int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
   1587 uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
   1588 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
   1589 uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   1590 uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   1591 uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   1592 int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   1593 int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   1594 int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   1595 poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   1596 poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   1597 float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   1598 uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   1599 uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   1600 uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   1601 int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   1602 int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   1603 int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   1604 poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   1605 poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   1606 float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   1607 int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
   1608 uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
   1609 int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
   1610 uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
   1611 //Initialize a vector from a literal bit pattern.
   1612 int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
   1613 int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
   1614 int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
   1615 float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
   1616 float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
   1617 uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
   1618 uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
   1619 uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
   1620 uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
   1621 poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
   1622 poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
   1623 int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
   1624 //Set all lanes to same value
   1625 //Load all lanes of vector to the same literal value
   1626 uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
   1627 uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
   1628 uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
   1629 int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
   1630 int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
   1631 int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
   1632 poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
   1633 poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
   1634 float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
   1635 uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
   1636 uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
   1637 uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
   1638 int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
   1639 int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
   1640 int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
   1641 poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
   1642 poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
   1643 float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
   1644 int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
   1645 uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
   1646 int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
   1647 uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
   1648 uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
   1649 uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
   1650 uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
   1651 int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
   1652 int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
   1653 int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
   1654 poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
   1655 poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
   1656 float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
   1657 uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
   1658 uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
   1659 uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
   1660 int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
   1661 int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
   1662 int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
   1663 poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
   1664 poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
   1665 float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
   1666 int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
   1667 uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
   1668 int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
   1669 uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
   1670 //Load all lanes of the vector to the value of a lane of a vector
   1671 uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   1672 uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   1673 uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   1674 int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   1675 int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   1676 int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   1677 poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   1678 poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   1679 float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   1680 uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   1681 uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   1682 uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   1683 int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   1684 int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   1685 int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   1686 poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   1687 poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   1688 float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   1689 int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
   1690 uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
   1691 int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
   1692 uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
   1693 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
   1694 int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
   1695 int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
   1696 int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
   1697 int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
   1698 float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
   1699 float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
   1700 uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
   1701 uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
   1702 uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
   1703 uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
   1704 poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
   1705 poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
   1706 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
   1707 int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
   1708 int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
   1709 int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
   1710 int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
   1711 float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
   1712 float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
   1713 uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
   1714 uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
   1715 uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
   1716 uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
   1717 poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
   1718 poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
   1719 int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
   1720 int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
   1721 int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
   1722 int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
   1723 float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
   1724 float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
   1725 uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
   1726 uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
   1727 uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
   1728 uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
   1729 poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
   1730 poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
   1731 //Converting vectors. These intrinsics are used to convert vectors.
   1732 //Convert from float
   1733 int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
   1734 uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
   1735 int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
   1736 uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
   1737 int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
   1738 uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
   1739 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
   1740 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
   1741 //Convert to float
   1742 float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
   1743 float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
   1744 float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
   1745 float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
   1746 float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
   1747 float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
   1748 float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
   1749 float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
   1750 //Convert between floats
   1751 float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
   1752 float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
   1753 //Vector narrow integer
   1754 int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
   1755 int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
   1756 int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
   1757 uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
   1758 uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
   1759 uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
   1760 //Vector long move
   1761 int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
   1762 int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
   1763 int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
   1764 uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
   1765 uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
   1766 uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
   1767 //Vector saturating narrow integer
   1768 int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
   1769 int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
   1770 int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
   1771 uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
   1772 uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
   1773 uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
   1774 //Vector saturating narrow integer signed->unsigned
   1775 uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
   1776 uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
   1777 uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
   1778 //Table look up
   1779 uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
   1780 int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
   1781 poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
   1782 uint8x8_t vtbl2_u8_ptr(uint8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   1783 int8x8_t vtbl2_s8_ptr(int8x8x2_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   1784 poly8x8_t vtbl2_p8_ptr(poly8x8x2_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   1785 uint8x8_t vtbl3_u8_ptr(uint8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   1786 int8x8_t vtbl3_s8_ptr(int8x8x3_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   1787 poly8x8_t vtbl3_p8_ptr(poly8x8x3_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   1788 uint8x8_t vtbl4_u8_ptr(uint8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   1789 int8x8_t vtbl4_s8_ptr(int8x8x4_t *a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   1790 poly8x8_t vtbl4_p8_ptr(poly8x8x4_t *a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   1791 //Extended table look up intrinsics
   1792 uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
   1793 int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
   1794 poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
   1795 uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   1796 int8x8_t vtbx2_s8_ptr(int8x8_t a, int8x8x2_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   1797 poly8x8_t vtbx2_p8_ptr(poly8x8_t a, poly8x8x2_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   1798 uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   1799 int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   1800 poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   1801 uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   1802 int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t *b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   1803 poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t *b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   1804 //Operations with a scalar value
   1805 //Vector multiply accumulate with scalar
   1806 int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
   1807 int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
   1808 uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
   1809 uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
   1810 float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
   1811 int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
   1812 int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
   1813 uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
   1814 uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
   1815 float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
   1816 //Vector widening multiply accumulate with scalar
   1817 int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
   1818 int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
   1819 uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
   1820 uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
   1821 //Vector widening saturating doubling multiply accumulate with scalar
   1822 int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
   1823 int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
   1824 //Vector multiply subtract with scalar
   1825 int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
   1826 int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
   1827 uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
   1828 uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
   1829 float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
   1830 int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
   1831 int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
   1832 uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
   1833 uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
   1834 float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
   1835 //Vector widening multiply subtract with scalar
   1836 int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
   1837 int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
   1838 uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
   1839 uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
   1840 //Vector widening saturating doubling multiply subtract with scalar
   1841 int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
   1842 int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
   1843 //Vector multiply by scalar
   1844 int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
   1845 int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
   1846 float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
   1847 uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
   1848 uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
   1849 int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
   1850 int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
   1851 float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
   1852 uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
   1853 uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
   1854 //Vector long multiply with scalar
   1855 int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
   1856 int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
   1857 uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
   1858 uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
   1859 //Vector long multiply by scalar
   1860 int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
   1861 int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
   1862 uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
   1863 uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
   1864 //Vector saturating doubling long multiply with scalar
   1865 int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
   1866 int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
   1867 //Vector saturating doubling long multiply by scalar
   1868 int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
   1869 int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
   1870 //Vector saturating doubling multiply high with scalar
   1871 int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
   1872 int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
   1873 int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
   1874 int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
   1875 //Vector saturating doubling multiply high by scalar
   1876 int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
   1877 int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
   1878 int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
   1879 int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
   1880 //Vector saturating rounding doubling multiply high with scalar
   1881 int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
   1882 int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
   1883 int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
   1884 int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
   1885 //Vector rounding saturating doubling multiply high by scalar
   1886 int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
   1887 int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
   1888 int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
   1889 int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
   1890 //Vector multiply accumulate with scalar
   1891 int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
   1892 int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
   1893 uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
   1894 uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
   1895 float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
   1896 int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
   1897 int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
   1898 uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
   1899 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
   1900 float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
   1901 //Vector widening multiply accumulate with scalar
   1902 int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
   1903 int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
   1904 uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
   1905 uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
   1906 //Vector widening saturating doubling multiply accumulate with scalar
   1907 int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
   1908 int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
   1909 //Vector multiply subtract with scalar
   1910 int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
   1911 int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
   1912 uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
   1913 uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
   1914 float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
   1915 int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
   1916 int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
   1917 uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
   1918 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
   1919 float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
   1920 //Vector widening multiply subtract with scalar
   1921 int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
   1922 int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
   1923 uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
   1924 uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
   1925 //Vector widening saturating doubling multiply subtract with scalar
   1926 int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
   1927 int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
   1928 //Vector extract
   1929 int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   1930 uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   1931 poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   1932 int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   1933 uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   1934 poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   1935 int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   1936 uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   1937 int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
   1938 uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
   1939 float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   1940 int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   1941 uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   1942 poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   1943 int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   1944 uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   1945 poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   1946 int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
   1947 uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
   1948 int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
   1949 uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
   1950 float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
   1951 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
   1952 int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
   1953 int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
   1954 int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
   1955 uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
   1956 uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
   1957 uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
   1958 poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
   1959 poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
   1960 float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
   1961 int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
   1962 int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
   1963 int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
   1964 uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
   1965 uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
   1966 uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
   1967 poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
   1968 poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
   1969 float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
   1970 int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
   1971 int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
   1972 uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
   1973 uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
   1974 poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
   1975 poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
   1976 int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
   1977 int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
   1978 uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
   1979 uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
   1980 poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
   1981 poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
   1982 int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
   1983 uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
   1984 poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
   1985 int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
   1986 uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
   1987 poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
   1988 //Other single operand arithmetic
   1989 //Absolute: Vd[i] = |Va[i]|
   1990 int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
   1991 int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
   1992 int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
   1993 float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
   1994 int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
   1995 int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
   1996 int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
   1997 float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
   1998 //Saturating absolute: Vd[i] = sat(|Va[i]|)
   1999 int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
   2000 int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
   2001 int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
   2002 int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
   2003 int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
   2004 int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
   2005 //Negate: Vd[i] = - Va[i]
   2006 int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
   2007 int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
   2008 int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
   2009 float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
   2010 int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
   2011 int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
   2012 int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
   2013 float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
   2014 //Saturating Negate: sat(Vd[i] = - Va[i])
   2015 int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
   2016 int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
   2017 int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
   2018 int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
   2019 int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
   2020 int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
   2021 //Count leading sign bits
   2022 int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
   2023 int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
   2024 int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
   2025 int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
   2026 int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
   2027 int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
   2028 //Count leading zeros
   2029 int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
   2030 int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
   2031 int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
   2032 uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
   2033 uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
   2034 uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
   2035 int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
   2036 int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
   2037 int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
   2038 uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
   2039 uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
   2040 uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
   2041 //Count number of set bits
   2042 uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
   2043 int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
   2044 poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
   2045 uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
   2046 int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
   2047 poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
   2048 //Reciprocal estimate
   2049 float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
   2050 uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
   2051 float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
   2052 uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
   2053 //Reciprocal square root estimate
   2054 float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
   2055 uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
   2056 float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
   2057 uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
   2058 //Logical operations
   2059 //Bitwise not
   2060 int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
   2061 int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
   2062 int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
   2063 uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
   2064 uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
   2065 uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
   2066 poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
   2067 int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
   2068 int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
   2069 int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
   2070 uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
   2071 uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
   2072 uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
   2073 poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
   2074 //Bitwise and
   2075 int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
   2076 int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
   2077 int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
   2078 int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
   2079 uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
   2080 uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
   2081 uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
   2082 uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
   2083 int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
   2084 int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
   2085 int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
   2086 int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
   2087 uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
   2088 uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
   2089 uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
   2090 uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
   2091 //Bitwise or
   2092 int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
   2093 int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
   2094 int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
   2095 int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
   2096 uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
   2097 uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
   2098 uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
   2099 uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
   2100 int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
   2101 int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
   2102 int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
   2103 int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
   2104 uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
   2105 uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
   2106 uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
   2107 uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
   2108 //Bitwise exclusive or (EOR or XOR)
   2109 int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
   2110 int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
   2111 int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
   2112 int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
   2113 uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
   2114 uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
   2115 uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
   2116 uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
   2117 int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
   2118 int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
   2119 int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
   2120 int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
   2121 uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
   2122 uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
   2123 uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
   2124 uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
   2125 //Bit Clear
   2126 int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
   2127 int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
   2128 int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
   2129 int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
   2130 uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
   2131 uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
   2132 uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
   2133 uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
   2134 int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
   2135 int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
   2136 int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
   2137 int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
   2138 uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
   2139 uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
   2140 uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
   2141 uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
   2142 //Bitwise OR complement
   2143 int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
   2144 int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
   2145 int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
   2146 int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
   2147 uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
   2148 uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
   2149 uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
   2150 uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
   2151 int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
   2152 int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
   2153 int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
   2154 int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
   2155 uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
   2156 uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
   2157 uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
   2158 uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
   2159 //Bitwise Select
   2160 int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
   2161 int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
   2162 int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
   2163 int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
   2164 uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
   2165 uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
   2166 uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
   2167 uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
   2168 float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
   2169 poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
   2170 poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
   2171 int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
   2172 int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
   2173 int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
   2174 int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
   2175 uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
   2176 uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
   2177 uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
   2178 uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
   2179 float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
   2180 poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
   2181 poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
   2182 //Transposition operations
   2183 //Transpose elements
   2184 int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
   2185 int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
   2186 int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
   2187 uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
   2188 uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
   2189 uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
   2190 float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
   2191 poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
   2192 poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
   2193 int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
   2194 int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
   2195 int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
   2196 uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
   2197 uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
   2198 uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
   2199 float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
   2200 poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
   2201 poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
   2202 //Interleave elements
   2203 int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
   2204 int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
   2205 int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
   2206 uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
   2207 uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
   2208 uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
   2209 float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
   2210 poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
   2211 poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
   2212 int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
   2213 int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
   2214 int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
   2215 uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
   2216 uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
   2217 uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
   2218 float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
   2219 poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
   2220 poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
   2221 //De-Interleave elements
   2222 int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
   2223 int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
   2224 int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
   2225 uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
   2226 uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
   2227 uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
   2228 float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
   2229 poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
   2230 poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
   2231 int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
   2232 int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
   2233 int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
   2234 uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
   2235 uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
   2236 uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
   2237 float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
   2238 poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
   2239 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
   2240 
   2241 
   2242 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   2243 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must,
   2244 //for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
   2245 //
   2246 #if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined (__INTEL_COMPILER) )&& defined NDEBUG     //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
   2247 
   2248     #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
   2249 
   2250     #define _MM_EXTRACT_EPI16  _mm_extract_epi16
   2251     #define _MM_INSERT_EPI16 _mm_insert_epi16
   2252 #ifdef USE_SSE4
   2253         #define _MM_EXTRACT_EPI8  _mm_extract_epi8
   2254         #define _MM_EXTRACT_EPI32  _mm_extract_epi32
   2255         #define _MM_EXTRACT_PS  _mm_extract_ps
   2256 
   2257         #define _MM_INSERT_EPI8  _mm_insert_epi8
   2258         #define _MM_INSERT_EPI32 _mm_insert_epi32
   2259         #define _MM_INSERT_PS    _mm_insert_ps
   2260 #ifdef  _NEON2SSE_64BIT
   2261             #define _MM_INSERT_EPI64 _mm_insert_epi64
   2262             #define _MM_EXTRACT_EPI64 _mm_extract_epi64
   2263 #endif
   2264 #endif //SSE4
   2265 #else
   2266     #define _NEON2SSE_COMMA ,
   2267     #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
   2268             switch(LANE)         \
   2269         {                \
   2270         case 0:     return NAME(a b, 0); \
   2271         case 1:     return NAME(a b, 1); \
   2272         case 2:     return NAME(a b, 2); \
   2273         case 3:     return NAME(a b, 3); \
   2274         case 4:     return NAME(a b, 4); \
   2275         case 5:     return NAME(a b, 5); \
   2276         case 6:     return NAME(a b, 6); \
   2277         case 7:     return NAME(a b, 7); \
   2278         case 8:     return NAME(a b, 8); \
   2279         case 9:     return NAME(a b, 9); \
   2280         case 10:    return NAME(a b, 10); \
   2281         case 11:    return NAME(a b, 11); \
   2282         case 12:    return NAME(a b, 12); \
   2283         case 13:    return NAME(a b, 13); \
   2284         case 14:    return NAME(a b, 14); \
   2285         case 15:    return NAME(a b, 15); \
   2286         default:    return NAME(a b, 0); \
   2287         }
   2288 
   2289     #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
   2290             switch(LANE)              \
   2291         {                          \
   2292         case 0:  return NAME(vec p,0); \
   2293         case 1:  return NAME(vec p,1); \
   2294         case 2:  return NAME(vec p,2); \
   2295         case 3:  return NAME(vec p,3); \
   2296         case 4:  return NAME(vec p,4); \
   2297         case 5:  return NAME(vec p,5); \
   2298         case 6:  return NAME(vec p,6); \
   2299         case 7:  return NAME(vec p,7); \
   2300         default: return NAME(vec p,0); \
   2301         }
   2302 
   2303     #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
   2304             switch(LANE)              \
   2305         {                          \
   2306         case case0:  return NAME(vec p,case0); \
   2307         case case1:  return NAME(vec p,case1); \
   2308         case case2:  return NAME(vec p,case2); \
   2309         case case3:  return NAME(vec p,case3); \
   2310         default:     return NAME(vec p,case0); \
   2311         }
   2312 
   2313     _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
   2314     {
   2315         _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
   2316     }
   2317 
   2318     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
   2319     {
   2320         _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
   2321     }
   2322 
   2323     _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
   2324     {
   2325         _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
   2326     }
   2327 
   2328 #ifdef USE_SSE4
   2329         _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
   2330         {
   2331             _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
   2332         }
   2333 
   2334         _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
   2335         {
   2336             _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
   2337         }
   2338 
   2339         _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
   2340         {
   2341             _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
   2342         }
   2343 
   2344         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
   2345         {
   2346             _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
   2347         }
   2348 
   2349         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
   2350         {
   2351             _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
   2352         }
   2353 
   2354 #ifdef  _NEON2SSE_64BIT
   2355             //the special case of functions available only for SSE4 and 64-bit build.
   2356             _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
   2357             {
   2358                 switch(LANE) {
   2359                 case 0:
   2360                     return _mm_insert_epi64(vec,  p, 0);
   2361                 case 1:
   2362                     return _mm_insert_epi64(vec,  p, 1);
   2363                 default:
   2364                     return _mm_insert_epi64(vec,  p, 0);
   2365                 }
   2366             }
   2367 
   2368             _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
   2369             {
   2370                 if (LANE ==0) return _mm_extract_epi64(val, 0);
   2371                 else return _mm_extract_epi64(val, 1);
   2372             }
   2373 #endif
   2374 
   2375         _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
   2376         {
   2377             _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
   2378         }
   2379 
   2380 #endif //USE_SSE4
   2381 
   2382 #endif     //#ifdef NDEBUG
   2383 
   2384 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   2385 // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
   2386 // or for some specific commonly used operations implementation missing in SSE
   2387 #ifdef USE_SSE4
   2388     #define _MM_CVTEPU8_EPI16  _mm_cvtepu8_epi16
   2389     #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
   2390     #define _MM_CVTEPU32_EPI64  _mm_cvtepu32_epi64
   2391 
   2392     #define _MM_CVTEPI8_EPI16  _mm_cvtepi8_epi16
   2393     #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
   2394     #define _MM_CVTEPI32_EPI64  _mm_cvtepi32_epi64
   2395 
   2396     #define _MM_MAX_EPI8  _mm_max_epi8
   2397     #define _MM_MAX_EPI32 _mm_max_epi32
   2398     #define _MM_MAX_EPU16 _mm_max_epu16
   2399     #define _MM_MAX_EPU32 _mm_max_epu32
   2400 
   2401     #define _MM_MIN_EPI8  _mm_min_epi8
   2402     #define _MM_MIN_EPI32 _mm_min_epi32
   2403     #define _MM_MIN_EPU16 _mm_min_epu16
   2404     #define _MM_MIN_EPU32 _mm_min_epu32
   2405 
   2406     #define _MM_BLENDV_EPI8 _mm_blendv_epi8
   2407     #define _MM_PACKUS_EPI32 _mm_packus_epi32
   2408     #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
   2409 
   2410     #define _MM_MULLO_EPI32 _mm_mullo_epi32
   2411     #define _MM_MUL_EPI32  _mm_mul_epi32
   2412 
   2413     #define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
   2414 #else     //no SSE4 !!!!!!
   2415     _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
   2416     {
   2417         __m128i zero = _mm_setzero_si128();
   2418         return _mm_unpacklo_epi8(a, zero);
   2419     }
   2420 
   2421     _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
   2422     {
   2423         __m128i zero = _mm_setzero_si128();
   2424         return _mm_unpacklo_epi16(a, zero);
   2425     }
   2426 
   2427     _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
   2428     {
   2429         __m128i zero = _mm_setzero_si128();
   2430         return _mm_unpacklo_epi32(a, zero);
   2431     }
   2432 
   2433     _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
   2434     {
   2435         __m128i zero = _mm_setzero_si128();
   2436         __m128i sign = _mm_cmpgt_epi8(zero, a);
   2437         return _mm_unpacklo_epi8(a, sign);
   2438     }
   2439 
   2440     _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
   2441     {
   2442         __m128i zero = _mm_setzero_si128();
   2443         __m128i sign = _mm_cmpgt_epi16(zero, a);
   2444         return _mm_unpacklo_epi16(a, sign);
   2445     }
   2446 
   2447     _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
   2448     {
   2449         __m128i zero = _mm_setzero_si128();
   2450         __m128i sign = _mm_cmpgt_epi32(zero, a);
   2451         return _mm_unpacklo_epi32(a, sign);
   2452     }
   2453 
   2454     _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
   2455     {
   2456         _NEON2SSE_ALIGN_16 int32_t tmp[4];
   2457         _mm_store_si128((__m128i*)tmp, vec);
   2458         return tmp[LANE];
   2459     }
   2460 
   2461     _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
   2462     {
   2463         _NEON2SSE_ALIGN_16 int8_t tmp[16];
   2464         _mm_store_si128((__m128i*)tmp, vec);
   2465         return (int)tmp[LANE];
   2466     }
   2467 
   2468     _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
   2469     {
   2470         _NEON2SSE_ALIGN_16 int32_t tmp[4];
   2471         _mm_store_si128((__m128i*)tmp, _M128i(vec));
   2472         return tmp[LANE];
   2473     }
   2474 
   2475     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
   2476     {
   2477         _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
   2478         _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
   2479         __m128i vec_masked, p_masked;
   2480         pvec[LANE] = p;
   2481         mask[LANE] = 0x0;
   2482         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
   2483         p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
   2484         return _mm_or_si128(vec_masked, p_masked);
   2485     }
   2486 
   2487     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
   2488     {
   2489         _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
   2490         _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
   2491         __m128i vec_masked, p_masked;
   2492         pvec[LANE] = (int8_t)p;
   2493         mask[LANE] = 0x0;
   2494         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
   2495         p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
   2496         return _mm_or_si128(vec_masked, p_masked);
   2497     }
   2498 
   2499     _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
   2500     {
   2501         _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
   2502         __m128 tmp, vec_masked, p_masked;
   2503         mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
   2504         vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
   2505         p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
   2506         tmp = _mm_or_ps(vec_masked, p_masked);
   2507         return tmp;
   2508     }
   2509 
   2510     _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
   2511     {
   2512         __m128i cmp, resa, resb;
   2513         cmp = _mm_cmpgt_epi8 (a, b);
   2514         resa = _mm_and_si128 (cmp, a);
   2515         resb = _mm_andnot_si128 (cmp,b);
   2516         return _mm_or_si128(resa, resb);
   2517     }
   2518 
   2519     _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
   2520     {
   2521         __m128i cmp, resa, resb;
   2522         cmp = _mm_cmpgt_epi32(a, b);
   2523         resa = _mm_and_si128 (cmp, a);
   2524         resb = _mm_andnot_si128 (cmp,b);
   2525         return _mm_or_si128(resa, resb);
   2526     }
   2527 
   2528     _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
   2529     {
   2530         __m128i c8000, b_s, a_s, cmp;
   2531         c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
   2532         c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
   2533         b_s = _mm_sub_epi16 (b, c8000);
   2534         a_s = _mm_sub_epi16 (a, c8000);
   2535         cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
   2536         a_s = _mm_and_si128 (cmp,a);
   2537         b_s = _mm_andnot_si128 (cmp,b);
   2538         return _mm_or_si128(a_s, b_s);
   2539     }
   2540 
   2541     _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
   2542     {
   2543         __m128i c80000000, b_s, a_s, cmp;
   2544         c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
   2545         c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
   2546         b_s = _mm_sub_epi32 (b, c80000000);
   2547         a_s = _mm_sub_epi32 (a, c80000000);
   2548         cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
   2549         a_s = _mm_and_si128 (cmp,a);
   2550         b_s = _mm_andnot_si128 (cmp,b);
   2551         return _mm_or_si128(a_s, b_s);
   2552     }
   2553 
   2554     _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
   2555     {
   2556         __m128i cmp, resa, resb;
   2557         cmp = _mm_cmpgt_epi8 (b, a);
   2558         resa = _mm_and_si128 (cmp, a);
   2559         resb = _mm_andnot_si128 (cmp,b);
   2560         return _mm_or_si128(resa, resb);
   2561     }
   2562 
   2563     _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
   2564     {
   2565         __m128i cmp, resa, resb;
   2566         cmp = _mm_cmpgt_epi32(b, a);
   2567         resa = _mm_and_si128 (cmp, a);
   2568         resb = _mm_andnot_si128 (cmp,b);
   2569         return _mm_or_si128(resa, resb);
   2570     }
   2571 
   2572     _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
   2573     {
   2574         __m128i c8000, b_s, a_s, cmp;
   2575         c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
   2576         c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
   2577         b_s = _mm_sub_epi16 (b, c8000);
   2578         a_s = _mm_sub_epi16 (a, c8000);
   2579         cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
   2580         a_s = _mm_and_si128 (cmp,a);
   2581         b_s = _mm_andnot_si128 (cmp,b);
   2582         return _mm_or_si128(a_s, b_s);
   2583     }
   2584 
   2585     _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
   2586     {
   2587         __m128i c80000000, b_s, a_s, cmp;
   2588         c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
   2589         c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
   2590         b_s = _mm_sub_epi32 (b, c80000000);
   2591         a_s = _mm_sub_epi32 (a, c80000000);
   2592         cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
   2593         a_s = _mm_and_si128 (cmp,a);
   2594         b_s = _mm_andnot_si128 (cmp,b);
   2595         return _mm_or_si128(a_s, b_s);
   2596     }
   2597 
   2598     _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
   2599     {
   2600         //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
   2601         __m128i a_masked, b_masked;
   2602         b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
   2603         a_masked = _mm_andnot_si128 (mask,a);
   2604         return _mm_or_si128(a_masked, b_masked);
   2605     }
   2606 
   2607     _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
   2608     {
   2609         _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
   2610         __m128i a16, b16, res, reshi,cmp, zero;
   2611         zero = _mm_setzero_si128();
   2612         a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
   2613         b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
   2614         res = _mm_unpacklo_epi64(a16, b16); //result without saturation
   2615         reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
   2616         cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
   2617         res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
   2618         cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
   2619         return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
   2620     }
   2621 
   2622     _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
   2623     {
   2624         _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
   2625         __m128i a16, res, reshi,cmp, zero;
   2626         zero = _mm_setzero_si128();
   2627         a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
   2628         reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
   2629         cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
   2630         res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
   2631         cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
   2632         return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
   2633     }
   2634 
   2635 
   2636     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
   2637     {
   2638         _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
   2639         int64_t res64;
   2640         int i;
   2641         _mm_store_si128((__m128i*)atmp, a);
   2642         _mm_store_si128((__m128i*)btmp, b);
   2643         for (i = 0; i<4; i++) {
   2644             res64 = atmp[i] * btmp[i];
   2645             res[i] = (int)(res64 & 0xffffffff);
   2646         }
   2647         return _mm_load_si128((__m128i*)res);
   2648     }
   2649 
   2650     _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
   2651     {
   2652         __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
   2653         sign = _mm_xor_si128 (a, b);
   2654         sign =  _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
   2655         zero = _mm_setzero_si128();
   2656         a_neg = _mm_abs_epi32 (a); //negate a and b
   2657         b_neg = _mm_abs_epi32 (b); //negate a and b
   2658         mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
   2659         mul_us_neg = _mm_sub_epi64(zero, mul_us);
   2660         mul_us_neg = _mm_and_si128(sign, mul_us_neg);
   2661         mul_us = _mm_andnot_si128(sign, mul_us);
   2662         return _mm_or_si128 (mul_us, mul_us_neg);
   2663     }
   2664 
   2665     _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
   2666     {
   2667         __m128i res;
   2668         res = _mm_cmpeq_epi32 (a, b);
   2669         return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
   2670     }
   2671 #endif     //SSE4
   2672 
   2673 //the special case of functions working only for 32 bits, no SSE4
   2674 _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
   2675 {
   2676     _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
   2677     _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
   2678     __m128i vec_masked, p_masked;
   2679     pvec[LANE] = p;
   2680     mask[LANE] = 0x0;
   2681     vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
   2682     p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
   2683     return _mm_or_si128(vec_masked, p_masked);
   2684 }
   2685 
   2686 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
   2687 {
   2688     _NEON2SSE_ALIGN_16 int64_t tmp[2];
   2689     _mm_store_si128((__m128i*)tmp, val);
   2690     return tmp[LANE];
   2691 }
   2692 
   2693 #ifndef _NEON2SSE_64BIT_SSE4
   2694     #define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
   2695     #define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
   2696 #endif
   2697 
   2698 int32x4_t  vqd_s32(int32x4_t a); //Doubling saturation for signed ints
   2699 _NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
   2700 {
   2701     //Overflow happens only if a and sum have the opposite signs
   2702     __m128i c7fffffff, res, res_sat, res_xor_a;
   2703     c7fffffff = _mm_set1_epi32(0x7fffffff);
   2704     res = _mm_slli_epi32 (a, 1); // res = a*2
   2705     res_sat = _mm_srli_epi32(a, 31);
   2706     res_sat = _mm_add_epi32(res_sat, c7fffffff);
   2707     res_xor_a = _mm_xor_si128(res, a);
   2708     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
   2709     res_sat = _mm_and_si128(res_xor_a, res_sat);
   2710     res = _mm_andnot_si128(res_xor_a, res);
   2711     return _mm_or_si128(res, res_sat);
   2712 }
   2713 
   2714 
   2715 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   2716 //*************************************************************************
   2717 //*************************************************************************
   2718 //*****************  Functions redefinition\implementatin starts here *****
   2719 //*************************************************************************
   2720 //*************************************************************************
   2721 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   2722 
   2723 /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
   2724 #ifdef ARM
   2725 #define vector_addq_s32 _mm_add_epi32
   2726 #else //if we have IA
   2727 #define vector_addq_s32 vadd_s32
   2728 #endif
   2729 
   2730 ********************************************************************************************
   2731 Functions below are organised in the following way:
   2732 
   2733 Each NEON intrinsic function has one of the following options:
   2734 1.  its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
   2735 2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
   2736 3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
   2737 4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
   2738 the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
   2739 - please consider such functions removal from your code.
   2740 */
   2741 
   2742 //***********************************************************************
   2743 //************************      Vector add   *****************************
   2744 //***********************************************************************
   2745 int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
   2746 _NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
   2747 {
   2748     int8x8_t res64;
   2749     return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
   2750 }
   2751 
   2752 
   2753 int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
   2754 _NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
   2755 {
   2756     int16x4_t res64;
   2757     return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
   2758 }
   2759 
   2760 
   2761 int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
   2762 _NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
   2763 {
   2764     int32x2_t res64;
   2765     return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
   2766 }
   2767 
   2768 
   2769 int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b); // VADD.I64 d0,d0,d0
   2770 _NEON2SSE_INLINE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b)
   2771 {
   2772     int64x1_t res64;
   2773     res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
   2774     return res64;
   2775 }
   2776 
   2777 
   2778 float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
   2779 _NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
   2780 {
   2781     __m128 res;
   2782     __m64_128 res64;
   2783     res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
   2784     _M64f(res64, res);
   2785     return res64;
   2786 }
   2787 
   2788 uint8x8_t  vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
   2789 #define vadd_u8 vadd_s8
   2790 
   2791 uint16x4_t  vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
   2792 #define vadd_u16 vadd_s16
   2793 
   2794 uint32x2_t  vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
   2795 #define vadd_u32 vadd_s32
   2796 
   2797 uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b); // VADD.I64 d0,d0,d0
   2798 _NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b)
   2799 {
   2800     uint64x1_t res64;
   2801     res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
   2802     return res64;
   2803 }
   2804 
   2805 
   2806 int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
   2807 #define vaddq_s8 _mm_add_epi8
   2808 
   2809 int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
   2810 #define vaddq_s16 _mm_add_epi16
   2811 
   2812 int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
   2813 #define vaddq_s32 _mm_add_epi32
   2814 
   2815 int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
   2816 #define vaddq_s64 _mm_add_epi64
   2817 
   2818 float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
   2819 #define vaddq_f32 _mm_add_ps
   2820 
   2821 uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
   2822 #define vaddq_u8 _mm_add_epi8
   2823 
   2824 uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
   2825 #define vaddq_u16 _mm_add_epi16
   2826 
   2827 uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
   2828 #define vaddq_u32 _mm_add_epi32
   2829 
   2830 uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
   2831 #define vaddq_u64 _mm_add_epi64
   2832 
   2833 //**************************** Vector long add *****************************:
   2834 //***********************************************************************
   2835 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
   2836 int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
   2837 _NEON2SSE_INLINE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
   2838 {
   2839     __m128i a16, b16;
   2840     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
   2841     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   2842     return _mm_add_epi16 (a16, b16);
   2843 }
   2844 
   2845 int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
   2846 _NEON2SSE_INLINE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
   2847 {
   2848     __m128i a32, b32;
   2849     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
   2850     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
   2851     return _mm_add_epi32 (a32, b32);
   2852 }
   2853 
   2854 int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
   2855 _NEON2SSE_INLINE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
   2856 {
   2857     //may be not optimal
   2858     __m128i a64, b64;
   2859     a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
   2860     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
   2861     return _mm_add_epi64 ( a64, b64);
   2862 }
   2863 
   2864 uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
   2865 _NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
   2866 {
   2867     __m128i a16, b16;
   2868     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
   2869     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
   2870     return _mm_add_epi16 (a16, b16);
   2871 }
   2872 
   2873 uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
   2874 _NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
   2875 {
   2876     __m128i a32, b32;
   2877     a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
   2878     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
   2879     return _mm_add_epi32 (a32, b32);
   2880 }
   2881 
   2882 uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
   2883 _NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
   2884 {
   2885     //may be not optimal
   2886     __m128i a64, b64;
   2887     a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
   2888     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
   2889     return _mm_add_epi64 (a64, b64);
   2890 }
   2891 
   2892 //***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
   2893 //*************** *********************************************************************
   2894 int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
   2895 _NEON2SSE_INLINE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
   2896 {
   2897     __m128i b16;
   2898     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   2899     return _mm_add_epi16 (a, b16);
   2900 }
   2901 
   2902 int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
   2903 _NEON2SSE_INLINE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
   2904 {
   2905     __m128i b32;
   2906     b32 =  _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
   2907     return _mm_add_epi32 (a, b32);
   2908 }
   2909 
   2910 int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
   2911 _NEON2SSE_INLINE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
   2912 {
   2913     __m128i b64;
   2914     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
   2915     return _mm_add_epi64 (a, b64);
   2916 }
   2917 
   2918 uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
   2919 _NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
   2920 {
   2921     __m128i b16;
   2922     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
   2923     return _mm_add_epi16 (a, b16);
   2924 }
   2925 
   2926 uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
   2927 _NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
   2928 {
   2929     __m128i b32;
   2930     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
   2931     return _mm_add_epi32 (a, b32);
   2932 }
   2933 
   2934 uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
   2935 _NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
   2936 {
   2937     __m128i b64;
   2938     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
   2939     return _mm_add_epi64 (a, b64);
   2940 }
   2941 
   2942 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
   2943 //*************************************************************************************************************************
   2944 int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b); // VHADD.S8 d0,d0,d0
   2945 _NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b)
   2946 {
   2947     int8x8_t res64;
   2948     return64(vhaddq_u8(_pM128i(a), _pM128i(b)));
   2949 }
   2950 
   2951 
   2952 int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b); // VHADD.S16 d0,d0,d0
   2953 _NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b)
   2954 {
   2955     int16x4_t res64;
   2956     return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
   2957 }
   2958 
   2959 
   2960 int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b); // VHADD.S32 d0,d0,d0
   2961 _NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b)
   2962 {
   2963     int32x2_t res64;
   2964     return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
   2965 }
   2966 
   2967 
   2968 uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b); // VHADD.w d0,d0,d0
   2969 _NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b)
   2970 {
   2971     uint8x8_t res64;
   2972     return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
   2973 }
   2974 
   2975 
   2976 uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b); // VHADD.s16 d0,d0,d0
   2977 _NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b)
   2978 {
   2979     uint16x4_t res64;
   2980     return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
   2981 }
   2982 
   2983 
   2984 uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b); // VHADD.U32 d0,d0,d0
   2985 _NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b)
   2986 {
   2987     uint32x2_t res64;
   2988     return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
   2989 }
   2990 
   2991 
   2992 int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
   2993 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
   2994 {
   2995     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   2996     __m128i tmp1, tmp2;
   2997     tmp1 = _mm_and_si128(a,b);
   2998     tmp2 = _mm_xor_si128(a,b);
   2999     tmp2 = vshrq_n_s8(tmp2,1);
   3000     return _mm_add_epi8(tmp1,tmp2);
   3001 }
   3002 
   3003 int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
   3004 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
   3005 {
   3006     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   3007     __m128i tmp1, tmp2;
   3008     tmp1 = _mm_and_si128(a,b);
   3009     tmp2 = _mm_xor_si128(a,b);
   3010     tmp2 = _mm_srai_epi16(tmp2,1);
   3011     return _mm_add_epi16(tmp1,tmp2);
   3012 }
   3013 
   3014 int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
   3015 _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
   3016 {
   3017     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   3018     __m128i tmp1, tmp2;
   3019     tmp1 = _mm_and_si128(a,b);
   3020     tmp2 = _mm_xor_si128(a,b);
   3021     tmp2 = _mm_srai_epi32(tmp2,1);
   3022     return _mm_add_epi32(tmp1,tmp2);
   3023 }
   3024 
   3025 uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
   3026 _NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
   3027 {
   3028     __m128i c1, sum, res;
   3029     c1 = _mm_set1_epi8(1);
   3030     sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
   3031     res = _mm_xor_si128(a, b); //for rounding compensation
   3032     res = _mm_and_si128(res,c1); //for rounding compensation
   3033     return _mm_sub_epi8 (sum, res); //actual rounding compensation
   3034 }
   3035 
   3036 uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
   3037 _NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
   3038 {
   3039     __m128i sum, res;
   3040     sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
   3041     res = _mm_xor_si128(a, b); //for rounding compensation
   3042     res = _mm_slli_epi16 (res,15); //shift left  then back right to
   3043     res = _mm_srli_epi16 (res,15); //get 1 or zero
   3044     return _mm_sub_epi16 (sum, res); //actual rounding compensation
   3045 }
   3046 
   3047 uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
   3048 _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
   3049 {
   3050     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   3051     __m128i tmp1, tmp2;
   3052     tmp1 = _mm_and_si128(a,b);
   3053     tmp2 = _mm_xor_si128(a,b);
   3054     tmp2 = _mm_srli_epi32(tmp2,1);
   3055     return _mm_add_epi32(tmp1,tmp2);
   3056 }
   3057 
   3058 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
   3059 //*****************************************************************************************************************************
   3060 int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b); // VRHADD.S8 d0,d0,d0
   3061 _NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b)
   3062 {
   3063     int8x8_t res64;
   3064     return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
   3065 }
   3066 
   3067 
   3068 int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b); // VRHADD.S16 d0,d0,d0
   3069 _NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b)
   3070 {
   3071     int16x4_t res64;
   3072     return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
   3073 }
   3074 
   3075 
   3076 int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b); // VRHADD.S32 d0,d0,d0
   3077 _NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b)
   3078 {
   3079     int32x2_t res64;
   3080     return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
   3081 }
   3082 
   3083 
   3084 uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
   3085 _NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
   3086 {
   3087     uint8x8_t res64;
   3088     return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
   3089 }
   3090 
   3091 
   3092 uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
   3093 _NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
   3094 {
   3095     uint16x4_t res64;
   3096     return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
   3097 }
   3098 
   3099 
   3100 uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b); // VRHADD.U32 d0,d0,d0
   3101 _NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b)
   3102 {
   3103     uint32x2_t res64;
   3104     return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
   3105 }
   3106 
   3107 
   3108 int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
   3109 _NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
   3110 {
   3111     //no signed average in x86 SIMD, go to unsigned
   3112     __m128i c128, au, bu, sum;
   3113     c128 = _mm_set1_epi8(0x80); //-128
   3114     au = _mm_sub_epi8(a, c128); //add 128
   3115     bu = _mm_sub_epi8(b, c128); //add 128
   3116     sum = _mm_avg_epu8(au, bu);
   3117     return _mm_add_epi8 (sum, c128); //sub 128
   3118 }
   3119 
   3120 int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
   3121 _NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
   3122 {
   3123     //no signed average in x86 SIMD, go to unsigned
   3124     __m128i cx8000, au, bu, sum;
   3125     cx8000 = _mm_set1_epi16(0x8000); // - 32768
   3126     au = _mm_sub_epi16(a, cx8000); //add 32768
   3127     bu = _mm_sub_epi16(b, cx8000); //add 32768
   3128     sum = _mm_avg_epu16(au, bu);
   3129     return _mm_add_epi16 (sum, cx8000); //sub 32768
   3130 }
   3131 
   3132 int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
   3133 _NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
   3134 {
   3135     //need to avoid overflow
   3136     __m128i a2, b2, res, sum;
   3137     a2 = _mm_srai_epi32(a,1); //a2=a/2;
   3138     b2 = _mm_srai_epi32(b,1); // b2=b/2;
   3139     res = _mm_or_si128(a,b); //for rounding
   3140     res = _mm_slli_epi32 (res,31); //shift left  then back right to
   3141     res = _mm_srli_epi32 (res,31); //get 1 or zero
   3142     sum = _mm_add_epi32(a2,b2);
   3143     return _mm_add_epi32(sum,res);
   3144 }
   3145 
   3146 uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
   3147 #define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
   3148 
   3149 uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
   3150 #define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
   3151 
   3152 
   3153 uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
   3154 _NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
   3155 {
   3156     //need to avoid overflow
   3157     __m128i a2, b2, res, sum;
   3158     a2 = _mm_srli_epi32(a,1); //a2=a/2;
   3159     b2 = _mm_srli_epi32(b,1); // b2=b/2;
   3160     res = _mm_or_si128(a,b); //for rounding
   3161     res = _mm_slli_epi32 (res,31); //shift left  then back right to
   3162     res = _mm_srli_epi32 (res,31); //get 1 or zero
   3163     sum = _mm_add_epi32(a2,b2);
   3164     return _mm_add_epi32(sum,res);
   3165 }
   3166 
   3167 //****************** VQADD: Vector saturating add ************************
   3168 //************************************************************************
   3169 int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
   3170 _NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
   3171 {
   3172     int8x8_t res64;
   3173     return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
   3174 }
   3175 
   3176 
   3177 int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
   3178 _NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
   3179 {
   3180     int16x4_t res64;
   3181     return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
   3182 }
   3183 
   3184 
   3185 int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b); // VQADD.S32 d0,d0,d0
   3186 _NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b)
   3187 {
   3188     int32x2_t res64;
   3189     return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
   3190 }
   3191 
   3192 
   3193 int64x1_t  vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
   3194 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3195 {
   3196     int64x1_t res;
   3197     uint64_t a64, b64;
   3198     a64 = a.m64_u64[0];
   3199     b64 = b.m64_u64[0];
   3200     res.m64_u64[0] = a64 + b64;
   3201     a64 = (a64 >> 63) + (~_SIGNBIT64);
   3202     if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
   3203         res.m64_u64[0] = a64;
   3204     }
   3205     return res;
   3206 }
   3207 
   3208 uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
   3209 _NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
   3210 {
   3211     uint8x8_t res64;
   3212     return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
   3213 }
   3214 
   3215 
   3216 uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
   3217 _NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
   3218 {
   3219     uint16x4_t res64;
   3220     return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
   3221 }
   3222 
   3223 
   3224 uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b); // VQADD.U32 d0,d0,d0
   3225 _NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b)
   3226 {
   3227     uint32x2_t res64;
   3228     return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
   3229 }
   3230 
   3231 
   3232 uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
   3233 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3234 {
   3235     _NEON2SSE_ALIGN_16 uint64_t a64, b64;
   3236     uint64x1_t res;
   3237     a64 = a.m64_u64[0];
   3238     b64 = b.m64_u64[0];
   3239     res.m64_u64[0] = a64 + b64;
   3240     if (res.m64_u64[0] < a64) {
   3241         res.m64_u64[0] = ~(uint64_t)0;
   3242     }
   3243     return res;
   3244 }
   3245 
   3246 int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
   3247 #define vqaddq_s8 _mm_adds_epi8
   3248 
   3249 int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
   3250 #define vqaddq_s16 _mm_adds_epi16
   3251 
   3252 int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
   3253 _NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
   3254 {
   3255     //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
   3256     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
   3257     c7fffffff = _mm_set1_epi32(0x7fffffff);
   3258     res = _mm_add_epi32(a, b);
   3259     res_sat = _mm_srli_epi32(a, 31);
   3260     res_sat = _mm_add_epi32(res_sat, c7fffffff);
   3261     res_xor_a = _mm_xor_si128(res, a);
   3262     b_xor_a_ = _mm_xor_si128(b, a);
   3263     res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
   3264     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
   3265     res_sat = _mm_and_si128(res_xor_a, res_sat);
   3266     res = _mm_andnot_si128(res_xor_a, res);
   3267     return _mm_or_si128(res, res_sat);
   3268 }
   3269 
   3270 int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
   3271 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3272 {
   3273     _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
   3274     _mm_store_si128((__m128i*)atmp, a);
   3275     _mm_store_si128((__m128i*)btmp, b);
   3276     res[0] = atmp[0] + btmp[0];
   3277     res[1] = atmp[1] + btmp[1];
   3278 
   3279     atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
   3280     atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
   3281 
   3282     if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
   3283         res[0] = atmp[0];
   3284     }
   3285     if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
   3286         res[1] = atmp[1];
   3287     }
   3288     return _mm_load_si128((__m128i*)res);
   3289 }
   3290 
   3291 uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
   3292 #define vqaddq_u8 _mm_adds_epu8
   3293 
   3294 uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
   3295 #define vqaddq_u16 _mm_adds_epu16
   3296 
   3297 uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
   3298 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
   3299 {
   3300     __m128i c80000000, cmp, subsum, suba, sum;
   3301     c80000000 = _mm_set1_epi32 (0x80000000);
   3302     sum = _mm_add_epi32 (a, b);
   3303     subsum = _mm_sub_epi32 (sum, c80000000);
   3304     suba = _mm_sub_epi32 (a, c80000000);
   3305     cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
   3306     return _mm_or_si128 (sum, cmp); //saturation
   3307 }
   3308 
   3309 uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
   3310 #ifdef USE_SSE4
   3311     _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
   3312     {
   3313         __m128i c80000000, sum, cmp, suba, subsum;
   3314         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
   3315         sum = _mm_add_epi64 (a, b);
   3316         subsum = _mm_sub_epi64 (sum, c80000000);
   3317         suba = _mm_sub_epi64 (a, c80000000);
   3318         cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
   3319         return _mm_or_si128 (sum, cmp); //saturation
   3320     }
   3321 #else
   3322     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3323     {
   3324         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
   3325         _mm_store_si128((__m128i*)atmp, a);
   3326         _mm_store_si128((__m128i*)btmp, b);
   3327         res[0] = atmp[0] + btmp[0];
   3328         res[1] = atmp[1] + btmp[1];
   3329         if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
   3330         if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
   3331         return _mm_load_si128((__m128i*)(res));
   3332     }
   3333 #endif
   3334 
   3335 
   3336 //******************* Vector add high half (truncated)  ******************
   3337 //************************************************************************
   3338 int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
   3339 _NEON2SSE_INLINE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
   3340 {
   3341     int8x8_t res64;
   3342     __m128i sum;
   3343     sum = _mm_add_epi16 (a, b);
   3344     sum = _mm_srai_epi16 (sum, 8);
   3345     sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
   3346     return64(sum);
   3347 }
   3348 
   3349 int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
   3350 _NEON2SSE_INLINE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
   3351 {
   3352     int16x4_t res64;
   3353     __m128i sum;
   3354     sum = _mm_add_epi32 (a, b);
   3355     sum = _mm_srai_epi32(sum, 16);
   3356     sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
   3357     return64(sum);
   3358 }
   3359 
   3360 int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
   3361 _NEON2SSE_INLINE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b)
   3362 {
   3363     int32x2_t res64;
   3364     __m128i sum;
   3365     sum = _mm_add_epi64 (a, b);
   3366     sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (0 << 4) | (2 << 6));
   3367     return64(sum);
   3368 }
   3369 
   3370 uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
   3371 _NEON2SSE_INLINE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
   3372 {
   3373     uint8x8_t res64;
   3374     __m128i sum;
   3375     sum = _mm_add_epi16 (a, b);
   3376     sum = _mm_srli_epi16 (sum, 8);
   3377     sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
   3378     return64(sum);
   3379 }
   3380 
   3381 uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
   3382 _NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
   3383 {
   3384     uint16x4_t res64;
   3385     __m128i sum;
   3386     sum = _mm_add_epi32 (a, b);
   3387     sum = _mm_srli_epi32 (sum, 16);
   3388     sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
   3389     return64(sum);
   3390 }
   3391 
   3392 uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
   3393 #define vaddhn_u64 vaddhn_s64
   3394 
   3395 //*********** Vector rounding add high half: vraddhn_<type> ******************.
   3396 //***************************************************************************
   3397 int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
   3398 _NEON2SSE_INLINE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
   3399 {
   3400     int8x8_t res64;
   3401     __m128i sum, mask1;
   3402     sum = _mm_add_epi16 (a, b);
   3403     mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
   3404     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
   3405     sum = _mm_srai_epi16 (sum, 8); //get high half
   3406     sum = _mm_add_epi16 (sum, mask1); //actual rounding
   3407     sum = _mm_packs_epi16 (sum, sum);
   3408     return64(sum);
   3409 }
   3410 
   3411 int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
   3412 _NEON2SSE_INLINE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
   3413 {
   3414     //SIMD may be not optimal, serial may be faster
   3415     int16x4_t res64;
   3416     __m128i sum, mask1;
   3417     sum = _mm_add_epi32 (a, b);
   3418     mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
   3419     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
   3420     sum = _mm_srai_epi32 (sum, 16); //get high half
   3421     sum = _mm_add_epi32 (sum, mask1); //actual rounding
   3422     sum = _mm_packs_epi32 (sum, sum);
   3423     return64(sum);
   3424 }
   3425 
   3426 int32x2_t  vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
   3427 _NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
   3428 {
   3429     //SIMD may be not optimal, serial may be faster
   3430     int32x2_t res64;
   3431     __m128i sum, mask1;
   3432     sum = _mm_add_epi64 (a, b);
   3433     mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
   3434     mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
   3435     sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
   3436     sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
   3437     return64(sum);
   3438 }
   3439 
   3440 uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
   3441 _NEON2SSE_INLINE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
   3442 {
   3443     uint8x8_t res64;
   3444     __m128i sum, mask1;
   3445     sum = _mm_add_epi16 (a, b);
   3446     mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
   3447     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
   3448     sum = _mm_srai_epi16 (sum, 8); //get high half
   3449     sum = _mm_add_epi16 (sum, mask1); //actual rounding
   3450     sum = _mm_packus_epi16 (sum, sum);
   3451     return64(sum);
   3452 }
   3453 
   3454 uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
   3455 _NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
   3456 {
   3457     //SIMD may be not optimal, serial may be faster
   3458     uint16x4_t res64;
   3459     __m128i sum, mask1;
   3460     sum = _mm_add_epi32 (a, b);
   3461     mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
   3462     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
   3463     sum = _mm_srai_epi32 (sum, 16); //get high half
   3464     sum = _mm_add_epi32 (sum, mask1); //actual rounding
   3465     sum = _MM_PACKUS1_EPI32 (sum);
   3466     return64(sum);
   3467 }
   3468 
   3469 uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
   3470 #define vraddhn_u64 vraddhn_s64
   3471 
   3472 //**********************************************************************************
   3473 //*********             Multiplication            *************************************
   3474 //**************************************************************************************
   3475 
   3476 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
   3477 //As we don't go to wider result functions are equal to "multiply low" in x86
   3478 int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
   3479 _NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
   3480 {
   3481     // no 8 bit simd multiply, need to go to 16 bits in SSE
   3482     int8x8_t res64;
   3483     __m128i a128, b128, res;
   3484     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   3485     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
   3486     b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
   3487     res = _mm_mullo_epi16 (a128, b128);
   3488     res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
   3489     return64(res);
   3490 }
   3491 
   3492 int16x4_t vmul_s16(int16x4_t a,  int16x4_t b); // VMUL.I16 d0,d0,d0
   3493 #define vmul_s16 vmul_u16
   3494 
   3495 int32x2_t vmul_s32(int32x2_t a,  int32x2_t b); // VMUL.I32 d0,d0,d0
   3496 #define vmul_s32 vmul_u32
   3497 
   3498 float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
   3499 _NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
   3500 {
   3501     float32x4_t tmp;
   3502     __m64_128 res64;
   3503     tmp =  _mm_mul_ps(_pM128(a),_pM128(b));
   3504     _M64f(res64, tmp); //use low 64 bits
   3505     return res64;
   3506 }
   3507 
   3508 uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
   3509 _NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
   3510 {
   3511     // no 8 bit simd multiply, need to go to 16 bits in SSE
   3512     uint8x8_t res64;
   3513     __m128i mask, a128, b128, res;
   3514     mask = _mm_set1_epi16(0xff);
   3515     a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
   3516     b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
   3517     res = _mm_mullo_epi16 (a128, b128);
   3518     res = _mm_and_si128(res, mask); //to avoid saturation
   3519     res = _mm_packus_epi16 (res,res); //use only low 64 bits
   3520     return64(res);
   3521 }
   3522 
   3523 uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
   3524 _NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
   3525 {
   3526     uint16x4_t res64;
   3527     return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
   3528 }
   3529 
   3530 
   3531 uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
   3532 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3533 {
   3534     uint32x2_t res;
   3535     res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
   3536     res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
   3537     return res;
   3538 }
   3539 
   3540 poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
   3541 _NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
   3542 {
   3543     //may be optimized
   3544     poly8x8_t res64;
   3545     __m128i a64, b64, c1, res, tmp, bmasked;
   3546     int i;
   3547     a64 = _pM128i(a);
   3548     b64 = _pM128i(b);
   3549     c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
   3550     c1 = vshrq_n_u8(c1,7); //0x1
   3551     bmasked = _mm_and_si128(b64, c1); //0x1
   3552     res = vmulq_u8(a64, bmasked);
   3553     for(i = 1; i<8; i++) {
   3554         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
   3555         bmasked = _mm_and_si128(b64, c1); //0x1
   3556         tmp = vmulq_u8(a64, bmasked);
   3557         res = _mm_xor_si128(res, tmp);
   3558     }
   3559     return64 (res);
   3560 }
   3561 
   3562 int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
   3563 _NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
   3564 {
   3565     // no 8 bit simd multiply, need to go to 16 bits
   3566     //solution may be not optimal
   3567     __m128i a16, b16, r16_1, r16_2;
   3568     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   3569     a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
   3570     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
   3571     r16_1 = _mm_mullo_epi16 (a16, b16);
   3572     //swap hi and low part of a and b to process the remaining data
   3573     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3574     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   3575     a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
   3576     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1  __m128i r16_2
   3577 
   3578     r16_2 = _mm_mullo_epi16 (a16, b16);
   3579     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
   3580     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
   3581 
   3582     return _mm_unpacklo_epi64(r16_1,  r16_2);
   3583 }
   3584 
   3585 int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
   3586 #define vmulq_s16 _mm_mullo_epi16
   3587 
   3588 int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
   3589 #define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
   3590 
   3591 float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
   3592 #define vmulq_f32 _mm_mul_ps
   3593 
   3594 uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
   3595 _NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
   3596 {
   3597     // no 8 bit simd multiply, need to go to 16 bits
   3598     //solution may be not optimal
   3599     __m128i maskff, a16, b16, r16_1, r16_2;
   3600     maskff = _mm_set1_epi16(0xff);
   3601     a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
   3602     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
   3603     r16_1 = _mm_mullo_epi16 (a16, b16);
   3604     r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
   3605     //swap hi and low part of a and b to process the remaining data
   3606     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3607     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   3608     a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
   3609     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
   3610 
   3611     r16_2 = _mm_mullo_epi16 (a16, b16);
   3612     r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
   3613     return _mm_packus_epi16 (r16_1,  r16_2);
   3614 }
   3615 
   3616 uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
   3617 #define vmulq_u16 _mm_mullo_epi16
   3618 
   3619 uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
   3620 #define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
   3621 
   3622 poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
   3623 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
   3624 {
   3625     //may be optimized
   3626     __m128i c1, res, tmp, bmasked;
   3627     int i;
   3628     c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
   3629     c1 = vshrq_n_u8(c1,7); //0x1
   3630     bmasked = _mm_and_si128(b, c1); //0x1
   3631     res = vmulq_u8(a, bmasked);
   3632     for(i = 1; i<8; i++) {
   3633         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
   3634         bmasked = _mm_and_si128(b, c1); //0x1
   3635         tmp = vmulq_u8(a, bmasked);
   3636         res = _mm_xor_si128(res, tmp);
   3637     }
   3638     return res;
   3639 }
   3640 
   3641 //************************* Vector long multiply ***********************************
   3642 //****************************************************************************
   3643 int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
   3644 _NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
   3645 {
   3646     //no 8 bit simd multiply, need to go to 16 bits
   3647     __m128i a16, b16;
   3648     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
   3649     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
   3650     return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
   3651 }
   3652 
   3653 int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
   3654 _NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
   3655 {
   3656     #ifdef USE_SSE4
   3657         __m128i a16, b16;
   3658         a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
   3659         b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
   3660         return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
   3661     #else
   3662         __m128i low, hi, a128,b128;
   3663         a128 = _pM128i(a);
   3664         b128 = _pM128i(b);
   3665         low =  _mm_mullo_epi16(a128,b128);
   3666         hi =   _mm_mulhi_epi16(a128,b128);
   3667         return _mm_unpacklo_epi16(low,hi);
   3668     #endif
   3669 }
   3670 
   3671 int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
   3672 _NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
   3673 {
   3674     __m128i ab, ba, a128, b128;
   3675     a128 = _pM128i(a);
   3676     b128 = _pM128i(b);
   3677     ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
   3678     ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
   3679     return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   3680 }
   3681 
   3682 uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
   3683 _NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
   3684 {
   3685     //no 8 bit simd multiply, need to go to 16 bits
   3686     __m128i a16, b16;
   3687     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
   3688     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
   3689     return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
   3690 }
   3691 
   3692 uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
   3693 _NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
   3694 {
   3695     #ifdef USE_SSE4
   3696         __m128i a16, b16;
   3697         a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
   3698         b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
   3699         return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
   3700     #else
   3701         __m128i a128,b128,low, hi;
   3702         a128 = _pM128i(a);
   3703         b128 = _pM128i(b);
   3704         low =  _mm_mullo_epi16(a128,b128);
   3705         hi =   _mm_mulhi_epu16(a128,b128);
   3706         return _mm_unpacklo_epi16(low,hi);
   3707     #endif
   3708 }
   3709 
   3710 uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
   3711 _NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
   3712 {
   3713     ///may be not optimal compared with serial implementation
   3714     __m128i ab, ba, a128, b128;
   3715     a128 = _pM128i(a);
   3716     b128 = _pM128i(b);
   3717     ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
   3718     ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
   3719     return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   3720 }
   3721 
   3722 poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
   3723 _NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
   3724 {
   3725     //may be optimized
   3726     __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
   3727     int i;
   3728     a128 = _pM128i(a);
   3729     b128 = _pM128i(b);
   3730     c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
   3731     c1 = vshrq_n_u8(c1,7); //0x1
   3732     bmasked = _mm_and_si128(b128, c1); //0x1
   3733 
   3734     a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
   3735     bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
   3736     res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
   3737     for(i = 1; i<8; i++) {
   3738         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
   3739         bmasked = _mm_and_si128(b128, c1); //0x1
   3740         bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
   3741         tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
   3742         res = _mm_xor_si128(res, tmp);
   3743     }
   3744     return res;
   3745 }
   3746 
   3747 //****************Vector saturating doubling long multiply **************************
   3748 //*****************************************************************
   3749 int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
   3750 _NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
   3751 {
   3752     //the serial soulution may be faster due to saturation
   3753     __m128i res;
   3754     res = vmull_s16(a, b);
   3755     return vqd_s32(res);
   3756 }
   3757 
   3758 int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
   3759 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
   3760 {
   3761     //the serial soulution may be faster due to saturation
   3762     __m128i res;
   3763     res = vmull_s32(a,b);
   3764     return vqaddq_s64(res,res); //slow serial function!!!!
   3765 }
   3766 
   3767 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
   3768 //******************************************************************************************
   3769 int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
   3770 _NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
   3771 {
   3772     // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
   3773     int8x8_t res64;
   3774     __m128i b128, c128, res;
   3775     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   3776     b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
   3777     c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
   3778     res = _mm_mullo_epi16 (c128, b128);
   3779     res  =  _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
   3780     res  = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
   3781     return64(res);
   3782 }
   3783 
   3784 int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
   3785 _NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
   3786 {
   3787     int16x4_t res64;
   3788     return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
   3789 }
   3790 
   3791 
   3792 int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
   3793 _NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
   3794 {
   3795     int32x2_t res64;
   3796     __m128i res;
   3797     res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
   3798     res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
   3799     return64(res);
   3800 }
   3801 
   3802 float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
   3803 _NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
   3804 {
   3805     //fma is coming soon, but right now:
   3806     __m128 res;
   3807     __m64_128 res64;
   3808     res = _mm_mul_ps (_pM128(c), _pM128(b));
   3809     res = _mm_add_ps (_pM128(a), res);
   3810     _M64f(res64, res);
   3811     return res64;
   3812 }
   3813 
   3814 uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
   3815 _NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
   3816 {
   3817     // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
   3818     uint8x8_t res64;
   3819     __m128i mask, b128, c128, res;
   3820     mask = _mm_set1_epi16(0xff);
   3821     b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
   3822     c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
   3823     res = _mm_mullo_epi16 (c128, b128);
   3824     res = _mm_and_si128(res, mask); //to avoid saturation
   3825     res = _mm_packus_epi16 (res, res);
   3826     res =  _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
   3827     return64(res);
   3828 }
   3829 
   3830 uint16x4_t vmla_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
   3831 #define vmla_u16 vmla_s16
   3832 
   3833 uint32x2_t vmla_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
   3834 #define vmla_u32 vmla_s32
   3835 
   3836 int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
   3837 _NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
   3838 {
   3839     //solution may be not optimal
   3840     // no 8 bit simd multiply, need to go to 16 bits
   3841     __m128i b16, c16, r16_1, a_2,r16_2;
   3842     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   3843     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
   3844     c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
   3845     r16_1 = _mm_mullo_epi16 (b16, c16);
   3846     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
   3847     r16_1 = _mm_add_epi8 (r16_1, a);
   3848     //swap hi and low part of a, b and c to process the remaining data
   3849     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3850     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   3851     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   3852     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
   3853     c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
   3854 
   3855     r16_2 = _mm_mullo_epi16 (b16, c16);
   3856     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   3857     r16_2 = _mm_add_epi8(r16_2, a_2);
   3858     return _mm_unpacklo_epi64(r16_1,r16_2);
   3859 }
   3860 
   3861 int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
   3862 _NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
   3863 {
   3864     __m128i res;
   3865     res = _mm_mullo_epi16 (c, b);
   3866     return _mm_add_epi16 (res, a);
   3867 }
   3868 
   3869 int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
   3870 _NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
   3871 {
   3872     __m128i res;
   3873     res = _MM_MULLO_EPI32 (c,  b); //SSE4.1
   3874     return _mm_add_epi32 (res, a);
   3875 }
   3876 
   3877 float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
   3878 _NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
   3879 {
   3880     //fma is coming soon, but right now:
   3881     __m128 res;
   3882     res = _mm_mul_ps (c, b);
   3883     return _mm_add_ps (a, res);
   3884 }
   3885 
   3886 uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
   3887 _NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
   3888 {
   3889     //solution may be not optimal
   3890     // no 8 bit simd multiply, need to go to 16 bits
   3891     __m128i b16, c16, r16_1, a_2, r16_2;
   3892     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   3893     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
   3894     c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
   3895     r16_1 = _mm_mullo_epi16 (b16, c16);
   3896     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
   3897     r16_1 = _mm_add_epi8 (r16_1, a);
   3898     //swap hi and low part of a, b and c to process the remaining data
   3899     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3900     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   3901     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   3902     b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
   3903     c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
   3904 
   3905     r16_2 = _mm_mullo_epi16 (b16, c16);
   3906     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   3907     r16_2 = _mm_add_epi8(r16_2, a_2);
   3908     return _mm_unpacklo_epi64(r16_1,r16_2);
   3909 }
   3910 
   3911 uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
   3912 #define vmlaq_u16 vmlaq_s16
   3913 
   3914 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
   3915 #define vmlaq_u32 vmlaq_s32
   3916 
   3917 //**********************  Vector widening multiply accumulate (long multiply accumulate):
   3918 //                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
   3919 //********************************************************************************************
   3920 int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
   3921 _NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
   3922 {
   3923     int16x8_t res;
   3924     res = vmull_s8(b, c);
   3925     return _mm_add_epi16 (res, a);
   3926 }
   3927 
   3928 int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
   3929 _NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
   3930 {
   3931     //may be not optimal compared with serial implementation
   3932     int32x4_t res;
   3933     res = vmull_s16(b,  c);
   3934     return _mm_add_epi32 (res, a);
   3935 }
   3936 
   3937 int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
   3938 _NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
   3939 {
   3940     //may be not optimal compared with serial implementation
   3941     int64x2_t res;
   3942     res = vmull_s32( b, c);
   3943     return _mm_add_epi64 (res, a);
   3944 }
   3945 
   3946 uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
   3947 _NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
   3948 {
   3949     uint16x8_t res;
   3950     res = vmull_u8(b, c);
   3951     return _mm_add_epi16 (res, a);
   3952 }
   3953 
   3954 uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
   3955 _NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
   3956 {
   3957     //may be not optimal compared with serial implementation
   3958     uint32x4_t res;
   3959     res = vmull_u16(b, c);
   3960     return _mm_add_epi32 (res, a);
   3961 }
   3962 
   3963 uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
   3964 _NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
   3965 {
   3966     //may be not optimal compared with serial implementation
   3967     int64x2_t res;
   3968     res = vmull_u32( b,c);
   3969     return _mm_add_epi64 (res, a);
   3970 }
   3971 
   3972 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
   3973 //********************************************************************************************
   3974 int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
   3975 _NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
   3976 {
   3977     // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
   3978     int8x8_t res64;
   3979     __m128i res;
   3980     res64 = vmul_s8(b,c);
   3981     res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
   3982     return64(res);
   3983 }
   3984 
   3985 int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
   3986 _NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
   3987 {
   3988     int16x4_t res64;
   3989     return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
   3990 }
   3991 
   3992 
   3993 int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
   3994 _NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
   3995 {
   3996     int32x2_t res64;
   3997     __m128i res;
   3998     res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
   3999     res =  _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
   4000     return64(res);
   4001 }
   4002 
   4003 float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
   4004 _NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
   4005 {
   4006     __m128 res;
   4007     __m64_128 res64;
   4008     res = _mm_mul_ps (_pM128(c), _pM128(b));
   4009     res = _mm_sub_ps (_pM128(a), res);
   4010     _M64f(res64, res);
   4011     return res64;
   4012 }
   4013 
   4014 uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
   4015 _NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
   4016 {
   4017     // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
   4018     uint8x8_t res64;
   4019     __m128i res;
   4020     res64 = vmul_u8(b,c);
   4021     res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
   4022     return64(res);
   4023 }
   4024 
   4025 uint16x4_t vmls_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
   4026 #define vmls_u16 vmls_s16
   4027 
   4028 uint32x2_t vmls_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
   4029 #define vmls_u32 vmls_s32
   4030 
   4031 
   4032 int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
   4033 _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
   4034 {
   4035     //solution may be not optimal
   4036     // no 8 bit simd multiply, need to go to 16 bits
   4037     __m128i b16, c16, r16_1, a_2, r16_2;
   4038     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   4039     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
   4040     c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
   4041     r16_1 = _mm_mullo_epi16 (b16, c16);
   4042     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
   4043     r16_1 = _mm_sub_epi8 (a, r16_1);
   4044     //swap hi and low part of a, b, c to process the remaining data
   4045     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   4046     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   4047     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   4048     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
   4049     c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
   4050 
   4051     r16_2 = _mm_mullo_epi16 (b16, c16);
   4052     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   4053     r16_2 = _mm_sub_epi8 (a_2, r16_2);
   4054     return _mm_unpacklo_epi64(r16_1,r16_2);
   4055 }
   4056 
   4057 int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
   4058 _NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
   4059 {
   4060     __m128i res;
   4061     res = _mm_mullo_epi16 (c, b);
   4062     return _mm_sub_epi16 (a, res);
   4063 }
   4064 
   4065 int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
   4066 _NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
   4067 {
   4068     __m128i res;
   4069     res = _MM_MULLO_EPI32 (c, b); //SSE4.1
   4070     return _mm_sub_epi32 (a, res);
   4071 }
   4072 
   4073 float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
   4074 _NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
   4075 {
   4076     __m128 res;
   4077     res = _mm_mul_ps (c, b);
   4078     return _mm_sub_ps (a, res);
   4079 }
   4080 
   4081 uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
   4082 _NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
   4083 {
   4084     //solution may be not optimal
   4085     // no 8 bit simd multiply, need to go to 16 bits
   4086     __m128i b16, c16, r16_1, a_2, r16_2;
   4087     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   4088     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
   4089     c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
   4090     r16_1 = _mm_mullo_epi16 (b16, c16);
   4091     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
   4092     r16_1 = _mm_sub_epi8 (a, r16_1);
   4093     //swap hi and low part of a, b and c to process the remaining data
   4094     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   4095     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   4096     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   4097     b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
   4098     c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
   4099 
   4100     r16_2 = _mm_mullo_epi16 (b16, c16);
   4101     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   4102     r16_2 = _mm_sub_epi8(a_2, r16_2);
   4103     return _mm_unpacklo_epi64(r16_1,r16_2);
   4104 }
   4105 
   4106 uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
   4107 #define vmlsq_u16 vmlsq_s16
   4108 
   4109 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
   4110 #define vmlsq_u32 vmlsq_s32
   4111 
   4112 //******************** Vector multiply subtract long (widening multiply subtract) ************************************
   4113 //*************************************************************************************************************
   4114 int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
   4115 _NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
   4116 {
   4117     int16x8_t res;
   4118     res = vmull_s8(b, c);
   4119     return _mm_sub_epi16 (a, res);
   4120 }
   4121 
   4122 int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
   4123 _NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
   4124 {
   4125     //may be not optimal compared with serial implementation
   4126     int32x4_t res;
   4127     res = vmull_s16(b,  c);
   4128     return _mm_sub_epi32 (a, res);
   4129 }
   4130 
   4131 int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
   4132 _NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
   4133 {
   4134     //may be not optimal compared with serial implementation
   4135     int64x2_t res;
   4136     res = vmull_s32( b,c);
   4137     return _mm_sub_epi64 (a, res);
   4138 }
   4139 
   4140 uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
   4141 _NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
   4142 {
   4143     uint16x8_t res;
   4144     res = vmull_u8(b, c);
   4145     return _mm_sub_epi16 (a, res);
   4146 }
   4147 
   4148 uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
   4149 _NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
   4150 {
   4151     //may be not optimal compared with serial implementation
   4152     uint32x4_t res;
   4153     res = vmull_u16(b, c);
   4154     return _mm_sub_epi32 (a, res);
   4155 }
   4156 
   4157 uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
   4158 _NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
   4159 {
   4160     //may be not optimal compared with serial implementation
   4161     int64x2_t res;
   4162     res = vmull_u32( b,c);
   4163     return _mm_sub_epi64 (a, res);
   4164 }
   4165 
   4166 //******  Vector saturating doubling multiply high **********************
   4167 //*************************************************************************
   4168 int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b); // VQDMULH.S16 d0,d0,d0
   4169 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   4170 {
   4171     int16x4_t res;
   4172     int32_t a32, b32, i;
   4173     for (i = 0; i<4; i++) {
   4174         a32 = (int32_t) a.m64_i16[i];
   4175         b32 = (int32_t) b.m64_i16[i];
   4176         a32 = (a32 * b32) >> 15;
   4177         res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
   4178     }
   4179     return res;
   4180 }
   4181 
   4182 int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
   4183 _NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
   4184 {
   4185     //may be not optimal compared with a serial solution
   4186     int32x2_t res64;
   4187     __m128i mask;
   4188     _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4189     int64x2_t mul;
   4190     mul = vmull_s32(a,b);
   4191     mul = _mm_slli_epi64(mul,1); //double the result
   4192     //at this point start treating 2 64-bit numbers as 4 32-bit
   4193     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
   4194     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
   4195     mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
   4196     return64(mul);
   4197 }
   4198 
   4199 int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
   4200 _NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
   4201 {
   4202     __m128i res, res_lo, mask;
   4203     _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
   4204     res = _mm_mulhi_epi16 (a, b);
   4205     res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
   4206     res_lo = _mm_mullo_epi16 (a, b);
   4207     res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
   4208     res = _mm_add_epi16(res, res_lo); //combine results
   4209     mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
   4210     return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
   4211 }
   4212 
   4213 int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
   4214 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   4215 {
   4216     // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
   4217     __m128i ab, ba, mask, mul, mul1;
   4218     _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4219     ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
   4220     ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
   4221     mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   4222     mul = _mm_slli_epi64(mul,1); //double the result
   4223     ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
   4224     ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
   4225     mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   4226     mul1 = _mm_slli_epi64(mul1,1); //double the result
   4227     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
   4228     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
   4229     mul = _mm_unpacklo_epi64(mul, mul1);
   4230     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
   4231     return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
   4232 }
   4233 
   4234 //********* Vector saturating rounding doubling multiply high ****************
   4235 //****************************************************************************
   4236 //If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
   4237 int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b); // VQRDMULH.S16 d0,d0,d0
   4238 _NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b)
   4239 {
   4240     int16x4_t res64;
   4241     return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
   4242 }
   4243 
   4244 int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
   4245 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   4246 {
   4247     //may be not optimal compared with a serial solution
   4248     int32x2_t res64;
   4249     _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4250     __m128i res_sat, mask, mask1;
   4251     int64x2_t mul;
   4252     mul = vmull_s32(a,b);
   4253     res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
   4254     mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
   4255     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
   4256     mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
   4257     //at this point start treating 2 64-bit numbers as 4 32-bit
   4258     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
   4259     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
   4260     mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
   4261     return64(mul);
   4262 }
   4263 
   4264 int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
   4265 _NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
   4266 {
   4267     __m128i mask, res;
   4268     _NEON2SSE_ALIGN_16 uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
   4269     res = _mm_mulhrs_epi16 (a, b);
   4270     mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
   4271     return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
   4272 }
   4273 
   4274 int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
   4275 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   4276 {
   4277     // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
   4278     __m128i ab, ba,  mask, mul, mul1, mask1;
   4279     _NEON2SSE_ALIGN_16 uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4280     ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
   4281     ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
   4282     mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   4283     mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
   4284     mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
   4285     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
   4286     mul = _mm_add_epi32 (mul, mask1); //actual rounding
   4287 
   4288     ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
   4289     ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
   4290     mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   4291     mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
   4292     mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
   4293     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
   4294     mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
   4295     //at this point start treating 2 64-bit numbers as 4 32-bit
   4296     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
   4297     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
   4298     mul = _mm_unpacklo_epi64(mul, mul1);
   4299     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
   4300     return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
   4301 }
   4302 
   4303 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
   4304 //*************************************************************************************************************************
   4305 int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
   4306 _NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
   4307 {
   4308     //not optimal SIMD soulution, serial may be faster
   4309     __m128i res32;
   4310     res32 = vmull_s16(b,  c);
   4311     res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
   4312     return vqaddq_s32(res32, a); //saturation
   4313 }
   4314 
   4315 int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
   4316 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
   4317 {
   4318     __m128i res64;
   4319     res64 = vmull_s32(b,c);
   4320     res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
   4321     return vqaddq_s64(res64, a); //saturation
   4322 }
   4323 
   4324 //************************************************************************************
   4325 //******************  Vector subtract ***********************************************
   4326 //************************************************************************************
   4327 int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
   4328 _NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
   4329 {
   4330     int8x8_t res64;
   4331     return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
   4332 }
   4333 
   4334 
   4335 int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
   4336 _NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
   4337 {
   4338     int16x4_t res64;
   4339     return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
   4340 }
   4341 
   4342 
   4343 int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
   4344 _NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
   4345 {
   4346     int32x2_t res64;
   4347     return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
   4348 }
   4349 
   4350 
   4351 int64x1_t vsub_s64(int64x1_t a,  int64x1_t b); // VSUB.I64 d0,d0,d0
   4352 _NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b)
   4353 {
   4354     int64x1_t res64;
   4355     res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
   4356     return res64;
   4357 }
   4358 
   4359 
   4360 float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
   4361 _NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
   4362 {
   4363     float32x2_t res;
   4364     res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
   4365     res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
   4366     return res;
   4367 }
   4368 
   4369 uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
   4370 #define vsub_u8 vsub_s8
   4371 
   4372 uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
   4373 #define vsub_u16 vsub_s16
   4374 
   4375 uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
   4376 #define vsub_u32 vsub_s32
   4377 
   4378 
   4379 uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b); // VSUB.I64 d0,d0,d0
   4380 _NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b)
   4381 {
   4382     int64x1_t res64;
   4383     res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
   4384     return res64;
   4385 }
   4386 
   4387 
   4388 int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
   4389 #define vsubq_s8 _mm_sub_epi8
   4390 
   4391 int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
   4392 #define vsubq_s16 _mm_sub_epi16
   4393 
   4394 int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
   4395 #define vsubq_s32 _mm_sub_epi32
   4396 
   4397 int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
   4398 #define vsubq_s64 _mm_sub_epi64
   4399 
   4400 float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
   4401 #define vsubq_f32 _mm_sub_ps
   4402 
   4403 uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
   4404 #define vsubq_u8 _mm_sub_epi8
   4405 
   4406 uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
   4407 #define vsubq_u16 _mm_sub_epi16
   4408 
   4409 uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
   4410 #define vsubq_u32 _mm_sub_epi32
   4411 
   4412 uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
   4413 #define vsubq_u64 _mm_sub_epi64
   4414 
   4415 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
   4416 //***********************************************************************************
   4417 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
   4418 int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
   4419 _NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
   4420 {
   4421     __m128i a16, b16;
   4422     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
   4423     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   4424     return _mm_sub_epi16 (a16, b16);
   4425 }
   4426 
   4427 int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
   4428 _NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
   4429 {
   4430     __m128i a32, b32;
   4431     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
   4432     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
   4433     return _mm_sub_epi32 (a32, b32);
   4434 }
   4435 
   4436 int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
   4437 _NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
   4438 {
   4439     //may be not optimal
   4440     __m128i a64, b64;
   4441     a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
   4442     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
   4443     return _mm_sub_epi64 (a64, b64);
   4444 }
   4445 
   4446 uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
   4447 _NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
   4448 {
   4449     __m128i a16, b16;
   4450     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
   4451     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
   4452     return _mm_sub_epi16 (a16, b16);
   4453 }
   4454 
   4455 uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
   4456 _NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
   4457 {
   4458     __m128i a32, b32;
   4459     a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
   4460     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
   4461     return _mm_sub_epi32 (a32, b32);
   4462 }
   4463 
   4464 uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
   4465 _NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
   4466 {
   4467     //may be not optimal
   4468     __m128i a64, b64;
   4469     a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
   4470     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
   4471     return _mm_sub_epi64 (a64, b64);
   4472 }
   4473 
   4474 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
   4475 //*****************************************************************************************************
   4476 int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
   4477 _NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
   4478 {
   4479     __m128i b16;
   4480     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   4481     return _mm_sub_epi16 (a, b16);
   4482 }
   4483 
   4484 int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
   4485 _NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
   4486 {
   4487     __m128i b32;
   4488     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
   4489     return _mm_sub_epi32 (a, b32);
   4490 }
   4491 
   4492 int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
   4493 _NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
   4494 {
   4495     __m128i b64;
   4496     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
   4497     return _mm_sub_epi64 (a, b64);
   4498 }
   4499 
   4500 uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
   4501 _NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
   4502 {
   4503     __m128i b16;
   4504     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
   4505     return _mm_sub_epi16 (a, b16);
   4506 }
   4507 
   4508 uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
   4509 _NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
   4510 {
   4511     __m128i b32;
   4512     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
   4513     return _mm_sub_epi32 (a, b32);
   4514 }
   4515 
   4516 uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
   4517 _NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
   4518 {
   4519     __m128i b64;
   4520     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
   4521     return _mm_sub_epi64 (a, b64);
   4522 }
   4523 
   4524 //************************Vector saturating subtract *********************************
   4525 //*************************************************************************************
   4526 int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
   4527 _NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
   4528 {
   4529     int8x8_t res64;
   4530     return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
   4531 }
   4532 
   4533 
   4534 int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
   4535 _NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
   4536 {
   4537     int16x4_t res64;
   4538     return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
   4539 }
   4540 
   4541 
   4542 int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b); // VQSUB.S32 d0,d0,d0
   4543 _NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b)
   4544 {
   4545     int32x2_t res64;
   4546     return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
   4547 }
   4548 
   4549 
   4550 int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
   4551 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
   4552 {
   4553     uint64x1_t res;
   4554     uint64_t a64,b64;
   4555     a64 = a.m64_u64[0];
   4556     b64 = b.m64_u64[0];
   4557     res.m64_u64[0] = a64 - b64;
   4558 
   4559     a64 =  (a64 >> 63) + (~_SIGNBIT64);
   4560     if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
   4561         res.m64_u64[0] = a64;
   4562     }
   4563     return res;
   4564 }
   4565 
   4566 uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
   4567 _NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
   4568 {
   4569     uint8x8_t res64;
   4570     return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
   4571 }
   4572 
   4573 
   4574 uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
   4575 _NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
   4576 {
   4577     uint16x4_t res64;
   4578     return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
   4579 }
   4580 
   4581 
   4582 uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b); // VQSUB.U32 d0,d0,d0
   4583 _NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b)
   4584 {
   4585     uint32x2_t res64;
   4586     return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
   4587 }
   4588 
   4589 
   4590 uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
   4591 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   4592 {
   4593     uint64x1_t res;
   4594     uint64_t a64, b64;
   4595     a64 = _Ui64(a);
   4596     b64 = _Ui64(b);
   4597     if (a64 > b64) {
   4598         res.m64_u64[0] = a64 - b64;
   4599     } else {
   4600         res.m64_u64[0] = 0;
   4601     }
   4602     return res;
   4603 }
   4604 
   4605 int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
   4606 #define vqsubq_s8 _mm_subs_epi8
   4607 
   4608 int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
   4609 #define vqsubq_s16 _mm_subs_epi16
   4610 
   4611 int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
   4612 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
   4613 {
   4614     //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
   4615     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
   4616     c7fffffff = _mm_set1_epi32(0x7fffffff);
   4617     res = _mm_sub_epi32(a, b);
   4618     res_sat = _mm_srli_epi32(a, 31);
   4619     res_sat = _mm_add_epi32(res_sat, c7fffffff);
   4620     res_xor_a = _mm_xor_si128(res, a);
   4621     b_xor_a = _mm_xor_si128(b, a);
   4622     res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
   4623     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
   4624     res_sat = _mm_and_si128(res_xor_a, res_sat);
   4625     res = _mm_andnot_si128(res_xor_a, res);
   4626     return _mm_or_si128(res, res_sat);
   4627 }
   4628 
   4629 int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
   4630 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
   4631 {
   4632     _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
   4633     _NEON2SSE_ALIGN_16 uint64_t res[2];
   4634     _mm_store_si128((__m128i*)atmp, a);
   4635     _mm_store_si128((__m128i*)btmp, b);
   4636     res[0] = atmp[0] - btmp[0];
   4637     res[1] = atmp[1] - btmp[1];
   4638     if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
   4639         res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
   4640     }
   4641     if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
   4642         res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
   4643     }
   4644     return _mm_load_si128((__m128i*)res);
   4645 }
   4646 
   4647 uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
   4648 #define vqsubq_u8 _mm_subs_epu8
   4649 
   4650 uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
   4651 #define vqsubq_u16 _mm_subs_epu16
   4652 
   4653 uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
   4654 _NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
   4655 {
   4656     __m128i min, mask, sub;
   4657     min = _MM_MIN_EPU32(a, b); //SSE4.1
   4658     mask = _mm_cmpeq_epi32 (min,  b);
   4659     sub = _mm_sub_epi32 (a, b);
   4660     return _mm_and_si128 ( sub, mask);
   4661 }
   4662 
   4663 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
   4664 #ifdef USE_SSE4
   4665     _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
   4666     {
   4667         __m128i c80000000, subb, suba, cmp, sub;
   4668         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
   4669         sub  = _mm_sub_epi64 (a, b);
   4670         suba = _mm_sub_epi64 (a, c80000000);
   4671         subb = _mm_sub_epi64 (b, c80000000);
   4672         cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
   4673         return _mm_and_si128 (sub, cmp); //saturation
   4674     }
   4675 #else
   4676     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   4677     {
   4678         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
   4679         _mm_store_si128((__m128i*)atmp, a);
   4680         _mm_store_si128((__m128i*)btmp, b);
   4681         res[0] = (atmp[0] > btmp[0]) ? atmp[0] -  btmp[0] : 0;
   4682         res[1] = (atmp[1] > btmp[1]) ? atmp[1] -  btmp[1] : 0;
   4683         return _mm_load_si128((__m128i*)(res));
   4684     }
   4685 #endif
   4686 
   4687 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
   4688 //****************************************************************
   4689 int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
   4690 _NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
   4691 {
   4692     //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
   4693     int8x8_t res64;
   4694     __m128i r16;
   4695     int8x8_t r;
   4696     r = vsub_s8 (a, b);
   4697     r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
   4698     r16 = _mm_srai_epi16 (r16, 1); //SSE2
   4699     r16 =  _mm_packs_epi16 (r16,r16); //use low 64 bits
   4700     return64(r16);
   4701 }
   4702 
   4703 int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b); // VHSUB.S16 d0,d0,d0
   4704 _NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b)
   4705 {
   4706     int16x4_t res64;
   4707     return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
   4708 }
   4709 
   4710 
   4711 
   4712 int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b); // VHSUB.S32 d0,d0,d0
   4713 _NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b)
   4714 {
   4715     int32x2_t res64;
   4716     return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
   4717 }
   4718 
   4719 
   4720 uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b); // VHSUB.U8 d0,d0,d0
   4721 _NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b)
   4722 {
   4723     uint8x8_t res64;
   4724     return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
   4725 }
   4726 
   4727 uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b); // VHSUB.s16 d0,d0,d0
   4728 _NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b)
   4729 {
   4730     uint16x4_t res64;
   4731     return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
   4732 }
   4733 
   4734 uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b); // VHSUB.U32 d0,d0,d0
   4735 _NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b)
   4736 {
   4737     uint32x2_t res64;
   4738     return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
   4739 }
   4740 
   4741 int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
   4742 _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
   4743 {
   4744     // //need to deal with the possibility of internal overflow
   4745     __m128i c128, au,bu;
   4746     c128 = _mm_set1_epi8 (128);
   4747     au = _mm_add_epi8( a, c128);
   4748     bu = _mm_add_epi8( b, c128);
   4749     return vhsubq_u8(au,bu);
   4750 }
   4751 
   4752 int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
   4753 _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
   4754 {
   4755     //need to deal with the possibility of internal overflow
   4756     __m128i c8000, au,bu;
   4757     c8000 = _mm_set1_epi16(0x8000);
   4758     au = _mm_add_epi16( a, c8000);
   4759     bu = _mm_add_epi16( b, c8000);
   4760     return vhsubq_u16(au,bu);
   4761 }
   4762 
   4763 int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
   4764 _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
   4765 {
   4766     //need to deal with the possibility of internal overflow
   4767     __m128i a2, b2,r, b_1;
   4768     a2 = _mm_srai_epi32 (a,1);
   4769     b2 = _mm_srai_epi32 (b,1);
   4770     r = _mm_sub_epi32 (a2, b2);
   4771     b_1 = _mm_andnot_si128(a, b); //!a and b
   4772     b_1 = _mm_slli_epi32 (b_1,31);
   4773     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
   4774     return _mm_sub_epi32(r,b_1);
   4775 }
   4776 
   4777 uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
   4778 _NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
   4779 {
   4780     __m128i avg;
   4781     avg = _mm_avg_epu8 (a, b);
   4782     return _mm_sub_epi8(a, avg);
   4783 }
   4784 
   4785 uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
   4786 _NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
   4787 {
   4788     __m128i avg;
   4789     avg = _mm_avg_epu16 (a, b);
   4790     return _mm_sub_epi16(a, avg);
   4791 }
   4792 
   4793 uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
   4794 _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
   4795 {
   4796     //need to deal with the possibility of internal overflow
   4797     __m128i a2, b2,r, b_1;
   4798     a2 = _mm_srli_epi32 (a,1);
   4799     b2 = _mm_srli_epi32 (b,1);
   4800     r = _mm_sub_epi32 (a2, b2);
   4801     b_1 = _mm_andnot_si128(a, b); //!a and b
   4802     b_1 = _mm_slli_epi32 (b_1,31);
   4803     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
   4804     return _mm_sub_epi32(r,b_1);
   4805 }
   4806 
   4807 //******* Vector subtract high half (truncated) ** ************
   4808 //************************************************************
   4809 int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
   4810 _NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
   4811 {
   4812     int8x8_t res64;
   4813     __m128i sum, sum8;
   4814     sum = _mm_sub_epi16 (a, b);
   4815     sum8 = _mm_srai_epi16 (sum, 8);
   4816     sum8 = _mm_packs_epi16(sum8,sum8);
   4817     return64(sum8);
   4818 }
   4819 
   4820 int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
   4821 _NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
   4822 {
   4823     int16x4_t res64;
   4824     __m128i sum, sum16;
   4825     sum = _mm_sub_epi32 (a, b);
   4826     sum16 = _mm_srai_epi32 (sum, 16);
   4827     sum16 = _mm_packs_epi32(sum16,sum16);
   4828     return64(sum16);
   4829 }
   4830 
   4831 int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
   4832 _NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
   4833 {
   4834     int32x2_t res64;
   4835     __m128i sub;
   4836     sub = _mm_sub_epi64 (a, b);
   4837     sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
   4838     return64(sub);
   4839 }
   4840 
   4841 uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
   4842 _NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
   4843 {
   4844     uint8x8_t res64;
   4845     __m128i sum, sum8;
   4846     sum = _mm_sub_epi16 (a, b);
   4847     sum8 = _mm_srli_epi16 (sum, 8);
   4848     sum8 =  _mm_packus_epi16(sum8,sum8);
   4849     return64(sum8);
   4850 }
   4851 
   4852 uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
   4853 _NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
   4854 {
   4855     uint16x4_t res64;
   4856     __m128i sum, sum16;
   4857     sum = _mm_sub_epi32 (a, b);
   4858     sum16 = _mm_srli_epi32 (sum, 16);
   4859     sum16 =  _MM_PACKUS1_EPI32(sum16);
   4860     return64(sum16);
   4861 }
   4862 
   4863 uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
   4864 #define vsubhn_u64 vsubhn_s64
   4865 
   4866 //************ Vector rounding subtract high half *********************
   4867 //*********************************************************************
   4868 int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
   4869 _NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
   4870 {
   4871     int8x8_t res64;
   4872     __m128i sub, mask1;
   4873     sub = _mm_sub_epi16 (a, b);
   4874     mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
   4875     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
   4876     sub = _mm_srai_epi16 (sub, 8); //get high half
   4877     sub = _mm_add_epi16 (sub, mask1); //actual rounding
   4878     sub =  _mm_packs_epi16 (sub, sub);
   4879     return64(sub);
   4880 }
   4881 
   4882 int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
   4883 _NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
   4884 {
   4885     //SIMD may be not optimal, serial may be faster
   4886     int16x4_t res64;
   4887     __m128i sub, mask1;
   4888     sub = _mm_sub_epi32 (a, b);
   4889     mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
   4890     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
   4891     sub = _mm_srai_epi32 (sub, 16); //get high half
   4892     sub = _mm_add_epi32 (sub, mask1); //actual rounding
   4893     sub = _mm_packs_epi32 (sub, sub);
   4894     return64(sub);
   4895 }
   4896 
   4897 int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
   4898 _NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
   4899 {
   4900     //SIMD may be not optimal, serial may be faster
   4901     int32x2_t res64;
   4902     __m128i sub, mask1;
   4903     sub = _mm_sub_epi64 (a, b);
   4904     mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
   4905     mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
   4906     sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
   4907     sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
   4908     return64(sub);
   4909 }
   4910 
   4911 uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
   4912 _NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
   4913 {
   4914     uint8x8_t res64;
   4915     __m128i sub, mask1;
   4916     sub = _mm_sub_epi16 (a, b);
   4917     mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
   4918     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
   4919     sub = _mm_srai_epi16 (sub, 8); //get high half
   4920     sub = _mm_add_epi16 (sub, mask1); //actual rounding
   4921     sub = _mm_packus_epi16 (sub, sub);
   4922     return64(sub);
   4923 }
   4924 
   4925 uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
   4926 _NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
   4927 {
   4928     //SIMD may be not optimal, serial may be faster
   4929     uint16x4_t res64;
   4930     __m128i sub, mask1;
   4931     sub = _mm_sub_epi32 (a, b);
   4932     mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
   4933     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
   4934     sub = _mm_srai_epi32 (sub, 16); //get high half
   4935     sub = _mm_add_epi32 (sub, mask1); //actual rounding
   4936     sub =  _MM_PACKUS1_EPI32 (sub);
   4937     return64(sub);
   4938 }
   4939 
   4940 uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
   4941 #define vrsubhn_u64 vrsubhn_s64
   4942 
   4943 //*********** Vector saturating doubling multiply subtract long ********************
   4944 //************************************************************************************
   4945 int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
   4946 _NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
   4947 {
   4948     //not optimal SIMD soulution, serial may be faster
   4949     __m128i res32, mask;
   4950     int32x4_t res;
   4951     _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4952     res = vmull_s16(b,  c);
   4953     res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
   4954     mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
   4955     res32 = _mm_xor_si128 (res32,  mask); //res32 saturated for 0x80000000
   4956     return vqsubq_s32(a, res32); //saturation
   4957 }
   4958 
   4959 int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
   4960 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   4961 {
   4962     __m128i res64, mask;
   4963     int64x2_t res;
   4964     _NEON2SSE_ALIGN_16 uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
   4965     res = vmull_s32(b,  c);
   4966     res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
   4967     mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
   4968     res64 = _mm_xor_si128 (res64,  mask); //res32 saturated for 0x80000000
   4969     return vqsubq_s64(a, res64); //saturation
   4970 }
   4971 
   4972 //******************  COMPARISON ***************************************
   4973 //******************* Vector compare equal *************************************
   4974 //****************************************************************************
   4975 uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
   4976 _NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
   4977 {
   4978     int8x8_t res64;
   4979     return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
   4980 }
   4981 
   4982 
   4983 uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
   4984 _NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
   4985 {
   4986     int16x4_t res64;
   4987     return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
   4988 }
   4989 
   4990 
   4991 uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
   4992 _NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
   4993 {
   4994     int32x2_t res64;
   4995     return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
   4996 }
   4997 
   4998 
   4999 uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
   5000 _NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
   5001 {
   5002     uint32x2_t res64;
   5003     __m128 res;
   5004     res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
   5005     return64f(res);
   5006 }
   5007 
   5008 uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
   5009 _NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
   5010 {
   5011     uint8x8_t res64;
   5012     return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
   5013 }
   5014 
   5015 
   5016 uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
   5017 _NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
   5018 {
   5019     uint16x4_t res64;
   5020     return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
   5021 }
   5022 
   5023 
   5024 uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
   5025 _NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
   5026 {
   5027     uint32x2_t res64;
   5028     return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
   5029 }
   5030 
   5031 
   5032 uint8x8_t   vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
   5033 #define vceq_p8 vceq_u8
   5034 
   5035 
   5036 uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
   5037 #define vceqq_s8 _mm_cmpeq_epi8
   5038 
   5039 uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
   5040 #define vceqq_s16 _mm_cmpeq_epi16
   5041 
   5042 uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
   5043 #define vceqq_s32 _mm_cmpeq_epi32
   5044 
   5045 uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
   5046 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
   5047 {
   5048     __m128 res;
   5049     res = _mm_cmpeq_ps(a,b);
   5050     return _M128i(res);
   5051 }
   5052 
   5053 uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
   5054 #define vceqq_u8 _mm_cmpeq_epi8
   5055 
   5056 uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
   5057 #define vceqq_u16 _mm_cmpeq_epi16
   5058 
   5059 uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
   5060 #define vceqq_u32 _mm_cmpeq_epi32
   5061 
   5062 uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
   5063 #define vceqq_p8 _mm_cmpeq_epi8
   5064 
   5065 //******************Vector compare greater-than or equal*************************
   5066 //*******************************************************************************
   5067 //in IA SIMD no greater-than-or-equal comparison for integers,
   5068 // there is greater-than available only, so we need the following tricks
   5069 
   5070 uint8x8_t vcge_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
   5071 _NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a,  int8x8_t b)
   5072 {
   5073     int8x8_t res64;
   5074     return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
   5075 }
   5076 
   5077 
   5078 uint16x4_t vcge_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
   5079 _NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a,  int16x4_t b)
   5080 {
   5081     int16x4_t res64;
   5082     return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
   5083 }
   5084 
   5085 
   5086 uint32x2_t vcge_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
   5087 _NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a,  int32x2_t b)
   5088 {
   5089     int32x2_t res64;
   5090     return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
   5091 }
   5092 
   5093 
   5094 uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
   5095 _NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
   5096 {
   5097     uint32x2_t res64;
   5098     __m128 res;
   5099     res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
   5100     return64f(res);
   5101 }
   5102 
   5103 uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
   5104 _NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b)
   5105 {
   5106     uint8x8_t res64;
   5107     return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
   5108 }
   5109 
   5110 
   5111 uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
   5112 _NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b)
   5113 {
   5114     uint16x4_t res64;
   5115     return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
   5116 }
   5117 
   5118 
   5119 uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
   5120 _NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b)
   5121 {
   5122     //serial solution looks faster
   5123     uint32x2_t res64;
   5124     return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
   5125 }
   5126 
   5127 
   5128 
   5129 uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
   5130 _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
   5131 {
   5132     __m128i m1, m2;
   5133     m1 = _mm_cmpgt_epi8 ( a, b);
   5134     m2 = _mm_cmpeq_epi8 ( a, b);
   5135     return _mm_or_si128  ( m1, m2);
   5136 }
   5137 
   5138 uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
   5139 _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
   5140 {
   5141     __m128i m1, m2;
   5142     m1 = _mm_cmpgt_epi16 ( a, b);
   5143     m2 = _mm_cmpeq_epi16 ( a, b);
   5144     return _mm_or_si128   ( m1,m2);
   5145 }
   5146 
   5147 uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
   5148 _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
   5149 {
   5150     __m128i m1, m2;
   5151     m1 = _mm_cmpgt_epi32 (a, b);
   5152     m2 = _mm_cmpeq_epi32 (a, b);
   5153     return _mm_or_si128   (m1, m2);
   5154 }
   5155 
   5156 uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
   5157 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
   5158 {
   5159     __m128 res;
   5160     res = _mm_cmpge_ps(a,b); //use only 2 first entries
   5161     return *(__m128i*)&res;
   5162 }
   5163 
   5164 uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
   5165 _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
   5166 {
   5167     //no unsigned chars comparison, only signed available,so need the trick
   5168     #ifdef USE_SSE4
   5169         __m128i cmp;
   5170         cmp = _mm_max_epu8(a, b);
   5171         return _mm_cmpeq_epi8(cmp, a); //a>=b
   5172     #else
   5173         __m128i c128, as, bs, m1, m2;
   5174         c128 = _mm_set1_epi8 (128);
   5175         as = _mm_sub_epi8( a, c128);
   5176         bs = _mm_sub_epi8( b, c128);
   5177         m1 = _mm_cmpgt_epi8( as, bs);
   5178         m2 = _mm_cmpeq_epi8 (as, bs);
   5179         return _mm_or_si128 ( m1,  m2);
   5180     #endif
   5181 }
   5182 
   5183 uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
   5184 _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
   5185 {
   5186     //no unsigned shorts comparison, only signed available,so need the trick
   5187     #ifdef USE_SSE4
   5188         __m128i cmp;
   5189         cmp = _mm_max_epu16(a, b);
   5190         return _mm_cmpeq_epi16(cmp, a); //a>=b
   5191     #else
   5192         __m128i c8000, as, bs, m1, m2;
   5193         c8000 = _mm_set1_epi16 (0x8000);
   5194         as = _mm_sub_epi16(a,c8000);
   5195         bs = _mm_sub_epi16(b,c8000);
   5196         m1 = _mm_cmpgt_epi16(as, bs);
   5197         m2 = _mm_cmpeq_epi16 (as, bs);
   5198         return _mm_or_si128 ( m1, m2);
   5199     #endif
   5200 }
   5201 
   5202 uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
   5203 _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
   5204 {
   5205     //no unsigned ints comparison, only signed available,so need the trick
   5206     #ifdef USE_SSE4
   5207         __m128i cmp;
   5208         cmp = _mm_max_epu32(a, b);
   5209         return _mm_cmpeq_epi32(cmp, a); //a>=b
   5210     #else
   5211         //serial solution may be faster
   5212         __m128i c80000000, as, bs, m1, m2;
   5213         c80000000 = _mm_set1_epi32 (0x80000000);
   5214         as = _mm_sub_epi32(a,c80000000);
   5215         bs = _mm_sub_epi32(b,c80000000);
   5216         m1 = _mm_cmpgt_epi32 (as, bs);
   5217         m2 = _mm_cmpeq_epi32 (as, bs);
   5218         return _mm_or_si128 ( m1,  m2);
   5219     #endif
   5220 }
   5221 
   5222 //**********************Vector compare less-than or equal******************************
   5223 //***************************************************************************************
   5224 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
   5225 
   5226 uint8x8_t vcle_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
   5227 _NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a,  int8x8_t b)
   5228 {
   5229     int8x8_t res64;
   5230     return64(vcleq_s8(_pM128i(a), _pM128i(b)));
   5231 }
   5232 
   5233 
   5234 uint16x4_t vcle_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
   5235 _NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a,  int16x4_t b)
   5236 {
   5237     int16x4_t res64;
   5238     return64(vcleq_s16(_pM128i(a), _pM128i(b)));
   5239 }
   5240 
   5241 
   5242 uint32x2_t vcle_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
   5243 _NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a,  int32x2_t b)
   5244 {
   5245     int32x2_t res64;
   5246     return64(vcleq_s32(_pM128i(a), _pM128i(b)));
   5247 }
   5248 
   5249 
   5250 uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
   5251 _NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
   5252 {
   5253     uint32x2_t res64;
   5254     __m128 res;
   5255     res = _mm_cmple_ps(_pM128(a),_pM128(b));
   5256     return64f(res);
   5257 }
   5258 
   5259 uint8x8_t vcle_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
   5260 #define vcle_u8(a,b) vcge_u8(b,a)
   5261 
   5262 
   5263 uint16x4_t vcle_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
   5264 #define vcle_u16(a,b) vcge_u16(b,a)
   5265 
   5266 
   5267 uint32x2_t vcle_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
   5268 #define vcle_u32(a,b) vcge_u32(b,a)
   5269 
   5270 uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
   5271 _NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
   5272 {
   5273     __m128i c1, res;
   5274     c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
   5275     res = _mm_cmpgt_epi8 ( a,  b);
   5276     return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
   5277 }
   5278 
   5279 uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
   5280 _NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
   5281 {
   5282     __m128i c1, res;
   5283     c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
   5284     res = _mm_cmpgt_epi16 ( a,  b);
   5285     return _mm_andnot_si128 (res, c1);
   5286 }
   5287 
   5288 uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
   5289 _NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
   5290 {
   5291     __m128i c1, res;
   5292     c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
   5293     res = _mm_cmpgt_epi32 ( a,  b);
   5294     return _mm_andnot_si128 (res, c1);
   5295 }
   5296 
   5297 uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
   5298 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
   5299 {
   5300     __m128 res;
   5301     res = _mm_cmple_ps(a,b);
   5302     return *(__m128i*)&res;
   5303 }
   5304 
   5305 uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
   5306 #ifdef USE_SSE4
   5307     _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
   5308     {
   5309         //no unsigned chars comparison in SSE, only signed available,so need the trick
   5310         __m128i cmp;
   5311         cmp = _mm_min_epu8(a, b);
   5312         return _mm_cmpeq_epi8(cmp, a); //a<=b
   5313     }
   5314 #else
   5315     #define vcleq_u8(a,b) vcgeq_u8(b,a)
   5316 #endif
   5317 
   5318 
   5319 uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
   5320 #ifdef USE_SSE4
   5321     _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
   5322     {
   5323         //no unsigned shorts comparison in SSE, only signed available,so need the trick
   5324         __m128i cmp;
   5325         cmp = _mm_min_epu16(a, b);
   5326         return _mm_cmpeq_epi16(cmp, a); //a<=b
   5327     }
   5328 #else
   5329     #define vcleq_u16(a,b) vcgeq_u16(b,a)
   5330 #endif
   5331 
   5332 
   5333 uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
   5334 #ifdef USE_SSE4
   5335     _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
   5336     {
   5337         //no unsigned chars comparison in SSE, only signed available,so need the trick
   5338         __m128i cmp;
   5339         cmp = _mm_min_epu32(a, b);
   5340         return _mm_cmpeq_epi32(cmp, a); //a<=b
   5341     }
   5342 #else
   5343 //solution may be not optimal compared with the serial one
   5344     #define vcleq_u32(a,b) vcgeq_u32(b,a)
   5345 #endif
   5346 
   5347 
   5348 //****** Vector compare greater-than ******************************************
   5349 //**************************************************************************
   5350 uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
   5351 _NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
   5352 {
   5353     int8x8_t res64;
   5354     return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
   5355 }
   5356 
   5357 
   5358 uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
   5359 _NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
   5360 {
   5361     int16x4_t res64;
   5362     return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
   5363 }
   5364 
   5365 
   5366 uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
   5367 _NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
   5368 {
   5369     int32x2_t res64;
   5370     return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
   5371 }
   5372 
   5373 
   5374 uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
   5375 _NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
   5376 {
   5377     uint32x2_t res64;
   5378     __m128 res;
   5379     res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
   5380     return64f(res);
   5381 }
   5382 
   5383 uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b); // VCGT.U8 d0, d0, d0
   5384 _NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b)
   5385 {
   5386     uint8x8_t res64;
   5387     return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
   5388 }
   5389 
   5390 
   5391 uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b); // VCGT.s16 d0, d0, d0
   5392 _NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b)
   5393 {
   5394     uint16x4_t res64;
   5395     return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
   5396 }
   5397 
   5398 
   5399 uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b); // VCGT.U32 d0, d0, d0
   5400 _NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b)
   5401 {
   5402     uint32x2_t res64;
   5403     return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
   5404 }
   5405 
   5406 
   5407 uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
   5408 #define vcgtq_s8 _mm_cmpgt_epi8
   5409 
   5410 uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
   5411 #define vcgtq_s16 _mm_cmpgt_epi16
   5412 
   5413 uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
   5414 #define vcgtq_s32 _mm_cmpgt_epi32
   5415 
   5416 uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
   5417 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
   5418 {
   5419     __m128 res;
   5420     res = _mm_cmpgt_ps(a,b); //use only 2 first entries
   5421     return *(__m128i*)&res;
   5422 }
   5423 
   5424 uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
   5425 _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
   5426 {
   5427     //no unsigned chars comparison, only signed available,so need the trick
   5428     __m128i c128, as, bs;
   5429     c128 = _mm_set1_epi8 (128);
   5430     as = _mm_sub_epi8(a,c128);
   5431     bs = _mm_sub_epi8(b,c128);
   5432     return _mm_cmpgt_epi8 (as, bs);
   5433 }
   5434 
   5435 uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
   5436 _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
   5437 {
   5438     //no unsigned short comparison, only signed available,so need the trick
   5439     __m128i c8000, as, bs;
   5440     c8000 = _mm_set1_epi16 (0x8000);
   5441     as = _mm_sub_epi16(a,c8000);
   5442     bs = _mm_sub_epi16(b,c8000);
   5443     return _mm_cmpgt_epi16 ( as, bs);
   5444 }
   5445 
   5446 uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
   5447 _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
   5448 {
   5449     //no unsigned int comparison, only signed available,so need the trick
   5450     __m128i c80000000, as, bs;
   5451     c80000000 = _mm_set1_epi32 (0x80000000);
   5452     as = _mm_sub_epi32(a,c80000000);
   5453     bs = _mm_sub_epi32(b,c80000000);
   5454     return _mm_cmpgt_epi32 ( as, bs);
   5455 }
   5456 
   5457 //********************* Vector compare less-than **************************
   5458 //*************************************************************************
   5459 uint8x8_t   vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
   5460 #define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
   5461 
   5462 
   5463 uint16x4_t   vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
   5464 #define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
   5465 
   5466 
   5467 uint32x2_t   vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
   5468 #define vclt_s32(a,b)  vcgt_s32(b,a) //swap the arguments!!
   5469 
   5470 
   5471 uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
   5472 #define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
   5473 
   5474 uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
   5475 #define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
   5476 
   5477 uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
   5478 #define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
   5479 
   5480 uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
   5481 #define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
   5482 
   5483 uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
   5484 #define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
   5485 
   5486 uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
   5487 #define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
   5488 
   5489 uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
   5490 #define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
   5491 
   5492 uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
   5493 #define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
   5494 
   5495 uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
   5496 #define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
   5497 
   5498 uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
   5499 #define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
   5500 
   5501 uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
   5502 #define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
   5503 
   5504 //*****************Vector compare absolute greater-than or equal ************
   5505 //***************************************************************************
   5506 uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
   5507 _NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
   5508 {
   5509     uint32x2_t res64;
   5510     __m128i c7fffffff;
   5511     __m128 a0, b0;
   5512     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5513     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
   5514     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
   5515     a0 = _mm_cmpge_ps ( a0, b0);
   5516     return64f(a0);
   5517 }
   5518 
   5519 uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
   5520 _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
   5521 {
   5522     __m128i c7fffffff;
   5523     __m128 a0, b0;
   5524     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5525     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   5526     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   5527     a0 = _mm_cmpge_ps ( a0, b0);
   5528     return (*(__m128i*)&a0);
   5529 }
   5530 
   5531 //********Vector compare absolute less-than or equal ******************
   5532 //********************************************************************
   5533 uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
   5534 _NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
   5535 {
   5536     uint32x2_t res64;
   5537     __m128i c7fffffff;
   5538     __m128 a0, b0;
   5539     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5540     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
   5541     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
   5542     a0 = _mm_cmple_ps (a0, b0);
   5543     return64f(a0);
   5544 }
   5545 
   5546 uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
   5547 _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
   5548 {
   5549     __m128i c7fffffff;
   5550     __m128 a0, b0;
   5551     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5552     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   5553     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   5554     a0 = _mm_cmple_ps (a0, b0);
   5555     return (*(__m128i*)&a0);
   5556 }
   5557 
   5558 //********  Vector compare absolute greater-than    ******************
   5559 //******************************************************************
   5560 uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
   5561 _NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
   5562 {
   5563     uint32x2_t res64;
   5564     __m128i c7fffffff;
   5565     __m128 a0, b0;
   5566     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5567     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
   5568     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
   5569     a0 = _mm_cmpgt_ps (a0, b0);
   5570     return64f(a0);
   5571 }
   5572 
   5573 uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
   5574 _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
   5575 {
   5576     __m128i c7fffffff;
   5577     __m128 a0, b0;
   5578     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5579     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   5580     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   5581     a0 = _mm_cmpgt_ps (a0, b0);
   5582     return (*(__m128i*)&a0);
   5583 }
   5584 
   5585 //***************Vector compare absolute less-than  ***********************
   5586 //*************************************************************************
   5587 uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
   5588 _NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
   5589 {
   5590     uint32x2_t res64;
   5591     __m128i c7fffffff;
   5592     __m128 a0, b0;
   5593     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5594     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
   5595     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
   5596     a0 = _mm_cmplt_ps (a0, b0);
   5597     return64f(a0);
   5598 }
   5599 
   5600 uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
   5601 _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
   5602 {
   5603     __m128i c7fffffff;
   5604     __m128 a0, b0;
   5605     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5606     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   5607     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   5608     a0 = _mm_cmplt_ps (a0, b0);
   5609     return (*(__m128i*)&a0);
   5610 }
   5611 
   5612 //*************************Vector test bits************************************
   5613 //*****************************************************************************
   5614 /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
   5615 with the corresponding element of a second vector. If the result is not zero, the
   5616 corresponding element in the destination vector is set to all ones. Otherwise, it is set to
   5617 all zeros. */
   5618 
   5619 uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b); // VTST.8 d0, d0, d0
   5620 _NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b)
   5621 {
   5622     int8x8_t res64;
   5623     return64(vtstq_s8(_pM128i(a), _pM128i(b)));
   5624 }
   5625 
   5626 
   5627 uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b); // VTST.16 d0, d0, d0
   5628 _NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b)
   5629 {
   5630     int16x4_t res64;
   5631     return64(vtstq_s16(_pM128i(a), _pM128i(b)));
   5632 }
   5633 
   5634 
   5635 uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b); // VTST.32 d0, d0, d0
   5636 _NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b)
   5637 {
   5638     int32x2_t res64;
   5639     return64(vtstq_s32(_pM128i(a), _pM128i(b)));
   5640 }
   5641 
   5642 
   5643 uint8x8_t vtst_u8(uint8x8_t a,  uint8x8_t b); // VTST.8 d0, d0, d0
   5644 #define vtst_u8 vtst_s8
   5645 
   5646 uint16x4_t vtst_u16(uint16x4_t a,  uint16x4_t b); // VTST.16 d0, d0, d0
   5647 #define vtst_u16 vtst_s16
   5648 
   5649 uint32x2_t vtst_u32(uint32x2_t a,  uint32x2_t b); // VTST.32 d0, d0, d0
   5650 #define vtst_u32 vtst_s32
   5651 
   5652 
   5653 uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
   5654 #define vtst_p8 vtst_u8
   5655 
   5656 uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
   5657 _NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
   5658 {
   5659     __m128i zero, one, res;
   5660     zero = _mm_setzero_si128 ();
   5661     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
   5662     res = _mm_and_si128 (a, b);
   5663     res =  _mm_cmpeq_epi8 (res, zero);
   5664     return _mm_xor_si128(res, one); //invert result
   5665 }
   5666 
   5667 uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
   5668 _NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
   5669 {
   5670     __m128i zero, one, res;
   5671     zero = _mm_setzero_si128 ();
   5672     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
   5673     res = _mm_and_si128 (a, b);
   5674     res =  _mm_cmpeq_epi16 (res, zero);
   5675     return _mm_xor_si128(res, one); //invert result
   5676 }
   5677 
   5678 uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
   5679 _NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
   5680 {
   5681     __m128i zero, one, res;
   5682     zero = _mm_setzero_si128 ();
   5683     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
   5684     res = _mm_and_si128 (a, b);
   5685     res =  _mm_cmpeq_epi32 (res, zero);
   5686     return _mm_xor_si128(res, one); //invert result
   5687 }
   5688 
   5689 uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
   5690 #define vtstq_u8 vtstq_s8
   5691 
   5692 uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
   5693 #define vtstq_u16 vtstq_s16
   5694 
   5695 uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
   5696 #define vtstq_u32 vtstq_s32
   5697 
   5698 uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
   5699 #define vtstq_p8 vtstq_u8
   5700 
   5701 //****************** Absolute difference ********************
   5702 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
   5703 //************************************************************
   5704 int8x8_t vabd_s8(int8x8_t a,  int8x8_t b); // VABD.S8 d0,d0,d0
   5705 _NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b)
   5706 {
   5707     int8x8_t res64;
   5708     return64(vabdq_s8(_pM128i(a), _pM128i(b)));
   5709 }
   5710 
   5711 int16x4_t vabd_s16(int16x4_t a,  int16x4_t b); // VABD.S16 d0,d0,d0
   5712 _NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b)
   5713 {
   5714     int16x4_t res64;
   5715     return64(vabdq_s16(_pM128i(a), _pM128i(b)));
   5716 }
   5717 
   5718 int32x2_t vabd_s32(int32x2_t a,  int32x2_t b); // VABD.S32 d0,d0,d0
   5719 _NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b)
   5720 {
   5721     int32x2_t res64;
   5722     return64(vabdq_s32(_pM128i(a), _pM128i(b)));
   5723 }
   5724 
   5725 uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b); // VABD.U8 d0,d0,d0
   5726 _NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b)
   5727 {
   5728     uint8x8_t res64;
   5729     return64(vabdq_u8(_pM128i(a), _pM128i(b)));
   5730 }
   5731 
   5732 uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b); // VABD.s16 d0,d0,d0
   5733 _NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b)
   5734 {
   5735     uint16x4_t res64;
   5736     return64(vabdq_u16(_pM128i(a), _pM128i(b)));
   5737 }
   5738 
   5739 uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b); // VABD.U32 d0,d0,d0
   5740 _NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b)
   5741 {
   5742     uint32x2_t res64;
   5743     return64(vabdq_u32(_pM128i(a), _pM128i(b)));
   5744 }
   5745 
   5746 float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
   5747 _NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
   5748 {
   5749     float32x4_t res;
   5750     __m64_128 res64;
   5751     res = vabdq_f32(_pM128(a), _pM128(b));
   5752     _M64f(res64, res);
   5753     return res64;
   5754 }
   5755 
   5756 int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
   5757 _NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
   5758 {
   5759     __m128i res;
   5760     res = _mm_sub_epi8 (a, b);
   5761     return _mm_abs_epi8 (res);
   5762 }
   5763 
   5764 int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
   5765 _NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
   5766 {
   5767     __m128i res;
   5768     res = _mm_sub_epi16 (a,b);
   5769     return _mm_abs_epi16 (res);
   5770 }
   5771 
   5772 int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
   5773 _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
   5774 {
   5775     __m128i res;
   5776     res = _mm_sub_epi32 (a,b);
   5777     return _mm_abs_epi32 (res);
   5778 }
   5779 
   5780 uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
   5781 _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
   5782 {
   5783     __m128i cmp, difab, difba;
   5784     cmp = vcgtq_u8(a,b);
   5785     difab = _mm_sub_epi8(a,b);
   5786     difba = _mm_sub_epi8 (b,a);
   5787     difab = _mm_and_si128(cmp, difab);
   5788     difba = _mm_andnot_si128(cmp, difba);
   5789     return _mm_or_si128(difab, difba);
   5790 }
   5791 
   5792 uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
   5793 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
   5794 {
   5795     __m128i cmp, difab, difba;
   5796     cmp = vcgtq_u16(a,b);
   5797     difab = _mm_sub_epi16(a,b);
   5798     difba = _mm_sub_epi16 (b,a);
   5799     difab = _mm_and_si128(cmp, difab);
   5800     difba = _mm_andnot_si128(cmp, difba);
   5801     return _mm_or_si128(difab, difba);
   5802 }
   5803 
   5804 uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
   5805 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
   5806 {
   5807     __m128i cmp, difab, difba;
   5808     cmp = vcgtq_u32(a,b);
   5809     difab = _mm_sub_epi32(a,b);
   5810     difba = _mm_sub_epi32 (b,a);
   5811     difab = _mm_and_si128(cmp, difab);
   5812     difba = _mm_andnot_si128(cmp, difba);
   5813     return _mm_or_si128(difab, difba);
   5814 }
   5815 
   5816 float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
   5817 _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
   5818 {
   5819     __m128i c1;
   5820     __m128 res;
   5821     c1 =  _mm_set1_epi32(0x7fffffff);
   5822     res = _mm_sub_ps (a, b);
   5823     return _mm_and_ps (res, *(__m128*)&c1);
   5824 }
   5825 
   5826 //************  Absolute difference - long **************************
   5827 //********************************************************************
   5828 int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
   5829 _NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
   5830 {
   5831     __m128i a16, b16;
   5832     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
   5833     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   5834     return vabdq_s16(a16, b16);
   5835 
   5836 }
   5837 
   5838 int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
   5839 _NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
   5840 {
   5841     __m128i a32, b32;
   5842     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
   5843     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
   5844     return vabdq_s32(a32, b32);
   5845 }
   5846 
   5847 int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
   5848 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
   5849 {
   5850     //no optimal SIMD solution, serial looks faster
   5851     _NEON2SSE_ALIGN_16 int64_t res[2];
   5852     if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
   5853     else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
   5854     if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
   5855     else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
   5856     return _mm_load_si128((__m128i*)res);
   5857 }
   5858 
   5859 uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
   5860 _NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
   5861 {
   5862     __m128i res;
   5863     res = vsubl_u8(a,b);
   5864     return _mm_abs_epi16(res);
   5865 }
   5866 
   5867 uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
   5868 _NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
   5869 {
   5870     __m128i res;
   5871     res = vsubl_u16(a,b);
   5872     return _mm_abs_epi32(res);
   5873 }
   5874 
   5875 uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
   5876 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   5877 {
   5878     _NEON2SSE_ALIGN_16 uint64_t res[2];
   5879     if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
   5880     else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
   5881     if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
   5882     else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
   5883     return _mm_load_si128((__m128i*)res);
   5884 }
   5885 
   5886 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
   5887 //*********************************************************************************************
   5888 int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
   5889 _NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c)
   5890 {
   5891     int8x8_t res64;
   5892     return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
   5893 }
   5894 
   5895 int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
   5896 _NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
   5897 {
   5898     int16x4_t res64;
   5899     return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
   5900 }
   5901 
   5902 int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
   5903 _NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c)
   5904 {
   5905     int32x2_t res64;
   5906     return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
   5907 }
   5908 
   5909 uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
   5910 #define vaba_u8 vaba_s8
   5911 
   5912 
   5913 uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
   5914 #define vaba_u16 vaba_s16
   5915 
   5916 uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
   5917 _NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c)
   5918 {
   5919     uint32x2_t res64;
   5920     return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
   5921 }
   5922 
   5923 int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
   5924 _NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
   5925 {
   5926     int8x16_t sub;
   5927     sub = vabdq_s8(b, c);
   5928     return vaddq_s8( a, sub);
   5929 }
   5930 
   5931 int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
   5932 _NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
   5933 {
   5934     int16x8_t sub;
   5935     sub = vabdq_s16(b, c);
   5936     return vaddq_s16( a, sub);
   5937 }
   5938 
   5939 int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
   5940 _NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
   5941 {
   5942     int32x4_t sub;
   5943     sub = vabdq_s32(b, c);
   5944     return vaddq_s32( a, sub);
   5945 }
   5946 
   5947 uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
   5948 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
   5949 {
   5950     uint8x16_t sub;
   5951     sub = vabdq_u8(b, c);
   5952     return vaddq_u8( a, sub);
   5953 }
   5954 
   5955 uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
   5956 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
   5957 {
   5958     uint16x8_t sub;
   5959     sub = vabdq_u16(b, c);
   5960     return vaddq_u16( a, sub);
   5961 }
   5962 
   5963 uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
   5964 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
   5965 {
   5966     uint32x4_t sub;
   5967     sub = vabdq_u32(b, c);
   5968     return vaddq_u32( a, sub);
   5969 }
   5970 
   5971 //************** Absolute difference and accumulate - long ********************************
   5972 //*************************************************************************************
   5973 int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
   5974 _NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
   5975 {
   5976     __m128i b16, c16, res;
   5977     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   5978     c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
   5979     res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
   5980     return _mm_add_epi16 (a, res);
   5981 }
   5982 
   5983 int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
   5984 _NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
   5985 {
   5986     __m128i b32, c32, res;
   5987     b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
   5988     c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
   5989     res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
   5990     return _mm_add_epi32 (a, res);
   5991 }
   5992 
   5993 int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
   5994 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   5995 {
   5996     __m128i res;
   5997     res = vabdl_s32(b,c);
   5998     return _mm_add_epi64(a, res);
   5999 }
   6000 
   6001 uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
   6002 _NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
   6003 {
   6004     __m128i b16, c16, res;
   6005     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
   6006     c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
   6007     res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
   6008     return _mm_add_epi16 (a, res);
   6009 }
   6010 
   6011 uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
   6012 _NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
   6013 {
   6014     __m128i b32, c32, res;
   6015     b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
   6016     c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
   6017     res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
   6018     return _mm_add_epi32 (a, res);
   6019 }
   6020 
   6021 uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
   6022 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   6023 {
   6024     __m128i res;
   6025     res = vabdl_u32(b,c);
   6026     return _mm_add_epi64(a, res);
   6027 }
   6028 
   6029 //***********************************************************************************
   6030 //****************  Maximum and minimum operations **********************************
   6031 //***********************************************************************************
   6032 //************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
   6033 //***********************************************************************************
   6034 int8x8_t   vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
   6035 _NEON2SSE_INLINE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b)
   6036 {
   6037     int8x8_t res64;
   6038     __m128i res;
   6039     res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
   6040     return64(res);
   6041 }
   6042 
   6043 int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
   6044 _NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
   6045 {
   6046     int16x4_t res64;
   6047     return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
   6048 }
   6049 
   6050 int32x2_t   vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
   6051 _NEON2SSE_INLINE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b)
   6052 {
   6053     int32x2_t res64;
   6054     __m128i res;
   6055     res =  _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
   6056     return64(res);
   6057 }
   6058 
   6059 uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
   6060 _NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
   6061 {
   6062     uint8x8_t res64;
   6063     return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
   6064 }
   6065 
   6066 
   6067 uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
   6068 _NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
   6069 {
   6070     uint16x4_t res64;
   6071     return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
   6072 }
   6073 
   6074 
   6075 uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
   6076 _NEON2SSE_INLINE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b)
   6077 {
   6078     uint32x2_t res64;
   6079     __m128i res;
   6080     res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
   6081     return64(res);
   6082 }
   6083 
   6084 float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
   6085 _NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
   6086 {
   6087     //serial solution looks faster than  SIMD one
   6088     float32x2_t res;
   6089     res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
   6090     res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
   6091     return res;
   6092 }
   6093 
   6094 int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
   6095 #define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
   6096 
   6097 int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
   6098 #define vmaxq_s16 _mm_max_epi16
   6099 
   6100 int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
   6101 #define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
   6102 
   6103 uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
   6104 #define vmaxq_u8 _mm_max_epu8
   6105 
   6106 uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
   6107 #define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
   6108 
   6109 uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
   6110 #define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
   6111 
   6112 
   6113 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
   6114 #define vmaxq_f32 _mm_max_ps
   6115 
   6116 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
   6117 //***********************************************************************************************************
   6118 int8x8_t   vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
   6119 _NEON2SSE_INLINE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b)
   6120 {
   6121     int8x8_t res64;
   6122     __m128i res;
   6123     res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
   6124     return64(res);
   6125 }
   6126 
   6127 int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
   6128 _NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
   6129 {
   6130     int16x4_t res64;
   6131     return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
   6132 }
   6133 
   6134 
   6135 int32x2_t   vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
   6136 _NEON2SSE_INLINE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b)
   6137 {
   6138     int32x2_t res64;
   6139     __m128i res;
   6140     res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
   6141     return64(res);
   6142 }
   6143 
   6144 uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
   6145 _NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
   6146 {
   6147     uint8x8_t res64;
   6148     return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
   6149 }
   6150 
   6151 
   6152 uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
   6153 _NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
   6154 {
   6155     uint16x4_t res64;
   6156     return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
   6157 }
   6158 
   6159 
   6160 uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
   6161 _NEON2SSE_INLINE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b)
   6162 {
   6163     uint32x2_t res64;
   6164     __m128i res;
   6165     res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
   6166     return64(res);
   6167 }
   6168 
   6169 float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
   6170 _NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
   6171 {
   6172     //serial solution looks faster than  SIMD one
   6173     float32x2_t res;
   6174     res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
   6175     res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
   6176     return res;
   6177 }
   6178 
   6179 int8x16_t   vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
   6180 #define vminq_s8 _MM_MIN_EPI8 //SSE4.1
   6181 
   6182 int16x8_t   vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
   6183 #define vminq_s16 _mm_min_epi16
   6184 
   6185 int32x4_t   vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
   6186 #define vminq_s32 _MM_MIN_EPI32 //SSE4.1
   6187 
   6188 uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
   6189 #define vminq_u8 _mm_min_epu8
   6190 
   6191 uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
   6192 #define vminq_u16 _MM_MIN_EPU16 //SSE4.1
   6193 
   6194 uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
   6195 #define vminq_u32 _MM_MIN_EPU32 //SSE4.1
   6196 
   6197 float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
   6198 #define vminq_f32 _mm_min_ps
   6199 
   6200 //*************  Pairwise addition operations. **************************************
   6201 //************************************************************************************
   6202 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
   6203 int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
   6204 _NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
   6205 {
   6206     //no 8 bit hadd in IA32, need to go to 16 bit and then pack
   6207     int8x8_t res64;
   6208     __m128i a16, b16, res;
   6209     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   6210     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
   6211     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
   6212     res = _mm_hadd_epi16 (a16, b16);
   6213     res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
   6214     return64(res);
   6215 }
   6216 
   6217 int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
   6218 _NEON2SSE_INLINE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b)
   6219 {
   6220     int16x4_t res64;
   6221     __m128i hadd128;
   6222     hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
   6223     hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
   6224     return64(hadd128);
   6225 }
   6226 
   6227 
   6228 int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
   6229 _NEON2SSE_INLINE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b)
   6230 {
   6231     int32x2_t res64;
   6232     __m128i hadd128;
   6233     hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
   6234     hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
   6235     return64(hadd128);
   6236 }
   6237 
   6238 
   6239 uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
   6240 _NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
   6241 {
   6242     //  no 8 bit hadd in IA32, need to go to 16 bit and then pack
   6243     uint8x8_t res64;
   6244 //  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
   6245     __m128i mask8, a16, b16, res;
   6246     mask8 = _mm_set1_epi16(0xff);
   6247     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
   6248     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
   6249     res = _mm_hadd_epi16 (a16, b16);
   6250     res = _mm_and_si128(res, mask8); //to avoid saturation
   6251     res = _mm_packus_epi16 (res,res); //use low 64 bits
   6252     return64(res);
   6253 }
   6254 
   6255 uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
   6256 _NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
   6257 {
   6258     // solution may be not optimal, serial execution may be faster
   6259     // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
   6260     uint16x4_t res64;
   6261     __m128i c32767,  cfffe, as, bs, res;
   6262     c32767 = _mm_set1_epi16 (32767);
   6263     cfffe = _mm_set1_epi16 (0xfffe);
   6264     as = _mm_sub_epi16 (_pM128i(a), c32767);
   6265     bs = _mm_sub_epi16 (_pM128i(b), c32767);
   6266     res = _mm_hadd_epi16 (as, bs);
   6267     res = _mm_add_epi16 (res, cfffe);
   6268     res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
   6269     return64(res);
   6270 }
   6271 
   6272 uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
   6273 _NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
   6274 {
   6275     //hadd doesn't work for unsigned values
   6276     uint32x2_t res64;
   6277     __m128i ab, ab_sh, res;
   6278     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
   6279     ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
   6280     res = _mm_add_epi32(ab, ab_sh);
   6281     res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
   6282     return64(res);
   6283 }
   6284 
   6285 float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
   6286 _NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
   6287 {
   6288     __m128 hadd128;
   6289     __m64_128 res64;
   6290     hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
   6291     hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
   6292     _M64f(res64, hadd128);
   6293     return res64;
   6294 }
   6295 
   6296 
   6297 //**************************  Long pairwise add  **********************************
   6298 //*********************************************************************************
   6299 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
   6300 // and places the final results in the destination vector.
   6301 
   6302 int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
   6303 _NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
   6304 {
   6305     //no 8 bit hadd in IA32, need to go to 16 bit anyway
   6306     __m128i a16;
   6307     int16x4_t res64;
   6308     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
   6309     a16 = _mm_hadd_epi16 (a16,  a16); //use low 64 bits
   6310     return64(a16);
   6311 }
   6312 
   6313 int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
   6314 _NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
   6315 {
   6316     // solution may be not optimal, serial execution may be faster
   6317     int32x2_t res64;
   6318     __m128i r32_1;
   6319     r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
   6320     r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
   6321     return64(r32_1);
   6322 }
   6323 
   6324 int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
   6325 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
   6326 {
   6327     int64x1_t res;
   6328     res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
   6329     return res;
   6330 }
   6331 
   6332 uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
   6333 _NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
   6334 {
   6335     //  no 8 bit hadd in IA32, need to go to 16 bit
   6336 //  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
   6337     uint16x4_t res64;
   6338     __m128i a16;
   6339     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
   6340     a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
   6341     return64(a16);
   6342 }
   6343 
   6344 uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
   6345 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
   6346 {
   6347     //serial solution looks faster than a SIMD one
   6348     uint32x2_t res;
   6349     res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
   6350     res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
   6351     return res;
   6352 }
   6353 
   6354 uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
   6355 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
   6356 {
   6357     uint64x1_t res;
   6358     res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
   6359     return res;
   6360 }
   6361 
   6362 int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
   6363 _NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
   6364 {
   6365     //no 8 bit hadd in IA32, need to go to 16 bit
   6366     __m128i r16_1, r16_2;
   6367     r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
   6368     //swap hi and low part of r to process the remaining data
   6369     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   6370     r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
   6371     return _mm_hadd_epi16 (r16_1, r16_2);
   6372 }
   6373 
   6374 int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
   6375 _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
   6376 {
   6377     //no 8 bit hadd in IA32, need to go to 16 bit
   6378     __m128i r32_1, r32_2;
   6379     r32_1 = _MM_CVTEPI16_EPI32(a);
   6380     //swap hi and low part of r to process the remaining data
   6381     r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   6382     r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
   6383     return _mm_hadd_epi32 (r32_1, r32_2);
   6384 }
   6385 
   6386 int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
   6387 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
   6388 {
   6389     _NEON2SSE_ALIGN_16 int32_t atmp[4];
   6390     _NEON2SSE_ALIGN_16 int64_t res[2];
   6391     _mm_store_si128((__m128i*)atmp, a);
   6392     res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
   6393     res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
   6394     return _mm_load_si128((__m128i*)res);
   6395 }
   6396 
   6397 uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
   6398 _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
   6399 {
   6400     //no 8 bit hadd in IA32, need to go to 16 bit
   6401     __m128i r16_1, r16_2;
   6402     r16_1 = _MM_CVTEPU8_EPI16(a);
   6403     //swap hi and low part of r to process the remaining data
   6404     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   6405     r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
   6406     return _mm_hadd_epi16 (r16_1, r16_2);
   6407 }
   6408 
   6409 uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
   6410 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
   6411 {
   6412     //serial solution looks faster than a SIMD one
   6413     _NEON2SSE_ALIGN_16 uint16_t atmp[8];
   6414     _NEON2SSE_ALIGN_16 uint32_t res[4];
   6415     _mm_store_si128((__m128i*)atmp, a);
   6416     res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
   6417     res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
   6418     res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
   6419     res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
   6420     return _mm_load_si128((__m128i*)res);
   6421 }
   6422 
   6423 uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
   6424 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6425 {
   6426     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
   6427     _NEON2SSE_ALIGN_16 uint64_t res[2];
   6428     _mm_store_si128((__m128i*)atmp, a);
   6429     res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
   6430     res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
   6431     return _mm_load_si128((__m128i*)res);
   6432 }
   6433 
   6434 //************************  Long pairwise add and accumulate **************************
   6435 //****************************************************************************************
   6436 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
   6437 // and accumulates the  values of the results into the elements of the destination (wide) vector
   6438 int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b); // VPADAL.S8 d0,d0
   6439 _NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b)
   6440 {
   6441     int16x4_t res64;
   6442     return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
   6443 }
   6444 
   6445 int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b); // VPADAL.S16 d0,d0
   6446 _NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b)
   6447 {
   6448     int32x2_t res64;
   6449     return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
   6450 }
   6451 
   6452 
   6453 int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
   6454 _NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
   6455 {
   6456     int64x1_t res;
   6457     res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
   6458     return res;
   6459 }
   6460 
   6461 uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b); // VPADAL.U8 d0,d0
   6462 _NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b)
   6463 {
   6464     uint16x4_t res64;
   6465     return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
   6466 }
   6467 
   6468 
   6469 uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b); // VPADAL.s16 d0,d0
   6470 _NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b)
   6471 {
   6472     uint32x2_t res64;
   6473     return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
   6474 }
   6475 
   6476 uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
   6477 _NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
   6478 {
   6479     uint64x1_t res;
   6480     res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
   6481     return res;
   6482 }
   6483 
   6484 int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
   6485 _NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
   6486 {
   6487     int16x8_t pad;
   6488     pad = vpaddlq_s8(b);
   6489     return _mm_add_epi16 (a, pad);
   6490 }
   6491 
   6492 int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
   6493 _NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
   6494 {
   6495     int32x4_t pad;
   6496     pad = vpaddlq_s16(b);
   6497     return _mm_add_epi32(a, pad);
   6498 }
   6499 
   6500 int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
   6501 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
   6502 {
   6503     int64x2_t pad;
   6504     pad = vpaddlq_s32(b);
   6505     return _mm_add_epi64 (a, pad);
   6506 }
   6507 
   6508 uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
   6509 _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
   6510 {
   6511     uint16x8_t pad;
   6512     pad = vpaddlq_u8(b);
   6513     return _mm_add_epi16 (a, pad);
   6514 }
   6515 
   6516 uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
   6517 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   6518 {
   6519     uint32x4_t pad;
   6520     pad = vpaddlq_u16(b);
   6521     return _mm_add_epi32(a, pad);
   6522 } //no optimal SIMD solution, serial is faster
   6523 
   6524 uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
   6525 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   6526 {
   6527     //no optimal SIMD solution, serial is faster
   6528     uint64x2_t pad;
   6529     pad = vpaddlq_u32(b);
   6530     return _mm_add_epi64(a, pad);
   6531 } //no optimal SIMD solution, serial is faster
   6532 
   6533 //**********  Folding maximum   *************************************
   6534 //*******************************************************************
   6535 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
   6536 //and copies the larger of each pair into the corresponding element in the destination
   6537 //    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
   6538 int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
   6539 _NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
   6540 {
   6541     int8x8_t res64;
   6542     __m128i ab, ab1, max;
   6543     _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
   6544     _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6545     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
   6546     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
   6547     max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
   6548     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
   6549     return64(max); //we need 64 bits only
   6550 }
   6551 
   6552 int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
   6553 _NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
   6554 {
   6555     //solution may be not optimal compared with the serial one
   6556     int16x4_t res64;
   6557     __m128i ab, ab1, max;
   6558     _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
   6559     _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6560     ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
   6561     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
   6562     max = _mm_max_epi16 (ab, ab1);
   6563     max =  _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
   6564     return64(max);
   6565 }
   6566 
   6567 int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
   6568 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6569 {
   6570     //serial solution looks faster than SIMD one
   6571     int32x2_t res;
   6572     res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
   6573     res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
   6574     return res;
   6575 }
   6576 
   6577 uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
   6578 _NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
   6579 {
   6580     uint8x8_t res64;
   6581     __m128i ab, ab1, max;
   6582     _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
   6583     _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6584     ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
   6585     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
   6586     max = _mm_max_epu8 (ab, ab1); // SSE4.1
   6587     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
   6588     return64(max);
   6589 }
   6590 
   6591 uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
   6592 _NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
   6593 {
   6594     //solution may be not optimal compared with the serial one
   6595     uint16x4_t res64;
   6596     __m128i ab, ab1, max;
   6597     _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
   6598     _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6599     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
   6600     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
   6601     max = _MM_MAX_EPU16 (ab, ab1);
   6602     max = _mm_shuffle_epi8 (max, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
   6603     return64(max);
   6604 }
   6605 
   6606 uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
   6607 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6608 {
   6609     //serial solution looks faster than SIMD one
   6610     uint32x2_t res;
   6611     res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
   6612     res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
   6613     return res;
   6614 } //serial solution looks faster than a SIMD one
   6615 
   6616 float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
   6617 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   6618 {
   6619     //serial solution looks faster than  SIMD one
   6620     float32x2_t res;
   6621     res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
   6622     res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
   6623     return res;
   6624 }
   6625 
   6626 // ***************** Folding minimum  ****************************
   6627 // **************************************************************
   6628 //vpmin -> takes minimum of adjacent pairs
   6629 int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
   6630 _NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
   6631 {
   6632     int8x8_t res64;
   6633     __m128i ab, ab1, min;
   6634     _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
   6635     _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6636     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
   6637     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
   6638     min =  _MM_MIN_EPI8 (ab, ab1); // SSE4.1
   6639     min =  _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
   6640     return64(min);
   6641 }
   6642 
   6643 int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
   6644 _NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
   6645 {
   6646     //solution may be not optimal compared with the serial one
   6647     int16x4_t res64;
   6648     __m128i ab, ab1, min;
   6649     _NEON2SSE_ALIGN_16 int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
   6650     _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6651     ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
   6652     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
   6653     min = _mm_min_epi16 (ab, ab1);
   6654     min = _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
   6655     return64(min);
   6656 }
   6657 
   6658 int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
   6659 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6660 {
   6661     //serial solution looks faster than SIMD one
   6662     int32x2_t res;
   6663     res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
   6664     res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
   6665     return res;
   6666 }
   6667 
   6668 uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
   6669 _NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
   6670 {
   6671     uint8x8_t res64;
   6672     __m128i ab, ab1, min;
   6673     _NEON2SSE_ALIGN_16 uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
   6674     _NEON2SSE_ALIGN_16 uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6675     ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
   6676     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
   6677     min = _mm_min_epu8 (ab, ab1); // SSE4.1
   6678     min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
   6679     return64(min);
   6680 }
   6681 
   6682 uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
   6683 _NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
   6684 {
   6685     //solution may be not optimal compared with the serial one
   6686     uint16x4_t res64;
   6687     __m128i ab, ab1, min;
   6688     _NEON2SSE_ALIGN_16 uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
   6689     _NEON2SSE_ALIGN_16 uint8_t mask16_odd[16] = { 0,1, 4,5, 8,9, 12,13,  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6690     ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
   6691     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
   6692     min = _MM_MIN_EPU16 (ab, ab1);
   6693     min =    _mm_shuffle_epi8 (min, *(__m128i*) mask16_odd); //remove repetitive data, use 8bit fn and the corresponding mask
   6694     return64(min);
   6695 }
   6696 
   6697 uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
   6698 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6699 {
   6700     //serial solution looks faster than SIMD one
   6701     uint32x2_t res;
   6702     res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
   6703     res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
   6704     return res;
   6705 }
   6706 
   6707 float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
   6708 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   6709 {
   6710     //serial solution looks faster than SIMD one
   6711     float32x2_t res;
   6712     res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
   6713     res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
   6714     return res;
   6715 }
   6716 
   6717 //***************************************************************
   6718 //***********  Reciprocal/Sqrt ************************************
   6719 //***************************************************************
   6720 //****************** Reciprocal estimate *******************************
   6721 //the ARM NEON and x86 SIMD results may be slightly different
   6722 float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
   6723 _NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
   6724 {
   6725     float32x4_t res;
   6726     __m64_128 res64;
   6727     res = _mm_rcp_ps(_pM128(a));
   6728     _M64f(res64, res);
   6729     return res64;
   6730 }
   6731 
   6732 uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
   6733 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6734 {
   6735     //Input is  fixed point number!!! No reciprocal for ints in IA32 available
   6736     uint32x2_t res;
   6737     float resf, r;
   6738     int i, q, s;
   6739     for (i =0; i<2; i++){
   6740         if((a.m64_u32[i] & 0x80000000) == 0) {
   6741             res.m64_u32[i] = 0xffffffff;
   6742         }else{
   6743             resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
   6744             q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
   6745             r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
   6746             s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
   6747             r =  (float)s / 256.0;
   6748             res.m64_u32[i] = r * (uint32_t)(1 << 31);
   6749         }
   6750     }
   6751     return res;
   6752 }
   6753 
   6754 float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
   6755 #define vrecpeq_f32 _mm_rcp_ps
   6756 
   6757 
   6758 uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
   6759 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6760 {
   6761     //Input is  fixed point number!!!
   6762     //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
   6763     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
   6764     _NEON2SSE_ALIGN_16 uint32_t res[4];
   6765    _NEON2SSE_ALIGN_16 int c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
   6766     float resf, r;
   6767     int i, q, s;
   6768   __m128i res128, mask, zero;
   6769     _mm_store_si128((__m128i*)atmp, a);
   6770     zero = _mm_setzero_si128();
   6771     for (i =0; i<4; i++){
   6772         resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31)));  //  2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
   6773         q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
   6774         r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
   6775         s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
   6776         r =  (float)s / 256.0;
   6777         res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
   6778     }
   6779     res128 = _mm_load_si128((__m128i*)res);
   6780     mask = _mm_and_si128(a, *(__m128i*)c80000000);
   6781     mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x7fffffff
   6782     return _mm_or_si128(res128, mask);
   6783 }
   6784 
   6785 //**********Reciprocal square root estimate ****************
   6786 //**********************************************************
   6787 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
   6788 //but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
   6789 ////the ARM NEON and x86 SIMD results may be slightly different
   6790 float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
   6791 _NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
   6792 {
   6793     float32x4_t res;
   6794     __m64_128 res64;
   6795     res = _mm_rsqrt_ps(_pM128(a));
   6796     _M64f(res64, res);
   6797     return res64;
   6798 }
   6799 
   6800 uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
   6801 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6802 {
   6803     //Input is  fixed point number!!!
   6804     //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
   6805    uint32x2_t res;
   6806    __m128 tmp;
   6807     float r, resf, coeff;
   6808     int i,q0, q1, s;;
   6809     for (i =0; i<2; i++){
   6810         if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
   6811             res.m64_u32[i] = 0xffffffff;
   6812         }else{
   6813             resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
   6814             coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
   6815             q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
   6816             r = ((float)q0 + 0.5) / coeff;
   6817             tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
   6818             _mm_store_ss(&r, tmp);
   6819             s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
   6820             r = (float)s / 256.0;
   6821             res.m64_u32[i] = r * (((uint32_t)1) << 31);
   6822         }
   6823     }
   6824     return res;
   6825 }
   6826 
   6827 float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
   6828 #define vrsqrteq_f32 _mm_rsqrt_ps
   6829 
   6830 uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
   6831 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6832 {
   6833     //Input is  fixed point number!!!
   6834     //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
   6835    _NEON2SSE_ALIGN_16 uint32_t  atmp[4], res[4];
   6836    _NEON2SSE_ALIGN_16 float c1_31[4] = {(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31),(float)(((uint32_t)1) << 31), (float)(((uint32_t)1) << 31)};
   6837    _NEON2SSE_ALIGN_16 int c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
   6838   __m128 tmp;
   6839   __m128i res128, mask, zero;
   6840     float r, resf, coeff;
   6841     int i,q0, q1, s;
   6842     _mm_store_si128((__m128i*)atmp, a);
   6843     zero = _mm_setzero_si128();
   6844     for (i =0; i<4; i++){
   6845         resf =  (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
   6846         coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
   6847         q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
   6848         r = ((float)q0 + 0.5) / coeff;
   6849         tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
   6850         _mm_store_ss(&r, tmp);
   6851         s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
   6852         r = (float)s / 256.0;
   6853         res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
   6854     }
   6855     res128 = _mm_load_si128((__m128i*)res);
   6856     mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
   6857     mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x3fffffff
   6858     return _mm_or_si128(res128, mask);
   6859 }
   6860 //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
   6861 //******************************************************************************************
   6862 //******VRECPS (Vector Reciprocal Step) ***************************************************
   6863 //multiplies the elements of one vector by the corresponding elements of another vector,
   6864 //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
   6865 
   6866 float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
   6867 _NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
   6868 {
   6869     float32x4_t res;
   6870     __m64_128 res64;
   6871     res = vrecpsq_f32(_pM128(a), _pM128(b));
   6872     _M64f(res64, res);
   6873     return res64;
   6874 }
   6875 
   6876 float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
   6877 _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
   6878 {
   6879     __m128 f2, mul;
   6880     f2 =  _mm_set1_ps(2.);
   6881     mul = _mm_mul_ps(a,b);
   6882     return _mm_sub_ps(f2,mul);
   6883 }
   6884 
   6885 //*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
   6886 //multiplies the elements of one vector by the corresponding elements of another vector,
   6887 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
   6888 
   6889 float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
   6890 _NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
   6891 {
   6892     float32x2_t res;
   6893     res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
   6894     res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
   6895     return res;
   6896 }
   6897 
   6898 float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
   6899 _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
   6900 {
   6901     __m128 f3, f05, mul;
   6902     f3 =  _mm_set1_ps(3.);
   6903     f05 =  _mm_set1_ps(0.5);
   6904     mul = _mm_mul_ps(a,b);
   6905     f3 = _mm_sub_ps(f3,mul);
   6906     return _mm_mul_ps (f3, f05);
   6907 }
   6908 //********************************************************************************************
   6909 //***************************** Shifts by signed variable ***********************************
   6910 //********************************************************************************************
   6911 //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
   6912 //********************************************************************************************
   6913 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
   6914 //helper macro. It matches ARM implementation for big shifts
   6915 #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
   6916         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
   6917         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   6918         for (i = 0; i<LEN; i++) { \
   6919         if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
   6920         else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
   6921         return _mm_load_si128((__m128i*)res);
   6922 
   6923 #define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
   6924         int ## TYPE ## x ## LEN ## _t res;  int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
   6925         for (i = 0; i<LEN; i++) { \
   6926         if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
   6927         else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
   6928         return res;
   6929 
   6930 int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
   6931 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6932 {
   6933     SERIAL_SHIFT_64(8, i, 8)
   6934 }
   6935 
   6936 int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
   6937 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6938 {
   6939     SERIAL_SHIFT_64(16, i, 4)
   6940 }
   6941 
   6942 int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
   6943 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6944 {
   6945     SERIAL_SHIFT_64(32, i, 2)
   6946 }
   6947 
   6948 int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
   6949 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6950 {
   6951     SERIAL_SHIFT_64(64, i, 1)
   6952 }
   6953 
   6954 uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
   6955 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6956 {
   6957     SERIAL_SHIFT_64(8, u, 8)
   6958 }
   6959 
   6960 uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
   6961 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6962 {
   6963     SERIAL_SHIFT_64(16, u, 4)
   6964 }
   6965 
   6966 uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
   6967 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6968 {
   6969     SERIAL_SHIFT_64(32, u, 2)
   6970 }
   6971 
   6972 uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
   6973 _NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing  for large numbers
   6974 {
   6975     SERIAL_SHIFT_64(64, u, 1)
   6976 }
   6977 
   6978 int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
   6979 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6980 {
   6981     SERIAL_SHIFT(int8_t, int8_t, 16, 16)
   6982 }
   6983 
   6984 int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
   6985 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6986 {
   6987     SERIAL_SHIFT(int16_t, int16_t, 8, 8)
   6988 }
   6989 
   6990 int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
   6991 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6992 {
   6993     SERIAL_SHIFT(int32_t, int32_t, 4, 4)
   6994 }
   6995 
   6996 int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
   6997 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6998 {
   6999     SERIAL_SHIFT(int64_t, int64_t, 2, 2)
   7000 }
   7001 
   7002 uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
   7003 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7004 {
   7005     SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
   7006 }
   7007 
   7008 uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
   7009 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7010 {
   7011     SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
   7012 }
   7013 
   7014 uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
   7015 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7016 {
   7017     SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
   7018 }
   7019 
   7020 uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
   7021 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7022 {
   7023     SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
   7024 }
   7025 
   7026 
   7027 //*********** Vector saturating shift left: (negative values shift right) **********************
   7028 //********************************************************************************************
   7029 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
   7030 #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
   7031         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
   7032         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
   7033         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7034         for (i = 0; i<LEN; i++) { \
   7035         if (atmp[i] ==0) res[i] = 0; \
   7036         else{ \
   7037             if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
   7038             else{ \
   7039                 if (btmp[i]>lanesize_1) { \
   7040                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   7041                 }else{ \
   7042                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
   7043                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
   7044                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   7045                     else res[i] = atmp[i] << btmp[i]; }}}} \
   7046         return _mm_load_si128((__m128i*)res);
   7047 
   7048 #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
   7049         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
   7050         TYPE lanesize = (sizeof(TYPE) << 3); \
   7051         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7052         for (i = 0; i<LEN; i++) { \
   7053         if (atmp[i] ==0) {res[i] = 0; \
   7054         }else{ \
   7055             if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
   7056             else{ \
   7057                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
   7058                 else{ \
   7059                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
   7060                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
   7061         return _mm_load_si128((__m128i*)res);
   7062 
   7063 #define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
   7064         int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
   7065         int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
   7066         for (i = 0; i<LEN; i++) { \
   7067         if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
   7068         else{ \
   7069             if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
   7070             else{ \
   7071                 if (b.m64_i ## TYPE[i]>lanesize_1) { \
   7072                     res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
   7073                 }else{ \
   7074                     limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
   7075                     if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
   7076                         res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
   7077                     else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
   7078         return res;
   7079 
   7080 #define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
   7081         int ## TYPE ## x ## LEN ## _t res;  _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
   7082         int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
   7083         for (i = 0; i<LEN; i++) { \
   7084         if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
   7085         }else{ \
   7086             if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
   7087             else{ \
   7088                 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
   7089                 else{ \
   7090                     limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
   7091                     res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
   7092         return res;
   7093 
   7094 int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
   7095 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7096 {
   7097     SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
   7098 }
   7099 
   7100 int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
   7101 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7102 {
   7103     SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
   7104 }
   7105 
   7106 int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
   7107 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7108 {
   7109     SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
   7110 }
   7111 
   7112 int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
   7113 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7114 {
   7115     SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
   7116 }
   7117 
   7118 uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
   7119 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7120 {
   7121     SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
   7122 }
   7123 
   7124 uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
   7125 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7126 {
   7127     SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
   7128 }
   7129 
   7130 uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
   7131 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7132 {
   7133     SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
   7134 }
   7135 
   7136 uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
   7137 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7138 {
   7139     SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
   7140 }
   7141 
   7142 int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
   7143 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7144 {
   7145     SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
   7146 }
   7147 
   7148 int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
   7149 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7150 {
   7151     SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
   7152 }
   7153 
   7154 int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
   7155 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7156 {
   7157     SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
   7158 }
   7159 
   7160 int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
   7161 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7162 {
   7163     SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
   7164 }
   7165 
   7166 uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
   7167 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7168 {
   7169     SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
   7170 }
   7171 
   7172 uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
   7173 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7174 {
   7175     SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
   7176 }
   7177 
   7178 uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
   7179 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7180 {
   7181     SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
   7182 }
   7183 
   7184 uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
   7185 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7186 {
   7187     SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
   7188 }
   7189 
   7190 
   7191 //******** Vector rounding shift left: (negative values shift right) **********
   7192 //****************************************************************************
   7193 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
   7194 //rounding makes sense for right shifts only.
   7195 #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
   7196         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
   7197         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7198         for (i = 0; i<LEN; i++) { \
   7199         if( btmp[i] >= 0) { \
   7200             if(btmp[i] >= lanesize) res[i] = 0; \
   7201             else res[i] = (atmp[i] << btmp[i]); \
   7202         }else{ \
   7203             res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
   7204                             (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
   7205                             (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
   7206         return _mm_load_si128((__m128i*)res);
   7207 
   7208 
   7209 #define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
   7210         int ## TYPE ## x ## LEN ## _t res;  int i;  int lanesize = sizeof(int ## TYPE ## _t) << 3; \
   7211         for (i = 0; i<LEN; i++) { \
   7212         if( b.m64_i ## TYPE[i] >= 0) { \
   7213             if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
   7214             else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
   7215         }else{ \
   7216             res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
   7217                             (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
   7218                             (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) );    }} \
   7219         return res;
   7220 
   7221 
   7222 int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
   7223 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7224 {
   7225     SERIAL_ROUNDING_SHIFT_64(8,i,8)
   7226 }
   7227 
   7228 int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
   7229 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7230 {
   7231     SERIAL_ROUNDING_SHIFT_64(16,i,4)
   7232 }
   7233 
   7234 int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
   7235 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7236 {
   7237     SERIAL_ROUNDING_SHIFT_64(32,i,2)
   7238 }
   7239 
   7240 int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
   7241 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7242 {
   7243     SERIAL_ROUNDING_SHIFT_64(64,i,1)
   7244 }
   7245 
   7246 uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
   7247 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7248 {
   7249     SERIAL_ROUNDING_SHIFT_64(8,u,8)
   7250 }
   7251 
   7252 uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
   7253 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7254 {
   7255     SERIAL_ROUNDING_SHIFT_64(16,u,4)
   7256 }
   7257 
   7258 uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
   7259 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7260 {
   7261     SERIAL_ROUNDING_SHIFT_64(32,u,2)
   7262 }
   7263 
   7264 uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
   7265 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7266 {
   7267     SERIAL_ROUNDING_SHIFT_64(64,u,1)
   7268 }
   7269 
   7270 int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
   7271 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7272 {
   7273     SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
   7274 }
   7275 
   7276 int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
   7277 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7278 {
   7279     SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
   7280 }
   7281 
   7282 int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
   7283 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7284 {
   7285     SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
   7286 }
   7287 
   7288 int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
   7289 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7290 {
   7291     SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
   7292 }
   7293 
   7294 uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
   7295 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7296 {
   7297     SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
   7298 }
   7299 
   7300 uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
   7301 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7302 {
   7303     SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
   7304 }
   7305 
   7306 uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
   7307 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7308 {
   7309     SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
   7310 }
   7311 
   7312 uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
   7313 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7314 {
   7315     SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
   7316 }
   7317 
   7318 
   7319 //********** Vector saturating rounding shift left: (negative values shift right) ****************
   7320 //*************************************************************************************************
   7321 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
   7322 //Saturation happens for left shifts only while rounding makes sense for right shifts only.
   7323 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
   7324         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
   7325         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
   7326         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7327         for (i = 0; i<LEN; i++) { \
   7328         if (atmp[i] ==0) res[i] = 0; \
   7329         else{ \
   7330             if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
   7331             else{ \
   7332                 if (btmp[i]>lanesize_1) { \
   7333                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   7334                 }else{ \
   7335                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
   7336                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
   7337                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   7338                     else res[i] = atmp[i] << btmp[i]; }}}} \
   7339         return _mm_load_si128((__m128i*)res);
   7340 
   7341 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
   7342         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
   7343         int lanesize = (sizeof(TYPE) << 3); \
   7344         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7345         for (i = 0; i<LEN; i++) { \
   7346         if (atmp[i] ==0) {res[i] = 0; \
   7347         }else{ \
   7348             if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
   7349             else{ \
   7350                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
   7351                 else{ \
   7352                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
   7353                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
   7354         return _mm_load_si128((__m128i*)res);
   7355 
   7356 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
   7357         __m64_128 res; int ## TYPE ## _t limit; int i; \
   7358         int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
   7359         for (i = 0; i<LEN; i++) { \
   7360         if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
   7361         else{ \
   7362             if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
   7363             else{ \
   7364                 if (b.m64_i ## TYPE[i]>lanesize_1) { \
   7365                     res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
   7366                 }else{ \
   7367                     limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
   7368                     if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
   7369                         res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
   7370                     else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
   7371         return res;
   7372 
   7373 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
   7374         __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
   7375         int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
   7376         for (i = 0; i<LEN; i++) { \
   7377         if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
   7378         }else{ \
   7379             if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
   7380             else{ \
   7381                 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
   7382                 else{ \
   7383                     limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
   7384                     res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
   7385         return res;
   7386 
   7387 int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
   7388 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7389 {
   7390     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
   7391 }
   7392 
   7393 int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
   7394 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7395 {
   7396     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
   7397 }
   7398 
   7399 int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
   7400 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7401 {
   7402     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
   7403 }
   7404 
   7405 int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
   7406 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7407 {
   7408     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
   7409 }
   7410 
   7411 uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
   7412 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7413 {
   7414     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
   7415 }
   7416 
   7417 uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
   7418 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7419 {
   7420     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
   7421 }
   7422 
   7423 uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
   7424 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7425 {
   7426     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
   7427 }
   7428 
   7429 uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
   7430 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7431 {
   7432     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
   7433 }
   7434 
   7435 int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
   7436 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7437 {
   7438     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
   7439 }
   7440 
   7441 int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
   7442 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7443 {
   7444     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
   7445 }
   7446 
   7447 int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
   7448 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7449 {
   7450     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
   7451 }
   7452 
   7453 int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
   7454 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7455 {
   7456     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
   7457 }
   7458 
   7459 uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
   7460 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7461 {
   7462     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
   7463 }
   7464 
   7465 uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
   7466 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7467 {
   7468     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
   7469 }
   7470 
   7471 uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
   7472 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7473 {
   7474     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
   7475 }
   7476 
   7477 uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
   7478 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7479 {
   7480     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
   7481 }
   7482 
   7483 // *********************************************************************************
   7484 // *****************************  Shifts by a constant *****************************
   7485 // *********************************************************************************
   7486 //**************** Vector shift right by constant*************************************
   7487 //************************************************************************************
   7488 int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
   7489 _NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
   7490 {
   7491     //no 8 bit shift available, go to 16 bit
   7492     int8x8_t res64;
   7493     __m128i r;
   7494     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   7495     r = _mm_srai_epi16 (r, b); //SSE2
   7496     r = _mm_packs_epi16 (r,r); //we need 64 bits only
   7497     return64(r);
   7498 }
   7499 
   7500 int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
   7501 _NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
   7502 {
   7503     int16x4_t res64;
   7504     return64(_mm_srai_epi16(_pM128i(a), b));
   7505 }
   7506 
   7507 
   7508 int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
   7509 _NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
   7510 {
   7511     int32x2_t res64;
   7512     return64(_mm_srai_epi32(_pM128i(a), b));
   7513 }
   7514 
   7515 int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
   7516 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   7517 {
   7518     //no arithmetic shift for 64bit values, serial solution used
   7519     int64x1_t res;
   7520     if(b>=64) res.m64_i64[0] = 0;
   7521     else res.m64_i64[0] = (*(int64_t*)&a) >> b;
   7522     return res;
   7523 }
   7524 
   7525 uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
   7526 _NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
   7527 {
   7528     //no 8 bit shift available, go to 16 bit
   7529     uint8x8_t res64;
   7530     __m128i r;
   7531     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
   7532     r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
   7533     r = _mm_packus_epi16 (r,r); //we need 64 bits only
   7534     return64(r);
   7535 }
   7536 
   7537 uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
   7538 _NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
   7539 {
   7540     uint16x4_t res64;
   7541     return64(_mm_srli_epi16(_pM128i(a), b));
   7542 }
   7543 
   7544 
   7545 uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
   7546 _NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
   7547 {
   7548     uint32x2_t res64;
   7549     return64(_mm_srli_epi32(_pM128i(a), b));
   7550 }
   7551 
   7552 
   7553 uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
   7554 _NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b)
   7555 {
   7556     uint64x1_t res64;
   7557     return64(_mm_srli_epi64(_pM128i(a), b));
   7558 }
   7559 
   7560 
   7561 int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
   7562 _NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
   7563 {
   7564     //no 8 bit shift available, go to 16 bit trick
   7565     __m128i zero, mask0, a_sign, r, a_sign_mask;
   7566     _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
   7567     zero = _mm_setzero_si128();
   7568     mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
   7569     a_sign =  _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
   7570     r = _mm_srai_epi16 (a, b);
   7571     a_sign_mask =  _mm_and_si128 (mask0, a_sign);
   7572     r =  _mm_andnot_si128 (mask0, r);
   7573     return _mm_or_si128 (r, a_sign_mask);
   7574 }
   7575 
   7576 int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
   7577 #define vshrq_n_s16 _mm_srai_epi16
   7578 
   7579 int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
   7580 #define vshrq_n_s32 _mm_srai_epi32
   7581 
   7582 int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
   7583 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
   7584 {
   7585     //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
   7586     __m128i c1, signmask,a0,  res64;
   7587     _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
   7588     c1 =  _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
   7589     signmask  =  _mm_slli_epi64 (c1, (64 - b));
   7590     a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
   7591     a0 = _MM_CMPEQ_EPI64 (a, a0);
   7592     signmask = _mm_and_si128(a0, signmask);
   7593     res64 = _mm_srli_epi64 (a, b);
   7594     return _mm_or_si128(res64, signmask);
   7595 }
   7596 
   7597 uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
   7598 _NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
   7599 {
   7600     //no 8 bit shift available, need the special trick
   7601     __m128i mask0, r;
   7602     _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
   7603     mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
   7604     r = _mm_srli_epi16 ( a, b);
   7605     return _mm_and_si128 (r,  mask0);
   7606 }
   7607 
   7608 uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
   7609 #define vshrq_n_u16 _mm_srli_epi16
   7610 
   7611 uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
   7612 #define vshrq_n_u32 _mm_srli_epi32
   7613 
   7614 uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
   7615 #define vshrq_n_u64 _mm_srli_epi64
   7616 
   7617 //*************************** Vector shift left by constant *************************
   7618 //*********************************************************************************
   7619 int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
   7620 _NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
   7621 {
   7622     //no 8 bit shift available, go to 16 bit
   7623     int8x8_t res64;
   7624     __m128i r;
   7625     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   7626     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   7627     r = _mm_slli_epi16 (r, b); //SSE2
   7628     r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
   7629     return64(r);
   7630 }
   7631 
   7632 int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
   7633 _NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b)
   7634 {
   7635     int16x4_t res64;
   7636     return64(_mm_slli_epi16(_pM128i(a), b));
   7637 }
   7638 
   7639 
   7640 int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
   7641 _NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
   7642 {
   7643     int32x2_t res64;
   7644     return64(_mm_slli_epi32(_pM128i(a), b));
   7645 }
   7646 
   7647 
   7648 int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
   7649 _NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b)
   7650 {
   7651     int64x1_t res64;
   7652     return64(_mm_slli_epi64(_pM128i(a), b));
   7653 }
   7654 
   7655 
   7656 uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
   7657 _NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
   7658 {
   7659     //no 8 bit shift available, go to 16 bit
   7660     uint8x8_t res64;
   7661     __m128i mask8;
   7662     __m128i r;
   7663     mask8 = _mm_set1_epi16(0xff);
   7664     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
   7665     r = _mm_slli_epi16 (r, b); //SSE2
   7666     r = _mm_and_si128(r, mask8); //to avoid saturation
   7667     r = _mm_packus_epi16 (r,r); //we need 64 bits only
   7668     return64(r);
   7669 }
   7670 
   7671 uint16x4_t vshl_n_u16(uint16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
   7672 #define vshl_n_u16 vshl_n_s16
   7673 
   7674 
   7675 uint32x2_t vshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
   7676 #define vshl_n_u32 vshl_n_s32
   7677 
   7678 uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
   7679 #define vshl_n_u64 vshl_n_s64
   7680 
   7681 int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
   7682 #define vshlq_n_s8 vshlq_n_u8
   7683 
   7684 int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
   7685 #define vshlq_n_s16 _mm_slli_epi16
   7686 
   7687 int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
   7688 #define vshlq_n_s32 _mm_slli_epi32
   7689 
   7690 int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
   7691 #define vshlq_n_s64 _mm_slli_epi64
   7692 
   7693 uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
   7694 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
   7695 {
   7696     //no 8 bit shift available, need the special trick
   7697     __m128i mask0, r;
   7698     _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
   7699     mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
   7700     r = _mm_slli_epi16 ( a, b);
   7701     return _mm_and_si128 (r,  mask0);
   7702 }
   7703 
   7704 uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
   7705 #define vshlq_n_u16 vshlq_n_s16
   7706 
   7707 uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
   7708 #define vshlq_n_u32 vshlq_n_s32
   7709 
   7710 uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
   7711 #define vshlq_n_u64 vshlq_n_s64
   7712 
   7713 //************* Vector rounding shift right by constant ******************
   7714 //*************************************************************************
   7715 //No corresponding  x86 intrinsics exist, need to do some tricks
   7716 int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
   7717 _NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
   7718 {
   7719     //no 8 bit shift available, go to 16 bit
   7720     int8x8_t res64;
   7721     __m128i r, maskb;
   7722     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   7723     maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
   7724     maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
   7725     r = _mm_srai_epi16 (r, b);
   7726     r = _mm_add_epi16 (r, maskb); //actual rounding
   7727     r = _mm_packs_epi16 (r,r); ////we need 64 bits only
   7728     return64(r);
   7729 }
   7730 
   7731 int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
   7732 _NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
   7733 {
   7734     int16x4_t res64;
   7735     return64(vrshrq_n_s16(_pM128i(a), b));
   7736 }
   7737 
   7738 
   7739 int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
   7740 _NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
   7741 {
   7742     int32x2_t res64;
   7743     return64(vrshrq_n_s32(_pM128i(a), b));
   7744 }
   7745 
   7746 
   7747 int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
   7748 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   7749 {
   7750     //serial solution is faster
   7751     int64x1_t res;
   7752     int64_t a_i64 = *( int64_t*)&a;
   7753     if(b==64) {
   7754         res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
   7755     } else {
   7756         int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
   7757         res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
   7758     }
   7759     return res;
   7760 }
   7761 
   7762 uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
   7763 _NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
   7764 {
   7765     //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
   7766     uint8x8_t res64;
   7767     __m128i r, maskb;
   7768     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
   7769     maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
   7770     maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
   7771     r = _mm_srli_epi16 (r, b);
   7772     r = _mm_add_epi16 (r, maskb); //actual rounding
   7773     r =  _mm_packus_epi16 (r,r); ////we need 64 bits only
   7774     return64(r);
   7775 }
   7776 
   7777 uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
   7778 _NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
   7779 {
   7780     uint16x4_t res64;
   7781     return64(vrshrq_n_u16(_pM128i(a), b));
   7782 }
   7783 
   7784 
   7785 uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
   7786 _NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
   7787 {
   7788     uint32x2_t res64;
   7789     return64(vrshrq_n_u32(_pM128i(a), b));
   7790 }
   7791 
   7792 
   7793 uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
   7794 _NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
   7795 {
   7796     uint64x1_t res64;
   7797     return64(vrshrq_n_u64(_pM128i(a), b));
   7798 }
   7799 
   7800 int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
   7801 _NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
   7802 {
   7803     //no 8 bit shift available, go to 16 bit trick
   7804     __m128i r, mask1, maskb;
   7805     _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
   7806     r = vshrq_n_s8 (a, b);
   7807     mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
   7808     maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
   7809     maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
   7810     return _mm_add_epi8(r, maskb); //actual rounding
   7811 }
   7812 
   7813 int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
   7814 _NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
   7815 {
   7816     __m128i maskb, r;
   7817     maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
   7818     maskb = _mm_srli_epi16(maskb, 15); //1 or 0
   7819     r = _mm_srai_epi16 (a, b);
   7820     return _mm_add_epi16 (r, maskb); //actual rounding
   7821 }
   7822 
   7823 int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
   7824 _NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
   7825 {
   7826     __m128i maskb,  r;
   7827     maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
   7828     maskb = _mm_srli_epi32 (maskb,31); //1 or 0
   7829     r = _mm_srai_epi32(a, b);
   7830     return _mm_add_epi32 (r, maskb); //actual rounding
   7831 }
   7832 
   7833 int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
   7834 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
   7835 {
   7836     //solution may be not optimal compared with a serial one
   7837     __m128i maskb;
   7838     int64x2_t r;
   7839     maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
   7840     maskb = _mm_srli_epi64 (maskb,63); //1 or 0
   7841     r = vshrq_n_s64(a, b);
   7842     return _mm_add_epi64 (r, maskb); //actual rounding
   7843 }
   7844 
   7845 uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
   7846 _NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
   7847 {
   7848     //no 8 bit shift available, go to 16 bit trick
   7849     __m128i r, mask1, maskb;
   7850     _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
   7851     r = vshrq_n_u8 (a, b);
   7852     mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
   7853     maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
   7854     maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
   7855     return _mm_add_epi8(r, maskb); //actual rounding
   7856 }
   7857 
   7858 uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
   7859 _NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
   7860 {
   7861     __m128i maskb, r;
   7862     maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
   7863     maskb = _mm_srli_epi16(maskb, 15); //1 or 0
   7864     r = _mm_srli_epi16 (a, b);
   7865     return _mm_add_epi16 (r, maskb); //actual rounding
   7866 }
   7867 
   7868 uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
   7869 _NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
   7870 {
   7871     __m128i maskb,  r;
   7872     maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
   7873     maskb = _mm_srli_epi32 (maskb,31); //1 or 0
   7874     r = _mm_srli_epi32(a, b);
   7875     return _mm_add_epi32 (r, maskb); //actual rounding
   7876 }
   7877 
   7878 uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
   7879 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
   7880 {
   7881     //solution may be not optimal compared with a serial one
   7882     __m128i maskb,  r;
   7883     maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
   7884     maskb = _mm_srli_epi64 (maskb,63); //1 or 0
   7885     r = _mm_srli_epi64(a, b);
   7886     return _mm_add_epi64 (r, maskb); //actual rounding
   7887 }
   7888 
   7889 //************* Vector shift right by constant and accumulate *********
   7890 //*********************************************************************
   7891 int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
   7892 _NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
   7893 {
   7894     int8x8_t shift;
   7895     shift = vshr_n_s8(b, c);
   7896     return vadd_s8( a, shift);
   7897 }
   7898 
   7899 int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
   7900 _NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
   7901 {
   7902     int16x4_t shift;
   7903     shift = vshr_n_s16( b, c);
   7904     return vadd_s16(a, shift);
   7905 }
   7906 
   7907 int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
   7908 _NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
   7909 {
   7910     //may be not optimal compared with the serial execution
   7911     int32x2_t shift;
   7912     shift = vshr_n_s32(b, c);
   7913     return vadd_s32( a, shift);
   7914 }
   7915 
   7916 int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
   7917 _NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
   7918 {
   7919     //may be not optimal compared with a serial solution
   7920     int64x1_t shift;
   7921     shift = vshr_n_s64(b, c);
   7922     return vadd_s64( a, shift);
   7923 }
   7924 
   7925 uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
   7926 _NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
   7927 {
   7928     uint8x8_t shift;
   7929     shift = vshr_n_u8(b, c);
   7930     return vadd_u8(a, shift);
   7931 }
   7932 
   7933 uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
   7934 _NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
   7935 {
   7936     uint16x4_t shift;
   7937     shift = vshr_n_u16(b, c);
   7938     return vadd_u16(a,shift);
   7939 }
   7940 
   7941 uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
   7942 _NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
   7943 {
   7944     //may be not optimal compared with the serial execution
   7945     uint32x2_t shift;
   7946     shift = vshr_n_u32(b, c);
   7947     return vadd_u32( a, shift);
   7948 }
   7949 
   7950 uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
   7951 _NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
   7952 {
   7953     //may be not optimal compared with the serial execution
   7954     uint64x1_t shift;
   7955     shift = vshr_n_u64(b, c);
   7956     return vadd_u64(a, shift);
   7957 }
   7958 
   7959 int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
   7960 _NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
   7961 {
   7962     int8x16_t shift;
   7963     shift = vshrq_n_s8(b, c);
   7964     return vaddq_s8(a, shift);
   7965 }
   7966 
   7967 int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
   7968 _NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
   7969 {
   7970     int16x8_t shift;
   7971     shift = vshrq_n_s16(b, c);
   7972     return vaddq_s16(a, shift);
   7973 }
   7974 
   7975 int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
   7976 _NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
   7977 {
   7978     int32x4_t shift;
   7979     shift = vshrq_n_s32(b, c);
   7980     return vaddq_s32(a, shift);
   7981 }
   7982 
   7983 int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
   7984 _NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
   7985 {
   7986     int64x2_t shift;
   7987     shift = vshrq_n_s64(b, c);
   7988     return vaddq_s64( a, shift);
   7989 }
   7990 
   7991 uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
   7992 _NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
   7993 {
   7994     uint8x16_t shift;
   7995     shift = vshrq_n_u8(b, c);
   7996     return vaddq_u8(a, shift);
   7997 }
   7998 
   7999 uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
   8000 _NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
   8001 {
   8002     uint16x8_t shift;
   8003     shift = vshrq_n_u16(b, c);
   8004     return vaddq_u16(a,  shift);
   8005 }
   8006 
   8007 uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
   8008 _NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
   8009 {
   8010     uint32x4_t shift;
   8011     shift = vshrq_n_u32(b, c);
   8012     return vaddq_u32(a, shift);
   8013 }
   8014 
   8015 uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
   8016 _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
   8017 {
   8018     uint64x2_t shift;
   8019     shift = vshrq_n_u64(b, c);
   8020     return vaddq_u64(a, shift);
   8021 }
   8022 
   8023 //************* Vector rounding shift right by constant and accumulate ****************************
   8024 //************************************************************************************************
   8025 int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
   8026 _NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
   8027 {
   8028     int8x8_t shift;
   8029     shift = vrshr_n_s8(b, c);
   8030     return vadd_s8( a, shift);
   8031 }
   8032 
   8033 int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
   8034 _NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
   8035 {
   8036     int16x4_t shift;
   8037     shift = vrshr_n_s16( b, c);
   8038     return vadd_s16(a, shift);
   8039 }
   8040 
   8041 int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
   8042 _NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
   8043 {
   8044     //may be not optimal compared with the serial execution
   8045     int32x2_t shift;
   8046     shift = vrshr_n_s32(b, c);
   8047     return vadd_s32( a, shift);
   8048 }
   8049 
   8050 int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
   8051 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
   8052 {
   8053     int64x1_t shift;
   8054     shift = vrshr_n_s64(b, c);
   8055     return vadd_s64( a, shift);
   8056 }
   8057 
   8058 uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
   8059 _NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
   8060 {
   8061     uint8x8_t shift;
   8062     shift = vrshr_n_u8(b, c);
   8063     return vadd_u8(a, shift);
   8064 }
   8065 
   8066 uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
   8067 _NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
   8068 {
   8069     uint16x4_t shift;
   8070     shift = vrshr_n_u16(b, c);
   8071     return vadd_u16(a,shift);
   8072 }
   8073 
   8074 uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
   8075 _NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
   8076 {
   8077     //may be not optimal compared with the serial execution
   8078     uint32x2_t shift;
   8079     shift = vrshr_n_u32(b, c);
   8080     return vadd_u32( a, shift);
   8081 }
   8082 
   8083 uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
   8084 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
   8085 {
   8086     //may be not optimal compared with the serial execution
   8087     uint64x1_t shift;
   8088     shift = vrshr_n_u64(b, c);
   8089     return vadd_u64( a, shift);
   8090 }
   8091 
   8092 int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
   8093 _NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
   8094 {
   8095     int8x16_t shift;
   8096     shift = vrshrq_n_s8(b, c);
   8097     return vaddq_s8(a, shift);
   8098 }
   8099 
   8100 int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
   8101 _NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
   8102 {
   8103     int16x8_t shift;
   8104     shift = vrshrq_n_s16(b, c);
   8105     return vaddq_s16(a, shift);
   8106 }
   8107 
   8108 int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
   8109 _NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
   8110 {
   8111     int32x4_t shift;
   8112     shift = vrshrq_n_s32(b, c);
   8113     return vaddq_s32(a, shift);
   8114 }
   8115 
   8116 int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
   8117 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
   8118 {
   8119     int64x2_t shift;
   8120     shift = vrshrq_n_s64(b, c);
   8121     return vaddq_s64(a, shift);
   8122 }
   8123 
   8124 uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
   8125 _NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
   8126 {
   8127     uint8x16_t shift;
   8128     shift = vrshrq_n_u8(b, c);
   8129     return vaddq_u8(a, shift);
   8130 }
   8131 
   8132 uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
   8133 _NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
   8134 {
   8135     uint16x8_t shift;
   8136     shift = vrshrq_n_u16(b, c);
   8137     return vaddq_u16(a,  shift);
   8138 }
   8139 
   8140 uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
   8141 _NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
   8142 {
   8143     uint32x4_t shift;
   8144     shift = vrshrq_n_u32(b, c);
   8145     return vaddq_u32(a, shift);
   8146 }
   8147 
   8148 uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
   8149 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
   8150 {
   8151     uint64x2_t shift;
   8152     shift = vrshrq_n_u64(b, c);
   8153     return vaddq_u64(a, shift);
   8154 }
   8155 
   8156 //**********************Vector saturating shift left by constant *****************************
   8157 //********************************************************************************************
   8158 //we don't check const ranges  assuming they are met
   8159 int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
   8160 _NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
   8161 {
   8162     //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
   8163     int8x8_t res64;
   8164     __m128i a128, r128;
   8165     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   8166     r128 = _mm_slli_epi16 (a128, b);
   8167     r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
   8168     return64(r128);
   8169 }
   8170 
   8171 int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
   8172 _NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
   8173 {
   8174     // go to 32 bit to get the auto saturation (in packs function)
   8175     int16x4_t res64;
   8176     __m128i a128, r128;
   8177     a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
   8178     r128 = _mm_slli_epi32 (a128, b); //shift_res
   8179     r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
   8180     return64(r128);
   8181 }
   8182 
   8183 int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
   8184 _NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
   8185 {
   8186     //serial execution may be faster
   8187     int32x2_t res64;
   8188     return64(vqshlq_n_s32 (_pM128i(a), b));
   8189 }
   8190 
   8191 
   8192 int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
   8193 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   8194 {
   8195     // no effective SIMD solution here
   8196     int64x1_t res;
   8197     int64_t bmask;
   8198     int64_t a_i64 = *( int64_t*)&a;
   8199     bmask = ( int64_t)1 << (63 - b); //positive
   8200     if (a_i64 >= bmask) {
   8201         res.m64_i64[0] = ~(_SIGNBIT64);
   8202     } else {
   8203         res.m64_i64[0]  = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
   8204     }
   8205     return res;
   8206 }
   8207 
   8208 
   8209 uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
   8210 _NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
   8211 {
   8212     //no 8 bit shift available in IA32 SIMD, go to 16 bit
   8213     uint8x8_t res64;
   8214     __m128i a128, r128;
   8215     a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
   8216     r128 = _mm_slli_epi16 (a128, b); //shift_res
   8217     r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
   8218     return64(r128);
   8219 }
   8220 
   8221 uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
   8222 _NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
   8223 {
   8224     // go to 32 bit to get the auto saturation (in packus function)
   8225     uint16x4_t res64;
   8226     __m128i a128, r128;
   8227     a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
   8228     r128 = _mm_slli_epi32 (a128, b); //shift_res
   8229     r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
   8230     return64(r128);
   8231 }
   8232 
   8233 uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
   8234 _NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b)
   8235 {
   8236     uint32x2_t res64;
   8237     return64(vqshlq_n_u32(_pM128i(a), b));
   8238 }
   8239 
   8240 uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
   8241 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   8242 {
   8243     // no effective SIMD solution here
   8244     uint64x1_t res;
   8245     uint64_t bmask;
   8246     uint64_t a_i64 = *(uint64_t*)&a;
   8247     bmask = ( uint64_t)1 << (64 - b);
   8248     res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
   8249     return res;
   8250 }
   8251 
   8252 int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
   8253 _NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
   8254 {
   8255     // go to 16 bit to get the auto saturation (in packs function)
   8256     __m128i a128, r128_1, r128_2;
   8257     a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
   8258     r128_1 = _mm_slli_epi16 (a128, b);
   8259     //swap hi and low part of a128 to process the remaining data
   8260     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8261     a128 = _MM_CVTEPI8_EPI16 (a128);
   8262     r128_2 = _mm_slli_epi16 (a128, b);
   8263     return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
   8264 }
   8265 
   8266 int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
   8267 _NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
   8268 {
   8269     // manual saturation solution looks LESS optimal than 32 bits conversion one
   8270     // go to 32 bit to get the auto saturation (in packs function)
   8271     __m128i a128, r128_1, r128_2;
   8272     a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
   8273     r128_1 = _mm_slli_epi32 (a128, b); //shift_res
   8274     //swap hi and low part of a128 to process the remaining data
   8275     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8276     a128 = _MM_CVTEPI16_EPI32 (a128);
   8277     r128_2 = _mm_slli_epi32 (a128, b);
   8278     return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
   8279 }
   8280 
   8281 int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
   8282 _NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
   8283 {
   8284     // no 64 bit saturation option available, special tricks necessary
   8285     __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
   8286     c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
   8287     maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
   8288     saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0  otherwise
   8289     c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
   8290     shift_res = _mm_slli_epi32 (a, b);
   8291     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
   8292     //result with positive numbers saturated
   8293     shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
   8294     //treat negative numbers
   8295     maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones  and (32-b+1)  zeros
   8296     saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0  otherwise
   8297     c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
   8298     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
   8299     return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
   8300 }
   8301 
   8302 int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
   8303 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   8304 {
   8305     // no effective SIMD solution here
   8306     _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
   8307     int64_t bmask;
   8308     int i;
   8309     bmask = ( int64_t)1 << (63 - b); //positive
   8310     _mm_store_si128((__m128i*)atmp, a);
   8311     for (i = 0; i<2; i++) {
   8312         if (atmp[i] >= bmask) {
   8313             res[i] = ~(_SIGNBIT64);
   8314         } else {
   8315             res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
   8316         }
   8317     }
   8318     return _mm_load_si128((__m128i*)res);
   8319 }
   8320 
   8321 uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
   8322 _NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
   8323 {
   8324     // go to 16 bit to get the auto saturation (in packs function)
   8325     __m128i a128, r128_1, r128_2;
   8326     a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
   8327     r128_1 = _mm_slli_epi16 (a128, b);
   8328     //swap hi and low part of a128 to process the remaining data
   8329     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8330     a128 = _MM_CVTEPU8_EPI16 (a128);
   8331     r128_2 = _mm_slli_epi16 (a128, b);
   8332     return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
   8333 }
   8334 
   8335 uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
   8336 _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
   8337 {
   8338     // manual saturation solution looks more optimal than 32 bits conversion one
   8339     __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
   8340     cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
   8341     c8000 = _mm_set1_epi16 (0x8000);
   8342 //no unsigned shorts comparison in SSE, only signed available, so need the trick
   8343     a_signed = _mm_sub_epi16(a, c8000); //go to signed
   8344     saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
   8345     shift_res = _mm_slli_epi16 (a, b);
   8346     return _mm_or_si128 (shift_res, saturation_mask);
   8347 }
   8348 
   8349 uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
   8350 _NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
   8351 {
   8352     // manual saturation solution, no 64 bit saturation option, the serial version may be faster
   8353     __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
   8354     cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
   8355     c80000000 = _mm_set1_epi32 (0x80000000);
   8356 //no unsigned ints comparison in SSE, only signed available, so need the trick
   8357     a_signed = _mm_sub_epi32(a, c80000000); //go to signed
   8358     saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
   8359     shift_res = _mm_slli_epi32 (a, b);
   8360     return _mm_or_si128 (shift_res, saturation_mask);
   8361 }
   8362 
   8363 uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
   8364 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   8365 {
   8366     // no effective SIMD solution here
   8367     _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
   8368     uint64_t bmask;
   8369     int i;
   8370     bmask = ( uint64_t)1 << (64 - b);
   8371     _mm_store_si128((__m128i*)atmp, a);
   8372     for (i = 0; i<2; i++) {
   8373         res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
   8374     }
   8375     return _mm_load_si128((__m128i*)res);
   8376 }
   8377 
   8378 //**************Vector signed->unsigned saturating shift left by constant *************
   8379 //*************************************************************************************
   8380 uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
   8381 _NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
   8382 {
   8383     //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
   8384     uint8x8_t res64;
   8385     __m128i a128, r128;
   8386     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   8387     r128 = _mm_slli_epi16 (a128, b);
   8388     r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
   8389     return64(r128);
   8390 }
   8391 
   8392 uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
   8393 _NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
   8394 {
   8395     uint16x4_t res64;
   8396     __m128i a128, r128;
   8397     a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
   8398     r128 = _mm_slli_epi32 (a128, b); //shift_res
   8399     r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
   8400     return64(r128);
   8401 }
   8402 
   8403 uint32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
   8404 _NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b)
   8405 {
   8406     int32x2_t res64;
   8407     return64( vqshluq_n_s32(_pM128i(a), b));
   8408 }
   8409 
   8410 uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
   8411 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
   8412 {
   8413     uint64x1_t res;
   8414     uint64_t limit;
   8415     if (a.m64_i64[0]<=0) {
   8416         res.m64_u64[0] = 0;
   8417     } else {
   8418         limit = (uint64_t) 1 << (64 - b);
   8419         res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
   8420     }
   8421     return res;
   8422 }
   8423 
   8424 uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
   8425 _NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
   8426 {
   8427     __m128i a128, r128_1, r128_2;
   8428     a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
   8429     r128_1 = _mm_slli_epi16 (a128, b);
   8430     //swap hi and low part of a128 to process the remaining data
   8431     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8432     a128 = _MM_CVTEPI8_EPI16 (a128);
   8433     r128_2 = _mm_slli_epi16 (a128, b);
   8434     return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
   8435 }
   8436 
   8437 uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
   8438 _NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
   8439 {
   8440     // manual saturation solution looks LESS optimal than 32 bits conversion one
   8441     __m128i a128, r128_1, r128_2;
   8442     a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
   8443     r128_1 = _mm_slli_epi32 (a128, b); //shift_res
   8444     //swap hi and low part of a128 to process the remaining data
   8445     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8446     a128 = _MM_CVTEPI16_EPI32 (a128);
   8447     r128_2 = _mm_slli_epi32 (a128, b);
   8448     return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
   8449 }
   8450 
   8451 uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
   8452 _NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
   8453 {
   8454     //solution may be  not optimal compared with the serial one
   8455     __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
   8456     zero = _mm_setzero_si128();
   8457     maskA = _mm_cmpeq_epi32(a, a);
   8458     maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
   8459     //saturate negative numbers to zero
   8460     maskGT0   = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
   8461     a0 = _mm_and_si128 (a,  maskGT0); //negative are zeros now
   8462     //saturate positive to 0xffffffff
   8463     a_masked = _mm_and_si128 (a0, maskA);
   8464     a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
   8465     a_shift = _mm_slli_epi32 (a0, b);
   8466     return _mm_or_si128 (a_shift, a_masked); //actual saturation
   8467 }
   8468 
   8469 uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
   8470 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
   8471 {
   8472     // no effective SIMD solution here, serial execution looks faster
   8473     _NEON2SSE_ALIGN_16 int64_t atmp[2];
   8474     _NEON2SSE_ALIGN_16 uint64_t res[2];
   8475     uint64_t limit;
   8476     int i;
   8477     _mm_store_si128((__m128i*)atmp, a);
   8478     for (i = 0; i<2; i++) {
   8479         if (atmp[i]<=0) {
   8480             res[i] = 0;
   8481         } else {
   8482             limit = (uint64_t) 1 << (64 - b);
   8483             res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
   8484         }
   8485     }
   8486     return _mm_load_si128((__m128i*)res);
   8487 }
   8488 
   8489 //************** Vector narrowing  shift right by constant **************
   8490 //**********************************************************************
   8491 int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
   8492 _NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
   8493 {
   8494     int8x8_t res64;
   8495     __m128i r16;
   8496     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   8497     r16  = vshrq_n_s16(a,b);
   8498     r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
   8499     return64(r16);
   8500 }
   8501 
   8502 int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
   8503 _NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
   8504 {
   8505     int16x4_t res64;
   8506     __m128i r32;
   8507     _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   8508     r32  = vshrq_n_s32(a,b);
   8509     r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
   8510     return64(r32);
   8511 }
   8512 
   8513 int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
   8514 _NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
   8515 {
   8516     int32x2_t res64;
   8517     __m128i r64;
   8518     r64  = vshrq_n_s64(a,b);
   8519     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8520     return64(r64);
   8521 }
   8522 
   8523 uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
   8524 _NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
   8525 {
   8526     uint8x8_t res64;
   8527     __m128i mask, r16;
   8528     mask = _mm_set1_epi16(0xff);
   8529     r16  = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
   8530     r16 = _mm_and_si128(r16, mask); //to avoid saturation
   8531     r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
   8532     return64(r16);
   8533 }
   8534 
   8535 uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
   8536 _NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
   8537 {
   8538     uint16x4_t res64;
   8539     __m128i mask, r32;
   8540     mask = _mm_set1_epi32(0xffff);
   8541     r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
   8542     r32 = _mm_and_si128(r32, mask); //to avoid saturation
   8543     r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
   8544     return64(r32);
   8545 }
   8546 
   8547 uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
   8548 _NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
   8549 {
   8550     uint32x2_t res64;
   8551     __m128i r64;
   8552     r64  = vshrq_n_u64(a,b);
   8553     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8554     return64(r64);
   8555 }
   8556 
   8557 //************** Vector signed->unsigned narrowing saturating shift right by constant ********
   8558 //*********************************************************************************************
   8559 uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
   8560 _NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
   8561 {
   8562     uint8x8_t res64;
   8563     __m128i r16;
   8564     r16  = vshrq_n_s16(a,b);
   8565     r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
   8566     return64(r16);
   8567 }
   8568 
   8569 uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
   8570 _NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
   8571 {
   8572     uint16x4_t res64;
   8573     __m128i r32;
   8574     r32  = vshrq_n_s32(a,b);
   8575     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow(signed to unsigned), use low 64 bits only
   8576     return64(r32);
   8577 }
   8578 
   8579 uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
   8580 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
   8581 {
   8582     _NEON2SSE_ALIGN_16 int64_t atmp[2];
   8583     uint32x2_t res;
   8584     int64_t res64;
   8585     _mm_store_si128((__m128i*)atmp, a);
   8586     if (atmp[0] < 0) {
   8587         res.m64_u32[0] = 0;
   8588     } else {
   8589         res64 = (atmp[0] >> b);
   8590         res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
   8591     }
   8592     if (atmp[1] < 0) {
   8593         res.m64_u32[1] = 0;
   8594     } else {
   8595         res64 = (atmp[1] >> b);
   8596         res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
   8597     }
   8598     return res;
   8599 }
   8600 
   8601 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
   8602 uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
   8603 _NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
   8604 {
   8605     //solution may be not optimal compared with the serial one
   8606     __m128i r16;
   8607     uint8x8_t res64;
   8608     r16 = vrshrq_n_s16(a,b);
   8609     r16 =  _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
   8610     return64(r16);
   8611 }
   8612 
   8613 uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
   8614 _NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
   8615 {
   8616     //solution may be not optimal compared with the serial one
   8617     __m128i r32;
   8618     uint16x4_t res64;
   8619     r32 = vrshrq_n_s32(a,b);
   8620     r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow (signed to unsigned), use low 64 bits only
   8621     return64(r32);
   8622 }
   8623 
   8624 uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
   8625 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
   8626 {
   8627     _NEON2SSE_ALIGN_16 int64_t atmp[2];
   8628     uint32x2_t res;
   8629     int64_t res64;
   8630     _mm_store_si128((__m128i*)atmp, a);
   8631     if (atmp[0] < 0) {
   8632         res.m64_u32[0] = 0;
   8633     } else {
   8634         res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
   8635         res.m64_u32[0] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
   8636     }
   8637     if (atmp[1] < 0) {
   8638         res.m64_u32[1] = 0;
   8639     } else {
   8640         res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
   8641         res.m64_u32[1] = (res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64;
   8642     }
   8643     return res;
   8644 }
   8645 
   8646 //***** Vector narrowing saturating shift right by constant ******
   8647 //*****************************************************************
   8648 int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
   8649 _NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
   8650 {
   8651     int8x8_t res64;
   8652     __m128i r16;
   8653     r16  = vshrq_n_s16(a,b);
   8654     r16  = _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8655     return64(r16);
   8656 }
   8657 
   8658 int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
   8659 _NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
   8660 {
   8661     int16x4_t res64;
   8662     __m128i r32;
   8663     r32  = vshrq_n_s32(a,b);
   8664     r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
   8665     return64(r32);
   8666 }
   8667 
   8668 int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
   8669 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   8670 {
   8671     //no optimal SIMD solution found
   8672     _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
   8673     int32x2_t res;
   8674     _mm_store_si128((__m128i*)atmp, a);
   8675     res64[0] = (atmp[0] >> b);
   8676     res64[1] = (atmp[1] >> b);
   8677     if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
   8678     if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
   8679     if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
   8680     if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
   8681     res.m64_i32[0] = (int32_t)res64[0];
   8682     res.m64_i32[1] = (int32_t)res64[1];
   8683     return res;
   8684 }
   8685 
   8686 uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
   8687 _NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
   8688 {
   8689     uint8x8_t res64;
   8690     __m128i r16;
   8691     r16  = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
   8692     r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8693     return64(r16);
   8694 }
   8695 
   8696 uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
   8697 _NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
   8698 {
   8699     uint16x4_t res64;
   8700     __m128i r32;
   8701     r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
   8702     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
   8703     return64(r32);
   8704 }
   8705 
   8706 uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
   8707 _NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
   8708 {
   8709     //serial solution may be faster
   8710     uint32x2_t res64;
   8711     __m128i r64, res_hi, zero;
   8712     zero = _mm_setzero_si128();
   8713     r64  = vshrq_n_u64(a,b);
   8714     res_hi = _mm_srli_epi64(r64,  32);
   8715     res_hi = _mm_cmpgt_epi32(res_hi, zero);
   8716     r64 = _mm_or_si128(r64, res_hi);
   8717     r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8718     return64(r64);
   8719 }
   8720 
   8721 
   8722 //********* Vector rounding narrowing shift right by constant *************************
   8723 //****************************************************************************************
   8724 int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
   8725 _NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
   8726 {
   8727     int8x8_t res64;
   8728     __m128i r16;
   8729     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   8730     r16  = vrshrq_n_s16(a,b);
   8731     r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
   8732     return64(r16);
   8733 }
   8734 
   8735 int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
   8736 _NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
   8737 {
   8738     int16x4_t res64;
   8739     __m128i r32;
   8740     _NEON2SSE_ALIGN_16 int8_t mask16_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   8741     r32  = vrshrq_n_s32(a,b);
   8742     r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask16_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
   8743     return64(r32);
   8744 }
   8745 
   8746 int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
   8747 _NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
   8748 {
   8749     int32x2_t res64;
   8750     __m128i r64;
   8751     r64  = vrshrq_n_s64(a,b);
   8752     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8753     return64(r64);
   8754 }
   8755 
   8756 uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
   8757 _NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
   8758 {
   8759     uint8x8_t res64;
   8760     __m128i mask, r16;
   8761     mask = _mm_set1_epi16(0xff);
   8762     r16  = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
   8763     r16 = _mm_and_si128(r16, mask); //to avoid saturation
   8764     r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8765     return64(r16);
   8766 }
   8767 
   8768 uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
   8769 _NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
   8770 {
   8771     uint16x4_t res64;
   8772     __m128i mask, r32;
   8773     mask = _mm_set1_epi32(0xffff);
   8774     r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
   8775     r32 = _mm_and_si128(r32, mask); //to avoid saturation
   8776     r32 = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
   8777     return64(r32);
   8778 }
   8779 
   8780 uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
   8781 _NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
   8782 {
   8783     uint32x2_t res64;
   8784     __m128i r64;
   8785     r64  = vrshrq_n_u64(a,b);
   8786     r64  =  _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8787     return64(r64);
   8788 }
   8789 
   8790 //************* Vector rounding narrowing saturating shift right by constant ************
   8791 //****************************************************************************************
   8792 int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
   8793 _NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
   8794 {
   8795     int8x8_t res64;
   8796     __m128i r16;
   8797     r16  = vrshrq_n_s16(a,b);
   8798     r16  =  _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8799     return64(r16);
   8800 }
   8801 
   8802 int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
   8803 _NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
   8804 {
   8805     int16x4_t res64;
   8806     __m128i r32;
   8807     r32  = vrshrq_n_s32(a,b);
   8808     r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
   8809     return64(r32);
   8810 }
   8811 
   8812 int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
   8813 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   8814 {
   8815     //no optimal SIMD solution found
   8816     _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
   8817     int32x2_t res;
   8818     _mm_store_si128((__m128i*)atmp, a);
   8819     maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
   8820     res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
   8821     maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
   8822     res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
   8823     if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
   8824     if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
   8825     if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
   8826     if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
   8827     res.m64_i32[0] = (int32_t)res64[0];
   8828     res.m64_i32[1] = (int32_t)res64[1];
   8829     return res;
   8830 }
   8831 
   8832 uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
   8833 _NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
   8834 {
   8835     uint8x8_t res64;
   8836     __m128i r16;
   8837     r16  = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
   8838     r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8839     return64(r16);
   8840 }
   8841 
   8842 uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
   8843 _NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
   8844 {
   8845     uint16x4_t res64;
   8846     __m128i r32;
   8847     r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
   8848     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
   8849     return64(r32);
   8850 }
   8851 
   8852 uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
   8853 _NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
   8854 {
   8855     //serial solution may be faster
   8856     uint32x2_t res64;
   8857     __m128i r64, res_hi, zero;
   8858     zero = _mm_setzero_si128();
   8859     r64  = vrshrq_n_u64(a,b);
   8860     res_hi = _mm_srli_epi64(r64,  32);
   8861     res_hi = _mm_cmpgt_epi32(res_hi, zero);
   8862     r64 = _mm_or_si128(r64, res_hi);
   8863     r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8864     return64(r64);
   8865 }
   8866 
   8867 //************** Vector widening shift left by constant ****************
   8868 //************************************************************************
   8869 int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
   8870 _NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
   8871 {
   8872     __m128i r;
   8873     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   8874     return _mm_slli_epi16 (r, b);
   8875 }
   8876 
   8877 int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
   8878 _NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
   8879 {
   8880     __m128i r;
   8881     r =  _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
   8882     return _mm_slli_epi32 (r, b);
   8883 }
   8884 
   8885 int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
   8886 _NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
   8887 {
   8888     __m128i r;
   8889     r =  _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
   8890     return _mm_slli_epi64 (r, b);
   8891 }
   8892 
   8893 uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
   8894 _NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
   8895 {
   8896     //no uint8 to uint16 conversion available, manual conversion used
   8897     __m128i zero,  r;
   8898     zero = _mm_setzero_si128 ();
   8899     r = _mm_unpacklo_epi8(_pM128i(a), zero);
   8900     return _mm_slli_epi16 (r, b);
   8901 }
   8902 
   8903 uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
   8904 _NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
   8905 {
   8906     //no uint16 to uint32 conversion available, manual conversion used
   8907     __m128i zero,  r;
   8908     zero = _mm_setzero_si128 ();
   8909     r = _mm_unpacklo_epi16(_pM128i(a), zero);
   8910     return _mm_slli_epi32 (r, b);
   8911 }
   8912 
   8913 uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
   8914 _NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
   8915 {
   8916     //no uint32 to uint64 conversion available, manual conversion used
   8917     __m128i zero,  r;
   8918     zero = _mm_setzero_si128 ();
   8919     r = _mm_unpacklo_epi32(_pM128i(a), zero);
   8920     return _mm_slli_epi64 (r, b);
   8921 }
   8922 
   8923 //************************************************************************************
   8924 //**************************** Shifts with insert ************************************
   8925 //************************************************************************************
   8926 //takes each element in a vector,  shifts them by an immediate value,
   8927 //and inserts the results in the destination vector. Bits shifted out of the each element are lost.
   8928 
   8929 //**************** Vector shift right and insert ************************************
   8930 //Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
   8931 //All other bits are taken from b shifted.
   8932 int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   8933 _NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c)
   8934 {
   8935     int8x8_t res64;
   8936     return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
   8937 }
   8938 
   8939 
   8940 int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   8941 _NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c)
   8942 {
   8943     int16x4_t res64;
   8944     return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
   8945 }
   8946 
   8947 
   8948 int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
   8949 _NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c)
   8950 {
   8951     int32x2_t res64;
   8952     return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
   8953 }
   8954 
   8955 
   8956 int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
   8957 _NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
   8958 {
   8959     int64x1_t res;
   8960     if (c ==64)
   8961         res = a;
   8962     else{
   8963         res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
   8964     }
   8965     return res;
   8966 }
   8967 
   8968 uint8x8_t vsri_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   8969 #define vsri_n_u8 vsri_n_s8
   8970 
   8971 uint16x4_t vsri_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   8972 #define vsri_n_u16 vsri_n_s16
   8973 
   8974 uint32x2_t vsri_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
   8975 #define vsri_n_u32 vsri_n_s32
   8976 
   8977 
   8978 uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
   8979 #define vsri_n_u64 vsri_n_s64
   8980 
   8981 poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   8982 #define vsri_n_p8 vsri_n_u8
   8983 
   8984 poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   8985 #define vsri_n_p16 vsri_n_u16
   8986 
   8987 int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   8988 _NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
   8989 {
   8990     __m128i maskA, a_masked;
   8991     uint8x16_t b_shift;
   8992     _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
   8993     maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
   8994     a_masked = _mm_and_si128 (a, maskA);
   8995     b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
   8996     return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
   8997 }
   8998 
   8999 int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   9000 _NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
   9001 {
   9002     //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
   9003     uint16x8_t b_shift;
   9004     uint16x8_t a_c;
   9005     b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
   9006     a_c = vshrq_n_u16( a, (16 - c));
   9007     a_c  = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
   9008     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
   9009 }
   9010 
   9011 int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
   9012 _NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
   9013 {
   9014     //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
   9015     uint32x4_t b_shift;
   9016     uint32x4_t a_c;
   9017     b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
   9018     a_c = vshrq_n_u32( a, (32 - c));
   9019     a_c  = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
   9020     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
   9021 }
   9022 
   9023 int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
   9024 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
   9025 {
   9026     //serial solution may be faster
   9027     uint64x2_t b_shift;
   9028     uint64x2_t a_c;
   9029     b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
   9030     a_c = _mm_srli_epi64(a, (64 - c));
   9031     a_c  = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
   9032     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
   9033 }
   9034 
   9035 uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   9036 #define vsriq_n_u8 vsriq_n_s8
   9037 
   9038 uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   9039 #define vsriq_n_u16 vsriq_n_s16
   9040 
   9041 uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
   9042 #define vsriq_n_u32 vsriq_n_s32
   9043 
   9044 uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
   9045 #define vsriq_n_u64 vsriq_n_s64
   9046 
   9047 poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   9048 #define vsriq_n_p8 vsriq_n_u8
   9049 
   9050 poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   9051 #define vsriq_n_p16 vsriq_n_u16
   9052 
   9053 //***** Vector shift left and insert *********************************************
   9054 //*********************************************************************************
   9055 //Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
   9056 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
   9057 int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   9058 _NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c)
   9059 {
   9060     int8x8_t res64;
   9061     return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
   9062 }
   9063 
   9064 
   9065 int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   9066 _NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c)
   9067 {
   9068     int16x4_t res64;
   9069     return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
   9070 }
   9071 
   9072 
   9073 int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
   9074 _NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c)
   9075 {
   9076     int32x2_t res64;
   9077     return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
   9078 }
   9079 
   9080 int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
   9081 _NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
   9082 {
   9083     int64x1_t res;
   9084     res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
   9085     return res;
   9086 }
   9087 
   9088 
   9089 uint8x8_t vsli_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   9090 #define vsli_n_u8 vsli_n_s8
   9091 
   9092 uint16x4_t vsli_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   9093 #define vsli_n_u16 vsli_n_s16
   9094 
   9095 uint32x2_t vsli_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
   9096 #define vsli_n_u32 vsli_n_s32
   9097 
   9098 uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
   9099 #define vsli_n_u64 vsli_n_s64
   9100 
   9101 poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   9102 #define vsli_n_p8 vsli_n_u8
   9103 
   9104 poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   9105 #define vsli_n_p16 vsli_n_u16
   9106 
   9107 int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   9108 _NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
   9109 {
   9110     __m128i maskA, a_masked;
   9111     int8x16_t b_shift;
   9112     _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
   9113     maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
   9114     b_shift = vshlq_n_s8( b, c);
   9115     a_masked = _mm_and_si128 (a, maskA);
   9116     return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
   9117 }
   9118 
   9119 int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   9120 _NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
   9121 {
   9122     //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
   9123     int16x8_t b_shift;
   9124     int16x8_t a_c;
   9125     b_shift = vshlq_n_s16( b, c);
   9126     a_c = vshlq_n_s16( a, (16 - c));
   9127     a_c  = _mm_srli_epi16(a_c, (16 - c));
   9128     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
   9129 }
   9130 
   9131 int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
   9132 _NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
   9133 {
   9134     //solution may be  not optimal compared with the serial one
   9135     //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
   9136     int32x4_t b_shift;
   9137     int32x4_t a_c;
   9138     b_shift = vshlq_n_s32( b, c);
   9139     a_c = vshlq_n_s32( a, (32 - c));
   9140     a_c  = _mm_srli_epi32(a_c, (32 - c));
   9141     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
   9142 }
   9143 
   9144 int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
   9145 _NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
   9146 {
   9147     //solution may be  not optimal compared with the serial one
   9148     //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
   9149     int64x2_t b_shift;
   9150     int64x2_t a_c;
   9151     b_shift = vshlq_n_s64( b, c);
   9152     a_c = vshlq_n_s64( a, (64 - c));
   9153     a_c  = _mm_srli_epi64(a_c, (64 - c));
   9154     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
   9155 }
   9156 
   9157 uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   9158 #define vsliq_n_u8 vsliq_n_s8
   9159 
   9160 uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   9161 #define vsliq_n_u16 vsliq_n_s16
   9162 
   9163 uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
   9164 #define vsliq_n_u32 vsliq_n_s32
   9165 
   9166 uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
   9167 #define vsliq_n_u64 vsliq_n_s64
   9168 
   9169 poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   9170 #define vsliq_n_p8 vsliq_n_u8
   9171 
   9172 poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   9173 #define vsliq_n_p16 vsliq_n_u16
   9174 
   9175 // ***********************************************************************************************
   9176 // ****************** Loads and stores of a single vector ***************************************
   9177 // ***********************************************************************************************
   9178 //Performs loads and stores of a single vector of some type.
   9179 //*******************************  Loads ********************************************************
   9180 // ***********************************************************************************************
   9181 //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
   9182 //also for SSE3  supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
   9183 // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
   9184 //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
   9185 #define LOAD_SI128(ptr) \
   9186         ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr));
   9187 
   9188 uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   9189 #define vld1q_u8 LOAD_SI128
   9190 
   9191 uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   9192 #define vld1q_u16 LOAD_SI128
   9193 
   9194 uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   9195 #define vld1q_u32 LOAD_SI128
   9196 
   9197 uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   9198 #define vld1q_u64 LOAD_SI128
   9199 
   9200 int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   9201 #define vld1q_s8 LOAD_SI128
   9202 
   9203 int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   9204 #define vld1q_s16 LOAD_SI128
   9205 
   9206 int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   9207 #define vld1q_s32 LOAD_SI128
   9208 
   9209 int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   9210 #define vld1q_s64 LOAD_SI128
   9211 
   9212 float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
   9213 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
   9214 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
   9215 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
   9216 __m128 f2;
   9217 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
   9218 }*/
   9219 
   9220 float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   9221 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
   9222 {
   9223     if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned
   9224         return _mm_load_ps(ptr);
   9225     else
   9226         return _mm_loadu_ps(ptr);
   9227 }
   9228 
   9229 poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   9230 #define vld1q_p8  LOAD_SI128
   9231 
   9232 poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   9233 #define vld1q_p16 LOAD_SI128
   9234 
   9235 uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
   9236 #define vld1_u8(ptr)  *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
   9237 
   9238 uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
   9239 #define vld1_u16 vld1_u8
   9240 
   9241 uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
   9242 #define vld1_u32 vld1_u8
   9243 
   9244 
   9245 uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   9246 #define vld1_u64 vld1_u8
   9247 
   9248 int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
   9249 #define vld1_s8 vld1_u8
   9250 
   9251 int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
   9252 #define vld1_s16 vld1_u16
   9253 
   9254 int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
   9255 #define vld1_s32 vld1_u32
   9256 
   9257 int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   9258 #define vld1_s64 vld1_u64
   9259 
   9260 float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
   9261 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
   9262 
   9263 float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
   9264 _NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
   9265 {
   9266     float32x2_t res;
   9267     res.m64_f32[0] = *(ptr);
   9268     res.m64_f32[1] = *(ptr + 1);
   9269     return res;
   9270 }
   9271 
   9272 poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
   9273 #define vld1_p8 vld1_u8
   9274 
   9275 poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
   9276 #define vld1_p16 vld1_u16
   9277 
   9278 //***********************************************************************************************************
   9279 //******* Lane load functions - insert the data at  vector's given position (lane) *************************
   9280 //***********************************************************************************************************
   9281 uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
   9282 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
   9283 
   9284 uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   9285 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
   9286 
   9287 uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   9288 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
   9289 
   9290 uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
   9291 #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
   9292 
   9293 
   9294 int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
   9295 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
   9296 
   9297 int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   9298 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
   9299 
   9300 int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   9301 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
   9302 
   9303 float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   9304 //current IA SIMD doesn't support float16
   9305 
   9306 float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   9307 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
   9308 {
   9309     //we need to deal with  ptr  16bit NOT aligned case
   9310     __m128 p;
   9311     p = _mm_set1_ps(*(ptr));
   9312     return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
   9313 }
   9314 
   9315 int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
   9316 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
   9317 
   9318 poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
   9319 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
   9320 
   9321 poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   9322 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
   9323 
   9324 uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
   9325 _NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
   9326 {
   9327     uint8x8_t res;
   9328     res = vec;
   9329     res.m64_u8[lane] = *(ptr);
   9330     return res;
   9331 }
   9332 
   9333 uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
   9334 _NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
   9335 {
   9336     uint16x4_t res;
   9337     res = vec;
   9338     res.m64_u16[lane] = *(ptr);
   9339     return res;
   9340 }
   9341 
   9342 uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
   9343 _NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
   9344 {
   9345     uint32x2_t res;
   9346     res = vec;
   9347     res.m64_u32[lane] = *(ptr);
   9348     return res;
   9349 }
   9350 
   9351 uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
   9352 _NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
   9353 {
   9354     uint64x1_t res;
   9355     res.m64_u64[0] = *(ptr);
   9356     return res;
   9357 }
   9358 
   9359 
   9360 int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
   9361 #define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
   9362 
   9363 int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
   9364 #define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
   9365 
   9366 int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
   9367 #define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
   9368 
   9369 float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
   9370 //current IA SIMD doesn't support float16
   9371 
   9372 float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
   9373 _NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
   9374 {
   9375     float32x2_t res;
   9376     res = vec;
   9377     res.m64_f32[lane] = *(ptr);
   9378     return res;
   9379 }
   9380 
   9381 int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
   9382 #define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
   9383 
   9384 poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
   9385 #define vld1_lane_p8 vld1_lane_u8
   9386 
   9387 poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
   9388 #define vld1_lane_p16 vld1_lane_s16
   9389 
   9390 // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
   9391 // ******************************************************************************************************************
   9392 uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9393 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
   9394 
   9395 uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9396 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
   9397 
   9398 uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9399 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
   9400 
   9401 uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   9402 _NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
   9403 {
   9404     _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
   9405     return LOAD_SI128(val);
   9406 }
   9407 
   9408 int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9409 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
   9410 
   9411 int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9412 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
   9413 
   9414 int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9415 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
   9416 
   9417 int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   9418 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
   9419 
   9420 float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
   9421 //current IA SIMD doesn't support float16, need to go to 32 bits
   9422 
   9423 float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9424 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
   9425 
   9426 poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9427 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
   9428 
   9429 poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9430 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
   9431 
   9432 uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9433 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
   9434 {
   9435     uint8x8_t res;
   9436     int i;
   9437     for(i = 0; i<8; i++) {
   9438         res.m64_u8[i] =  *(ptr);
   9439     }
   9440     return res;
   9441 }
   9442 
   9443 uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9444 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
   9445 {
   9446     uint16x4_t res;
   9447     int i;
   9448     for(i = 0; i<4; i++) {
   9449         res.m64_u16[i] =  *(ptr);
   9450     }
   9451     return res;
   9452 }
   9453 
   9454 uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9455 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
   9456 {
   9457     uint32x2_t res;
   9458     res.m64_u32[0] = *(ptr);
   9459     res.m64_u32[1] = *(ptr);
   9460     return res;
   9461 }
   9462 
   9463 uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   9464 _NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
   9465 {
   9466     uint64x1_t res;
   9467     res.m64_u64[0] = *(ptr);
   9468     return res;
   9469 }
   9470 
   9471 int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9472 #define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
   9473 
   9474 
   9475 int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9476 #define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
   9477 
   9478 
   9479 int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9480 #define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
   9481 
   9482 
   9483 int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   9484 #define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
   9485 
   9486 float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
   9487 //current IA SIMD doesn't support float16
   9488 
   9489 float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9490 _NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
   9491 {
   9492     float32x2_t res;
   9493     res.m64_f32[0] = *(ptr);
   9494     res.m64_f32[1] = res.m64_f32[0];
   9495     return res; // use last 64bits only
   9496 }
   9497 
   9498 poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9499 #define vld1_dup_p8 vld1_dup_u8
   9500 
   9501 
   9502 poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9503 #define vld1_dup_p16 vld1_dup_u16
   9504 
   9505 
   9506 //*************************************************************************************
   9507 //********************************* Store **********************************************
   9508 //*************************************************************************************
   9509 // If ptr is 16bit aligned and you  need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
   9510 //here we assume the case of  NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
   9511 #define STORE_SI128(ptr, val) \
   9512         (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
   9513 
   9514 void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
   9515 #define vst1q_u8 STORE_SI128
   9516 
   9517 void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
   9518 #define vst1q_u16 STORE_SI128
   9519 
   9520 void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
   9521 #define vst1q_u32 STORE_SI128
   9522 
   9523 void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
   9524 #define vst1q_u64 STORE_SI128
   9525 
   9526 void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
   9527 #define vst1q_s8 STORE_SI128
   9528 
   9529 void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
   9530 #define vst1q_s16 STORE_SI128
   9531 
   9532 void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
   9533 #define vst1q_s32 STORE_SI128
   9534 
   9535 void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
   9536 #define vst1q_s64 STORE_SI128
   9537 
   9538 void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
   9539 // IA32 SIMD doesn't work with 16bit floats currently
   9540 
   9541 void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
   9542 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
   9543 {
   9544     if( ((unsigned long)(ptr) & 15)  == 0 ) //16 bits aligned
   9545         _mm_store_ps (ptr, val);
   9546     else
   9547         _mm_storeu_ps (ptr, val);
   9548 }
   9549 
   9550 void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
   9551 #define vst1q_p8  vst1q_u8
   9552 
   9553 void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
   9554 #define vst1q_p16 vst1q_u16
   9555 
   9556 void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
   9557 _NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
   9558 {
   9559     int i;
   9560     for (i = 0; i<8; i++) {
   9561         *(ptr + i) = ((uint8_t*)&val)[i];
   9562     }
   9563     //_mm_storel_epi64((__m128i*)ptr, val);
   9564     return;
   9565 }
   9566 
   9567 void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
   9568 _NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
   9569 {
   9570     int i;
   9571     for (i = 0; i<4; i++) {
   9572         *(ptr + i) = ((uint16_t*)&val)[i];
   9573     }
   9574     //_mm_storel_epi64((__m128i*)ptr, val);
   9575     return;
   9576 }
   9577 
   9578 void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
   9579 _NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
   9580 {
   9581     int i;
   9582     for (i = 0; i<2; i++) {
   9583         *(ptr + i) = ((uint32_t*)&val)[i];
   9584     }
   9585     //_mm_storel_epi64((__m128i*)ptr, val);
   9586     return;
   9587 }
   9588 
   9589 void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
   9590 _NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
   9591 {
   9592     *(ptr) = *((uint64_t*)&val);
   9593     //_mm_storel_epi64((__m128i*)ptr, val);
   9594     return;
   9595 }
   9596 
   9597 void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
   9598 #define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
   9599 
   9600 void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
   9601 #define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
   9602 
   9603 void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
   9604 #define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
   9605 
   9606 void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
   9607 #define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
   9608 
   9609 void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
   9610 //current IA SIMD doesn't support float16
   9611 
   9612 void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
   9613 _NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
   9614 {
   9615     *(ptr) =   val.m64_f32[0];
   9616     *(ptr + 1) = val.m64_f32[1];
   9617     return;
   9618 }
   9619 
   9620 void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
   9621 #define vst1_p8 vst1_u8
   9622 
   9623 void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
   9624 #define vst1_p16 vst1_u16
   9625 
   9626 //***********Store a lane of a vector into memory (extract given lane) *********************
   9627 //******************************************************************************************
   9628 void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
   9629 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
   9630 
   9631 void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
   9632 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
   9633 
   9634 void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
   9635 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
   9636 
   9637 void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
   9638 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
   9639 
   9640 void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
   9641 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
   9642 
   9643 void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
   9644 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
   9645 
   9646 void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
   9647 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
   9648 
   9649 void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
   9650 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
   9651 
   9652 void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
   9653 //current IA SIMD doesn't support float16
   9654 
   9655 void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
   9656 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
   9657 {
   9658     int32_t ilane;
   9659     ilane = _MM_EXTRACT_PS(val,lane);
   9660     *(ptr) =  *((float*)&ilane);
   9661 }
   9662 
   9663 void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
   9664 #define vst1q_lane_p8   vst1q_lane_u8
   9665 
   9666 void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
   9667 #define vst1q_lane_p16   vst1q_lane_s16
   9668 
   9669 void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
   9670 _NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
   9671 {
   9672     *(ptr) = val.m64_u8[lane];
   9673 }
   9674 
   9675 void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
   9676 _NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
   9677 {
   9678     *(ptr) = val.m64_u16[lane];
   9679 }
   9680 
   9681 void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
   9682 _NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
   9683 {
   9684     *(ptr) = val.m64_u32[lane];
   9685 }
   9686 
   9687 void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
   9688 _NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
   9689 {
   9690     *(ptr) = val.m64_u64[0];
   9691 }
   9692 
   9693 void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
   9694 #define  vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
   9695 
   9696 void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
   9697 #define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
   9698 
   9699 void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
   9700 #define vst1_lane_s32(ptr, val, lane)  vst1_lane_u32((uint32_t*)ptr, val, lane)
   9701 
   9702 
   9703 void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
   9704 #define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
   9705 
   9706 
   9707 void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
   9708 //current IA SIMD doesn't support float16
   9709 
   9710 void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
   9711 _NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
   9712 {
   9713     *(ptr) = val.m64_f32[lane];
   9714 }
   9715 
   9716 void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
   9717 #define vst1_lane_p8 vst1_lane_u8
   9718 
   9719 void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
   9720 #define vst1_lane_p16 vst1_lane_s16
   9721 
   9722 //***********************************************************************************************
   9723 //**************** Loads and stores of an N-element structure **********************************
   9724 //***********************************************************************************************
   9725 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
   9726 //We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
   9727 //****************** 2 elements load  *********************************************
   9728 uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   9729 _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
   9730 {
   9731     uint8x16x2_t v;
   9732     v.val[0] = vld1q_u8(ptr);
   9733     v.val[1] = vld1q_u8((ptr + 16));
   9734     v = vuzpq_s8(v.val[0], v.val[1]);
   9735     return v;
   9736 }
   9737 
   9738 uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   9739 _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
   9740 {
   9741     uint16x8x2_t v;
   9742     v.val[0] = vld1q_u16( ptr);
   9743     v.val[1] = vld1q_u16( (ptr + 8));
   9744     v = vuzpq_s16(v.val[0], v.val[1]);
   9745     return v;
   9746 }
   9747 
   9748 uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   9749 _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
   9750 {
   9751     uint32x4x2_t v;
   9752     v.val[0] = vld1q_u32 ( ptr);
   9753     v.val[1] = vld1q_u32 ( (ptr + 4));
   9754     v = vuzpq_s32(v.val[0], v.val[1]);
   9755     return v;
   9756 }
   9757 
   9758 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
   9759 #define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
   9760 
   9761 int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   9762 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
   9763 
   9764 int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   9765 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
   9766 
   9767 
   9768 float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
   9769 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   9770 
   9771 float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   9772 _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
   9773 {
   9774     float32x4x2_t v;
   9775     v.val[0] =  vld1q_f32 (ptr);
   9776     v.val[1] =  vld1q_f32 ((ptr + 4));
   9777     v = vuzpq_f32(v.val[0], v.val[1]);
   9778     return v;
   9779 }
   9780 
   9781 poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   9782 #define  vld2q_p8 vld2q_u8
   9783 
   9784 poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   9785 #define vld2q_p16 vld2q_u16
   9786 
   9787 uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   9788 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
   9789 {
   9790     uint8x8x2_t v;
   9791     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
   9792     __m128i ld128;
   9793     ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
   9794     ld128 =  _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
   9795     vst1q_u8((v.val), ld128); //  v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   9796     return v;
   9797 }
   9798 
   9799 uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   9800 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
   9801 {
   9802     _NEON2SSE_ALIGN_16 uint16x4x2_t v;
   9803     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
   9804     __m128i ld128;
   9805     ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
   9806     ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
   9807     vst1q_u16((v.val), ld128);
   9808     return v;
   9809 }
   9810 
   9811 uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   9812 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
   9813 {
   9814     _NEON2SSE_ALIGN_16 uint32x2x2_t v;
   9815     __m128i ld128;
   9816     ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
   9817     ld128 = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
   9818     vst1q_u32((v.val), ld128);
   9819     return v;
   9820 }
   9821 
   9822 uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   9823 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
   9824 {
   9825     uint64x1x2_t v;
   9826     v.val[0].m64_u64[0] = *(ptr);
   9827     v.val[1].m64_u64[0] = *(ptr + 1);
   9828     return v;
   9829 }
   9830 
   9831 int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   9832 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
   9833 
   9834 int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   9835 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
   9836 
   9837 int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   9838 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
   9839 
   9840 int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   9841 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
   9842 
   9843 float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
   9844 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
   9845 
   9846 float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   9847 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
   9848 {
   9849     float32x2x2_t v;
   9850     v.val[0].m64_f32[0] = *(ptr);
   9851     v.val[0].m64_f32[1] = *(ptr + 2);
   9852     v.val[1].m64_f32[0] = *(ptr + 1);
   9853     v.val[1].m64_f32[1] = *(ptr + 3);
   9854     return v;
   9855 }
   9856 
   9857 poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   9858 #define vld2_p8 vld2_u8
   9859 
   9860 poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   9861 #define vld2_p16 vld2_u16
   9862 
   9863 //******************** Triplets ***************************************
   9864 //*********************************************************************
   9865 uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   9866 _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
   9867 {
   9868     //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
   9869     //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
   9870     //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
   9871     //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
   9872     uint8x16x3_t v;
   9873     __m128i tmp0, tmp1,tmp2, tmp3;
   9874     _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
   9875     _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
   9876     _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
   9877 
   9878     v.val[0] =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
   9879     v.val[1] =  vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
   9880     v.val[2] =  vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
   9881 
   9882     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
   9883     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
   9884     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
   9885 
   9886     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
   9887     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
   9888     tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
   9889     tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
   9890     v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
   9891     v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
   9892 
   9893     tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
   9894     tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
   9895     v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
   9896     v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
   9897     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
   9898     v.val[1] =  _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
   9899     v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
   9900     tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
   9901     tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
   9902     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
   9903 
   9904     tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
   9905     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
   9906     v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
   9907     v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
   9908     v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
   9909     tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
   9910     v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
   9911     return v;
   9912 }
   9913 
   9914 uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   9915 _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
   9916 {
   9917     //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
   9918     uint16x8x3_t v;
   9919     __m128i tmp0, tmp1,tmp2, tmp3;
   9920     _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
   9921     _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
   9922     _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
   9923 
   9924     v.val[0] =  vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
   9925     v.val[1] =  vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
   9926     v.val[2] =  vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
   9927 
   9928     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
   9929     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
   9930     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
   9931 
   9932     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
   9933     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
   9934     tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
   9935     tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
   9936     v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
   9937     v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
   9938 
   9939     tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
   9940     tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
   9941     v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
   9942     v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
   9943     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
   9944     v.val[1] =  _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
   9945     v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
   9946     tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
   9947     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
   9948     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
   9949 
   9950     tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
   9951     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
   9952     v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
   9953     v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
   9954     v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
   9955     tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
   9956     v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
   9957     return v;
   9958 }
   9959 
   9960 uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   9961 _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
   9962 {
   9963     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
   9964     uint32x4x3_t v;
   9965     __m128i tmp0, tmp1,tmp2, tmp3;
   9966     v.val[0] =  vld1q_u32 (ptr); //a0,a1,a2,a3,
   9967     v.val[1] =  vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
   9968     v.val[2] =  vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
   9969 
   9970     tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
   9971     tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
   9972     tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
   9973 
   9974     tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
   9975     v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
   9976     tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
   9977     v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
   9978     v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
   9979     v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
   9980     return v;
   9981 }
   9982 
   9983 int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   9984 #define  vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
   9985 
   9986 int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   9987 #define  vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
   9988 
   9989 int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   9990 #define  vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
   9991 
   9992 float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   9993 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   9994 
   9995 float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   9996 _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
   9997 {
   9998     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
   9999     float32x4x3_t v;
   10000     __m128 tmp0, tmp1,tmp2, tmp3;
   10001     v.val[0] =  vld1q_f32 (ptr); //a0,a1,a2,a3,
   10002     v.val[1] =  vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
   10003     v.val[2] =  vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
   10004 
   10005     tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
   10006     tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
   10007     tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
   10008     tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
   10009 
   10010     v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
   10011     tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
   10012     v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
   10013     v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
   10014     v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
   10015     return v;
   10016 }
   10017 
   10018 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   10019 #define vld3q_p8 vld3q_u8
   10020 
   10021 poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   10022 #define vld3q_p16 vld3q_u16
   10023 
   10024 uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   10025 _NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
   10026 {
   10027     //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
   10028     uint8x8x3_t v;
   10029     __m128i val0, val1, val2, tmp0, tmp1;
   10030     _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
   10031     _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
   10032     val0 =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
   10033     val2 =  _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
   10034 
   10035     tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
   10036     tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
   10037     val0 = _mm_slli_si128(tmp0,10);
   10038     val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
   10039     val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
   10040     val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
   10041     _M64(v.val[0], val0);
   10042     val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
   10043     val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
   10044     val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
   10045     val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
   10046     val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
   10047     _M64(v.val[1], val1);
   10048 
   10049     tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
   10050     val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
   10051     val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
   10052     val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
   10053     _M64(v.val[2], val2);
   10054     return v;
   10055 }
   10056 
   10057 uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   10058 _NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
   10059 {
   10060     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
   10061     uint16x4x3_t v;
   10062     __m128i val0, val1, val2, tmp0, tmp1;
   10063     _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
   10064     val0 =  vld1q_u16 (ptr); //a0,a1,a2,a3,  b0,b1,b2,b3
   10065     val2 =  _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
   10066 
   10067     tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
   10068     tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01     : c1, c2, c0, c3,
   10069     val0 = _mm_slli_si128(tmp0,10);
   10070     val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
   10071     val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
   10072     val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
   10073     val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
   10074     _M64(v.val[0], val0);
   10075 
   10076     val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
   10077     val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
   10078     val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
   10079     val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
   10080     val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
   10081     _M64(v.val[1], val1);
   10082 
   10083     tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
   10084     tmp1 = _mm_srli_si128(tmp1,4);
   10085     tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
   10086     val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
   10087     _M64(v.val[2], val2);
   10088     return v;
   10089 }
   10090 
   10091 uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   10092 _NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
   10093 {
   10094     //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
   10095     uint32x2x3_t v;
   10096     __m128i val0, val1, val2;
   10097     val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
   10098     val2 =   _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
   10099 
   10100     val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
   10101     _M64(v.val[0], val0);
   10102     val2 =  _mm_slli_si128(val2, 8); //x, x,c0,c1,
   10103     val1 =  _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
   10104     _M64(v.val[1], val1);
   10105     val2 =  _mm_srli_si128(val1, 8); //b0, c1, x, x,
   10106     _M64(v.val[2], val2);
   10107     return v;
   10108 }
   10109 uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   10110 _NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
   10111 {
   10112     uint64x1x3_t v;
   10113     v.val[0].m64_u64[0] = *(ptr);
   10114     v.val[1].m64_u64[0] = *(ptr + 1);
   10115     v.val[2].m64_u64[0] = *(ptr + 2);
   10116     return v;
   10117 }
   10118 
   10119 int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   10120 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
   10121 
   10122 int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   10123 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
   10124 
   10125 int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   10126 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
   10127 
   10128 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   10129 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
   10130 
   10131 float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   10132 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10133 
   10134 float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   10135 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
   10136 {
   10137     //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
   10138     float32x2x3_t v;
   10139     v.val[0].m64_f32[0] = *(ptr);
   10140     v.val[0].m64_f32[1] = *(ptr + 3);
   10141 
   10142     v.val[1].m64_f32[0] = *(ptr + 1);
   10143     v.val[1].m64_f32[1] = *(ptr + 4);
   10144 
   10145     v.val[2].m64_f32[0] = *(ptr + 2);
   10146     v.val[2].m64_f32[1] = *(ptr + 5);
   10147     return v;
   10148 }
   10149 
   10150 poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   10151 #define vld3_p8 vld3_u8
   10152 
   10153 poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   10154 #define vld3_p16 vld3_u16
   10155 
   10156 //***************  Quadruples load ********************************
   10157 //*****************************************************************
   10158 uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   10159 _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
   10160 {
   10161     uint8x16x4_t v;
   10162     __m128i tmp3, tmp2, tmp1, tmp0;
   10163 
   10164     v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
   10165     v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
   10166     v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
   10167     v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
   10168 
   10169     tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
   10170     tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
   10171     tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
   10172     tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
   10173 
   10174     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8,  a1,a9, b1,b9, ....a3,a11, b3,b11
   10175     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
   10176     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
   10177     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
   10178 
   10179     tmp0 =  _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
   10180     tmp1 =  _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
   10181     tmp2 =  _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
   10182     tmp3 =  _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
   10183 
   10184     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
   10185     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
   10186     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
   10187     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
   10188     return v;
   10189 }
   10190 
   10191 uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   10192 _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
   10193 {
   10194     uint16x8x4_t v;
   10195     __m128i tmp3, tmp2, tmp1, tmp0;
   10196     tmp0  =  vld1q_u16 (ptr); //a0,a1,a2,...a7
   10197     tmp1  =  vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
   10198     tmp2  =  vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
   10199     tmp3  =  vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
   10200     v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
   10201     v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
   10202     v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
   10203     v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
   10204     tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
   10205     tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
   10206     tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
   10207     tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
   10208     v.val[0] =  _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
   10209     v.val[1] =  _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
   10210     v.val[2] =  _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
   10211     v.val[3] =  _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
   10212     return v;
   10213 }
   10214 
   10215 uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   10216 _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
   10217 {
   10218     uint32x4x4_t v;
   10219     __m128i tmp3, tmp2, tmp1, tmp0;
   10220     v.val[0] =  vld1q_u32 (ptr);
   10221     v.val[1] =  vld1q_u32 ((ptr + 4));
   10222     v.val[2] =  vld1q_u32 ((ptr + 8));
   10223     v.val[3] =  vld1q_u32 ((ptr + 12));
   10224     tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
   10225     tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
   10226     tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
   10227     tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
   10228     v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
   10229     v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
   10230     v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
   10231     v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
   10232     return v;
   10233 }
   10234 
   10235 int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   10236 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
   10237 
   10238 int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   10239 #define  vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
   10240 
   10241 int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   10242 #define  vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
   10243 
   10244 float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   10245 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10246 
   10247 float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   10248 _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
   10249 {
   10250     float32x4x4_t v;
   10251     __m128 tmp3, tmp2, tmp1, tmp0;
   10252 
   10253     v.val[0] =  vld1q_f32 ((float*) ptr);
   10254     v.val[1] =  vld1q_f32 ((float*) (ptr + 4));
   10255     v.val[2] =  vld1q_f32 ((float*) (ptr + 8));
   10256     v.val[3] =  vld1q_f32 ((float*) (ptr + 12));
   10257     tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
   10258     tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
   10259     tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
   10260     tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
   10261     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
   10262     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
   10263     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
   10264     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
   10265     return v;
   10266 }
   10267 
   10268 poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   10269 #define vld4q_p8 vld4q_u8
   10270 
   10271 poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   10272 #define vld4q_p16 vld4q_s16
   10273 
   10274 uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   10275 _NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
   10276 {
   10277     uint8x8x4_t v;
   10278     __m128i sh0, sh1;
   10279     __m128i val0,  val2;
   10280     _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
   10281 
   10282     val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
   10283     val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
   10284 
   10285     sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
   10286     sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
   10287     val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
   10288     vst1q_u8(&v.val[0], val0 );
   10289     val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
   10290     vst1q_u8(&v.val[2], val2 );
   10291     return v;
   10292 }
   10293 
   10294 uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   10295 _NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
   10296 {
   10297     uint16x4x4_t v;
   10298     __m128i sh0, sh1;
   10299     __m128i val0, val2;
   10300     _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
   10301     val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
   10302     val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
   10303     sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
   10304     sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
   10305     val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
   10306     vst1q_u16(&v.val[0], val0 );
   10307     val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
   10308     vst1q_u16(&v.val[2], val2 );
   10309     return v;
   10310 }
   10311 
   10312 uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   10313 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
   10314 {
   10315     //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
   10316     uint32x2x4_t v;
   10317     __m128i val0, val01, val2;
   10318     val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
   10319     val2 =  vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
   10320     val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
   10321     val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
   10322     vst1q_u32(&v.val[0], val01);
   10323     vst1q_u32(&v.val[2], val2 );
   10324     return v;
   10325 }
   10326 
   10327 uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   10328 _NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
   10329 {
   10330     uint64x1x4_t v;
   10331     v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
   10332     v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
   10333     v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
   10334     v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
   10335     return v;
   10336 }
   10337 
   10338 int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   10339 #define  vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
   10340 
   10341 int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   10342 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
   10343 
   10344 int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   10345 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
   10346 
   10347 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   10348 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
   10349 
   10350 float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   10351 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10352 
   10353 float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   10354 _NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
   10355 {
   10356     //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
   10357     float32x2x4_t res;
   10358     res.val[0].m64_f32[0] = *(ptr);
   10359     res.val[0].m64_f32[1] = *(ptr + 4);
   10360     res.val[1].m64_f32[0] = *(ptr + 1);
   10361     res.val[1].m64_f32[1] = *(ptr + 5);
   10362     res.val[2].m64_f32[0] = *(ptr + 2);
   10363     res.val[2].m64_f32[1] = *(ptr + 6);
   10364     res.val[3].m64_f32[0] = *(ptr + 3);
   10365     res.val[3].m64_f32[1] = *(ptr + 7);
   10366     return res;
   10367 }
   10368 
   10369 poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   10370 #define vld4_p8 vld4_u8
   10371 
   10372 poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   10373 #define vld4_p16 vld4_u16
   10374 
   10375 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
   10376 //*******************************************************************************************************************
   10377 uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   10378 _NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
   10379 {
   10380     uint8x8x2_t v;
   10381     __m128i val0, val1;
   10382     val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
   10383     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
   10384     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
   10385     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
   10386     vst1q_u8(v.val, val0);
   10387     return v;
   10388 }
   10389 
   10390 uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   10391 _NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
   10392 {
   10393     uint16x4x2_t v;
   10394     __m128i val0, val1;
   10395     val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
   10396     val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
   10397     _M64(v.val[0], val0);
   10398     val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
   10399     _M64(v.val[1], val1);
   10400     return v;
   10401 }
   10402 
   10403 uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   10404 _NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
   10405 {
   10406     uint32x2x2_t v;
   10407     __m128i val0;
   10408     val0 = LOAD_SI128(ptr); //0,1,x,x
   10409     val0 = _mm_shuffle_epi32(val0,   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
   10410     vst1q_u32(v.val, val0);
   10411     return v;
   10412 }
   10413 
   10414 uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   10415 #define vld2_dup_u64 vld2_u64
   10416 
   10417 int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   10418 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
   10419 
   10420 int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   10421 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
   10422 
   10423 int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   10424 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
   10425 
   10426 int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   10427 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
   10428 
   10429 float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   10430 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10431 
   10432 float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   10433 _NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
   10434 {
   10435     float32x2x2_t v;
   10436     v.val[0].m64_f32[0] = *(ptr); //0,0
   10437     v.val[0].m64_f32[1] = *(ptr); //0,0
   10438     v.val[1].m64_f32[0] = *(ptr + 1); //1,1
   10439     v.val[1].m64_f32[1] = *(ptr + 1); //1,1
   10440     return v;
   10441 }
   10442 
   10443 poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   10444 #define vld2_dup_p8 vld2_dup_u8
   10445 
   10446 poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   10447 #define vld2_dup_p16 vld2_dup_s16
   10448 
   10449 //************* Duplicate (or propagate)triplets: *******************
   10450 //********************************************************************
   10451 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
   10452 uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   10453 _NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
   10454 {
   10455     uint8x8x3_t v;
   10456     __m128i val0, val1, val2;
   10457     val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
   10458     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
   10459     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
   10460     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
   10461     val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
   10462     vst1q_u8(v.val, val0);
   10463     _M64(v.val[2], val2);
   10464     return v;
   10465 }
   10466 
   10467 uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10468 _NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10469 {
   10470     uint16x4x3_t v;
   10471     __m128i val0, val1, val2;
   10472     val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
   10473     val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
   10474     val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
   10475     val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
   10476     _M64(v.val[0], val0);
   10477     _M64(v.val[1], val1);
   10478     _M64(v.val[2], val2);
   10479     return v;
   10480 }
   10481 
   10482 uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10483 _NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10484 {
   10485     uint32x2x3_t v;
   10486     __m128i val0, val1, val2;
   10487     val2 = LOAD_SI128(ptr); //0,1,2,x
   10488     val0 = _mm_shuffle_epi32(val2,   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
   10489     val1 = _mm_shuffle_epi32(val2,   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
   10490     val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
   10491     _M64(v.val[0], val0);
   10492     _M64(v.val[1], val1);
   10493     _M64(v.val[2], val2);
   10494     return v;
   10495 }
   10496 
   10497 uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   10498 _NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
   10499 {
   10500     uint64x1x3_t v;
   10501     v.val[0].m64_u64[0] = *(ptr);
   10502     v.val[1].m64_u64[0] = *(ptr + 1);
   10503     v.val[2].m64_u64[0] = *(ptr + 2);
   10504     return v;
   10505 }
   10506 
   10507 int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   10508 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
   10509 
   10510 int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10511 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
   10512 
   10513 int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10514 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
   10515 
   10516 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   10517 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
   10518 
   10519 
   10520 float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10521 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10522 
   10523 float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10524 _NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10525 {
   10526     float32x2x3_t v;
   10527     int i;
   10528     for (i = 0; i<3; i++) {
   10529         v.val[i].m64_f32[0] = *(ptr + i);
   10530         v.val[i].m64_f32[1] = *(ptr + i);
   10531     }
   10532     return v;
   10533 }
   10534 
   10535 poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   10536 #define vld3_dup_p8 vld3_dup_u8
   10537 
   10538 poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10539 #define vld3_dup_p16 vld3_dup_s16
   10540 
   10541 
   10542 //************* Duplicate (or propagate) quadruples: *******************
   10543 //***********************************************************************
   10544 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes  and  ptr[3] to all val[3] lanes
   10545 uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   10546 _NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   10547 {
   10548     uint8x8x4_t v;
   10549     __m128i val0, val1, val2;
   10550     val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
   10551     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
   10552     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
   10553     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
   10554     val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
   10555     vst1q_u8(&v.val[0], val0);
   10556     vst1q_u8(&v.val[2], val2);
   10557     return v;
   10558 }
   10559 
   10560 uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10561 _NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10562 {
   10563     uint16x4x4_t v;
   10564     __m128i val0, val1, val2, val3;
   10565     val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
   10566     val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
   10567     val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
   10568     val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
   10569     val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
   10570     _M64(v.val[0], val0);
   10571     _M64(v.val[1], val1);
   10572     _M64(v.val[2], val2);
   10573     _M64(v.val[3], val3);
   10574     return v;
   10575 }
   10576 
   10577 uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10578 _NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10579 {
   10580     uint32x2x4_t v;
   10581     __m128i val0, val1, val2, val3;
   10582     val3 = LOAD_SI128(ptr); //0,1,2,3
   10583     val0 = _mm_shuffle_epi32(val3,   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
   10584     val1 = _mm_shuffle_epi32(val3,   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
   10585     val2 = _mm_shuffle_epi32(val3,   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
   10586     val3 = _mm_shuffle_epi32(val3,   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
   10587     _M64(v.val[0], val0);
   10588     _M64(v.val[1], val1);
   10589     _M64(v.val[2], val2);
   10590     _M64(v.val[3], val3);
   10591     return v;
   10592 }
   10593 
   10594 uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   10595 _NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
   10596 {
   10597     uint64x1x4_t v;
   10598     v.val[0].m64_u64[0] = *(ptr);
   10599     v.val[1].m64_u64[0] = *(ptr + 1);
   10600     v.val[2].m64_u64[0] = *(ptr + 2);
   10601     v.val[3].m64_u64[0] = *(ptr + 3);
   10602     return v;
   10603 }
   10604 
   10605 int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   10606 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
   10607 
   10608 int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10609 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
   10610 
   10611 int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10612 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
   10613 
   10614 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   10615 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
   10616 
   10617 float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10618 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10619 
   10620 float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10621 _NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10622 {
   10623     float32x2x4_t v;
   10624     int i;
   10625     for (i = 0; i<4; i++) {
   10626         v.val[i].m64_f32[0] = *(ptr + i);
   10627         v.val[i].m64_f32[1] = *(ptr + i);
   10628     }
   10629     return v;
   10630 }
   10631 
   10632 poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   10633 #define vld4_dup_p8 vld4_dup_u8
   10634 
   10635 poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10636 #define vld4_dup_p16 vld4_dup_u16
   10637 
   10638 
   10639 //**********************************************************************************
   10640 //*******************Lane loads for  an N-element structures ***********************
   10641 //**********************************************************************************
   10642 //********************** Lane pairs  ************************************************
   10643 //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
   10644 //we assume  src is 16 bit aligned
   10645 
   10646 //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
   10647 //to fix it the all functions below work with  xxxxxx_2t pointers and the corresponding original functions are redefined
   10648 
   10649 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   10650 _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
   10651 {
   10652     uint16x8x2_t v;
   10653     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
   10654     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
   10655     return v;
   10656 }
   10657 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
   10658 
   10659 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
   10660 _NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
   10661 {
   10662     uint32x4x2_t v;
   10663     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
   10664     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
   10665     return v;
   10666 }
   10667 #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
   10668 
   10669 //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   10670 _NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
   10671 {
   10672     int16x8x2_t v;
   10673     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
   10674     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
   10675     return v;
   10676 }
   10677 #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
   10678 
   10679 //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
   10680 _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
   10681 {
   10682     int32x4x2_t v;
   10683     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
   10684     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
   10685     return v;
   10686 }
   10687 #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
   10688 
   10689 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   10690 //current IA SIMD doesn't support float16
   10691 
   10692 //float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
   10693 _NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
   10694 {
   10695     float32x4x2_t v;
   10696     v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
   10697     v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
   10698     return v;
   10699 }
   10700 #define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
   10701 
   10702 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   10703 #define vld2q_lane_p16 vld2q_lane_u16
   10704 
   10705 //uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
   10706 _NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
   10707 {
   10708     uint8x8x2_t v;
   10709     v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
   10710     v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
   10711     return v;
   10712 }
   10713 #define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane)
   10714 
   10715 //uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
   10716 _NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t* src, __constrange(0,3) int lane)
   10717 {
   10718     uint16x4x2_t v;
   10719     v.val[0]  =  vld1_lane_u16(ptr, src->val[0], lane);
   10720     v.val[1]  = vld1_lane_u16((ptr + 1), src->val[1], lane);
   10721     return v;
   10722 }
   10723 #define vld2_lane_u16(ptr, src, lane) vld2_lane_u16_ptr(ptr, &src, lane)
   10724 
   10725 //uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
   10726 _NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t* src, __constrange(0,1) int lane)
   10727 {
   10728     uint32x2x2_t v;
   10729     v.val[0]  =  vld1_lane_u32(ptr, src->val[0], lane);
   10730     v.val[1]  = vld1_lane_u32((ptr + 1), src->val[1], lane);
   10731     return v;
   10732 }
   10733 #define vld2_lane_u32(ptr, src, lane) vld2_lane_u32_ptr(ptr, &src, lane)
   10734 
   10735 //int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
   10736 int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
   10737 #define vld2_lane_s8(ptr, src, lane)  vld2_lane_u8(( uint8_t*) ptr, src, lane)
   10738 
   10739 //int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
   10740 int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   10741 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
   10742 
   10743 //int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
   10744 int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
   10745 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
   10746 
   10747 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   10748 //current IA SIMD doesn't support float16
   10749 
   10750 float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
   10751 _NEON2SSE_INLINE float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane)
   10752 {
   10753     float32x2x2_t v;
   10754     v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
   10755     v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
   10756     return v;
   10757 }
   10758 #define vld2_lane_f32(ptr, src, lane) vld2_lane_f32_ptr(ptr, &src, lane)
   10759 
   10760 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
   10761 poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
   10762 #define vld2_lane_p8 vld2_lane_u8
   10763 
   10764 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
   10765 poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   10766 #define vld2_lane_p16 vld2_lane_u16
   10767 
   10768 //*********** Lane triplets **********************
   10769 //*************************************************
   10770 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
   10771 //we assume src is 16 bit aligned
   10772 
   10773 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10774 _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10775 {
   10776     uint16x8x3_t v;
   10777     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
   10778     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
   10779     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
   10780     return v;
   10781 }
   10782 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
   10783 
   10784 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10785 _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10786 {
   10787     uint32x4x3_t v;
   10788     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
   10789     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
   10790     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
   10791     return v;
   10792 }
   10793 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
   10794 
   10795 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10796 _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10797 {
   10798     int16x8x3_t v;
   10799     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
   10800     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
   10801     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
   10802     return v;
   10803 }
   10804 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
   10805 
   10806 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10807 _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10808 {
   10809     int32x4x3_t v;
   10810     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
   10811     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
   10812     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
   10813     return v;
   10814 }
   10815 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
   10816 
   10817 float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10818 //current IA SIMD doesn't support float16
   10819 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
   10820 
   10821 
   10822 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10823 _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10824 {
   10825     float32x4x3_t v;
   10826     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
   10827     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
   10828     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
   10829     return v;
   10830 }
   10831 #define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
   10832 
   10833 poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10834 #define vld3q_lane_p16 vld3q_lane_u16
   10835 
   10836 //uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   10837 _NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   10838 {
   10839     uint8x8x3_t v;
   10840     v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
   10841     v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
   10842     v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
   10843     return v;
   10844 }
   10845 #define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane)
   10846 
   10847 //uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t   const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10848 _NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10849 {
   10850     uint16x4x3_t v;
   10851     v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
   10852     v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
   10853     v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
   10854     return v;
   10855 }
   10856 #define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane)
   10857 
   10858 //uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10859 _NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10860 {
   10861     //need to merge into 128 bit anyway
   10862     uint32x2x3_t v;
   10863     v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);;
   10864     v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);;
   10865     v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);;
   10866     return v;
   10867 }
   10868 #define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane)
   10869 
   10870 int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   10871 #define vld3_lane_s8(ptr, src, lane)  vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane)
   10872 
   10873 int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10874 #define vld3_lane_s16(ptr, src, lane)  vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane)
   10875 
   10876 int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10877 #define vld3_lane_s32(ptr, src, lane)  vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane)
   10878 
   10879 float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10880 //current IA SIMD doesn't support float16
   10881 
   10882 //float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10883 _NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10884 {
   10885     float32x2x3_t v;
   10886     v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
   10887     v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
   10888     v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
   10889     return v;
   10890 }
   10891 #define vld3_lane_f32(ptr,src,lane) vld3_lane_f32_ptr(ptr,&src,lane)
   10892 
   10893 //poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   10894 #define vld3_lane_p8 vld3_lane_u8
   10895 
   10896 //poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10897 #define vld3_lane_p16 vld3_lane_u16
   10898 
   10899 //******************* Lane Quadruples  load ***************************
   10900 //*********************************************************************
   10901 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
   10902 //we assume src is 16 bit aligned
   10903 
   10904 //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10905 _NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
   10906 {
   10907     uint16x8x4_t v;
   10908     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
   10909     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
   10910     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
   10911     v.val[3] = _MM_INSERT_EPI16 ( src->val[3],  ptr[3], lane);
   10912     return v;
   10913 }
   10914 #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
   10915 
   10916 //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10917 _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
   10918 {
   10919     uint32x4x4_t v;
   10920     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
   10921     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
   10922     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
   10923     v.val[3] = _MM_INSERT_EPI32 ( src->val[3],  ptr[3], lane);
   10924     return v;
   10925 }
   10926 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
   10927 
   10928 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10929 int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10930 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
   10931 
   10932 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10933 int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10934 #define vld4q_lane_s32(ptr, src, lane)  vld4q_lane_u32(( uint32_t*) ptr, src, lane)
   10935 
   10936 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10937 float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10938 //current IA SIMD doesn't support float16
   10939 
   10940 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10941 _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
   10942 {
   10943     float32x4x4_t v;
   10944     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
   10945     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
   10946     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
   10947     v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
   10948     return v;
   10949 }
   10950 #define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
   10951 
   10952 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10953 poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10954 #define vld4q_lane_p16 vld4q_lane_u16
   10955 
   10956 //uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   10957 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane)
   10958 {
   10959     uint8x8x4_t v;
   10960     v.val[0] = vld1_lane_u8(ptr, src->val[0], lane);
   10961     v.val[1] = vld1_lane_u8((ptr + 1), src->val[1], lane);
   10962     v.val[2] = vld1_lane_u8((ptr + 2), src->val[2], lane);
   10963     v.val[3] = vld1_lane_u8((ptr + 3), src->val[3], lane);
   10964     return v;
   10965 }
   10966 #define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane)
   10967 
   10968 //uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   10969 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane)
   10970 {
   10971     uint16x4x4_t v;
   10972     v.val[0] = vld1_lane_u16(ptr, src->val[0], lane);
   10973     v.val[1] = vld1_lane_u16((ptr + 1), src->val[1], lane);
   10974     v.val[2] = vld1_lane_u16((ptr + 2), src->val[2], lane);
   10975     v.val[3] = vld1_lane_u16((ptr + 3), src->val[3], lane);
   10976     return v;
   10977 }
   10978 #define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane)
   10979 
   10980 //uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   10981 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane)
   10982 {
   10983     uint32x2x4_t v;
   10984     v.val[0] = vld1_lane_u32(ptr, src->val[0], lane);
   10985     v.val[1] = vld1_lane_u32((ptr + 1), src->val[1], lane);
   10986     v.val[2] = vld1_lane_u32((ptr + 2), src->val[2], lane);
   10987     v.val[3] = vld1_lane_u32((ptr + 3), src->val[3], lane);
   10988     return v;
   10989 }
   10990 #define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane)
   10991 
   10992 //int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   10993 int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);
   10994 #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
   10995 
   10996 //int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   10997 int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);
   10998 #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
   10999 
   11000 //int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11001 int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);
   11002 #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
   11003 
   11004 //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11005 float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
   11006 //current IA SIMD doesn't support float16
   11007 
   11008 //float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11009 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane)
   11010 {
   11011     //serial solution may be faster
   11012     float32x2x4_t v;
   11013     v.val[0] = vld1_lane_f32(ptr, src->val[0], lane);
   11014     v.val[1] = vld1_lane_f32((ptr + 1), src->val[1], lane);
   11015     v.val[2] = vld1_lane_f32((ptr + 2), src->val[2], lane);
   11016     v.val[3] = vld1_lane_f32((ptr + 3), src->val[3], lane);
   11017     return v;
   11018 }
   11019 #define vld4_lane_f32(ptr,src,lane) vld4_lane_f32_ptr(ptr,&src,lane)
   11020 
   11021 //poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11022 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);
   11023 #define vld4_lane_p8 vld4_lane_u8
   11024 
   11025 //poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11026 poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);
   11027 #define vld4_lane_p16 vld4_lane_u16
   11028 
   11029 //******************* Store duplets *********************************************
   11030 //********************************************************************************
   11031 //here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function
   11032 //If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions
   11033 //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
   11034 _NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
   11035 {
   11036     uint8x16x2_t v;
   11037     v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
   11038     v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
   11039     vst1q_u8 (ptr, v.val[0]);
   11040     vst1q_u8 ((ptr + 16),  v.val[1]);
   11041 }
   11042 #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
   11043 
   11044 //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
   11045 _NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
   11046 {
   11047     uint16x8x2_t v;
   11048     v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
   11049     v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
   11050     vst1q_u16 (ptr, v.val[0]);
   11051     vst1q_u16 ((ptr + 8),  v.val[1]);
   11052 }
   11053 #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
   11054 
   11055 //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
   11056 _NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
   11057 {
   11058     uint32x4x2_t v;
   11059     v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
   11060     v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
   11061     vst1q_u32 (ptr, v.val[0]);
   11062     vst1q_u32 ((ptr + 4),  v.val[1]);
   11063 }
   11064 #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
   11065 
   11066 //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
   11067 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
   11068 #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
   11069 
   11070 //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
   11071 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
   11072 #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
   11073 
   11074 //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
   11075 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
   11076 #define vst2q_s32(ptr, val)  vst2q_u32((uint32_t*)(ptr), val)
   11077 
   11078 //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
   11079 void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
   11080 // IA32 SIMD doesn't work with 16bit floats currently
   11081 
   11082 //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
   11083 _NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
   11084 {
   11085     float32x4x2_t v;
   11086     v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
   11087     v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
   11088     vst1q_f32 (ptr, v.val[0]);
   11089     vst1q_f32 ((ptr + 4),  v.val[1]);
   11090 }
   11091 #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
   11092 
   11093 //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
   11094 void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
   11095 #define vst2q_p8 vst2q_u8
   11096 
   11097 //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
   11098 void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
   11099 #define vst2q_p16 vst2q_u16
   11100 
   11101 //void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
   11102 _NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val)
   11103 {
   11104     __m128i v0;
   11105     v0 = _mm_unpacklo_epi8(_pM128i(val->val[0]), _pM128i(val->val[1]));
   11106     vst1q_u8 (ptr, v0);
   11107 }
   11108 #define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val)
   11109 
   11110 //void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
   11111 _NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val)
   11112 {
   11113     __m128i v0;
   11114     v0 = _mm_unpacklo_epi16(_pM128i(val->val[0]), _pM128i(val->val[1]));
   11115     vst1q_u16 (ptr, v0);
   11116 }
   11117 #define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val)
   11118 
   11119 //void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
   11120 _NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val)
   11121 {
   11122     __m128i v0;
   11123     v0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1]));
   11124     vst1q_u32 (ptr, v0);
   11125 }
   11126 #define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val)
   11127 
   11128 
   11129 //void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
   11130 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);
   11131 _NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val)
   11132 {
   11133     *(ptr) = val->val[0].m64_u64[0];
   11134     *(ptr + 1) = val->val[1].m64_u64[0];
   11135 }
   11136 #define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val)
   11137 
   11138 //void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
   11139 #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
   11140 
   11141 //void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
   11142 #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
   11143 
   11144 //void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
   11145 #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
   11146 
   11147 //void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
   11148 #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
   11149 
   11150 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
   11151 //current IA SIMD doesn't support float16
   11152 
   11153 //void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
   11154 _NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val)
   11155 {
   11156     *(ptr) =   val->val[0].m64_f32[0];
   11157     *(ptr + 1) = val->val[1].m64_f32[0];
   11158     *(ptr + 2) = val->val[0].m64_f32[1];
   11159     *(ptr + 3) = val->val[1].m64_f32[1];
   11160 }
   11161 #define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val)
   11162 
   11163 //void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
   11164 #define vst2_p8 vst2_u8
   11165 
   11166 //void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
   11167 #define vst2_p16 vst2_u16
   11168 
   11169 //******************** Triplets store  *****************************************
   11170 //******************************************************************************
   11171 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
   11172 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
   11173 {
   11174     uint8x16x3_t v;
   11175     __m128i v0,v1,v2, cff, bldmask;
   11176     _NEON2SSE_ALIGN_16 uint8_t mask0[16]   = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
   11177     _NEON2SSE_ALIGN_16 uint8_t mask1[16]   = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
   11178     _NEON2SSE_ALIGN_16 uint8_t mask2[16] =    {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
   11179     _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
   11180     _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
   11181     _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
   11182 
   11183     v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
   11184     v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
   11185     v1 =  _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
   11186     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
   11187     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
   11188     cff = _mm_cmpeq_epi8(v0, v0); //all ff
   11189     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
   11190     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
   11191     vst1q_u8(ptr,   v.val[0]);
   11192     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
   11193     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
   11194     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
   11195     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
   11196     vst1q_u8((ptr + 16),  v.val[1]);
   11197     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
   11198     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
   11199     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
   11200     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
   11201     vst1q_u8((ptr + 32),  v.val[2]);
   11202 }
   11203 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
   11204 
   11205 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
   11206 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
   11207 {
   11208     uint16x8x3_t v;
   11209     __m128i v0,v1,v2, cff, bldmask;
   11210     _NEON2SSE_ALIGN_16 uint8_t mask0[16]   = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
   11211     _NEON2SSE_ALIGN_16 uint8_t mask1[16]   = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
   11212     _NEON2SSE_ALIGN_16 uint8_t mask2[16] =    {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
   11213     _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
   11214     _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
   11215     _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
   11216 
   11217     v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
   11218     v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
   11219     v1 =  _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
   11220     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
   11221     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
   11222     cff = _mm_cmpeq_epi16(v0, v0); //all ff
   11223     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
   11224     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
   11225     vst1q_u16(ptr,      v.val[0]);
   11226     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
   11227     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
   11228     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
   11229     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
   11230     vst1q_u16((ptr + 8),  v.val[1]);
   11231     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
   11232     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
   11233     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
   11234     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
   11235     vst1q_u16((ptr + 16), v.val[2]);
   11236 }
   11237 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
   11238 
   11239 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
   11240 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
   11241 {
   11242     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
   11243     uint32x4x3_t v;
   11244     __m128i tmp0, tmp1,tmp2;
   11245     tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
   11246     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
   11247     tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
   11248     v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
   11249     v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
   11250     v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
   11251     tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
   11252     v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
   11253 
   11254     vst1q_u32(ptr,      v.val[0]);
   11255     vst1q_u32((ptr + 4),  v.val[1]);
   11256     vst1q_u32((ptr + 8),  v.val[2]);
   11257 }
   11258 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
   11259 
   11260 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
   11261 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
   11262 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
   11263 
   11264 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
   11265 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
   11266 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
   11267 
   11268 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
   11269 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
   11270 #define vst3q_s32(ptr, val)  vst3q_u32((uint32_t*)(ptr), val)
   11271 
   11272 //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
   11273 void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
   11274 // IA32 SIMD doesn't work with 16bit floats currently
   11275 
   11276 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
   11277 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
   11278 {
   11279     float32x4x3_t v;
   11280     __m128 tmp0, tmp1,tmp2;
   11281     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
   11282     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
   11283     tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
   11284     v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
   11285     v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
   11286     v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
   11287     tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
   11288     v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
   11289 
   11290     vst1q_f32( ptr,    v.val[0]);
   11291     vst1q_f32( (ptr + 4),  v.val[1]);
   11292     vst1q_f32( (ptr + 8),  v.val[2]);
   11293 }
   11294 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
   11295 
   11296 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
   11297 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
   11298 #define vst3q_p8 vst3q_u8
   11299 
   11300 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
   11301 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
   11302 #define vst3q_p16 vst3q_u16
   11303 
   11304 //void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0]
   11305 _NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val)
   11306 {
   11307     __m128i tmp, sh0, sh1, val0, val2;
   11308     _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
   11309     _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
   11310     _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
   11311     _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
   11312     tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]) );
   11313     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
   11314     val2 = _pM128i(val->val[2]);
   11315     sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask0);
   11316     val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
   11317     vst1q_u8(ptr,   val0); //store as 128 bit structure
   11318     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
   11319     sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask1);
   11320     val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
   11321     _M64((*(__m64_128*)(ptr + 16)),  val2); //need it to fit into *ptr memory
   11322 }
   11323 #define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val)
   11324 
   11325 //void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0]
   11326 _NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val)
   11327 {
   11328     __m128i tmp, val0, val1, val2;
   11329     _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
   11330     _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23,   0,0,0,0,0,0,0,0};
   11331     _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0]  otherwise from v.val[1]
   11332     _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1]  otherwise from v.val[0]
   11333     tmp = _mm_unpacklo_epi64(_pM128i(val->val[0]), _pM128i(val->val[1]));
   11334     val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
   11335     val2 = _pM128i(val->val[2]);
   11336     val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
   11337     val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
   11338     vst1q_u16(ptr,    val0); //store as 128 bit structure
   11339     val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
   11340     val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
   11341     val1 = _MM_BLENDV_EPI8(val0, val1,  *(__m128i*)mask1f); //change the operands order
   11342     _M64((*(__m64_128*)(ptr + 8)),  val1); //need it to fit into *ptr memory
   11343 }
   11344 #define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val)
   11345 
   11346 //void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
   11347 _NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val)
   11348 {
   11349     //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
   11350     __m128i val0, val1;
   11351     val0 = _mm_unpacklo_epi64(_pM128i(val->val[1]), _pM128i(val->val[2])); //val[0]: 1,4,2,5
   11352     val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
   11353     val1 = _mm_srli_si128(val0, 8); //4,5, x,x
   11354     _M64((*(__m64_128*)(ptr + 4)),  val1);
   11355     val0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), val0); //0,1,3,2
   11356     val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
   11357     vst1q_u32(ptr, val0); //store as 128 bit structure
   11358 }
   11359 #define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val)
   11360 
   11361 //void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0]
   11362 _NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val)
   11363 {
   11364     *(ptr) = val->val[0].m64_u64[0];
   11365     *(ptr + 1) = val->val[1].m64_u64[0];
   11366     *(ptr + 2) = val->val[2].m64_u64[0];
   11367 }
   11368 #define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val)
   11369 
   11370 //void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val)  // VST3.8 {d0, d1, d2}, [r0]
   11371 #define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val)
   11372 
   11373 //void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val)  // VST3.16 {d0, d1, d2}, [r0]
   11374 #define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val)
   11375 
   11376 //void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
   11377 #define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val)
   11378 
   11379 //void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0]
   11380 #define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val)
   11381 
   11382 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
   11383 void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
   11384 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   11385 
   11386 //void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
   11387 _NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val)
   11388 {
   11389     //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;   -> 0,2, 4,1, 3,5
   11390     *(ptr) =   val->val[0].m64_f32[0];
   11391     *(ptr + 1) = val->val[1].m64_f32[0];
   11392     *(ptr + 2) = val->val[2].m64_f32[0];
   11393     *(ptr + 3) = val->val[0].m64_f32[1];
   11394     *(ptr + 4) = val->val[1].m64_f32[1];
   11395     *(ptr + 5) = val->val[2].m64_f32[1];
   11396 }
   11397 #define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val)
   11398 
   11399 //void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
   11400 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);
   11401 #define vst3_p8 vst3_u8
   11402 
   11403 //void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
   11404 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);
   11405 #define vst3_p16 vst3_s16
   11406 
   11407 //***************  Quadruples store ********************************
   11408 //*********************************************************************
   11409 //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
   11410 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
   11411 {
   11412     __m128i tmp1, tmp2, res;
   11413     tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
   11414     tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
   11415     res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
   11416     vst1q_u8(ptr,  res);
   11417     res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
   11418     vst1q_u8((ptr + 16), res);
   11419     tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
   11420     tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
   11421     res = _mm_unpacklo_epi16(tmp1, tmp2); //
   11422     vst1q_u8((ptr + 32), res);
   11423     res = _mm_unpackhi_epi16(tmp1, tmp2); //
   11424     vst1q_u8((ptr + 48), res);
   11425 }
   11426 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
   11427 
   11428 //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
   11429 _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
   11430 {
   11431     uint16x8x4_t v;
   11432     __m128i tmp1, tmp2;
   11433     tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
   11434     tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
   11435     v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
   11436     v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
   11437     tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
   11438     tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
   11439     v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
   11440     v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
   11441     vst1q_u16(ptr,     v.val[0]);
   11442     vst1q_u16((ptr + 8), v.val[1]);
   11443     vst1q_u16((ptr + 16),v.val[2]);
   11444     vst1q_u16((ptr + 24), v.val[3]);
   11445 }
   11446 #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
   11447 
   11448 //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
   11449 _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
   11450 {
   11451     uint16x8x4_t v;
   11452     __m128i tmp1, tmp2;
   11453     tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
   11454     tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
   11455     v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
   11456     v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
   11457     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
   11458     tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
   11459     v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
   11460     v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
   11461     vst1q_u32(ptr,      v.val[0]);
   11462     vst1q_u32((ptr + 4),  v.val[1]);
   11463     vst1q_u32((ptr + 8),  v.val[2]);
   11464     vst1q_u32((ptr + 12), v.val[3]);
   11465 }
   11466 #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
   11467 
   11468 //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
   11469 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
   11470 #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
   11471 
   11472 //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
   11473 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
   11474 #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
   11475 
   11476 //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
   11477 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
   11478 #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
   11479 
   11480 //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
   11481 void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
   11482 // IA32 SIMD doesn't work with 16bit floats currently
   11483 
   11484 //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
   11485 _NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
   11486 {
   11487     __m128 tmp3, tmp2, tmp1, tmp0;
   11488     float32x4x4_t v;
   11489     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
   11490     tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
   11491     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
   11492     tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
   11493     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
   11494     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
   11495     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
   11496     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
   11497     vst1q_f32(ptr,   v.val[0]);
   11498     vst1q_f32((ptr + 4), v.val[1]);
   11499     vst1q_f32((ptr + 8), v.val[2]);
   11500     vst1q_f32((ptr + 12), v.val[3]);
   11501 }
   11502 #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
   11503 
   11504 //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
   11505 void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
   11506 #define vst4q_p8 vst4q_u8
   11507 
   11508 //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
   11509 void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
   11510 #define vst4q_p16 vst4q_s16
   11511 
   11512 //void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0]
   11513 _NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val)
   11514 {
   11515     __m128i sh0, sh1, val0, val2;
   11516     sh0 = _mm_unpacklo_epi8(_pM128i(val->val[0]),_pM128i(val->val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
   11517     sh1 = _mm_unpacklo_epi8(_pM128i(val->val[2]),_pM128i(val->val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
   11518     val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
   11519     val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
   11520     vst1q_u8(ptr,    val0);
   11521     vst1q_u8((ptr + 16),  val2);
   11522 }
   11523 #define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val)
   11524 
   11525 //void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0]
   11526 _NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val)
   11527 {
   11528     __m128i sh0, sh1, val0, val2;
   11529     sh0 = _mm_unpacklo_epi16(_pM128i(val->val[0]),_pM128i(val->val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
   11530     sh1 = _mm_unpacklo_epi16(_pM128i(val->val[2]),_pM128i(val->val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
   11531     val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
   11532     val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
   11533     vst1q_u16(ptr,      val0); //store as 128 bit structure
   11534     vst1q_u16((ptr + 8),  val2);
   11535 }
   11536 #define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val)
   11537 
   11538 //void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
   11539 _NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val)
   11540 {
   11541     //0,4,   1,5,  2,6,  3,7
   11542     __m128i sh0, sh1, val0, val1;
   11543     sh0 = _mm_unpacklo_epi32(_pM128i(val->val[0]), _pM128i(val->val[1])); //0,1,4,5
   11544     sh1 = _mm_unpacklo_epi32(_pM128i(val->val[2]), _pM128i(val->val[3])); //2,3,6,7
   11545     val0 = _mm_unpacklo_epi64(sh0,sh1); //
   11546     val1 = _mm_unpackhi_epi64(sh0,sh1); //
   11547     vst1q_u32(ptr,     val0); //store as 128 bit structure
   11548     vst1q_u32((ptr + 4),  val1);
   11549 }
   11550 #define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val)
   11551 
   11552 //void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0]
   11553 _NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val)
   11554 {
   11555     *(ptr) =  val->val[0].m64_u64[0];
   11556     *(ptr + 1) =  val->val[1].m64_u64[0];
   11557     *(ptr + 2) =  val->val[2].m64_u64[0];
   11558     *(ptr + 3) =  val->val[3].m64_u64[0];
   11559 }
   11560 #define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val)
   11561 
   11562 //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val)  //VST4.8 {d0, d1, d2, d3}, [r0]
   11563 #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
   11564 
   11565 //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val)  // VST4.16 {d0, d1, d2, d3}, [r0]
   11566 #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
   11567 
   11568 //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
   11569 #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
   11570 
   11571 //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
   11572 void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
   11573 #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
   11574 
   11575 //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
   11576 void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
   11577 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   11578 
   11579 //void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
   11580 _NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val)
   11581 {
   11582     //0,4,   1,5,  2,6,  3,7 -> 0,1, 2,3, 4,5, 6,7
   11583     *(ptr) =   val->val[0].m64_f32[0];
   11584     *(ptr + 1) = val->val[1].m64_f32[0];
   11585     *(ptr + 2) = val->val[2].m64_f32[0];
   11586     *(ptr + 3) = val->val[3].m64_f32[0];
   11587     *(ptr + 4) = val->val[0].m64_f32[1];
   11588     *(ptr + 5) = val->val[1].m64_f32[1];
   11589     *(ptr + 6) = val->val[2].m64_f32[1];
   11590     *(ptr + 7) = val->val[3].m64_f32[1];
   11591 }
   11592 #define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val)
   11593 
   11594 //void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
   11595 void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);
   11596 #define vst4_p8 vst4_u8
   11597 
   11598 //void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
   11599 void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);
   11600 #define vst4_p16 vst4_u16
   11601 
   11602 //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors  *********************
   11603 //********************************************************************************************************************
   11604 //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
   11605 _NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
   11606 {
   11607     vst1q_lane_s16(ptr, val->val[0], lane);
   11608     vst1q_lane_s16((ptr + 1), val->val[1], lane);
   11609 }
   11610 #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
   11611 
   11612 //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
   11613 _NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
   11614 {
   11615     vst1q_lane_u32(ptr, val->val[0], lane);
   11616     vst1q_lane_u32((ptr + 1), val->val[1], lane);
   11617 }
   11618 #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
   11619 
   11620 //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
   11621 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
   11622 #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
   11623 
   11624 //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
   11625 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
   11626 #define vst2q_lane_s32(ptr, val, lane)  vst2q_lane_u32((uint32_t*)ptr, val, lane)
   11627 
   11628 //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
   11629 void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
   11630 //current IA SIMD doesn't support float16
   11631 
   11632 //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
   11633 _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
   11634 {
   11635     vst1q_lane_f32(ptr, val->val[0], lane);
   11636     vst1q_lane_f32((ptr + 1), val->val[1], lane);
   11637 }
   11638 #define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
   11639 
   11640 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
   11641 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
   11642 #define vst2q_lane_p16 vst2q_lane_s16
   11643 
   11644 //void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
   11645 void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0], d1[0]}, [r0]
   11646 _NEON2SSE_INLINE void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t* val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
   11647 {
   11648     *(ptr) = val->val[0].m64_u8[lane];
   11649     *(ptr + 1) = val->val[1].m64_u8[lane];
   11650 }
   11651 #define vst2_lane_u8(ptr, val, lane) vst2_lane_u8_ptr(ptr, &val, lane)
   11652 
   11653 //void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
   11654 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
   11655 _NEON2SSE_INLINE void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane)
   11656 {
   11657     *(ptr) = val->val[0].m64_u16[lane];
   11658     *(ptr + 1) = val->val[1].m64_u16[lane];
   11659 }
   11660 #define vst2_lane_u16(ptr, val, lane) vst2_lane_u16_ptr(ptr, &val, lane)
   11661 
   11662 //void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
   11663 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
   11664 _NEON2SSE_INLINE void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane)
   11665 {
   11666     *(ptr) = val->val[0].m64_u32[lane];
   11667     *(ptr + 1) = val->val[1].m64_u32[lane];
   11668 }
   11669 #define vst2_lane_u32(ptr, val, lane) vst2_lane_u32_ptr(ptr, &val, lane)
   11670 
   11671 //void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
   11672 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);
   11673 #define vst2_lane_s8(ptr, val, lane)  vst2_lane_u8((uint8_t*)ptr, val, lane)
   11674 
   11675 //void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
   11676 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);
   11677 #define vst2_lane_s16(ptr, val, lane)  vst2_lane_u16((uint16_t*)ptr, val, lane)
   11678 
   11679 //void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
   11680 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);
   11681 #define vst2_lane_s32(ptr, val, lane)  vst2_lane_u32((uint32_t*)ptr, val, lane)
   11682 
   11683 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
   11684 //current IA SIMD doesn't support float16
   11685 
   11686 void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
   11687 _NEON2SSE_INLINE void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane)
   11688 {
   11689     *(ptr) = val->val[0].m64_f32[lane];
   11690     *(ptr + 1) = val->val[1].m64_f32[lane];
   11691 }
   11692 #define vst2_lane_f32(ptr,src,lane) vst2_lane_f32_ptr(ptr,&src,lane)
   11693 
   11694 //void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
   11695 #define vst2_lane_p8 vst2_lane_u8
   11696 
   11697 //void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
   11698 #define vst2_lane_p16 vst2_lane_u16
   11699 
   11700 //************************* Triple lanes  stores *******************************************************
   11701 //*******************************************************************************************************
   11702 //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   11703 _NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
   11704 {
   11705     vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
   11706     vst1q_lane_u16((ptr + 2), val->val[2], lane);
   11707 }
   11708 #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
   11709 
   11710 //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   11711 _NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
   11712 {
   11713     vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
   11714     vst1q_lane_u32((ptr + 2), val->val[2], lane);
   11715 }
   11716 #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
   11717 
   11718 //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   11719 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
   11720 #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
   11721 
   11722 //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   11723 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
   11724 #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
   11725 
   11726 //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   11727 void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
   11728 //current IA SIMD doesn't support float16
   11729 
   11730 //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   11731 _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
   11732 {
   11733     vst1q_lane_f32(ptr,   val->val[0], lane);
   11734     vst1q_lane_f32((ptr + 1),   val->val[1], lane);
   11735     vst1q_lane_f32((ptr + 2), val->val[2], lane);
   11736 }
   11737 #define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
   11738 
   11739 //void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   11740 void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
   11741 #define vst3q_lane_p16 vst3q_lane_s16
   11742 
   11743 //void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
   11744 _NEON2SSE_INLINE void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t* val, __constrange(0,7) int lane)
   11745 {
   11746     *(ptr) =     val->val[0].m64_u8[lane];
   11747     *(ptr + 1) = val->val[1].m64_u8[lane];
   11748     *(ptr + 2) = val->val[2].m64_u8[lane];
   11749 }
   11750 #define vst3_lane_u8(ptr, val, lane) vst3_lane_u8_ptr(ptr, &val, lane)
   11751 
   11752 //void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   11753 _NEON2SSE_INLINE void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t* val, __constrange(0,3) int lane)
   11754 {
   11755     *(ptr) =     val->val[0].m64_u16[lane];
   11756     *(ptr + 1) = val->val[1].m64_u16[lane];
   11757     *(ptr + 2) = val->val[2].m64_u16[lane];
   11758 }
   11759 #define vst3_lane_u16(ptr, val, lane) vst3_lane_u16_ptr(ptr, &val, lane)
   11760 
   11761 //void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
   11762 _NEON2SSE_INLINE void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t* val, __constrange(0,1) int lane)
   11763 {
   11764     *(ptr) =     val->val[0].m64_u32[lane];
   11765     *(ptr + 1) = val->val[1].m64_u32[lane];
   11766     *(ptr + 2) = val->val[2].m64_u32[lane];
   11767 }
   11768 #define vst3_lane_u32(ptr, val, lane) vst3_lane_u32_ptr(ptr, &val, lane)
   11769 
   11770 //void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
   11771 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);
   11772 #define  vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
   11773 
   11774 //void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   11775 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);
   11776 #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
   11777 
   11778 //void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
   11779 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);
   11780 #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
   11781 
   11782 //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   11783 void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
   11784 //current IA SIMD doesn't support float16
   11785 
   11786 //void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
   11787 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);
   11788 _NEON2SSE_INLINE void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane)
   11789 {
   11790     *(ptr) = val->val[0].m64_f32[lane];
   11791     *(ptr + 1) = val->val[1].m64_f32[lane];
   11792     *(ptr + 2) = val->val[2].m64_f32[lane];
   11793 }
   11794 #define vst3_lane_f32(ptr,val,lane) vst3_lane_f32_ptr(ptr,&val,lane)
   11795 
   11796 //void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
   11797 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);
   11798 #define vst3_lane_p8 vst3_lane_u8
   11799 
   11800 //void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   11801 void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);
   11802 #define vst3_lane_p16 vst3_lane_s16
   11803 
   11804 //******************************** Quadruple lanes stores ***********************************************
   11805 //*******************************************************************************************************
   11806 //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11807 _NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
   11808 {
   11809     vst2q_lane_u16_ptr(ptr,    (uint16x8x2_t*)val4->val, lane);
   11810     vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
   11811 }
   11812 #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
   11813 
   11814 //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11815 _NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
   11816 {
   11817     vst2q_lane_u32_ptr(ptr,     (uint32x4x2_t*)val4->val, lane);
   11818     vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
   11819 }
   11820 #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
   11821 
   11822 //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11823 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
   11824 #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
   11825 
   11826 //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11827 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
   11828 #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
   11829 
   11830 //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11831 void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
   11832 //current IA SIMD doesn't support float16
   11833 
   11834 //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11835 _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
   11836 {
   11837     vst1q_lane_f32(ptr,   val->val[0], lane);
   11838     vst1q_lane_f32((ptr + 1), val->val[1], lane);
   11839     vst1q_lane_f32((ptr + 2), val->val[2], lane);
   11840     vst1q_lane_f32((ptr + 3), val->val[3], lane);
   11841 }
   11842 #define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
   11843 
   11844 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11845 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
   11846 #define vst4q_lane_p16 vst4q_lane_u16
   11847 
   11848 //void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11849 _NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane)
   11850 {
   11851     *(ptr) =     val->val[0].m64_u8[lane];
   11852     *(ptr + 1) = val->val[1].m64_u8[lane];
   11853     *(ptr + 2) = val->val[2].m64_u8[lane];
   11854     *(ptr + 3) = val->val[3].m64_u8[lane];
   11855 }
   11856 #define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane)
   11857 
   11858 //void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11859 _NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane)
   11860 {
   11861     *(ptr) =     val->val[0].m64_u16[lane];
   11862     *(ptr + 1) = val->val[1].m64_u16[lane];
   11863     *(ptr + 2) = val->val[2].m64_u16[lane];
   11864     *(ptr + 3) = val->val[3].m64_u16[lane];
   11865 }
   11866 #define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane)
   11867 
   11868 //void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11869 _NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane)
   11870 {
   11871     *(ptr) =     val->val[0].m64_u32[lane];
   11872     *(ptr + 1) = val->val[1].m64_u32[lane];
   11873     *(ptr + 2) = val->val[2].m64_u32[lane];
   11874     *(ptr + 3) = val->val[3].m64_u32[lane];
   11875 }
   11876 #define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane)
   11877 
   11878 //void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11879 #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
   11880 
   11881 //void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11882 #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
   11883 
   11884 //void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11885 #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
   11886 
   11887 //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11888 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
   11889 //current IA SIMD doesn't support float16
   11890 
   11891 void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11892 _NEON2SSE_INLINE void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t* val, __constrange(0,1) int lane)
   11893 {
   11894     *(ptr) = val->val[0].m64_f32[lane];
   11895     *(ptr + 1) = val->val[1].m64_f32[lane];
   11896     *(ptr + 2) = val->val[2].m64_f32[lane];
   11897     *(ptr + 3) = val->val[3].m64_f32[lane];
   11898 }
   11899 #define vst4_lane_f32(ptr,val,lane) vst4_lane_f32_ptr(ptr,&val,lane)
   11900 
   11901 //void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11902 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);
   11903 #define vst4_lane_p8 vst4_lane_u8
   11904 
   11905 //void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11906 void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);
   11907 #define vst4_lane_p16 vst4_lane_u16
   11908 
   11909 //**************************************************************************************************
   11910 //************************ Extract lanes from a vector ********************************************
   11911 //**************************************************************************************************
   11912 //These intrinsics extract a single lane (element) from a vector.
   11913 uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
   11914 #define vget_lane_u8(vec, lane) vec.m64_u8[lane]
   11915 
   11916 uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
   11917 #define vget_lane_u16(vec, lane) vec.m64_u16[lane]
   11918 
   11919 
   11920 uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   11921 #define vget_lane_u32(vec, lane) vec.m64_u32[lane]
   11922 
   11923 int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
   11924 #define vget_lane_s8(vec, lane) vec.m64_i8[lane]
   11925 
   11926 int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
   11927 #define vget_lane_s16(vec, lane) vec.m64_i16[lane]
   11928 
   11929 int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   11930 #define vget_lane_s32(vec, lane) vec.m64_i32[lane]
   11931 
   11932 poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
   11933 #define vget_lane_p8 vget_lane_u8
   11934 
   11935 poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
   11936 #define vget_lane_p16 vget_lane_u16
   11937 
   11938 float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   11939 #define vget_lane_f32(vec, lane) vec.m64_f32[lane]
   11940 
   11941 uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
   11942 #define vgetq_lane_u8 _MM_EXTRACT_EPI8
   11943 
   11944 uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
   11945 #define  vgetq_lane_u16 _MM_EXTRACT_EPI16
   11946 
   11947 uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   11948 #define vgetq_lane_u32 _MM_EXTRACT_EPI32
   11949 
   11950 int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
   11951 #define vgetq_lane_s8 vgetq_lane_u8
   11952 
   11953 int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
   11954 #define vgetq_lane_s16 vgetq_lane_u16
   11955 
   11956 int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   11957 #define vgetq_lane_s32 vgetq_lane_u32
   11958 
   11959 poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
   11960 #define vgetq_lane_p8 vgetq_lane_u8
   11961 
   11962 poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
   11963 #define vgetq_lane_p16 vgetq_lane_u16
   11964 
   11965 float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   11966 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
   11967 {
   11968     int32_t ilane;
   11969     ilane = _MM_EXTRACT_PS(vec,lane);
   11970     return *(float*)&ilane;
   11971 }
   11972 
   11973 int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
   11974 #define vget_lane_s64(vec, lane) vec.m64_i64[0]
   11975 
   11976 uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
   11977 #define vget_lane_u64(vec, lane) vec.m64_u64[0]
   11978 
   11979 
   11980 int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
   11981 #define vgetq_lane_s64 (int64_t) vgetq_lane_u64
   11982 
   11983 uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
   11984 #define vgetq_lane_u64 _MM_EXTRACT_EPI64
   11985 
   11986 // ***************** Set lanes within a vector ********************************************
   11987 // **************************************************************************************
   11988 //These intrinsics set a single lane (element) within a vector.
   11989 //same functions as vld1_lane_xx ones, but take the value to be set directly.
   11990 
   11991 uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   11992 _NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
   11993 {
   11994     uint8_t val;
   11995     val = value;
   11996     return vld1_lane_u8(&val, vec,  lane);
   11997 }
   11998 
   11999 uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   12000 _NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
   12001 {
   12002     uint16_t val;
   12003     val = value;
   12004     return vld1_lane_u16(&val, vec,  lane);
   12005 }
   12006 
   12007 uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   12008 _NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
   12009 {
   12010     uint32_t val;
   12011     val = value;
   12012     return vld1_lane_u32(&val, vec,  lane);
   12013 }
   12014 
   12015 int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   12016 _NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
   12017 {
   12018     int8_t val;
   12019     val = value;
   12020     return vld1_lane_s8(&val, vec,  lane);
   12021 }
   12022 
   12023 int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   12024 _NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
   12025 {
   12026     int16_t val;
   12027     val = value;
   12028     return vld1_lane_s16(&val, vec,  lane);
   12029 }
   12030 
   12031 int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   12032 _NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
   12033 {
   12034     int32_t val;
   12035     val = value;
   12036     return vld1_lane_s32(&val, vec,  lane);
   12037 }
   12038 
   12039 poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   12040 #define vset_lane_p8  vset_lane_u8
   12041 
   12042 poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   12043 #define vset_lane_p16  vset_lane_u16
   12044 
   12045 float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   12046 _NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
   12047 {
   12048     float32_t val;
   12049     val = value;
   12050     return vld1_lane_f32(&val, vec,  lane);
   12051 }
   12052 
   12053 uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   12054 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
   12055 {
   12056     uint8_t val;
   12057     val = value;
   12058     return vld1q_lane_u8(&val, vec,  lane);
   12059 }
   12060 
   12061 uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   12062 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
   12063 {
   12064     uint16_t val;
   12065     val = value;
   12066     return vld1q_lane_u16(&val, vec,  lane);
   12067 }
   12068 
   12069 uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   12070 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
   12071 {
   12072     uint32_t val;
   12073     val = value;
   12074     return vld1q_lane_u32(&val, vec,  lane);
   12075 }
   12076 
   12077 int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   12078 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
   12079 {
   12080     int8_t val;
   12081     val = value;
   12082     return vld1q_lane_s8(&val, vec,  lane);
   12083 }
   12084 
   12085 int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   12086 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
   12087 {
   12088     int16_t val;
   12089     val = value;
   12090     return vld1q_lane_s16(&val, vec,  lane);
   12091 }
   12092 
   12093 int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   12094 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
   12095 {
   12096     int32_t val;
   12097     val = value;
   12098     return vld1q_lane_s32(&val, vec,  lane);
   12099 }
   12100 
   12101 poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   12102 #define vsetq_lane_p8 vsetq_lane_u8
   12103 
   12104 poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   12105 #define vsetq_lane_p16 vsetq_lane_u16
   12106 
   12107 float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   12108 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
   12109 {
   12110     float32_t val;
   12111     val = value;
   12112     return vld1q_lane_f32(&val, vec,  lane);
   12113 }
   12114 
   12115 int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
   12116 _NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
   12117 {
   12118     int64_t val;
   12119     val = value;
   12120     return vld1_lane_s64(&val, vec,  lane);
   12121 }
   12122 
   12123 uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
   12124 _NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
   12125 {
   12126     uint64_t val;
   12127     val = value;
   12128     return vld1_lane_u64(&val, vec,  lane);
   12129 }
   12130 
   12131 int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
   12132 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
   12133 {
   12134     uint64_t val;
   12135     val = value;
   12136     return vld1q_lane_s64(&val, vec,  lane);
   12137 }
   12138 
   12139 uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
   12140 #define vsetq_lane_u64 vsetq_lane_s64
   12141 
   12142 // *******************************************************************************
   12143 // **************** Initialize a vector from bit pattern ***************************
   12144 // *******************************************************************************
   12145 //These intrinsics create a vector from a literal bit pattern.
   12146 int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
   12147 #define vcreate_s8(a)  (*(__m64_128*)&(a))
   12148 
   12149 
   12150 int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
   12151 #define vcreate_s16  vcreate_s8
   12152 
   12153 int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
   12154 #define vcreate_s32  vcreate_s8
   12155 
   12156 float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
   12157 //no IA32 SIMD avalilable
   12158 
   12159 float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
   12160 #define vcreate_f32(a)  (*(__m64_128*)&(a))
   12161 
   12162 uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
   12163 #define vcreate_u8 vcreate_s8
   12164 
   12165 uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
   12166 #define vcreate_u16 vcreate_s16
   12167 
   12168 uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
   12169 #define vcreate_u32 vcreate_s32
   12170 
   12171 uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
   12172 #define vcreate_u64  vcreate_s8
   12173 
   12174 
   12175 poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
   12176 #define vcreate_p8 vcreate_u8
   12177 
   12178 poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
   12179 #define vcreate_p16 vcreate_u16
   12180 
   12181 int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
   12182 #define vcreate_s64 vcreate_u64
   12183 
   12184 //********************* Set all lanes to same value ********************************
   12185 //*********************************************************************************
   12186 //These intrinsics set all lanes to the same value.
   12187 uint8x8_t   vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
   12188 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t  vdup_n_u8(uint8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12189 {
   12190     uint8x8_t res;
   12191     int i;
   12192     for (i = 0; i<8; i++) {
   12193         res.m64_u8[i] = value;
   12194     }
   12195     return res;
   12196 }
   12197 
   12198 uint16x4_t   vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
   12199 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t  vdup_n_u16(uint16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12200 {
   12201     uint16x4_t res;
   12202     int i;
   12203     for (i = 0; i<4; i++) {
   12204         res.m64_u16[i] = value;
   12205     }
   12206     return res;
   12207 }
   12208 
   12209 uint32x2_t   vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
   12210 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t  vdup_n_u32(uint32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12211 {
   12212     uint32x2_t res;
   12213     res.m64_u32[0] = value;
   12214     res.m64_u32[1] = value;
   12215     return res;
   12216 }
   12217 
   12218 int8x8_t   vdup_n_s8(int8_t value); // VDUP.8 d0,r0
   12219 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t  vdup_n_s8(int8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12220 {
   12221     int8x8_t res;
   12222     int i;
   12223     for (i = 0; i<8; i++) {
   12224         res.m64_i8[i] = value;
   12225     }
   12226     return res;
   12227 }
   12228 
   12229 int16x4_t   vdup_n_s16(int16_t value); // VDUP.16 d0,r0
   12230 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t  vdup_n_s16(int16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12231 {
   12232     int16x4_t res;
   12233     int i;
   12234     for (i = 0; i<4; i++) {
   12235         res.m64_i16[i] = value;
   12236     }
   12237     return res;
   12238 }
   12239 
   12240 int32x2_t   vdup_n_s32(int32_t value); // VDUP.32 d0,r0
   12241 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t  vdup_n_s32(int32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12242 {
   12243     int32x2_t res;
   12244     res.m64_i32[0] = value;
   12245     res.m64_i32[1] = value;
   12246     return res;
   12247 }
   12248 
   12249 poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
   12250 #define vdup_n_p8 vdup_n_u8
   12251 
   12252 poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
   12253 #define vdup_n_p16 vdup_n_s16
   12254 
   12255 float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
   12256 _NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
   12257 {
   12258     float32x2_t res;
   12259     res.m64_f32[0] = value;
   12260     res.m64_f32[1] = value;
   12261     return res;
   12262 }
   12263 
   12264 uint8x16_t   vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
   12265 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
   12266 
   12267 uint16x8_t   vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
   12268 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
   12269 
   12270 uint32x4_t   vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
   12271 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
   12272 
   12273 int8x16_t   vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
   12274 #define vdupq_n_s8 _mm_set1_epi8
   12275 
   12276 int16x8_t   vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
   12277 #define vdupq_n_s16 _mm_set1_epi16
   12278 
   12279 int32x4_t   vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
   12280 #define vdupq_n_s32 _mm_set1_epi32
   12281 
   12282 poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
   12283 #define  vdupq_n_p8 vdupq_n_u8
   12284 
   12285 poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
   12286 #define  vdupq_n_p16 vdupq_n_u16
   12287 
   12288 float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
   12289 #define vdupq_n_f32 _mm_set1_ps
   12290 
   12291 int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
   12292 _NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
   12293 {
   12294     int64x1_t res;
   12295     res.m64_i64[0] = value;
   12296     return res;
   12297 }
   12298 
   12299 uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
   12300 _NEON2SSE_INLINE uint64x1_t  vdup_n_u64(uint64_t value)
   12301 {
   12302     uint64x1_t res;
   12303     res.m64_u64[0] = value;
   12304     return res;
   12305 }
   12306 
   12307 int64x2_t   vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
   12308 _NEON2SSE_INLINE int64x2_t   vdupq_n_s64(int64_t value)
   12309 {
   12310     _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
   12311     return LOAD_SI128(value2);
   12312 }
   12313 
   12314 uint64x2_t   vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
   12315 _NEON2SSE_INLINE uint64x2_t   vdupq_n_u64(uint64_t value)
   12316 {
   12317     _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
   12318     return LOAD_SI128(val);
   12319 }
   12320 
   12321 //****  Set all lanes to same value  ************************
   12322 //Same functions as above - just aliaces.********************
   12323 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
   12324 uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
   12325 #define vmov_n_u8 vdup_n_s8
   12326 
   12327 uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
   12328 #define vmov_n_u16 vdup_n_s16
   12329 
   12330 uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
   12331 #define vmov_n_u32 vdup_n_u32
   12332 
   12333 int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
   12334 #define vmov_n_s8 vdup_n_s8
   12335 
   12336 int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
   12337 #define vmov_n_s16 vdup_n_s16
   12338 
   12339 int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
   12340 #define vmov_n_s32 vdup_n_s32
   12341 
   12342 poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
   12343 #define vmov_n_p8 vdup_n_u8
   12344 
   12345 poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
   12346 #define vmov_n_p16 vdup_n_s16
   12347 
   12348 float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
   12349 #define vmov_n_f32 vdup_n_f32
   12350 
   12351 uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
   12352 #define vmovq_n_u8 vdupq_n_u8
   12353 
   12354 uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
   12355 #define vmovq_n_u16 vdupq_n_s16
   12356 
   12357 uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
   12358 #define vmovq_n_u32 vdupq_n_u32
   12359 
   12360 int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
   12361 #define vmovq_n_s8 vdupq_n_s8
   12362 
   12363 int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
   12364 #define vmovq_n_s16 vdupq_n_s16
   12365 
   12366 int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
   12367 #define vmovq_n_s32 vdupq_n_s32
   12368 
   12369 poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
   12370 #define vmovq_n_p8 vdupq_n_u8
   12371 
   12372 poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
   12373 #define vmovq_n_p16 vdupq_n_s16
   12374 
   12375 float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
   12376 #define vmovq_n_f32 vdupq_n_f32
   12377 
   12378 int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
   12379 #define vmov_n_s64 vdup_n_s64
   12380 
   12381 uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
   12382 #define vmov_n_u64 vdup_n_u64
   12383 
   12384 int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
   12385 #define vmovq_n_s64 vdupq_n_s64
   12386 
   12387 uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
   12388 #define vmovq_n_u64 vdupq_n_u64
   12389 
   12390 //**************Set all lanes to the value of one lane of a vector *************
   12391 //****************************************************************************
   12392 //here shuffle is better solution than lane extraction followed by set1 function
   12393 uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   12394 _NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
   12395 {
   12396     uint8x8_t res;
   12397     uint8_t valane;
   12398     int i = 0;
   12399     valane = vec.m64_u8[lane];
   12400     for (i = 0; i<8; i++) {
   12401         res.m64_u8[i] = valane;
   12402     }
   12403     return res;
   12404 }
   12405 
   12406 uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   12407 _NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
   12408 {
   12409     uint16x4_t res;
   12410     uint16_t valane;
   12411     valane = vec.m64_u16[lane];
   12412     res.m64_u16[0] = valane;
   12413     res.m64_u16[1] = valane;
   12414     res.m64_u16[2] = valane;
   12415     res.m64_u16[3] = valane;
   12416     return res;
   12417 }
   12418 
   12419 uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   12420 _NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
   12421 {
   12422     uint32x2_t res;
   12423     res.m64_u32[0] = vec.m64_u32[lane];
   12424     res.m64_u32[1] = res.m64_u32[0];
   12425     return res;
   12426 }
   12427 
   12428 int8x8_t vdup_lane_s8(int8x8_t vec,  __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   12429 #define vdup_lane_s8 vdup_lane_u8
   12430 
   12431 int16x4_t vdup_lane_s16(int16x4_t vec,  __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   12432 #define vdup_lane_s16 vdup_lane_u16
   12433 
   12434 int32x2_t vdup_lane_s32(int32x2_t vec,  __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   12435 #define vdup_lane_s32 vdup_lane_u32
   12436 
   12437 poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   12438 #define vdup_lane_p8 vdup_lane_u8
   12439 
   12440 poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   12441 #define vdup_lane_p16 vdup_lane_s16
   12442 
   12443 float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   12444 _NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
   12445 {
   12446     float32x2_t res;
   12447     res.m64_f32[0] = vec.m64_f32[lane];
   12448     res.m64_f32[1] = res.m64_f32[0];
   12449     return res;
   12450 }
   12451 
   12452 uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   12453 _NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
   12454 {
   12455     _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane, lane};
   12456     return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
   12457 }
   12458 
   12459 uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   12460 _NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
   12461 {
   12462     //we could use 8bit shuffle for 16 bit as well
   12463     const int8_t lane16 = ((int8_t) lane) << 1;
   12464     _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1,
   12465                                                 lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1, lane16, lane16 + 1};
   12466     return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
   12467 }
   12468 
   12469 uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   12470 #define vdupq_lane_u32(vec,  lane) _mm_shuffle_epi32 (_pM128i(vec),  lane | (lane << 2) | (lane << 4) | (lane << 6))
   12471 
   12472 int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   12473 #define vdupq_lane_s8 vdupq_lane_u8
   12474 
   12475 int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   12476 #define vdupq_lane_s16 vdupq_lane_u16
   12477 
   12478 int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   12479 #define vdupq_lane_s32 vdupq_lane_u32
   12480 
   12481 poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   12482 #define vdupq_lane_p8 vdupq_lane_u8
   12483 
   12484 poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   12485 #define vdupq_lane_p16 vdupq_lane_s16
   12486 
   12487 float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   12488 #define  vdupq_lane_f32(vec, lane)  _mm_load1_ps((vec.m64_f32 + lane))
   12489 
   12490 int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
   12491 #define vdup_lane_s64(vec,lane) vec
   12492 
   12493 uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
   12494 #define vdup_lane_u64(vec,lane) vec
   12495 
   12496 int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
   12497 _NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
   12498 {
   12499     __m128i vec128;
   12500     vec128 = _pM128i(vec);
   12501     return _mm_unpacklo_epi64(vec128,vec128);
   12502 }
   12503 
   12504 uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
   12505 #define vdupq_lane_u64 vdupq_lane_s64
   12506 
   12507 // ********************************************************************
   12508 // ********************  Combining vectors *****************************
   12509 // ********************************************************************
   12510 //These intrinsics join two 64 bit vectors into a single 128bit vector.
   12511 int8x16_t   vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
   12512 #define vcombine_s8(low, high)   _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
   12513 
   12514 int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
   12515 #define vcombine_s16(low, high)    _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
   12516 
   12517 int32x4_t   vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
   12518 #define vcombine_s32(low, high)   _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
   12519 
   12520 int64x2_t   vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
   12521 #define vcombine_s64(low, high)   _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) )
   12522 
   12523 float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
   12524 //current IA SIMD doesn't support float16
   12525 
   12526 float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
   12527 _NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
   12528 {
   12529     __m128i res;
   12530     res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
   12531     return _M128(res);
   12532 }
   12533 
   12534 uint8x16_t   vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
   12535 #define vcombine_u8 vcombine_s8
   12536 
   12537 uint16x8_t   vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
   12538 #define vcombine_u16 vcombine_s16
   12539 
   12540 uint32x4_t   vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
   12541 #define vcombine_u32 vcombine_s32
   12542 
   12543 uint64x2_t   vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
   12544 #define vcombine_u64 vcombine_s64
   12545 
   12546 poly8x16_t   vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
   12547 #define vcombine_p8 vcombine_u8
   12548 
   12549 poly16x8_t   vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
   12550 #define vcombine_p16 vcombine_u16
   12551 
   12552 //**********************************************************************
   12553 //************************* Splitting vectors **************************
   12554 //**********************************************************************
   12555 //**************** Get high part ******************************************
   12556 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
   12557 int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
   12558 _NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
   12559 {
   12560     int8x8_t res64;
   12561     __m128i res;
   12562     res = _mm_unpackhi_epi64(a,a); //SSE2
   12563     return64(res);
   12564 }
   12565 
   12566 int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
   12567 _NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
   12568 {
   12569     int16x4_t res64;
   12570     __m128i res;
   12571     res =  _mm_unpackhi_epi64(a,a); //SSE2
   12572     return64(res);
   12573 }
   12574 
   12575 int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
   12576 _NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
   12577 {
   12578     int32x2_t res64;
   12579     __m128i res;
   12580     res =  _mm_unpackhi_epi64(a,a); //SSE2
   12581     return64(res);
   12582 }
   12583 
   12584 int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
   12585 _NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
   12586 {
   12587     int64x1_t res64;
   12588     __m128i res;
   12589     res =  _mm_unpackhi_epi64(a,a); //SSE2
   12590     return64(res);
   12591 }
   12592 
   12593 float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
   12594 // IA32 SIMD doesn't work with 16bit floats currently
   12595 
   12596 float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
   12597 _NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
   12598 {
   12599     __m128i res;
   12600     __m64_128 res64;
   12601     res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
   12602     return64(res);
   12603 }
   12604 
   12605 uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
   12606 #define vget_high_u8 vget_high_s8
   12607 
   12608 uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
   12609 #define vget_high_u16 vget_high_s16
   12610 
   12611 uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
   12612 #define vget_high_u32 vget_high_s32
   12613 
   12614 uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
   12615 #define vget_high_u64 vget_high_s64
   12616 
   12617 poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
   12618 #define vget_high_p8 vget_high_u8
   12619 
   12620 poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
   12621 #define vget_high_p16 vget_high_u16
   12622 
   12623 //********************** Get low part **********************
   12624 //**********************************************************
   12625 int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
   12626 _NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
   12627 {
   12628     int16x4_t res64;
   12629     return64(a);
   12630 }
   12631 
   12632 int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
   12633 _NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
   12634 {
   12635     int16x4_t res64;
   12636     return64(a);
   12637 }
   12638 
   12639 int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
   12640 _NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
   12641 {
   12642     int32x2_t res64;
   12643     return64(a);
   12644 }
   12645 
   12646 int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
   12647 _NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
   12648 {
   12649     int64x1_t res64;
   12650     return64 (a);
   12651 }
   12652 
   12653 float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
   12654 // IA32 SIMD doesn't work with 16bit floats currently
   12655 
   12656 float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
   12657 _NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
   12658 {
   12659     float32x2_t res64;
   12660     _M64f(res64, a);
   12661     return res64;
   12662 }
   12663 
   12664 uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
   12665 #define vget_low_u8 vget_low_s8
   12666 
   12667 uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
   12668 #define vget_low_u16 vget_low_s16
   12669 
   12670 uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
   12671 #define vget_low_u32 vget_low_s32
   12672 
   12673 uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
   12674 #define vget_low_u64 vget_low_s64
   12675 
   12676 poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
   12677 #define vget_low_p8 vget_low_u8
   12678 
   12679 poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
   12680 #define vget_low_p16 vget_low_s16
   12681 
   12682 //**************************************************************************
   12683 //************************ Converting vectors **********************************
   12684 //**************************************************************************
   12685 //************* Convert from float ***************************************
   12686 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
   12687 int32x2_t   vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
   12688 _NEON2SSE_INLINE int32x2_t   vcvt_s32_f32(float32x2_t a)
   12689 {
   12690     int32x2_t res64;
   12691     __m128i res;
   12692     res =  _mm_cvttps_epi32(_pM128(a)); //use low 64 bits of result only
   12693     return64(res);
   12694 }
   12695 
   12696 uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
   12697 _NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
   12698 {
   12699     //may be not effective compared with a serial SIMD solution
   12700     uint32x2_t res64;
   12701     __m128i res;
   12702     res = vcvtq_u32_f32(_pM128(a));
   12703     return64(res);
   12704 }
   12705 
   12706 int32x4_t   vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
   12707 #define vcvtq_s32_f32 _mm_cvttps_epi32
   12708 
   12709 uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
   12710 _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
   12711 {
   12712     //No single instruction SSE solution  but we could implement it as following:
   12713     __m128i resi;
   12714     __m128 zero,  mask, a_pos, mask_f_max_si, res;
   12715     _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
   12716     zero = _mm_setzero_ps();
   12717     mask = _mm_cmpgt_ps(a, zero);
   12718     a_pos = _mm_and_ps(a, mask);
   12719     mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff);
   12720     res =  _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything
   12721     resi = _mm_cvttps_epi32(res);
   12722     return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si);
   12723 }
   12724 
   12725 // ***** Convert to the fixed point  with   the number of fraction bits specified by b ***********
   12726 //*************************************************************************************************
   12727 int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
   12728 _NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
   12729 {
   12730     int32x2_t res64;
   12731     return64(vcvtq_n_s32_f32(_pM128(a),b));
   12732 }
   12733 
   12734 uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
   12735 _NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
   12736 {
   12737     uint32x2_t res;
   12738     float convconst;
   12739     convconst = (float)((uint32_t)1 << b);
   12740     res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
   12741     res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
   12742     return res;
   12743 }
   12744 
   12745 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
   12746 _NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
   12747 {
   12748     float convconst;
   12749     _NEON2SSE_ALIGN_16 uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   12750     __m128 cconst128;
   12751     __m128i mask, res;
   12752     convconst = (float)(1 << b);
   12753     cconst128 = vdupq_n_f32(convconst);
   12754     res =  _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
   12755     mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
   12756     return _mm_xor_si128 (res,  mask); //res saturated for 0x80000000
   12757 }
   12758 
   12759 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
   12760 _NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
   12761 {
   12762     float convconst;
   12763     __m128 cconst128;
   12764     convconst = (float)(1 << b);
   12765     cconst128 = vdupq_n_f32(convconst);
   12766     return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
   12767 }
   12768 
   12769 //***************** Convert to float *************************
   12770 //*************************************************************
   12771 float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
   12772 _NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
   12773 {
   12774     float32x2_t res;
   12775     res.m64_f32[0] = (float) a.m64_i32[0];
   12776     res.m64_f32[1] = (float) a.m64_i32[1];
   12777     return res;
   12778 }
   12779 
   12780 float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
   12781 _NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
   12782 {
   12783     float32x2_t res;
   12784     res.m64_f32[0] = (float) a.m64_u32[0];
   12785     res.m64_f32[1] = (float) a.m64_u32[1];
   12786     return res;
   12787 }
   12788 
   12789 float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
   12790 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
   12791 
   12792 float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
   12793 _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
   12794 {
   12795     //solution may be not optimal
   12796     __m128 two16, fHi, fLo;
   12797     __m128i hi, lo;
   12798     two16 = _mm_set1_ps((float)0x10000); //2^16
   12799     // Avoid double rounding by doing two exact conversions
   12800     // of high and low 16-bit segments
   12801     hi = _mm_srli_epi32(a, 16);
   12802     lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
   12803     fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
   12804     fLo = _mm_cvtepi32_ps(lo);
   12805     // do single rounding according to current rounding mode
   12806     return _mm_add_ps(fHi, fLo);
   12807 }
   12808 
   12809 // ***** Convert to the float from fixed point  with   the number of fraction bits specified by b ***********
   12810 float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
   12811 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
   12812 {
   12813     float32x2_t res;
   12814     float convconst;
   12815     convconst = (float)(1. / ((uint32_t)1 << b));
   12816     res.m64_f32[0] =  a.m64_i32[0] * convconst;
   12817     res.m64_f32[1] = a.m64_i32[1] * convconst;
   12818     return res;
   12819 }
   12820 
   12821 float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
   12822 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
   12823 {
   12824     float32x2_t res;
   12825     float convconst;
   12826     convconst = (float)(1. / ((uint32_t)1 << b));
   12827     res.m64_f32[0] =  a.m64_u32[0] * convconst;
   12828     res.m64_f32[1] = a.m64_u32[1] * convconst;
   12829     return res;
   12830 }
   12831 
   12832 float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
   12833 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
   12834 {
   12835     float convconst;
   12836     __m128 cconst128, af;
   12837     convconst = (float)(1. / ((uint32_t)1 << b));
   12838     af = _mm_cvtepi32_ps(a);
   12839     cconst128 = vdupq_n_f32(convconst);
   12840     return _mm_mul_ps(af,cconst128);
   12841 }
   12842 
   12843 float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
   12844 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
   12845 {
   12846     float convconst;
   12847     __m128 cconst128, af;
   12848     convconst = (float)(1. / (1 << b));
   12849     af = vcvtq_f32_u32(a);
   12850     cconst128 = vdupq_n_f32(convconst);
   12851     return _mm_mul_ps(af,cconst128);
   12852 }
   12853 
   12854 //**************Convert between floats ***********************
   12855 //************************************************************
   12856 float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
   12857 //Intel SIMD doesn't support 16bits floats curently
   12858 
   12859 float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
   12860 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
   12861 
   12862 //************Vector narrow integer conversion (truncation) ******************
   12863 //****************************************************************************
   12864 int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
   12865 _NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
   12866 {
   12867     int8x8_t res64;
   12868     __m128i res;
   12869     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   12870     res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
   12871     return64(res);
   12872 }
   12873 
   12874 int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
   12875 _NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
   12876 {
   12877     int16x4_t res64;
   12878     __m128i res;
   12879     _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
   12880     res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
   12881     return64(res);
   12882 }
   12883 
   12884 int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
   12885 _NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
   12886 {
   12887     //may be not effective compared with a serial implementation
   12888     int32x2_t res64;
   12889     __m128i res;
   12890     res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
   12891     return64(res);
   12892 }
   12893 
   12894 uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
   12895 #define vmovn_u16 vmovn_s16
   12896 
   12897 uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
   12898 #define vmovn_u32 vmovn_s32
   12899 
   12900 uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
   12901 #define vmovn_u64 vmovn_s64
   12902 
   12903 //**************** Vector long move   ***********************
   12904 //***********************************************************
   12905 int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
   12906 #define vmovl_s8(a) _MM_CVTEPI8_EPI16(_pM128i(a)) //SSE4.1
   12907 
   12908 int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
   12909 #define vmovl_s16(a) _MM_CVTEPI16_EPI32(_pM128i(a)) //SSE4.1
   12910 
   12911 int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
   12912 #define vmovl_s32(a)  _MM_CVTEPI32_EPI64(_pM128i(a)) //SSE4.1
   12913 
   12914 uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
   12915 #define vmovl_u8(a) _MM_CVTEPU8_EPI16(_pM128i(a)) //SSE4.1
   12916 
   12917 uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
   12918 #define vmovl_u16(a) _MM_CVTEPU16_EPI32(_pM128i(a)) //SSE4.1
   12919 
   12920 uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
   12921 #define vmovl_u32(a)  _MM_CVTEPU32_EPI64(_pM128i(a)) //SSE4.1
   12922 
   12923 //*************Vector saturating narrow integer*****************
   12924 //**************************************************************
   12925 int8x8_t   vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
   12926 _NEON2SSE_INLINE int8x8_t   vqmovn_s16(int16x8_t a)
   12927 {
   12928     int8x8_t res64;
   12929     __m128i res;
   12930     res = _mm_packs_epi16(a, a);
   12931     return64(res);
   12932 }
   12933 
   12934 int16x4_t   vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
   12935 _NEON2SSE_INLINE int16x4_t   vqmovn_s32(int32x4_t a)
   12936 {
   12937     int16x4_t res64;
   12938     __m128i res;
   12939     res = _mm_packs_epi32(a, a);
   12940     return64(res);
   12941 }
   12942 
   12943 int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
   12944 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
   12945 {
   12946     int32x2_t res;
   12947     _NEON2SSE_ALIGN_16 int64_t atmp[2];
   12948     _mm_store_si128((__m128i*)atmp, a);
   12949     if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
   12950     if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
   12951     if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
   12952     if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
   12953     res.m64_i32[0] = (int32_t)atmp[0];
   12954     res.m64_i32[1] = (int32_t)atmp[1];
   12955     return res;
   12956 }
   12957 
   12958 uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
   12959 _NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
   12960 {
   12961     //no uint16 to uint8 conversion in SSE, need truncate to max signed first
   12962     uint8x8_t res64;
   12963     __m128i c7fff, a_trunc;
   12964     c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
   12965     a_trunc =  _mm_and_si128(a,  c7fff); // a truncated to max signed
   12966     a_trunc =  _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
   12967     return64(a_trunc);
   12968 }
   12969 
   12970 uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
   12971 _NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
   12972 {
   12973     //no uint32 to uint16 conversion in SSE, need truncate to max signed first
   12974     uint16x4_t res64;
   12975     __m128i c7fffffff, a_trunc;
   12976     c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
   12977     a_trunc =  _mm_and_si128(a,  c7fffffff); // a truncated to max signed
   12978     a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
   12979     return64(a_trunc);
   12980 }
   12981 
   12982 uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
   12983 _NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
   12984 {
   12985     //serial solution may be faster
   12986     uint32x2_t res64;
   12987     __m128i res_hi, mask;
   12988     mask = _mm_setzero_si128();
   12989     res_hi = _mm_srli_epi64(a, 32);
   12990     res_hi = _mm_cmpeq_epi32(res_hi, mask);
   12991     mask = _mm_cmpeq_epi32(mask,mask); //all fff
   12992     mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
   12993     res_hi = _mm_or_si128(a, mask);
   12994     res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   12995     return64(res_hi);
   12996 }
   12997 //************* Vector saturating narrow integer signed->unsigned **************
   12998 //*****************************************************************************
   12999 uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
   13000 _NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
   13001 {
   13002     uint8x8_t res64;
   13003     __m128i res;
   13004     res = _mm_packus_epi16(a, a); //use low 64bits only
   13005     return64(res);
   13006 }
   13007 
   13008 uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
   13009 _NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
   13010 {
   13011     uint16x4_t res64;
   13012     __m128i res;
   13013     res = _MM_PACKUS1_EPI32(a); //use low 64bits only
   13014     return64(res);
   13015 }
   13016 
   13017 uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
   13018 _NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
   13019 {
   13020     uint32x2_t res64;
   13021     __m128i res_hi,res_lo, zero, cmp;
   13022     zero = _mm_setzero_si128();
   13023     res_hi = _mm_srli_epi64(a,  32);
   13024     cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
   13025     res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
   13026     cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
   13027     res_lo =  _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
   13028     res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   13029     return64(res_lo);
   13030 }
   13031 
   13032 // ********************************************************
   13033 // **************** Table look up **************************
   13034 // ********************************************************
   13035 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
   13036 //in a table and generate a new vector. Indexes out of range return 0.
   13037 //for Intel SIMD we need to set the MSB to 1 for zero return
   13038 uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
   13039 _NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
   13040 {
   13041     uint8x8_t res64;
   13042     __m128i c7, maskgt, bmask, b128;
   13043     c7 = _mm_set1_epi8 (7);
   13044     b128 = _pM128i(b);
   13045     maskgt = _mm_cmpgt_epi8(b128,c7);
   13046     bmask = _mm_or_si128(b128,maskgt);
   13047     bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
   13048     return64(bmask);
   13049 }
   13050 
   13051 int8x8_t vtbl1_s8(int8x8_t a,  int8x8_t b); // VTBL.8 d0, {d0}, d0
   13052 #define vtbl1_s8 vtbl1_u8
   13053 
   13054 poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
   13055 #define vtbl1_p8 vtbl1_u8
   13056 
   13057 //Special trick to avoid __declspec(align('8')) won't be aligned" error
   13058 //uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   13059 uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   13060 _NEON2SSE_INLINE uint8x8_t vtbl2_u8_ptr(uint8x8x2_t* a, uint8x8_t b)
   13061 {
   13062     uint8x8_t res64;
   13063     __m128i c15, a01, maskgt15, bmask, b128;
   13064     c15 = _mm_set1_epi8 (15);
   13065     b128 = _pM128i(b);
   13066     maskgt15 = _mm_cmpgt_epi8(b128,c15);
   13067     bmask = _mm_or_si128(b128, maskgt15);
   13068     a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]), _pM128i(a->val[1]));
   13069     a01 =  _mm_shuffle_epi8(a01, bmask);
   13070     return64(a01);
   13071 }
   13072 #define vtbl2_u8(a, b) vtbl2_u8_ptr(&a, b)
   13073 
   13074 //int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   13075 #define vtbl2_s8 vtbl2_u8
   13076 
   13077 //poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   13078 #define vtbl2_p8 vtbl2_u8
   13079 
   13080 //Special trick to avoid __declspec(align('16')) won't be aligned" error
   13081 //uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   13082 _NEON2SSE_INLINE uint8x8_t vtbl3_u8_ptr(uint8x8x3_t* a, uint8x8_t b)
   13083 {
   13084     //solution may be not optimal
   13085     uint8x8_t res64;
   13086     __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
   13087     c15 = _mm_set1_epi8 (15);
   13088     c23 = _mm_set1_epi8 (23);
   13089     b128 = _pM128i(b);
   13090     maskgt23 = _mm_cmpgt_epi8(b128,c23);
   13091     bmask = _mm_or_si128(b128, maskgt23);
   13092     maskgt15 = _mm_cmpgt_epi8(b128,c15);
   13093     a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
   13094     sh0 =  _mm_shuffle_epi8(a01, bmask);
   13095     sh1 =  _mm_shuffle_epi8(_pM128i(a->val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
   13096     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
   13097     return64(sh0);
   13098 }
   13099 #define vtbl3_u8(a,b) vtbl3_u8_ptr(&a,b)
   13100 
   13101 //int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   13102 int8x8_t vtbl3_s8_ptr(int8x8x3_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   13103 #define vtbl3_s8 vtbl3_u8
   13104 
   13105 //poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   13106 poly8x8_t vtbl3_p8_ptr(poly8x8x3_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   13107 #define vtbl3_p8 vtbl3_u8
   13108 
   13109 //uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   13110 _NEON2SSE_INLINE uint8x8_t vtbl4_u8_ptr(uint8x8x4_t* a, uint8x8_t b)
   13111 {
   13112     //solution may be not optimal
   13113     uint8x8_t res64;
   13114     __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
   13115     c15 = _mm_set1_epi8 (15);
   13116     c31 = _mm_set1_epi8 (31);
   13117     b128 = _pM128i(b);
   13118     maskgt31 = _mm_cmpgt_epi8(b128,c31);
   13119     bmask = _mm_or_si128(b128, maskgt31);
   13120     maskgt15 = _mm_cmpgt_epi8(b128,c15);
   13121     a01 = _mm_unpacklo_epi64(_pM128i(a->val[0]),_pM128i(a->val[1]));
   13122     a23 = _mm_unpacklo_epi64(_pM128i(a->val[2]),_pM128i(a->val[3]));
   13123     sh0 =  _mm_shuffle_epi8(a01, bmask);
   13124     sh1 =  _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
   13125     sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
   13126     return64(sh0);
   13127 }
   13128 #define vtbl4_u8(a,b) vtbl4_u8_ptr(&a,b)
   13129 
   13130 //int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   13131 int8x8_t vtbl4_s8_ptr(int8x8x4_t* a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   13132 #define vtbl4_s8 vtbl4_u8
   13133 
   13134 //poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   13135 poly8x8_t vtbl4_p8_ptr(poly8x8x4_t* a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   13136 #define vtbl4_p8 vtbl4_u8
   13137 
   13138 //****************** Extended table look up intrinsics ***************************
   13139 //**********************************************************************************
   13140 //VTBX (Vector Table Extension) works in the same way as VTBL do,
   13141 // except that indexes out of range leave the destination element unchanged.
   13142 
   13143 uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
   13144 _NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
   13145 {
   13146     uint8x8_t res64;
   13147     __m128i c7, maskgt, sh, c128;
   13148     c7 = _mm_set1_epi8 (7);
   13149     c128 = _pM128i(c);
   13150     maskgt = _mm_cmpgt_epi8(c128,c7);
   13151     c7 = _mm_and_si128(maskgt,_pM128i(a));
   13152     sh = _mm_shuffle_epi8(_pM128i(b),c128);
   13153     sh = _mm_andnot_si128(maskgt,sh);
   13154     sh =  _mm_or_si128(sh,c7);
   13155     return64(sh);
   13156 }
   13157 
   13158 int8x8_t vtbx1_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
   13159 #define vtbx1_s8 vtbx1_u8
   13160 
   13161 poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
   13162 #define vtbx1_p8 vtbx1_u8
   13163 
   13164 //Special trick to avoid __declspec(align('8')) won't be aligned" error
   13165 //uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   13166 uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   13167 _NEON2SSE_INLINE uint8x8_t vtbx2_u8_ptr(uint8x8_t a, uint8x8x2_t* b, uint8x8_t c)
   13168 {
   13169     uint8x8_t res64;
   13170     __m128i c15, b01, maskgt15, sh, c128;
   13171     c15 = _mm_set1_epi8 (15);
   13172     c128 = _pM128i(c);
   13173     maskgt15 = _mm_cmpgt_epi8(c128, c15);
   13174     c15 = _mm_and_si128(maskgt15, _pM128i(a));
   13175     b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]), _pM128i(b->val[1]));
   13176     sh =  _mm_shuffle_epi8(b01, c128);
   13177     sh = _mm_andnot_si128(maskgt15, sh);
   13178     sh =  _mm_or_si128(sh,c15);
   13179     return64(sh);
   13180 }
   13181 #define vtbx2_u8(a, b, c) vtbx2_u8_ptr(a, &b, c)
   13182 
   13183 //int8x8_t vtbx2_s8(int8x8_t a,  int8x8x2_t b, int8x8_t c);  // VTBX.8 d0, {d0, d1}, d0
   13184 #define vtbx2_s8 vtbx2_u8
   13185 
   13186 //poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   13187 #define vtbx2_p8 vtbx2_u8
   13188 
   13189 //uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2}, d0
   13190 _NEON2SSE_INLINE uint8x8_t vtbx3_u8_ptr(uint8x8_t a, uint8x8x3_t* b, uint8x8_t c)
   13191 {
   13192     //solution may be not optimal
   13193     uint8x8_t res64;
   13194     __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
   13195     c15 = _mm_set1_epi8 (15);
   13196     c23 = _mm_set1_epi8 (23);
   13197     c128 = _pM128i(c);
   13198     maskgt15 = _mm_cmpgt_epi8(c128,c15);
   13199     maskgt23 = _mm_cmpgt_epi8(c128,c23);
   13200     c23 = _mm_and_si128(maskgt23, _pM128i(a));
   13201     b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
   13202     sh0 =  _mm_shuffle_epi8(b01, c128);
   13203     sh1 =  _mm_shuffle_epi8(_pM128i(b->val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
   13204     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
   13205     sh0 = _mm_andnot_si128(maskgt23,sh0);
   13206     sh0 = _mm_or_si128(sh0,c23);
   13207     return64(sh0);
   13208 }
   13209 #define vtbx3_u8(a, b, c) vtbx3_u8_ptr(a, &b, c)
   13210 
   13211 //int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   13212 int8x8_t vtbx3_s8_ptr(int8x8_t a, int8x8x3_t* b, int8x8_t c);
   13213 #define vtbx3_s8 vtbx3_u8
   13214 
   13215 //poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   13216 poly8x8_t vtbx3_p8_ptr(poly8x8_t a, poly8x8x3_t* b, uint8x8_t c);
   13217 #define vtbx3_p8 vtbx3_u8
   13218 
   13219 //uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) // VTBX.8 d0, {d0, d1, d2, d3}, d0
   13220 _NEON2SSE_INLINE uint8x8_t vtbx4_u8_ptr(uint8x8_t a, uint8x8x4_t* b, uint8x8_t c)
   13221 {
   13222     //solution may be not optimal
   13223     uint8x8_t res64;
   13224     __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
   13225     c15 = _mm_set1_epi8 (15);
   13226     c31 = _mm_set1_epi8 (31);
   13227     c128 = _pM128i(c);
   13228     maskgt15 = _mm_cmpgt_epi8(c128,c15);
   13229     maskgt31 = _mm_cmpgt_epi8(c128,c31);
   13230     c31 = _mm_and_si128(maskgt31, _pM128i(a));
   13231 
   13232     b01 = _mm_unpacklo_epi64(_pM128i(b->val[0]),_pM128i(b->val[1]));
   13233     b23 = _mm_unpacklo_epi64(_pM128i(b->val[2]),_pM128i(b->val[3]));
   13234     sh0 =  _mm_shuffle_epi8(b01, c128);
   13235     sh1 =  _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
   13236     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
   13237     sh0 = _mm_andnot_si128(maskgt31,sh0);
   13238     sh0 =  _mm_or_si128(sh0,c31);
   13239     return64(sh0);
   13240 }
   13241 #define vtbx4_u8(a, b, c) vtbx4_u8_ptr(a, &b, c)
   13242 
   13243 //int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   13244 int8x8_t vtbx4_s8_ptr(int8x8_t a, int8x8x4_t* b, int8x8_t c);
   13245 #define vtbx4_s8 vtbx4_u8
   13246 
   13247 //poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   13248 poly8x8_t vtbx4_p8_ptr(poly8x8_t a, poly8x8x4_t* b, uint8x8_t c);
   13249 #define vtbx4_p8 vtbx4_u8
   13250 
   13251 //*************************************************************************************************
   13252 // *************************** Operations with a scalar value *********************************
   13253 //*************************************************************************************************
   13254 
   13255 //******* Vector multiply accumulate by scalar *************************************************
   13256 //**********************************************************************************************
   13257 int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
   13258 _NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
   13259 {
   13260     int16_t c;
   13261     int16x4_t scalar;
   13262     c = vget_lane_s16(v, l);
   13263     scalar = vdup_n_s16(c);
   13264     return vmla_s16(a, b, scalar);
   13265 }
   13266 
   13267 int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
   13268 _NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
   13269 {
   13270     int32_t c;
   13271     int32x2_t scalar;
   13272     c = vget_lane_s32(v, l);
   13273     scalar = vdup_n_s32(c);
   13274     return vmla_s32(a, b, scalar);
   13275 }
   13276 
   13277 uint16x4_t vmla_lane_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
   13278 #define vmla_lane_u16 vmla_lane_s16
   13279 
   13280 
   13281 uint32x2_t vmla_lane_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
   13282 #define vmla_lane_u32 vmla_lane_s32
   13283 
   13284 float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
   13285 _NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
   13286 {
   13287     float32_t vlane;
   13288     float32x2_t c;
   13289     vlane = vget_lane_f32(v, l);
   13290     c = vdup_n_f32(vlane);
   13291     return vmla_f32(a,b,c);
   13292 }
   13293 
   13294 int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
   13295 _NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
   13296 {
   13297     int16_t vlane;
   13298     int16x8_t c;
   13299     vlane = vget_lane_s16(v, l);
   13300     c = vdupq_n_s16(vlane);
   13301     return vmlaq_s16(a,b,c);
   13302 }
   13303 
   13304 int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
   13305 _NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
   13306 {
   13307     int32_t vlane;
   13308     int32x4_t c;
   13309     vlane = vget_lane_s32(v, l);
   13310     c = vdupq_n_s32(vlane);
   13311     return vmlaq_s32(a,b,c);
   13312 }
   13313 
   13314 uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
   13315 #define vmlaq_lane_u16 vmlaq_lane_s16
   13316 
   13317 uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
   13318 #define vmlaq_lane_u32 vmlaq_lane_s32
   13319 
   13320 float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
   13321 _NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
   13322 {
   13323     float32_t vlane;
   13324     float32x4_t c;
   13325     vlane = vget_lane_f32(v, l);
   13326     c = vdupq_n_f32(vlane);
   13327     return vmlaq_f32(a,b,c);
   13328 }
   13329 
   13330 //***************** Vector widening multiply accumulate by scalar **********************
   13331 //***************************************************************************************
   13332 int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
   13333 _NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
   13334 {
   13335     int16_t vlane;
   13336     int16x4_t c;
   13337     vlane = vget_lane_s16(v, l);
   13338     c = vdup_n_s16(vlane);
   13339     return vmlal_s16(a, b, c);
   13340 }
   13341 
   13342 int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
   13343 _NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
   13344 {
   13345     int32_t vlane;
   13346     int32x2_t c;
   13347     vlane = vget_lane_s32(v, l);
   13348     c = vdup_n_s32(vlane);
   13349     return vmlal_s32(a, b, c);
   13350 }
   13351 
   13352 uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
   13353 _NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
   13354 {
   13355     uint16_t vlane;
   13356     uint16x4_t c;
   13357     vlane = vget_lane_u16(v, l);
   13358     c = vdup_n_u16(vlane);
   13359     return vmlal_u16(a, b, c);
   13360 }
   13361 
   13362 uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
   13363 _NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
   13364 {
   13365     uint32_t vlane;
   13366     uint32x2_t c;
   13367     vlane = vget_lane_u32(v, l);
   13368     c = vdup_n_u32(vlane);
   13369     return vmlal_u32(a, b, c);
   13370 }
   13371 
   13372 // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
   13373 // ************************************************************************************************
   13374 int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
   13375 _NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
   13376 {
   13377     int16_t vlane;
   13378     int16x4_t c;
   13379     vlane = vget_lane_s16(v, l);
   13380     c = vdup_n_s16(vlane);
   13381     return vqdmlal_s16(a, b, c);
   13382 }
   13383 
   13384 int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
   13385 _NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
   13386 {
   13387     int32_t vlane;
   13388     uint32x2_t c;
   13389     vlane = vget_lane_s32(v, l);
   13390     c = vdup_n_s32(vlane);
   13391     return vqdmlal_s32(a, b, c);
   13392 }
   13393 
   13394 // ****** Vector multiply subtract by scalar *****************
   13395 // *************************************************************
   13396 int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
   13397 _NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
   13398 {
   13399     int16_t vlane;
   13400     int16x4_t c;
   13401     vlane = vget_lane_s16(v, l);
   13402     c = vdup_n_s16(vlane);
   13403     return vmls_s16(a, b, c);
   13404 }
   13405 
   13406 int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
   13407 _NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
   13408 {
   13409     int32_t vlane;
   13410     int32x2_t c;
   13411     vlane = vget_lane_s32(v, l);
   13412     c = vdup_n_s32(vlane);
   13413     return vmls_s32(a, b, c);
   13414 }
   13415 
   13416 uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
   13417 _NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
   13418 {
   13419     uint16_t vlane;
   13420     uint16x4_t c;
   13421     vlane = vget_lane_s16(v, l);
   13422     c = vdup_n_s16(vlane);
   13423     return vmls_s16(a, b, c);
   13424 }
   13425 
   13426 uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
   13427 _NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
   13428 {
   13429     uint32_t vlane;
   13430     uint32x2_t c;
   13431     vlane = vget_lane_u32(v, l);
   13432     c = vdup_n_u32(vlane);
   13433     return vmls_u32(a, b, c);
   13434 }
   13435 
   13436 float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
   13437 _NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
   13438 {
   13439     float32_t vlane;
   13440     float32x2_t c;
   13441     vlane = (float) vget_lane_f32(v, l);
   13442     c = vdup_n_f32(vlane);
   13443     return vmls_f32(a,b,c);
   13444 }
   13445 
   13446 int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
   13447 _NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
   13448 {
   13449     int16_t vlane;
   13450     int16x8_t c;
   13451     vlane = vget_lane_s16(v, l);
   13452     c = vdupq_n_s16(vlane);
   13453     return vmlsq_s16(a, b,c);
   13454 }
   13455 
   13456 int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
   13457 _NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
   13458 {
   13459     int32_t vlane;
   13460     int32x4_t c;
   13461     vlane = vget_lane_s32(v, l);
   13462     c = vdupq_n_s32(vlane);
   13463     return vmlsq_s32(a,b,c);
   13464 }
   13465 
   13466 uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
   13467 _NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
   13468 {
   13469     uint16_t vlane;
   13470     uint16x8_t c;
   13471     vlane = vget_lane_u16(v, l);
   13472     c = vdupq_n_u16(vlane);
   13473     return vmlsq_u16(a,b,c);
   13474 }
   13475 
   13476 uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
   13477 _NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
   13478 {
   13479     uint32_t vlane;
   13480     uint32x4_t c;
   13481     vlane = vget_lane_u32(v, l);
   13482     c = vdupq_n_u32(vlane);
   13483     return vmlsq_u32(a,b,c);
   13484 }
   13485 
   13486 float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
   13487 _NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
   13488 {
   13489     float32_t vlane;
   13490     float32x4_t c;
   13491     vlane = (float) vget_lane_f32(v, l);
   13492     c = vdupq_n_f32(vlane);
   13493     return vmlsq_f32(a,b,c);
   13494 }
   13495 
   13496 // **** Vector widening multiply subtract by scalar ****
   13497 // ****************************************************
   13498 int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
   13499 _NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
   13500 {
   13501     int16_t vlane;
   13502     int16x4_t c;
   13503     vlane = vget_lane_s16(v, l);
   13504     c = vdup_n_s16(vlane);
   13505     return vmlsl_s16(a, b, c);
   13506 }
   13507 
   13508 int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
   13509 _NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
   13510 {
   13511     int32_t vlane;
   13512     int32x2_t c;
   13513     vlane = vget_lane_s32(v, l);
   13514     c = vdup_n_s32(vlane);
   13515     return vmlsl_s32(a, b, c);
   13516 }
   13517 
   13518 uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
   13519 _NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
   13520 {
   13521     uint16_t vlane;
   13522     uint16x4_t c;
   13523     vlane = vget_lane_s16(v, l);
   13524     c = vdup_n_s16(vlane);
   13525     return vmlsl_s16(a, b, c);
   13526 }
   13527 
   13528 uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
   13529 _NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
   13530 {
   13531     uint32_t vlane;
   13532     uint32x2_t c;
   13533     vlane = vget_lane_u32(v, l);
   13534     c = vdup_n_u32(vlane);
   13535     return vmlsl_u32(a, b, c);
   13536 }
   13537 
   13538 //********* Vector widening saturating doubling multiply subtract by scalar **************************
   13539 //******************************************************************************************************
   13540 int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
   13541 _NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
   13542 {
   13543     int16_t vlane;
   13544     int16x4_t c;
   13545     vlane = vget_lane_s16(v, l);
   13546     c = vdup_n_s16(vlane);
   13547     return vqdmlsl_s16(a, b, c);
   13548 }
   13549 
   13550 int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
   13551 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
   13552 {
   13553     int32_t vlane;
   13554     int32x2_t c;
   13555     vlane = vget_lane_s32(v, l);
   13556     c = vdup_n_s32(vlane);
   13557     return vqdmlsl_s32(a, b, c);
   13558 }
   13559 //********** Vector multiply with scalar *****************************
   13560 int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
   13561 _NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
   13562 {
   13563     int16x4_t b16x4;
   13564     b16x4 = vdup_n_s16(b);
   13565     return vmul_s16(a, b16x4);
   13566 }
   13567 
   13568 int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
   13569 _NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
   13570 {
   13571     //serial solution looks faster
   13572     int32x2_t b32x2;
   13573     b32x2 = vdup_n_s32(b);
   13574     return vmul_s32(a, b32x2);
   13575 }
   13576 
   13577 float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
   13578 _NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
   13579 {
   13580     float32x2_t b32x2;
   13581     b32x2 = vdup_n_f32(b);
   13582     return vmul_f32(a, b32x2);
   13583 }
   13584 
   13585 uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
   13586 _NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
   13587 {
   13588     uint16x4_t b16x4;
   13589     b16x4 = vdup_n_s16(b);
   13590     return vmul_s16(a, b16x4);
   13591 }
   13592 
   13593 uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
   13594 _NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
   13595 {
   13596     //serial solution looks faster
   13597     uint32x2_t b32x2;
   13598     b32x2 = vdup_n_u32(b);
   13599     return vmul_u32(a, b32x2);
   13600 }
   13601 
   13602 int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
   13603 _NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
   13604 {
   13605     int16x8_t b16x8;
   13606     b16x8 = vdupq_n_s16(b);
   13607     return vmulq_s16(a, b16x8);
   13608 }
   13609 
   13610 int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
   13611 _NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
   13612 {
   13613     int32x4_t b32x4;
   13614     b32x4 = vdupq_n_s32(b);
   13615     return vmulq_s32(a, b32x4);
   13616 }
   13617 
   13618 float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
   13619 _NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
   13620 {
   13621     float32x4_t b32x4;
   13622     b32x4 = vdupq_n_f32(b);
   13623     return vmulq_f32(a, b32x4);
   13624 }
   13625 
   13626 uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
   13627 _NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
   13628 {
   13629     uint16x8_t b16x8;
   13630     b16x8 = vdupq_n_s16(b);
   13631     return vmulq_s16(a, b16x8);
   13632 }
   13633 
   13634 uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
   13635 _NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
   13636 {
   13637     uint32x4_t b32x4;
   13638     b32x4 = vdupq_n_u32(b);
   13639     return vmulq_u32(a, b32x4);
   13640 }
   13641 
   13642 //********** Vector multiply lane *****************************
   13643 int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
   13644 _NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
   13645 {
   13646     int16x4_t b16x4;
   13647     int16_t vlane;
   13648     vlane = vget_lane_s16(b, c);
   13649     b16x4 = vdup_n_s16(vlane);
   13650     return vmul_s16(a, b16x4);
   13651 }
   13652 
   13653 int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
   13654 _NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
   13655 {
   13656     int32x2_t b32x2;
   13657     int32_t vlane;
   13658     vlane = vget_lane_s32(b, c);
   13659     b32x2 = vdup_n_s32(vlane);
   13660     return vmul_s32(a, b32x2);
   13661 }
   13662 
   13663 float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
   13664 _NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
   13665 {
   13666     float32x2_t b32x2;
   13667     float32_t vlane;
   13668     vlane = vget_lane_f32(b, c);
   13669     b32x2 = vdup_n_f32(vlane);
   13670     return vmul_f32(a, b32x2);
   13671 }
   13672 
   13673 uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
   13674 #define vmul_lane_u16 vmul_lane_s16
   13675 
   13676 uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
   13677 #define vmul_lane_u32 vmul_lane_s32
   13678 
   13679 int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
   13680 _NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
   13681 {
   13682     int16x8_t b16x8;
   13683     int16_t vlane;
   13684     vlane = vget_lane_s16(b, c);
   13685     b16x8 = vdupq_n_s16(vlane);
   13686     return vmulq_s16(a, b16x8);
   13687 }
   13688 
   13689 int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
   13690 _NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
   13691 {
   13692     int32x4_t b32x4;
   13693     int32_t vlane;
   13694     vlane = vget_lane_s32(b, c);
   13695     b32x4 = vdupq_n_s32(vlane);
   13696     return vmulq_s32(a, b32x4);
   13697 }
   13698 
   13699 float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
   13700 _NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
   13701 {
   13702     float32x4_t b32x4;
   13703     float32_t vlane;
   13704     vlane = vget_lane_f32(b, c);
   13705     b32x4 = vdupq_n_f32(vlane);
   13706     return vmulq_f32(a, b32x4);
   13707 }
   13708 
   13709 uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
   13710 #define vmulq_lane_u16 vmulq_lane_s16
   13711 
   13712 uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
   13713 #define vmulq_lane_u32 vmulq_lane_s32
   13714 
   13715 //**** Vector long multiply with scalar ************
   13716 int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
   13717 _NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
   13718 {
   13719     int16x4_t b16x4;
   13720     b16x4 = vdup_n_s16(val2);
   13721     return vmull_s16(vec1, b16x4);
   13722 }
   13723 
   13724 int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
   13725 _NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
   13726 {
   13727     int32x2_t b32x2;
   13728     b32x2 = vdup_n_s32(val2);
   13729     return vmull_s32(vec1, b32x2);
   13730 }
   13731 
   13732 uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
   13733 _NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
   13734 {
   13735     uint16x4_t b16x4;
   13736     b16x4 = vdup_n_s16(val2);
   13737     return vmull_s16(vec1, b16x4);
   13738 }
   13739 
   13740 uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
   13741 _NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
   13742 {
   13743     uint32x2_t b32x2;
   13744     b32x2 = vdup_n_u32(val2);
   13745     return vmull_u32(vec1, b32x2);
   13746 }
   13747 
   13748 //**** Vector long multiply by scalar ****
   13749 int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
   13750 _NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
   13751 {
   13752     int16_t vlane;
   13753     int16x4_t b;
   13754     vlane = vget_lane_s16(val2, val3);
   13755     b = vdup_n_s16(vlane);
   13756     return vmull_s16(vec1, b);
   13757 }
   13758 
   13759 int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
   13760 _NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
   13761 {
   13762     int32_t vlane;
   13763     int32x2_t b;
   13764     vlane = vget_lane_s32(val2, val3);
   13765     b = vdup_n_s32(vlane);
   13766     return vmull_s32(vec1, b);
   13767 }
   13768 
   13769 uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
   13770 _NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
   13771 {
   13772     uint16_t vlane;
   13773     uint16x4_t b;
   13774     vlane = vget_lane_s16(val2, val3);
   13775     b = vdup_n_s16(vlane);
   13776     return vmull_s16(vec1, b);
   13777 }
   13778 
   13779 uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
   13780 _NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
   13781 {
   13782     uint32_t vlane;
   13783     uint32x2_t b;
   13784     vlane = vget_lane_u32(val2, val3);
   13785     b = vdup_n_u32(vlane);
   13786     return vmull_u32(vec1, b);
   13787 }
   13788 
   13789 //********* Vector saturating doubling long multiply with scalar  *******************
   13790 int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
   13791 _NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
   13792 {
   13793     //the serial soulution may be faster due to saturation
   13794     int16x4_t b;
   13795     b = vdup_n_s16(val2);
   13796     return vqdmull_s16(vec1, b);
   13797 }
   13798 
   13799 int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
   13800 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
   13801 {
   13802     int32x2_t b;
   13803     b = vdup_n_s32(val2);
   13804     return vqdmull_s32(vec1,b); //slow serial function!!!!
   13805 }
   13806 
   13807 //************* Vector saturating doubling long multiply by scalar ***********************************************
   13808 int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
   13809 _NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
   13810 {
   13811     int16_t c;
   13812     int16x4_t scalar;
   13813     c = vget_lane_s16(val2, val3);
   13814     scalar = vdup_n_s16(c);
   13815     return vqdmull_s16(vec1, scalar);
   13816 }
   13817 
   13818 
   13819 int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULL.S32 q0,d0,d0[0]
   13820 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
   13821 {
   13822     int32_t c;
   13823     int32x2_t scalar;
   13824     c = vget_lane_s32(val2, val3);
   13825     scalar = vdup_n_s32(c);
   13826     return vqdmull_s32(vec1,scalar); //slow serial function!!!!
   13827 }
   13828 
   13829 // *****Vector saturating doubling multiply high with scalar *****
   13830 int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2); //  VQDMULH.S16 d0,d0,d0[0]
   13831 _NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2)
   13832 {
   13833     int16x4_t res64;
   13834     return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
   13835 }
   13836 
   13837 int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2); //  VQDMULH.S32 d0,d0,d0[0]
   13838 _NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2)
   13839 {
   13840     int32x2_t res64;
   13841     return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
   13842 }
   13843 
   13844 int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); //  VQDMULH.S16 q0,q0,d0[0]
   13845 _NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) //  VQDMULH.S16 q0,q0,d0[0]
   13846 {
   13847     //solution may be not optimal
   13848     int16x8_t scalar;
   13849     scalar = vdupq_n_s16(val2);
   13850     return vqdmulhq_s16(vec1, scalar);
   13851 }
   13852 
   13853 int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); //  VQDMULH.S32 q0,q0,d0[0]
   13854 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13855 {
   13856     int32x4_t scalar;
   13857     scalar = vdupq_n_s32(val2);
   13858     return vqdmulhq_s32(vec1, scalar);
   13859 }
   13860 
   13861 //***** Vector saturating doubling multiply high by scalar ****************
   13862 int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 d0,d0,d0[0]
   13863 _NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 d0,d0,d0[0]
   13864 {
   13865     //solution may be not optimal
   13866     int16_t vlane;
   13867     int16x4_t scalar;
   13868     vlane = vget_lane_s16(val2, val3);
   13869     scalar = vdup_n_s16(vlane);
   13870     return vqdmulh_s16(vec1, scalar);
   13871 }
   13872 
   13873 int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 d0,d0,d0[0]
   13874 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13875 {
   13876     int32_t vlane;
   13877     int32x2_t scalar;
   13878     vlane = vget_lane_s32(val2, val3);
   13879     scalar = vdup_n_s32(vlane);
   13880     return vqdmulh_s32(vec1, scalar);
   13881 }
   13882 
   13883 int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 q0,q0,d0[0]
   13884 _NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 q0,q0,d0[0]
   13885 {
   13886     //solution may be not optimal
   13887     int16_t vlane;
   13888     int16x8_t scalar;
   13889     vlane = vget_lane_s16(val2, val3);
   13890     scalar = vdupq_n_s16(vlane );
   13891     return vqdmulhq_s16(vec1, scalar);
   13892 }
   13893 
   13894 int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 q0,q0,d0[0]
   13895 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13896 {
   13897     //solution may be not optimal
   13898     int32_t vlane;
   13899     int32x4_t scalar;
   13900     vlane = vgetq_lane_s32(_pM128i(val2), val3);
   13901     scalar = vdupq_n_s32(vlane );
   13902     return vqdmulhq_s32(vec1, scalar);
   13903 }
   13904 
   13905 //******** Vector saturating rounding doubling multiply high with scalar ***
   13906 int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
   13907 _NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
   13908 {
   13909     //solution may be not optimal
   13910     int16x4_t scalar;
   13911     scalar = vdup_n_s16(val2);
   13912     return vqrdmulh_s16(vec1, scalar);
   13913 }
   13914 
   13915 int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
   13916 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13917 {
   13918     int32x2_t scalar;
   13919     scalar = vdup_n_s32(val2);
   13920     return vqrdmulh_s32(vec1, scalar);
   13921 }
   13922 
   13923 int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
   13924 _NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
   13925 {
   13926     //solution may be not optimal
   13927     int16x8_t scalar;
   13928     scalar = vdupq_n_s16(val2);
   13929     return vqrdmulhq_s16(vec1, scalar);
   13930 }
   13931 
   13932 int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
   13933 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13934 {
   13935     int32x4_t scalar;
   13936     scalar = vdupq_n_s32(val2);
   13937     return vqrdmulhq_s32(vec1, scalar);
   13938 }
   13939 
   13940 //********* Vector rounding saturating doubling multiply high by scalar  ****
   13941 int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
   13942 _NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
   13943 {
   13944     //solution may be not optimal
   13945     int16_t vlane;
   13946     int16x4_t scalar;
   13947     vlane = vget_lane_s16(val2, val3);
   13948     scalar = vdup_n_s16(vlane);
   13949     return vqrdmulh_s16(vec1, scalar);
   13950 }
   13951 
   13952 int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
   13953 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13954 {
   13955     int32_t vlane;
   13956     int32x2_t scalar;
   13957     vlane = vget_lane_s32(val2, val3);
   13958     scalar = vdup_n_s32(vlane);
   13959     return vqrdmulh_s32(vec1, scalar);
   13960 }
   13961 
   13962 int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
   13963 _NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
   13964 {
   13965     //solution may be not optimal
   13966     int16_t vlane;
   13967     int16x8_t scalar;
   13968     vlane = vget_lane_s16(val2, val3);
   13969     scalar = vdupq_n_s16(vlane);
   13970     return vqrdmulhq_s16(vec1, scalar);
   13971 }
   13972 
   13973 int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
   13974 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13975 {
   13976     //solution may be not optimal
   13977     int32_t vlane;
   13978     int32x4_t scalar;
   13979     vlane = vgetq_lane_s32(_pM128i(val2), val3);
   13980     scalar = vdupq_n_s32(vlane );
   13981     return vqrdmulhq_s32(vec1, scalar);
   13982 }
   13983 
   13984 //**************Vector multiply accumulate with scalar *******************
   13985 int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
   13986 _NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
   13987 {
   13988     int16x4_t scalar;
   13989     scalar = vdup_n_s16(c);
   13990     return vmla_s16(a, b, scalar);
   13991 }
   13992 
   13993 int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
   13994 _NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
   13995 {
   13996     int32x2_t scalar;
   13997     scalar = vdup_n_s32(c);
   13998     return vmla_s32(a, b, scalar);
   13999 }
   14000 
   14001 uint16x4_t vmla_n_u16(uint16x4_t a,  uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
   14002 #define vmla_n_u16 vmla_n_s16
   14003 
   14004 
   14005 uint32x2_t vmla_n_u32(uint32x2_t a,  uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
   14006 #define vmla_n_u32 vmla_n_s32
   14007 
   14008 
   14009 float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
   14010 _NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
   14011 {
   14012     float32x2_t scalar;
   14013     scalar = vdup_n_f32(c);
   14014     return vmla_f32(a, b, scalar);
   14015 }
   14016 
   14017 int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
   14018 _NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
   14019 {
   14020     int16x8_t scalar;
   14021     scalar = vdupq_n_s16(c);
   14022     return vmlaq_s16(a,b,scalar);
   14023 }
   14024 
   14025 int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
   14026 _NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
   14027 {
   14028     int32x4_t scalar;
   14029     scalar = vdupq_n_s32(c);
   14030     return vmlaq_s32(a,b,scalar);
   14031 }
   14032 
   14033 uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
   14034 #define vmlaq_n_u16 vmlaq_n_s16
   14035 
   14036 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
   14037 #define vmlaq_n_u32 vmlaq_n_s32
   14038 
   14039 float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
   14040 _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
   14041 {
   14042     float32x4_t scalar;
   14043     scalar = vdupq_n_f32(c);
   14044     return vmlaq_f32(a,b,scalar);
   14045 }
   14046 
   14047 //************Vector widening multiply accumulate with scalar****************************
   14048 int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
   14049 _NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
   14050 {
   14051     int16x4_t vc;
   14052     vc = vdup_n_s16(c);
   14053     return vmlal_s16(a, b, vc);
   14054 }
   14055 
   14056 int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
   14057 _NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
   14058 {
   14059     int32x2_t vc;
   14060     vc = vdup_n_s32(c);
   14061     return vmlal_s32(a, b, vc);
   14062 }
   14063 
   14064 uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
   14065 _NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
   14066 {
   14067     uint16x4_t vc;
   14068     vc = vdup_n_s16(c);
   14069     return vmlal_s16(a, b, vc);
   14070 }
   14071 
   14072 uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
   14073 _NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
   14074 {
   14075     uint32x2_t vc;
   14076     vc = vdup_n_u32(c);
   14077     return vmlal_u32(a, b, vc);
   14078 }
   14079 
   14080 //************ Vector widening saturating doubling multiply accumulate with scalar **************
   14081 int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
   14082 _NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
   14083 {
   14084     //not optimal SIMD soulution, serial may be faster
   14085     int16x4_t vc;
   14086     vc = vdup_n_s16(c);
   14087     return vqdmlal_s16(a, b, vc);
   14088 }
   14089 
   14090 int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
   14091 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   14092 {
   14093     int32x2_t vc;
   14094     vc = vdup_n_s32(c);
   14095     return vqdmlal_s32(a, b, vc);
   14096 }
   14097 
   14098 //******** Vector multiply subtract with scalar **************
   14099 int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
   14100 _NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
   14101 {
   14102     int16x4_t vc;
   14103     vc = vdup_n_s16(c);
   14104     return vmls_s16(a, b, vc);
   14105 }
   14106 
   14107 int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
   14108 _NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
   14109 {
   14110     int32x2_t vc;
   14111     vc = vdup_n_s32(c);
   14112     return vmls_s32(a, b, vc);
   14113 }
   14114 
   14115 uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
   14116 _NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
   14117 {
   14118     uint16x4_t vc;
   14119     vc = vdup_n_s16(c);
   14120     return vmls_s16(a, b, vc);
   14121 }
   14122 
   14123 uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
   14124 _NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
   14125 {
   14126     uint32x2_t vc;
   14127     vc = vdup_n_u32(c);
   14128     return vmls_u32(a, b, vc);
   14129 }
   14130 
   14131 float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
   14132 _NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
   14133 {
   14134     float32x2_t res;
   14135     res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
   14136     res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
   14137     return res;
   14138 }
   14139 
   14140 int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
   14141 _NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
   14142 {
   14143     int16x8_t vc;
   14144     vc = vdupq_n_s16(c);
   14145     return vmlsq_s16(a, b,vc);
   14146 }
   14147 
   14148 int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
   14149 _NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
   14150 {
   14151     int32x4_t vc;
   14152     vc = vdupq_n_s32(c);
   14153     return vmlsq_s32(a,b,vc);
   14154 }
   14155 
   14156 uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
   14157 _NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
   14158 {
   14159     uint32x4_t vc;
   14160     vc = vdupq_n_u32(c);
   14161     return vmlsq_u32(a,b,vc);
   14162 }
   14163 
   14164 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
   14165 _NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
   14166 {
   14167     uint32x4_t vc;
   14168     vc = vdupq_n_u32(c);
   14169     return vmlsq_u32(a,b,vc);
   14170 }
   14171 
   14172 float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
   14173 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
   14174 {
   14175     float32x4_t vc;
   14176     vc = vdupq_n_f32(c);
   14177     return vmlsq_f32(a,b,vc);
   14178 }
   14179 
   14180 //**** Vector widening multiply subtract with scalar ******
   14181 int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
   14182 _NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
   14183 {
   14184     int16x4_t vc;
   14185     vc = vdup_n_s16(c);
   14186     return vmlsl_s16(a, b, vc);
   14187 }
   14188 
   14189 int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
   14190 _NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
   14191 {
   14192     int32x2_t vc;
   14193     vc = vdup_n_s32(c);
   14194     return vmlsl_s32(a, b, vc);
   14195 }
   14196 
   14197 uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
   14198 _NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
   14199 {
   14200     uint16x4_t vc;
   14201     vc = vdup_n_u16(c);
   14202     return vmlsl_u16(a, b, vc);
   14203 }
   14204 
   14205 uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
   14206 _NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
   14207 {
   14208     uint32x2_t vc;
   14209     vc = vdup_n_u32(c);
   14210     return vmlsl_u32(a, b, vc);
   14211 }
   14212 
   14213 //***** Vector widening saturating doubling multiply subtract with scalar *********
   14214 //**********************************************************************************
   14215 int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
   14216 _NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
   14217 {
   14218     int16x4_t vc;
   14219     vc = vdup_n_s16(c);
   14220     return vqdmlsl_s16(a, b, vc);
   14221 }
   14222 
   14223 int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
   14224 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   14225 {
   14226     int32x2_t vc;
   14227     vc = vdup_n_s32(c);
   14228     return vqdmlsl_s32(a, b, vc);
   14229 }
   14230 
   14231 //*******************  Vector extract ***********************************************
   14232 //*************************************************************************************
   14233 //VEXT (Vector Extract) extracts  elements from the bottom end of the second operand
   14234 //vector and the top end of the first, concatenates them, and places the result in the destination vector
   14235 //c elements from the bottom end of the second operand and (8-c) from the top end of the first
   14236 int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   14237 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
   14238 {
   14239     int8x8_t res;
   14240     int i;
   14241     for (i = 0; i<8 - c; i++) {
   14242         res.m64_i8[i] = a.m64_i8[i + c];
   14243     }
   14244     for(i = 0; i<c; i++) {
   14245         res.m64_i8[8 - c + i] = b.m64_i8[i];
   14246     }
   14247     return res;
   14248 }
   14249 
   14250 uint8x8_t vext_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   14251 #define vext_u8 vext_s8
   14252 //same result tested
   14253 
   14254 poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   14255 #define vext_p8 vext_u8
   14256 
   14257 int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   14258 _NEON2SSE_INLINE int16x4_t  _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
   14259 {
   14260     int16x4_t res;
   14261     int i;
   14262     for (i = 0; i<4 - c; i++) {
   14263         res.m64_i16[i] = a.m64_i16[i + c];
   14264     }
   14265     for(i = 0; i<c; i++) {
   14266         res.m64_i16[4 - c + i] = b.m64_i16[i];
   14267     }
   14268     return res;
   14269 }
   14270 
   14271 uint16x4_t vext_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   14272 #define vext_u16 vext_s16
   14273 
   14274 poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   14275 #define vext_p16 vext_s16
   14276 
   14277 int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   14278 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
   14279 {
   14280     int32x2_t res;
   14281     if (c==0) {
   14282         res.m64_i32[0] = a.m64_i32[0];
   14283         res.m64_i32[1] = a.m64_i32[1];
   14284     } else {
   14285         res.m64_i32[0] = a.m64_i32[1];
   14286         res.m64_i32[1] = b.m64_i32[0];
   14287     }
   14288     return res;
   14289 }
   14290 
   14291 float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   14292 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
   14293 {
   14294     float32x2_t res;
   14295     if (c==0) {
   14296         res.m64_f32[0] = a.m64_f32[0];
   14297         res.m64_f32[1] = a.m64_f32[1];
   14298     } else {
   14299         res.m64_f32[0] = a.m64_f32[1];
   14300         res.m64_f32[1] = b.m64_f32[0];
   14301     }
   14302     return res;
   14303 }
   14304 
   14305 uint32x2_t vext_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   14306 #define vext_u32 vext_s32
   14307 
   14308 
   14309 int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
   14310 #define vext_s64(a,b,c) a
   14311 
   14312 uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
   14313 #define vext_u64(a,b,c) a
   14314 
   14315 int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   14316 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
   14317 
   14318 uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   14319 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
   14320 
   14321 poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   14322 #define vextq_p8 vextq_s8
   14323 
   14324 int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   14325 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
   14326 
   14327 uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   14328 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
   14329 
   14330 poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   14331 #define vextq_p16 vextq_s16
   14332 
   14333 int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
   14334 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
   14335 
   14336 uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
   14337 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
   14338 
   14339 float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
   14340 #define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
   14341 
   14342 int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
   14343 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
   14344 
   14345 uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
   14346 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
   14347 
   14348 //************ Reverse vector elements (swap endianness)*****************
   14349 //*************************************************************************
   14350 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
   14351 int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
   14352 _NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
   14353 {
   14354     int8x8_t res64;
   14355     __m128i res;
   14356     res = vrev64q_s8(_pM128i(vec));
   14357     return64(res);
   14358 }
   14359 
   14360 int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
   14361 _NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
   14362 {
   14363     int16x4_t res64;
   14364     __m128i res;
   14365     res = vrev64q_s16(_pM128i(vec));
   14366     return64(res);
   14367 }
   14368 
   14369 int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
   14370 _NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
   14371 {
   14372     int32x2_t res;
   14373     res.m64_i32[0] = vec.m64_i32[1];
   14374     res.m64_i32[1] = vec.m64_i32[0];
   14375     return res;
   14376 }
   14377 
   14378 uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
   14379 #define vrev64_u8 vrev64_s8
   14380 
   14381 uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
   14382 #define vrev64_u16 vrev64_s16
   14383 
   14384 uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
   14385 #define vrev64_u32 vrev64_s32
   14386 
   14387 poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
   14388 #define vrev64_p8 vrev64_u8
   14389 
   14390 poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
   14391 #define vrev64_p16 vrev64_u16
   14392 
   14393 float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
   14394 _NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
   14395 {
   14396     float32x2_t res;
   14397     res.m64_f32[0] = vec.m64_f32[1];
   14398     res.m64_f32[1] = vec.m64_f32[0];
   14399     return res;
   14400 }
   14401 
   14402 int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
   14403 _NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
   14404 {
   14405     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
   14406     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
   14407 }
   14408 
   14409 int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
   14410 _NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
   14411 {
   14412     //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
   14413     _NEON2SSE_ALIGN_16 int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
   14414     return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
   14415 }
   14416 
   14417 int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
   14418 _NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
   14419 {
   14420     return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
   14421 }
   14422 
   14423 uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
   14424 #define vrev64q_u8 vrev64q_s8
   14425 
   14426 uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
   14427 #define vrev64q_u16 vrev64q_s16
   14428 
   14429 uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
   14430 #define vrev64q_u32 vrev64q_s32
   14431 
   14432 poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
   14433 #define vrev64q_p8 vrev64q_u8
   14434 
   14435 poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
   14436 #define vrev64q_p16 vrev64q_u16
   14437 
   14438 float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
   14439 #define vrev64q_f32(vec) _mm_shuffle_ps (vec,  vec, _MM_SHUFFLE(2,3, 0,1))
   14440 
   14441 //********************  32 bit shuffles **********************
   14442 //************************************************************
   14443 int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
   14444 _NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
   14445 {
   14446     int8x8_t res64;
   14447     __m128i res;
   14448     res = vrev32q_s8(_pM128i(vec));
   14449     return64(res);
   14450 }
   14451 
   14452 int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
   14453 _NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
   14454 {
   14455     int16x4_t res64;
   14456     __m128i res;
   14457     res = vrev32q_s16(_pM128i(vec));
   14458     return64(res);
   14459 }
   14460 
   14461 uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
   14462 #define vrev32_u8 vrev32_s8
   14463 
   14464 uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
   14465 #define vrev32_u16 vrev32_s16
   14466 
   14467 poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
   14468 #define vrev32_p8 vrev32_u8
   14469 
   14470 poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
   14471 #define vrev32_p16 vrev32_u16
   14472 
   14473 int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
   14474 _NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
   14475 {
   14476     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
   14477     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
   14478 }
   14479 
   14480 int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
   14481 _NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
   14482 {
   14483     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
   14484     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
   14485 }
   14486 
   14487 uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
   14488 #define vrev32q_u8 vrev32q_s8
   14489 
   14490 uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
   14491 #define vrev32q_u16 vrev32q_s16
   14492 
   14493 poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
   14494 #define vrev32q_p8 vrev32q_u8
   14495 
   14496 poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
   14497 #define vrev32q_p16 vrev32q_u16
   14498 
   14499 //*************  16 bit shuffles **********************
   14500 //******************************************************
   14501 int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
   14502 _NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
   14503 {
   14504     int8x8_t res64;
   14505     __m128i res;
   14506     res = vrev16q_s8(_pM128i(vec));
   14507     return64(res);
   14508 }
   14509 
   14510 uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
   14511 #define vrev16_u8 vrev16_s8
   14512 
   14513 poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
   14514 #define vrev16_p8 vrev16_u8
   14515 
   14516 int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
   14517 _NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
   14518 {
   14519     _NEON2SSE_ALIGN_16 int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
   14520     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev8);
   14521 }
   14522 
   14523 uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
   14524 #define vrev16q_u8 vrev16q_s8
   14525 
   14526 poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
   14527 #define vrev16q_p8 vrev16q_u8
   14528 
   14529 //*********************************************************************
   14530 //**************** Other single operand arithmetic *******************
   14531 //*********************************************************************
   14532 
   14533 //*********** Absolute: Vd[i] = |Va[i]| **********************************
   14534 //************************************************************************
   14535 int8x8_t   vabs_s8(int8x8_t a); // VABS.S8 d0,d0
   14536 _NEON2SSE_INLINE int8x8_t   vabs_s8(int8x8_t a)
   14537 {
   14538     int8x8_t res64;
   14539     __m128i res;
   14540     res = _mm_abs_epi8(_pM128i(a));
   14541     return64(res);
   14542 }
   14543 
   14544 
   14545 int16x4_t   vabs_s16(int16x4_t a); // VABS.S16 d0,d0
   14546 _NEON2SSE_INLINE int16x4_t   vabs_s16(int16x4_t a)
   14547 {
   14548     int16x4_t res64;
   14549     __m128i res;
   14550     res = _mm_abs_epi16(_pM128i(a));
   14551     return64(res);
   14552 }
   14553 
   14554 int32x2_t   vabs_s32(int32x2_t a); // VABS.S32 d0,d0
   14555 _NEON2SSE_INLINE int32x2_t   vabs_s32(int32x2_t a)
   14556 {
   14557     int32x2_t res64;
   14558     __m128i res;
   14559     res = _mm_abs_epi32(_pM128i(a));
   14560     return64(res);
   14561 }
   14562 
   14563 float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
   14564 _NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
   14565 {
   14566     float32x4_t res;
   14567     __m64_128 res64;
   14568     _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
   14569     res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
   14570     _M64f(res64, res);
   14571     return res64;
   14572 }
   14573 
   14574 int8x16_t   vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
   14575 #define vabsq_s8 _mm_abs_epi8
   14576 
   14577 int16x8_t   vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
   14578 #define vabsq_s16 _mm_abs_epi16
   14579 
   14580 int32x4_t   vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
   14581 #define vabsq_s32 _mm_abs_epi32
   14582 
   14583 float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
   14584 _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
   14585 {
   14586     _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
   14587     return _mm_and_ps (a, *(__m128*)c7fffffff);
   14588 }
   14589 
   14590 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
   14591 //**********************************************************************
   14592 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
   14593 int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
   14594 _NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
   14595 {
   14596     int8x8_t res64;
   14597     __m128i res;
   14598     res = vqabsq_s8(_pM128i(a));
   14599     return64(res);
   14600 }
   14601 
   14602 int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
   14603 _NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
   14604 {
   14605     int16x4_t res64;
   14606     __m128i res;
   14607     res = vqabsq_s16(_pM128i(a));
   14608     return64(res);
   14609 }
   14610 
   14611 int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
   14612 _NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
   14613 {
   14614     int32x2_t res64;
   14615     __m128i res;
   14616     res = vqabsq_s32(_pM128i(a));
   14617     return64(res);
   14618 }
   14619 
   14620 int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
   14621 _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
   14622 {
   14623     __m128i c_128, abs, abs_cmp;
   14624     c_128 = _mm_set1_epi8 (0x80); //-128
   14625     abs = _mm_abs_epi8 (a);
   14626     abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
   14627     return _mm_xor_si128 (abs,  abs_cmp);
   14628 }
   14629 
   14630 int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
   14631 _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
   14632 {
   14633     __m128i c_32768, abs, abs_cmp;
   14634     c_32768 = _mm_set1_epi16 (0x8000); //-32768
   14635     abs = _mm_abs_epi16 (a);
   14636     abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
   14637     return _mm_xor_si128 (abs,  abs_cmp);
   14638 }
   14639 
   14640 int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
   14641 _NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
   14642 {
   14643     __m128i c80000000, abs, abs_cmp;
   14644     c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
   14645     abs = _mm_abs_epi32 (a);
   14646     abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
   14647     return _mm_xor_si128 (abs,  abs_cmp);
   14648 }
   14649 
   14650 //*************** Negate: Vd[i] = - Va[i] *************************************
   14651 //*****************************************************************************
   14652 //several Negate implementations possible for SIMD.
   14653 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
   14654 int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
   14655 _NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
   14656 {
   14657     int8x8_t res64;
   14658     __m128i res;
   14659     res = vnegq_s8(_pM128i(a));
   14660     return64(res);
   14661 }
   14662 
   14663 int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
   14664 _NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
   14665 {
   14666     int16x4_t res64;
   14667     __m128i res;
   14668     res = vnegq_s16(_pM128i(a));
   14669     return64(res);
   14670 }
   14671 
   14672 int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
   14673 _NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
   14674 {
   14675     int32x2_t res64;
   14676     __m128i res;
   14677     res = vnegq_s32(_pM128i(a));
   14678     return64(res);
   14679 }
   14680 
   14681 float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
   14682 _NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
   14683 {
   14684     float32x4_t res;
   14685     __m64_128 res64;
   14686     _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   14687     res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
   14688     _M64f(res64, res);
   14689     return res64;
   14690 }
   14691 
   14692 int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
   14693 _NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
   14694 {
   14695     __m128i zero;
   14696     zero = _mm_setzero_si128 ();
   14697     return _mm_sub_epi8 (zero, a);
   14698 } //or _mm_sign_epi8 (a, negative numbers vector)
   14699 
   14700 int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
   14701 _NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
   14702 {
   14703     __m128i zero;
   14704     zero = _mm_setzero_si128 ();
   14705     return _mm_sub_epi16 (zero, a);
   14706 } //or _mm_sign_epi16 (a, negative numbers vector)
   14707 
   14708 int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
   14709 _NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
   14710 {
   14711     __m128i zero;
   14712     zero = _mm_setzero_si128 ();
   14713     return _mm_sub_epi32 (zero, a);
   14714 } //or _mm_sign_epi32 (a, negative numbers vector)
   14715 
   14716 float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
   14717 _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
   14718 {
   14719     _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   14720     return _mm_xor_ps (a, *(__m128*) c80000000);
   14721 }
   14722 
   14723 //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
   14724 //***************************************************************************************
   14725 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
   14726 int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
   14727 _NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
   14728 {
   14729     int8x8_t res64;
   14730     __m128i res;
   14731     res = vqnegq_s8(_pM128i(a));
   14732     return64(res);
   14733 }
   14734 
   14735 int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
   14736 _NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
   14737 {
   14738     int16x4_t res64;
   14739     __m128i res;
   14740     res = vqnegq_s16(_pM128i(a));
   14741     return64(res);
   14742 }
   14743 
   14744 int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
   14745 _NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
   14746 {
   14747     int32x2_t res64;
   14748     __m128i res;
   14749     res = vqnegq_s32(_pM128i(a));
   14750     return64(res);
   14751 }
   14752 
   14753 int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
   14754 _NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
   14755 {
   14756     __m128i zero;
   14757     zero = _mm_setzero_si128 ();
   14758     return _mm_subs_epi8 (zero, a); //saturating substraction
   14759 }
   14760 
   14761 int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
   14762 _NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
   14763 {
   14764     __m128i zero;
   14765     zero = _mm_setzero_si128 ();
   14766     return _mm_subs_epi16 (zero, a); //saturating substraction
   14767 }
   14768 
   14769 int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
   14770 _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
   14771 {
   14772     //solution may be not optimal compared with a serial
   14773     __m128i c80000000, zero, sub, cmp;
   14774     c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
   14775     zero = _mm_setzero_si128 ();
   14776     sub =  _mm_sub_epi32 (zero, a); //substraction
   14777     cmp = _mm_cmpeq_epi32 (a, c80000000);
   14778     return _mm_xor_si128 (sub,  cmp);
   14779 }
   14780 
   14781 //****************** Count leading zeros ********************************
   14782 //**************************************************************************
   14783 //no corresponding vector intrinsics in IA32, need to implement it.  While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
   14784 int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
   14785 _NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
   14786 {
   14787     int8x8_t res64;
   14788     __m128i res;
   14789     res = vclzq_s8(_pM128i(a));
   14790     return64(res);
   14791 }
   14792 
   14793 int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
   14794 _NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
   14795 {
   14796     int16x4_t res64;
   14797     __m128i res;
   14798     res = vclzq_s16(_pM128i(a));
   14799     return64(res);
   14800 }
   14801 
   14802 int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
   14803 _NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
   14804 {
   14805     int32x2_t res64;
   14806     __m128i res;
   14807     res = vclzq_s32(_pM128i(a));
   14808     return64(res);
   14809 }
   14810 
   14811 
   14812 uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
   14813 #define vclz_u8 vclz_s8
   14814 
   14815 uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
   14816 #define vclz_u16 vclz_s16
   14817 
   14818 uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
   14819 #define vclz_u32 vclz_s32
   14820 
   14821 int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
   14822 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
   14823 {
   14824     _NEON2SSE_ALIGN_16 int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
   14825                                                     /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
   14826                                                     /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
   14827                                                     /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0                          };
   14828     __m128i maskLOW, c4, lowclz, mask, hiclz;
   14829     maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
   14830     c4 = _mm_set1_epi8(4);
   14831     lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
   14832     mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
   14833     mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
   14834     hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
   14835     mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
   14836     lowclz = _mm_and_si128(lowclz,mask);
   14837     return _mm_add_epi8(lowclz, hiclz);
   14838 }
   14839 
   14840 int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
   14841 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
   14842 {
   14843     __m128i c7, res8x16, res8x16_swap;
   14844     _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
   14845     _NEON2SSE_ALIGN_16 uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
   14846     c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
   14847     res8x16 = vclzq_s8(a);
   14848     res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
   14849     res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
   14850     res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
   14851     c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
   14852     res8x16 = _mm_and_si128(res8x16, c7); //lowclz
   14853     return _mm_add_epi16(res8x16_swap, res8x16);
   14854 }
   14855 
   14856 int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
   14857 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
   14858 {
   14859     __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
   14860     c55555555 = _mm_set1_epi32(0x55555555);
   14861     c33333333 = _mm_set1_epi32(0x33333333);
   14862     c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
   14863     c3f = _mm_set1_epi32(0x3f);
   14864     c32 = _mm_set1_epi32(32);
   14865     tmp = _mm_srli_epi32(a, 1);
   14866     res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
   14867     tmp = _mm_srli_epi32(res, 2);
   14868     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
   14869     tmp = _mm_srli_epi32(res, 4);
   14870     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
   14871     tmp = _mm_srli_epi32(res, 8);
   14872     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
   14873     tmp = _mm_srli_epi32(res, 16);
   14874     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
   14875 
   14876     tmp = _mm_srli_epi32(res, 1);
   14877     tmp = _mm_and_si128(tmp, c55555555);
   14878     res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
   14879 
   14880     tmp = _mm_srli_epi32(res, 2);
   14881     tmp = _mm_and_si128(tmp, c33333333);
   14882     tmp1 = _mm_and_si128(res, c33333333);
   14883     res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
   14884 
   14885     tmp = _mm_srli_epi32(res, 4);
   14886     tmp = _mm_add_epi32(tmp, res);
   14887     res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
   14888 
   14889     tmp = _mm_srli_epi32(res, 8);
   14890     res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
   14891 
   14892     tmp = _mm_srli_epi32(res, 16);
   14893     res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
   14894 
   14895     res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
   14896 
   14897     return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
   14898 }
   14899 
   14900 uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
   14901 #define vclzq_u8 vclzq_s8
   14902 
   14903 uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
   14904 #define vclzq_u16 vclzq_s16
   14905 
   14906 uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
   14907 #define vclzq_u32 vclzq_s32
   14908 
   14909 //************** Count leading sign bits **************************
   14910 //********************************************************************
   14911 //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
   14912 // the topmost bit, that are the same as the topmost bit, in each element in a vector
   14913 //No corresponding vector intrinsics in IA32, need to implement it.
   14914 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
   14915 int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
   14916 _NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
   14917 {
   14918     int8x8_t res64;
   14919     __m128i res;
   14920     res = vclsq_s8(_pM128i(a));
   14921     return64(res);
   14922 }
   14923 
   14924 int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
   14925 _NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
   14926 {
   14927     int16x4_t res64;
   14928     __m128i res;
   14929     res = vclsq_s16(_pM128i(a));
   14930     return64(res);
   14931 }
   14932 
   14933 int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
   14934 _NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
   14935 {
   14936     int32x2_t res64;
   14937     __m128i res;
   14938     res = vclsq_s32(_pM128i(a));
   14939     return64(res);
   14940 }
   14941 
   14942 int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
   14943 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
   14944 {
   14945     __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
   14946     cff = _mm_cmpeq_epi8 (a,a); //0xff
   14947     c80 = _mm_set1_epi8(0x80);
   14948     c1 = _mm_set1_epi8(1);
   14949     a_mask = _mm_and_si128(a, c80);
   14950     a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
   14951     a_neg = _mm_xor_si128(a, cff);
   14952     a_neg = _mm_and_si128(a_mask, a_neg);
   14953     a_pos = _mm_andnot_si128(a_mask, a);
   14954     a_comb = _mm_or_si128(a_pos, a_neg);
   14955     a_comb = vclzq_s8(a_comb);
   14956     return _mm_sub_epi8(a_comb, c1);
   14957 }
   14958 
   14959 int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
   14960 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
   14961 {
   14962     __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
   14963     cffff = _mm_cmpeq_epi16(a,a);
   14964     c8000 =  _mm_slli_epi16(cffff, 15); //0x8000
   14965     c1 = _mm_srli_epi16(cffff,15); //0x1
   14966     a_mask = _mm_and_si128(a, c8000);
   14967     a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
   14968     a_neg = _mm_xor_si128(a, cffff);
   14969     a_neg = _mm_and_si128(a_mask, a_neg);
   14970     a_pos = _mm_andnot_si128(a_mask, a);
   14971     a_comb = _mm_or_si128(a_pos, a_neg);
   14972     a_comb = vclzq_s16(a_comb);
   14973     return _mm_sub_epi16(a_comb, c1);
   14974 }
   14975 
   14976 int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
   14977 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
   14978 {
   14979     __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
   14980     cffffffff = _mm_cmpeq_epi32(a,a);
   14981     c80000000 =  _mm_slli_epi32(cffffffff, 31); //0x80000000
   14982     c1 = _mm_srli_epi32(cffffffff,31); //0x1
   14983     a_mask = _mm_and_si128(a, c80000000);
   14984     a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
   14985     a_neg = _mm_xor_si128(a, cffffffff);
   14986     a_neg = _mm_and_si128(a_mask, a_neg);
   14987     a_pos = _mm_andnot_si128(a_mask, a);
   14988     a_comb = _mm_or_si128(a_pos, a_neg);
   14989     a_comb = vclzq_s32(a_comb);
   14990     return _mm_sub_epi32(a_comb, c1);
   14991 }
   14992 
   14993 //************************* Count number of set bits   ********************************
   14994 //*************************************************************************************
   14995 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2  _mm_popcnt__u32 (unsigned int v) for each element
   14996 //another option is to do the following algorithm:
   14997 
   14998 uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
   14999 _NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
   15000 {
   15001     uint8x8_t res64;
   15002     __m128i res;
   15003     res = vcntq_u8(_pM128i(a));
   15004     return64(res);
   15005 }
   15006 
   15007 int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
   15008 #define vcnt_s8 vcnt_u8
   15009 
   15010 poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
   15011 #define vcnt_p8 vcnt_u8
   15012 
   15013 uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
   15014 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
   15015 {
   15016     _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
   15017                                                         /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
   15018                                                         /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
   15019                                                         /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4                                   };
   15020     __m128i maskLOW, mask, lowpopcnt, hipopcnt;
   15021     maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
   15022     mask = _mm_and_si128(a, maskLOW);
   15023     lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
   15024     mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
   15025     mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
   15026     hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
   15027     return _mm_add_epi8(lowpopcnt, hipopcnt);
   15028 }
   15029 
   15030 int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
   15031 #define vcntq_s8 vcntq_u8
   15032 
   15033 poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
   15034 #define vcntq_p8 vcntq_u8
   15035 
   15036 //**************************************************************************************
   15037 //*********************** Logical operations ****************************************
   15038 //**************************************************************************************
   15039 //************************** Bitwise not ***********************************
   15040 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
   15041 int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
   15042 _NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
   15043 {
   15044     int8x8_t res64;
   15045     __m128i res;
   15046     res = vmvnq_s8(_pM128i(a));
   15047     return64(res);
   15048 }
   15049 
   15050 int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
   15051 _NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
   15052 {
   15053     int16x4_t res64;
   15054     __m128i res;
   15055     res = vmvnq_s16(_pM128i(a));
   15056     return64(res);
   15057 }
   15058 
   15059 int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
   15060 _NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
   15061 {
   15062     int32x2_t res64;
   15063     __m128i res;
   15064     res = vmvnq_s32(_pM128i(a));
   15065     return64(res);
   15066 }
   15067 
   15068 uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
   15069 #define vmvn_u8 vmvn_s8
   15070 
   15071 uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
   15072 #define vmvn_u16 vmvn_s16
   15073 
   15074 uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
   15075 #define vmvn_u32 vmvn_s32
   15076 
   15077 poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
   15078 #define vmvn_p8 vmvn_u8
   15079 
   15080 int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
   15081 _NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
   15082 {
   15083     __m128i c1;
   15084     c1 = _mm_cmpeq_epi8 (a,a); //0xff
   15085     return _mm_andnot_si128 (a, c1);
   15086 }
   15087 
   15088 int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
   15089 _NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
   15090 {
   15091     __m128i c1;
   15092     c1 = _mm_cmpeq_epi16 (a,a); //0xffff
   15093     return _mm_andnot_si128 (a, c1);
   15094 }
   15095 
   15096 int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
   15097 _NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
   15098 {
   15099     __m128i c1;
   15100     c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
   15101     return _mm_andnot_si128 (a, c1);
   15102 }
   15103 
   15104 uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
   15105 #define vmvnq_u8 vmvnq_s8
   15106 
   15107 uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
   15108 #define vmvnq_u16 vmvnq_s16
   15109 
   15110 uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
   15111 #define vmvnq_u32 vmvnq_s32
   15112 
   15113 poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
   15114 #define vmvnq_p8 vmvnq_u8
   15115 
   15116 //****************** Bitwise and ***********************
   15117 //******************************************************
   15118 int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
   15119 _NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
   15120 {
   15121     int8x8_t res64;
   15122     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
   15123 }
   15124 
   15125 int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
   15126 _NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
   15127 {
   15128     int16x4_t res64;
   15129     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
   15130 }
   15131 
   15132 int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
   15133 _NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
   15134 {
   15135     int32x2_t res64;
   15136     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
   15137 }
   15138 
   15139 
   15140 int64x1_t vand_s64(int64x1_t a,  int64x1_t b); // VAND d0,d0,d0
   15141 _NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a,  int64x1_t b)
   15142 {
   15143     int64x1_t res;
   15144     res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
   15145     return res;
   15146 }
   15147 
   15148 uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
   15149 #define vand_u8 vand_s8
   15150 
   15151 uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
   15152 #define vand_u16 vand_s16
   15153 
   15154 uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
   15155 #define vand_u32 vand_s32
   15156 
   15157 uint64x1_t vand_u64(uint64x1_t a,  uint64x1_t b); // VAND d0,d0,d0
   15158 #define vand_u64 vand_s64
   15159 
   15160 
   15161 int8x16_t   vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
   15162 #define vandq_s8 _mm_and_si128
   15163 
   15164 int16x8_t   vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
   15165 #define vandq_s16 _mm_and_si128
   15166 
   15167 int32x4_t   vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
   15168 #define vandq_s32 _mm_and_si128
   15169 
   15170 int64x2_t   vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
   15171 #define vandq_s64 _mm_and_si128
   15172 
   15173 uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
   15174 #define vandq_u8 _mm_and_si128
   15175 
   15176 uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
   15177 #define vandq_u16 _mm_and_si128
   15178 
   15179 uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
   15180 #define vandq_u32 _mm_and_si128
   15181 
   15182 uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
   15183 #define vandq_u64 _mm_and_si128
   15184 
   15185 //******************** Bitwise or *********************************
   15186 //******************************************************************
   15187 int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
   15188 _NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
   15189 {
   15190     int8x8_t res64;
   15191     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
   15192 }
   15193 
   15194 
   15195 int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
   15196 _NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
   15197 {
   15198     int16x4_t res64;
   15199     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
   15200 }
   15201 
   15202 
   15203 int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
   15204 _NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
   15205 {
   15206     int32x2_t res64;
   15207     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
   15208 }
   15209 
   15210 
   15211 int64x1_t vorr_s64(int64x1_t a,  int64x1_t b); // VORR d0,d0,d0
   15212 _NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b)
   15213 {
   15214     int64x1_t res;
   15215     res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
   15216     return res;
   15217 }
   15218 
   15219 uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
   15220 #define vorr_u8 vorr_s8
   15221 
   15222 uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
   15223 #define vorr_u16 vorr_s16
   15224 
   15225 uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
   15226 #define vorr_u32 vorr_s32
   15227 
   15228 uint64x1_t vorr_u64(uint64x1_t a,  uint64x1_t b); // VORR d0,d0,d0
   15229 #define vorr_u64 vorr_s64
   15230 
   15231 int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
   15232 #define vorrq_s8 _mm_or_si128
   15233 
   15234 int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
   15235 #define vorrq_s16 _mm_or_si128
   15236 
   15237 int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
   15238 #define vorrq_s32 _mm_or_si128
   15239 
   15240 int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
   15241 #define vorrq_s64 _mm_or_si128
   15242 
   15243 uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
   15244 #define vorrq_u8 _mm_or_si128
   15245 
   15246 uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
   15247 #define vorrq_u16 _mm_or_si128
   15248 
   15249 uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
   15250 #define vorrq_u32 _mm_or_si128
   15251 
   15252 uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
   15253 #define vorrq_u64 _mm_or_si128
   15254 
   15255 //************* Bitwise exclusive or (EOR or XOR) ******************
   15256 //*******************************************************************
   15257 int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
   15258 _NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
   15259 {
   15260     int8x8_t res64;
   15261     return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
   15262 }
   15263 
   15264 int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
   15265 #define veor_s16 veor_s8
   15266 
   15267 int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
   15268 #define veor_s32 veor_s8
   15269 
   15270 int64x1_t veor_s64(int64x1_t a,  int64x1_t b); // VEOR d0,d0,d0
   15271 _NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a,  int64x1_t b)
   15272 {
   15273     int64x1_t res;
   15274     res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
   15275     return res;
   15276 }
   15277 
   15278 uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
   15279 #define veor_u8 veor_s8
   15280 
   15281 uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
   15282 #define veor_u16 veor_s16
   15283 
   15284 uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
   15285 #define veor_u32 veor_s32
   15286 
   15287 uint64x1_t veor_u64(uint64x1_t a,  uint64x1_t b); // VEOR d0,d0,d0
   15288 #define veor_u64 veor_s64
   15289 
   15290 int8x16_t   veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
   15291 #define veorq_s8 _mm_xor_si128
   15292 
   15293 int16x8_t   veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
   15294 #define veorq_s16 _mm_xor_si128
   15295 
   15296 int32x4_t   veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
   15297 #define veorq_s32 _mm_xor_si128
   15298 
   15299 int64x2_t   veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
   15300 #define veorq_s64 _mm_xor_si128
   15301 
   15302 uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
   15303 #define veorq_u8 _mm_xor_si128
   15304 
   15305 uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
   15306 #define veorq_u16 _mm_xor_si128
   15307 
   15308 uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
   15309 #define veorq_u32 _mm_xor_si128
   15310 
   15311 uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
   15312 #define veorq_u64 _mm_xor_si128
   15313 
   15314 //********************** Bit Clear **********************************
   15315 //*******************************************************************
   15316 //Logical AND complement (AND negation or AND NOT)
   15317 int8x8_t   vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
   15318 _NEON2SSE_INLINE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b)
   15319 {
   15320     int8x8_t res64;
   15321     return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
   15322 }
   15323 
   15324 int16x4_t   vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
   15325 #define vbic_s16 vbic_s8
   15326 
   15327 int32x2_t   vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
   15328 #define vbic_s32 vbic_s8
   15329 
   15330 int64x1_t   vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
   15331 _NEON2SSE_INLINE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b)
   15332 {
   15333     int64x1_t res;
   15334     res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
   15335     return res;
   15336 }
   15337 
   15338 uint8x8_t   vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
   15339 #define vbic_u8 vbic_s8
   15340 
   15341 uint16x4_t   vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
   15342 #define vbic_u16 vbic_s16
   15343 
   15344 uint32x2_t   vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
   15345 #define vbic_u32 vbic_s32
   15346 
   15347 uint64x1_t   vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
   15348 #define vbic_u64 vbic_s64
   15349 
   15350 int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
   15351 #define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15352 
   15353 int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
   15354 #define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15355 
   15356 int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
   15357 #define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15358 
   15359 int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
   15360 #define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15361 
   15362 uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
   15363 #define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15364 
   15365 uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
   15366 #define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15367 
   15368 uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
   15369 #define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15370 
   15371 uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
   15372 #define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15373 
   15374 //**************** Bitwise OR complement ********************************
   15375 //**************************************** ********************************
   15376 //no exact IA 32 match, need to implement it as following
   15377 int8x8_t vorn_s8(int8x8_t a,  int8x8_t b); // VORN d0,d0,d0
   15378 _NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b)
   15379 {
   15380     int8x8_t res64;
   15381     return64(vornq_s8(_pM128i(a), _pM128i(b)));
   15382 }
   15383 
   15384 
   15385 int16x4_t vorn_s16(int16x4_t a,  int16x4_t b); // VORN d0,d0,d0
   15386 _NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b)
   15387 {
   15388     int16x4_t res64;
   15389     return64(vornq_s16(_pM128i(a), _pM128i(b)));
   15390 }
   15391 
   15392 
   15393 int32x2_t vorn_s32(int32x2_t a,  int32x2_t b); // VORN d0,d0,d0
   15394 _NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b)
   15395 {
   15396     int32x2_t res64;
   15397     return64(vornq_s32(_pM128i(a), _pM128i(b)));
   15398 }
   15399 
   15400 
   15401 int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
   15402 _NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
   15403 {
   15404     int64x1_t res;
   15405     res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
   15406     return res;
   15407 }
   15408 
   15409 uint8x8_t vorn_u8(uint8x8_t a,  uint8x8_t b); // VORN d0,d0,d0
   15410 #define vorn_u8 vorn_s8
   15411 
   15412 
   15413 uint16x4_t vorn_u16(uint16x4_t a,  uint16x4_t b); // VORN d0,d0,d0
   15414 #define vorn_u16 vorn_s16
   15415 
   15416 uint32x2_t vorn_u32(uint32x2_t a,  uint32x2_t b); // VORN d0,d0,d0
   15417 #define vorn_u32 vorn_s32
   15418 
   15419 uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
   15420 #define vorn_u64 vorn_s64
   15421 
   15422 
   15423 int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
   15424 _NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
   15425 {
   15426     __m128i b1;
   15427     b1 = vmvnq_s8( b); //bitwise not for b
   15428     return _mm_or_si128 (a, b1);
   15429 }
   15430 
   15431 int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
   15432 _NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
   15433 {
   15434     __m128i b1;
   15435     b1 = vmvnq_s16( b); //bitwise not for b
   15436     return _mm_or_si128 (a, b1);
   15437 }
   15438 
   15439 int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
   15440 _NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
   15441 {
   15442     __m128i b1;
   15443     b1 = vmvnq_s32( b); //bitwise not for b
   15444     return _mm_or_si128 (a, b1);
   15445 }
   15446 
   15447 int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
   15448 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
   15449 {
   15450     __m128i c1, b1;
   15451     c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
   15452     b1 = _mm_andnot_si128 (b, c1);
   15453     return _mm_or_si128 (a, b1);
   15454 }
   15455 
   15456 uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
   15457 _NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
   15458 {
   15459     __m128i b1;
   15460     b1 = vmvnq_u8( b); //bitwise not for b
   15461     return _mm_or_si128 (a, b1);
   15462 }
   15463 
   15464 uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
   15465 _NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
   15466 {
   15467     __m128i b1;
   15468     b1 = vmvnq_s16( b); //bitwise not for b
   15469     return _mm_or_si128 (a, b1);
   15470 }
   15471 
   15472 uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
   15473 _NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
   15474 {
   15475     __m128i b1;
   15476     b1 = vmvnq_u32( b); //bitwise not for b
   15477     return _mm_or_si128 (a, b1);
   15478 }
   15479 uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
   15480 #define vornq_u64 vornq_s64
   15481 
   15482 //********************* Bitwise Select *****************************
   15483 //******************************************************************
   15484 //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
   15485 
   15486 //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
   15487 //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
   15488 
   15489 //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
   15490 //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
   15491 
   15492 //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
   15493 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
   15494 
   15495 //VBSL only is implemented for SIMD
   15496 int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
   15497 _NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
   15498 {
   15499     int8x8_t res64;
   15500     __m128i res;
   15501     res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
   15502     return64(res);
   15503 }
   15504 
   15505 int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
   15506 #define vbsl_s16 vbsl_s8
   15507 
   15508 int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
   15509 #define vbsl_s32 vbsl_s8
   15510 
   15511 int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
   15512 _NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
   15513 {
   15514     int64x1_t res;
   15515     res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
   15516     return res;
   15517 }
   15518 
   15519 uint8x8_t vbsl_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
   15520 #define vbsl_u8 vbsl_s8
   15521 
   15522 uint16x4_t vbsl_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
   15523 #define vbsl_u16 vbsl_s8
   15524 
   15525 uint32x2_t vbsl_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
   15526 #define vbsl_u32 vbsl_s8
   15527 
   15528 uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
   15529 #define vbsl_u64 vbsl_s64
   15530 
   15531 float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
   15532 _NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
   15533 {
   15534     __m128 sel1, sel2;
   15535     __m64_128 res64;
   15536     sel1 = _mm_and_ps   (_pM128(a), _pM128(b));
   15537     sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
   15538     sel1 = _mm_or_ps (sel1, sel2);
   15539     _M64f(res64, sel1);
   15540     return res64;
   15541 }
   15542 
   15543 poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
   15544 #define  vbsl_p8 vbsl_s8
   15545 
   15546 poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
   15547 #define  vbsl_p16 vbsl_s8
   15548 
   15549 int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
   15550 _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
   15551 {
   15552     __m128i sel1, sel2;
   15553     sel1 = _mm_and_si128   (a, b);
   15554     sel2 = _mm_andnot_si128 (a, c);
   15555     return _mm_or_si128 (sel1, sel2);
   15556 }
   15557 
   15558 int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
   15559 #define vbslq_s16 vbslq_s8
   15560 
   15561 int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
   15562 #define vbslq_s32 vbslq_s8
   15563 
   15564 int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
   15565 #define vbslq_s64 vbslq_s8
   15566 
   15567 uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
   15568 #define vbslq_u8 vbslq_s8
   15569 
   15570 uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
   15571 #define vbslq_u16 vbslq_s8
   15572 
   15573 uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
   15574 #define vbslq_u32 vbslq_s8
   15575 
   15576 uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
   15577 #define vbslq_u64 vbslq_s8
   15578 
   15579 float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
   15580 _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
   15581 {
   15582     __m128 sel1, sel2;
   15583     sel1 = _mm_and_ps   (*(__m128*)&a, b);
   15584     sel2 = _mm_andnot_ps (*(__m128*)&a, c);
   15585     return _mm_or_ps (sel1, sel2);
   15586 }
   15587 
   15588 poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
   15589 #define vbslq_p8 vbslq_u8
   15590 
   15591 poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
   15592 #define vbslq_p16 vbslq_s8
   15593 
   15594 //************************************************************************************
   15595 //**************** Transposition operations ****************************************
   15596 //************************************************************************************
   15597 //*****************  Vector Transpose ************************************************
   15598 //************************************************************************************
   15599 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
   15600 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
   15601 int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
   15602 _NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
   15603 {
   15604     int8x8x2_t val;
   15605     __m128i tmp, val0;
   15606     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; //mask8_trnsp
   15607     tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
   15608     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask16_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
   15609     vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
   15610     return val;
   15611 }
   15612 
   15613 int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
   15614 _NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
   15615 {
   15616     int16x4x2_t val;
   15617     __m128i tmp, val0;
   15618     _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
   15619     tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
   15620     val0 =  _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
   15621     vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
   15622     return val;
   15623 }
   15624 
   15625 int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
   15626 _NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
   15627 {
   15628     int32x2x2_t val;
   15629     __m128i val0;
   15630     val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
   15631     vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
   15632     return val;
   15633 }
   15634 
   15635 uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
   15636 #define vtrn_u8 vtrn_s8
   15637 
   15638 uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
   15639 #define vtrn_u16 vtrn_s16
   15640 
   15641 uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
   15642 #define vtrn_u32 vtrn_s32
   15643 
   15644 float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
   15645 _NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
   15646 {
   15647     float32x2x2_t val;
   15648     val.val[0].m64_f32[0] = a.m64_f32[0];
   15649     val.val[0].m64_f32[1] = b.m64_f32[0];
   15650     val.val[1].m64_f32[0] = a.m64_f32[1];
   15651     val.val[1].m64_f32[1] = b.m64_f32[1];
   15652     return val; //a0,b0,a1,b1
   15653 }
   15654 
   15655 poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
   15656 #define  vtrn_p8 vtrn_u8
   15657 
   15658 poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
   15659 #define  vtrn_p16 vtrn_s16
   15660 
   15661 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
   15662 _NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
   15663 {
   15664     int8x16x2_t r8x16;
   15665     __m128i a_sh, b_sh;
   15666     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
   15667     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
   15668     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
   15669 
   15670     r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
   15671     r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
   15672     return r8x16;
   15673 }
   15674 
   15675 int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
   15676 _NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
   15677 {
   15678     int16x8x2_t v16x8;
   15679     __m128i a_sh, b_sh;
   15680     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
   15681     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
   15682     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
   15683     v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
   15684     v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
   15685     return v16x8;
   15686 }
   15687 
   15688 int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
   15689 _NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
   15690 {
   15691     //may be not optimal solution compared with serial
   15692     int32x4x2_t v32x4;
   15693     __m128i a_sh, b_sh;
   15694     a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
   15695     b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
   15696 
   15697     v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
   15698     v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3,  b3
   15699     return v32x4;
   15700 }
   15701 
   15702 uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
   15703 #define vtrnq_u8 vtrnq_s8
   15704 
   15705 uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
   15706 #define vtrnq_u16 vtrnq_s16
   15707 
   15708 uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
   15709 #define vtrnq_u32 vtrnq_s32
   15710 
   15711 float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
   15712 _NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
   15713 {
   15714     //may be not optimal solution compared with serial
   15715     float32x4x2_t f32x4;
   15716     __m128 a_sh, b_sh;
   15717     a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
   15718     b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
   15719 
   15720     f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
   15721     f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3,  b3
   15722     return f32x4;
   15723 }
   15724 
   15725 poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
   15726 #define vtrnq_p8 vtrnq_s8
   15727 
   15728 poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
   15729 #define vtrnq_p16 vtrnq_s16
   15730 
   15731 //***************** Interleave elements ***************************
   15732 //*****************************************************************
   15733 //output has (a0,b0,a1,b1, a2,b2,.....)
   15734 int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
   15735 _NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
   15736 {
   15737     int8x8x2_t val;
   15738     __m128i val0;
   15739     val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
   15740     vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15741     return val;
   15742 }
   15743 
   15744 int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
   15745 _NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
   15746 {
   15747     int16x4x2_t val;
   15748     __m128i val0;
   15749     val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
   15750     vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15751     return val;
   15752 }
   15753 
   15754 int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
   15755 #define vzip_s32 vtrn_s32
   15756 
   15757 uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
   15758 #define vzip_u8 vzip_s8
   15759 
   15760 uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
   15761 #define vzip_u16 vzip_s16
   15762 
   15763 uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
   15764 #define vzip_u32 vzip_s32
   15765 
   15766 float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
   15767 #define vzip_f32 vtrn_f32
   15768 
   15769 poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
   15770 #define vzip_p8 vzip_u8
   15771 
   15772 poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
   15773 #define vzip_p16 vzip_u16
   15774 
   15775 int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
   15776 _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
   15777 {
   15778     int8x16x2_t r8x16;
   15779     r8x16.val[0] =  _mm_unpacklo_epi8(a, b);
   15780     r8x16.val[1] =  _mm_unpackhi_epi8(a, b);
   15781     return r8x16;
   15782 }
   15783 
   15784 int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
   15785 _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
   15786 {
   15787     int16x8x2_t r16x8;
   15788     r16x8.val[0] =  _mm_unpacklo_epi16(a, b);
   15789     r16x8.val[1] =  _mm_unpackhi_epi16(a, b);
   15790     return r16x8;
   15791 }
   15792 
   15793 int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
   15794 _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
   15795 {
   15796     int32x4x2_t r32x4;
   15797     r32x4.val[0] =  _mm_unpacklo_epi32(a, b);
   15798     r32x4.val[1] =  _mm_unpackhi_epi32(a, b);
   15799     return r32x4;
   15800 }
   15801 
   15802 uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
   15803 #define vzipq_u8 vzipq_s8
   15804 
   15805 uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
   15806 #define vzipq_u16 vzipq_s16
   15807 
   15808 uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
   15809 #define vzipq_u32 vzipq_s32
   15810 
   15811 float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
   15812 _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
   15813 {
   15814     float32x4x2_t f32x4;
   15815     f32x4.val[0] =   _mm_unpacklo_ps ( a,  b);
   15816     f32x4.val[1] =   _mm_unpackhi_ps ( a,  b);
   15817     return f32x4;
   15818 }
   15819 
   15820 poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
   15821 #define vzipq_p8 vzipq_u8
   15822 
   15823 poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
   15824 #define vzipq_p16 vzipq_u16
   15825 
   15826 //*********************** De-Interleave elements *************************
   15827 //*************************************************************************
   15828 //As the result of these functions first val  contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
   15829 //no such functions in IA32 SIMD, shuffle is required
   15830 int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
   15831 _NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
   15832 {
   15833     int8x8x2_t val;
   15834     __m128i tmp, val0;
   15835     _NEON2SSE_ALIGN_16 int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13,  2, 6, 10, 14, 3, 7, 11,15};
   15836     tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
   15837     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6),  (a1, a3, a5, a7, b1,b3, b5, b7)
   15838     vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15839     return val;
   15840 }
   15841 
   15842 int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
   15843 _NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
   15844 {
   15845     int16x4x2_t val;
   15846     __m128i tmp, val0;
   15847     _NEON2SSE_ALIGN_16 int8_t maskdlv16[16] = {0,1,  8,9,  2,3, 10,11,  4,5, 12,13, 6,7, 14,15};
   15848     tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
   15849     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
   15850     vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15851     return val;
   15852 }
   15853 
   15854 int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
   15855 _NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
   15856 {
   15857     int32x2x2_t val;
   15858     __m128i val0;
   15859     val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
   15860     vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15861     return val;
   15862 }
   15863 
   15864 uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
   15865 #define vuzp_u8 vuzp_s8
   15866 
   15867 uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
   15868 #define vuzp_u16 vuzp_s16
   15869 
   15870 uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
   15871 #define vuzp_u32 vuzp_s32
   15872 
   15873 float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
   15874 #define vuzp_f32 vzip_f32
   15875 
   15876 poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
   15877 #define vuzp_p8 vuzp_u8
   15878 
   15879 poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
   15880 #define vuzp_p16 vuzp_u16
   15881 
   15882 int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
   15883 _NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
   15884 {
   15885     int8x16x2_t v8x16;
   15886     __m128i a_sh, b_sh;
   15887     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
   15888     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
   15889     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
   15890     //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
   15891     v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
   15892     v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
   15893     return v8x16;
   15894 }
   15895 
   15896 int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
   15897 _NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
   15898 {
   15899     int16x8x2_t v16x8;
   15900     __m128i a_sh, b_sh;
   15901     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
   15902     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
   15903     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
   15904     v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
   15905     v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
   15906     return v16x8;
   15907 }
   15908 
   15909 int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
   15910 _NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
   15911 {
   15912     //may be not optimal solution compared with serial
   15913     int32x4x2_t v32x4;
   15914     __m128i a_sh, b_sh;
   15915     a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
   15916     b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
   15917 
   15918     v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
   15919     v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
   15920     return v32x4;
   15921 }
   15922 
   15923 uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
   15924 #define vuzpq_u8 vuzpq_s8
   15925 
   15926 uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
   15927 #define vuzpq_u16 vuzpq_s16
   15928 
   15929 uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
   15930 #define vuzpq_u32 vuzpq_s32
   15931 
   15932 float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
   15933 _NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
   15934 {
   15935     float32x4x2_t v32x4;
   15936     v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
   15937     v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
   15938     return v32x4;
   15939 }
   15940 
   15941 poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
   15942 #define vuzpq_p8 vuzpq_u8
   15943 
   15944 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
   15945 #define vuzpq_p16 vuzpq_u16
   15946 
   15947 //##############################################################################################
   15948 //*********************** Reinterpret cast intrinsics.******************************************
   15949 //##############################################################################################
   15950 // Not a part of oficial NEON instruction set but available in gcc compiler *********************
   15951 poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
   15952 #define vreinterpret_p8_u32
   15953 
   15954 poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
   15955 #define vreinterpret_p8_u16
   15956 
   15957 poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
   15958 #define vreinterpret_p8_u8
   15959 
   15960 poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
   15961 #define vreinterpret_p8_s32
   15962 
   15963 poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
   15964 #define vreinterpret_p8_s16
   15965 
   15966 poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
   15967 #define vreinterpret_p8_s8
   15968 
   15969 poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
   15970 #define vreinterpret_p8_u64
   15971 
   15972 poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
   15973 #define vreinterpret_p8_s64
   15974 
   15975 poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
   15976 #define vreinterpret_p8_f32
   15977 
   15978 poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
   15979 #define vreinterpret_p8_p16
   15980 
   15981 poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
   15982 #define vreinterpretq_p8_u32
   15983 
   15984 poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
   15985 #define vreinterpretq_p8_u16
   15986 
   15987 poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
   15988 #define vreinterpretq_p8_u8
   15989 
   15990 poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
   15991 #define vreinterpretq_p8_s32
   15992 
   15993 poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
   15994 #define vreinterpretq_p8_s16
   15995 
   15996 poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
   15997 #define vreinterpretq_p8_s8
   15998 
   15999 poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
   16000 #define vreinterpretq_p8_u64
   16001 
   16002 poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
   16003 #define vreinterpretq_p8_s64
   16004 
   16005 poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
   16006 #define vreinterpretq_p8_f32(t) _M128i(t)
   16007 
   16008 poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
   16009 #define vreinterpretq_p8_p16
   16010 
   16011 poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
   16012 #define vreinterpret_p16_u32
   16013 
   16014 poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
   16015 #define vreinterpret_p16_u16
   16016 
   16017 poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
   16018 #define vreinterpret_p16_u8
   16019 
   16020 poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
   16021 #define vreinterpret_p16_s32
   16022 
   16023 poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
   16024 #define vreinterpret_p16_s16
   16025 
   16026 poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
   16027 #define vreinterpret_p16_s8
   16028 
   16029 poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
   16030 #define vreinterpret_p16_u64
   16031 
   16032 poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
   16033 #define vreinterpret_p16_s64
   16034 
   16035 poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
   16036 #define vreinterpret_p16_f32
   16037 
   16038 poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
   16039 #define vreinterpret_p16_p8
   16040 
   16041 poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
   16042 #define vreinterpretq_p16_u32
   16043 
   16044 poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
   16045 #define vreinterpretq_p16_u16
   16046 
   16047 poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
   16048 #define vreinterpretq_p16_s32
   16049 
   16050 poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
   16051 #define vreinterpretq_p16_s16
   16052 
   16053 poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
   16054 #define vreinterpretq_p16_s8
   16055 
   16056 poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
   16057 #define vreinterpretq_p16_u64
   16058 
   16059 poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
   16060 #define vreinterpretq_p16_s64
   16061 
   16062 poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
   16063 #define vreinterpretq_p16_f32(t) _M128i(t)
   16064 
   16065 poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
   16066 #define vreinterpretq_p16_p8  vreinterpretq_s16_p8
   16067 
   16068 //****  Integer to float  ******
   16069 float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
   16070 #define vreinterpret_f32_u32(t) (*(__m64_128*)&(t))
   16071 
   16072 
   16073 float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
   16074 #define vreinterpret_f32_u16 vreinterpret_f32_u32
   16075 
   16076 
   16077 float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
   16078 #define vreinterpret_f32_u8 vreinterpret_f32_u32
   16079 
   16080 
   16081 float32x2_t vreinterpret_f32_s32 (int32x2_t t);
   16082 #define vreinterpret_f32_s32 vreinterpret_f32_u32
   16083 
   16084 
   16085 float32x2_t vreinterpret_f32_s16 (int16x4_t t);
   16086 #define vreinterpret_f32_s16 vreinterpret_f32_u32
   16087 
   16088 float32x2_t vreinterpret_f32_s8 (int8x8_t t);
   16089 #define vreinterpret_f32_s8 vreinterpret_f32_u32
   16090 
   16091 
   16092 float32x2_t vreinterpret_f32_u64(uint64x1_t t);
   16093 #define vreinterpret_f32_u64 vreinterpret_f32_u32
   16094 
   16095 
   16096 float32x2_t vreinterpret_f32_s64 (int64x1_t t);
   16097 #define vreinterpret_f32_s64 vreinterpret_f32_u32
   16098 
   16099 
   16100 float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
   16101 #define vreinterpret_f32_p16 vreinterpret_f32_u32
   16102 
   16103 float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
   16104 #define vreinterpret_f32_p8 vreinterpret_f32_u32
   16105 
   16106 float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
   16107 #define  vreinterpretq_f32_u32(t) *(__m128*)&(t)
   16108 
   16109 float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
   16110 #define vreinterpretq_f32_u16 vreinterpretq_f32_u32
   16111 
   16112 float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
   16113 #define vreinterpretq_f32_u8 vreinterpretq_f32_u32
   16114 
   16115 float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
   16116 #define vreinterpretq_f32_s32 vreinterpretq_f32_u32
   16117 
   16118 float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
   16119 #define vreinterpretq_f32_s16 vreinterpretq_f32_u32
   16120 
   16121 float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
   16122 #define vreinterpretq_f32_s8 vreinterpretq_f32_u32
   16123 
   16124 float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
   16125 #define vreinterpretq_f32_u64 vreinterpretq_f32_u32
   16126 
   16127 float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
   16128 #define vreinterpretq_f32_s64 vreinterpretq_f32_u32
   16129 
   16130 float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
   16131 #define vreinterpretq_f32_p16 vreinterpretq_f32_u32
   16132 
   16133 float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
   16134 #define vreinterpretq_f32_p8 vreinterpretq_f32_u32
   16135 
   16136 //*** Integer type conversions ******************
   16137 //no conversion necessary for the following functions because it is same data type
   16138 int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
   16139 #define vreinterpret_s64_u32
   16140 
   16141 int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
   16142 #define vreinterpret_s64_u16
   16143 
   16144 int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
   16145 #define vreinterpret_s64_u8
   16146 
   16147 int64x1_t vreinterpret_s64_s32 (int32x2_t t);
   16148 #define  vreinterpret_s64_s32
   16149 
   16150 int64x1_t vreinterpret_s64_s16 (int16x4_t t);
   16151 #define vreinterpret_s64_s16
   16152 
   16153 int64x1_t vreinterpret_s64_s8 (int8x8_t t);
   16154 #define  vreinterpret_s64_s8
   16155 
   16156 int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
   16157 #define  vreinterpret_s64_u64
   16158 
   16159 int64x1_t vreinterpret_s64_f32 (float32x2_t t);
   16160 #define  vreinterpret_s64_f32
   16161 
   16162 int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
   16163 #define vreinterpret_s64_p16
   16164 
   16165 int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
   16166 #define vreinterpret_s64_p8
   16167 
   16168 int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
   16169 #define vreinterpretq_s64_u32
   16170 
   16171 int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
   16172 #define vreinterpretq_s64_s16
   16173 
   16174 int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
   16175 #define vreinterpretq_s64_u8
   16176 
   16177 int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
   16178 #define vreinterpretq_s64_s32
   16179 
   16180 int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
   16181 #define vreinterpretq_s64_u16
   16182 
   16183 int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
   16184 #define vreinterpretq_s64_s8
   16185 
   16186 int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
   16187 #define vreinterpretq_s64_u64
   16188 
   16189 int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
   16190 #define vreinterpretq_s64_f32(t) _M128i(t)
   16191 
   16192 int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
   16193 #define vreinterpretq_s64_p16
   16194 
   16195 int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
   16196 #define vreinterpretq_s64_p8
   16197 
   16198 uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
   16199 #define vreinterpret_u64_u32
   16200 
   16201 uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
   16202 #define vreinterpret_u64_u16
   16203 
   16204 uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
   16205 #define vreinterpret_u64_u8
   16206 
   16207 uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
   16208 #define vreinterpret_u64_s32
   16209 
   16210 uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
   16211 #define vreinterpret_u64_s16
   16212 
   16213 uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
   16214 #define vreinterpret_u64_s8
   16215 
   16216 uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
   16217 #define vreinterpret_u64_s64
   16218 
   16219 uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
   16220 #define vreinterpret_u64_f32
   16221 
   16222 uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
   16223 #define vreinterpret_u64_p16
   16224 
   16225 uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
   16226 #define vreinterpret_u64_p8
   16227 
   16228 uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
   16229 #define vreinterpretq_u64_u32
   16230 
   16231 uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
   16232 #define vreinterpretq_u64_u16
   16233 
   16234 uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
   16235 #define vreinterpretq_u64_u8
   16236 
   16237 uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
   16238 #define vreinterpretq_u64_s32
   16239 
   16240 uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
   16241 #define vreinterpretq_u64_s16
   16242 
   16243 uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
   16244 #define vreinterpretq_u64_s8
   16245 
   16246 uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
   16247 #define vreinterpretq_u64_s64
   16248 
   16249 uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
   16250 #define vreinterpretq_u64_f32(t) _M128i(t)
   16251 
   16252 uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
   16253 #define vreinterpretq_u64_p16
   16254 
   16255 uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
   16256 #define vreinterpretq_u64_p8
   16257 
   16258 int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
   16259 #define vreinterpret_s8_u32
   16260 
   16261 int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
   16262 #define vreinterpret_s8_u16
   16263 
   16264 int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
   16265 #define vreinterpret_s8_u8
   16266 
   16267 int8x8_t vreinterpret_s8_s32 (int32x2_t t);
   16268 #define vreinterpret_s8_s32
   16269 
   16270 int8x8_t vreinterpret_s8_s16 (int16x4_t t);
   16271 #define vreinterpret_s8_s16
   16272 
   16273 int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
   16274 #define vreinterpret_s8_u64
   16275 
   16276 int8x8_t vreinterpret_s8_s64 (int64x1_t t);
   16277 #define vreinterpret_s8_s64
   16278 
   16279 int8x8_t vreinterpret_s8_f32 (float32x2_t t);
   16280 #define vreinterpret_s8_f32
   16281 
   16282 int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
   16283 #define vreinterpret_s8_p16
   16284 
   16285 int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
   16286 #define vreinterpret_s8_p8
   16287 
   16288 int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
   16289 #define vreinterpretq_s8_u32
   16290 
   16291 int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
   16292 #define vreinterpretq_s8_u16
   16293 
   16294 int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
   16295 #define vreinterpretq_s8_u8
   16296 
   16297 int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
   16298 #define vreinterpretq_s8_s32
   16299 
   16300 int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
   16301 #define vreinterpretq_s8_s16
   16302 
   16303 int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
   16304 #define vreinterpretq_s8_u64
   16305 
   16306 int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
   16307 #define vreinterpretq_s8_s64
   16308 
   16309 int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
   16310 #define vreinterpretq_s8_f32(t) _M128i(t)
   16311 
   16312 int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
   16313 #define vreinterpretq_s8_p16
   16314 
   16315 int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
   16316 #define vreinterpretq_s8_p8
   16317 
   16318 int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
   16319 #define vreinterpret_s16_u32
   16320 
   16321 int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
   16322 #define vreinterpret_s16_u16
   16323 
   16324 int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
   16325 #define vreinterpret_s16_u8
   16326 
   16327 int16x4_t vreinterpret_s16_s32 (int32x2_t t);
   16328 #define vreinterpret_s16_s32
   16329 
   16330 int16x4_t vreinterpret_s16_s8 (int8x8_t t);
   16331 #define vreinterpret_s16_s8
   16332 
   16333 int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
   16334 #define vreinterpret_s16_u64
   16335 
   16336 int16x4_t vreinterpret_s16_s64 (int64x1_t t);
   16337 #define vreinterpret_s16_s64
   16338 
   16339 int16x4_t vreinterpret_s16_f32 (float32x2_t t);
   16340 #define vreinterpret_s16_f32
   16341 
   16342 
   16343 int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
   16344 #define vreinterpret_s16_p16
   16345 
   16346 int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
   16347 #define vreinterpret_s16_p8
   16348 
   16349 int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
   16350 #define vreinterpretq_s16_u32
   16351 
   16352 int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
   16353 #define vreinterpretq_s16_u16
   16354 
   16355 int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
   16356 #define vreinterpretq_s16_u8
   16357 
   16358 int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
   16359 #define vreinterpretq_s16_s32
   16360 
   16361 int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
   16362 #define vreinterpretq_s16_s8
   16363 
   16364 int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
   16365 #define vreinterpretq_s16_u64
   16366 
   16367 int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
   16368 #define vreinterpretq_s16_s64
   16369 
   16370 int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
   16371 #define vreinterpretq_s16_f32(t) _M128i(t)
   16372 
   16373 int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
   16374 #define vreinterpretq_s16_p16
   16375 
   16376 int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
   16377 #define vreinterpretq_s16_p8
   16378 
   16379 int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
   16380 #define vreinterpret_s32_u32
   16381 
   16382 int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
   16383 #define vreinterpret_s32_u16
   16384 
   16385 int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
   16386 #define vreinterpret_s32_u8
   16387 
   16388 int32x2_t vreinterpret_s32_s16 (int16x4_t t);
   16389 #define vreinterpret_s32_s16
   16390 
   16391 int32x2_t vreinterpret_s32_s8 (int8x8_t t);
   16392 #define vreinterpret_s32_s8
   16393 
   16394 int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
   16395 #define vreinterpret_s32_u64
   16396 
   16397 int32x2_t vreinterpret_s32_s64 (int64x1_t t);
   16398 #define vreinterpret_s32_s64
   16399 
   16400 int32x2_t vreinterpret_s32_f32 (float32x2_t t);
   16401 #define vreinterpret_s32_f32
   16402 
   16403 int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
   16404 #define vreinterpret_s32_p16
   16405 
   16406 int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
   16407 #define vreinterpret_s32_p8
   16408 
   16409 int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
   16410 #define vreinterpretq_s32_u32
   16411 
   16412 int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
   16413 #define vreinterpretq_s32_u16
   16414 
   16415 int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
   16416 #define vreinterpretq_s32_u8
   16417 
   16418 int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
   16419 #define vreinterpretq_s32_s16
   16420 
   16421 int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
   16422 #define vreinterpretq_s32_s8
   16423 
   16424 int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
   16425 #define vreinterpretq_s32_u64
   16426 
   16427 int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
   16428 #define vreinterpretq_s32_s64
   16429 
   16430 int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
   16431 #define vreinterpretq_s32_f32(t)  _mm_castps_si128(t) //(*(__m128i*)&(t))
   16432 
   16433 int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
   16434 #define vreinterpretq_s32_p16
   16435 
   16436 int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
   16437 #define vreinterpretq_s32_p8
   16438 
   16439 uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
   16440 #define vreinterpret_u8_u32
   16441 
   16442 uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
   16443 #define vreinterpret_u8_u16
   16444 
   16445 uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
   16446 #define vreinterpret_u8_s32
   16447 
   16448 uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
   16449 #define vreinterpret_u8_s16
   16450 
   16451 uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
   16452 #define vreinterpret_u8_s8
   16453 
   16454 uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
   16455 #define vreinterpret_u8_u64
   16456 
   16457 uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
   16458 #define vreinterpret_u8_s64
   16459 
   16460 uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
   16461 #define vreinterpret_u8_f32
   16462 
   16463 uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
   16464 #define vreinterpret_u8_p16
   16465 
   16466 uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
   16467 #define vreinterpret_u8_p8
   16468 
   16469 uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
   16470 #define vreinterpretq_u8_u32
   16471 
   16472 uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
   16473 #define vreinterpretq_u8_u16
   16474 
   16475 uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
   16476 #define vreinterpretq_u8_s32
   16477 
   16478 uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
   16479 #define vreinterpretq_u8_s16
   16480 
   16481 uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
   16482 #define vreinterpretq_u8_s8
   16483 
   16484 uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
   16485 #define vreinterpretq_u8_u64
   16486 
   16487 uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
   16488 #define vreinterpretq_u8_s64
   16489 
   16490 uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
   16491 #define vreinterpretq_u8_f32(t) _M128i(t)
   16492 
   16493 
   16494 uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
   16495 #define vreinterpretq_u8_p16
   16496 
   16497 uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
   16498 #define vreinterpretq_u8_p8
   16499 
   16500 uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
   16501 #define vreinterpret_u16_u32
   16502 
   16503 uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
   16504 #define vreinterpret_u16_u8
   16505 
   16506 uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
   16507 #define vreinterpret_u16_s32
   16508 
   16509 uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
   16510 #define vreinterpret_u16_s16
   16511 
   16512 uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
   16513 #define vreinterpret_u16_s8
   16514 
   16515 uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
   16516 #define vreinterpret_u16_u64
   16517 
   16518 uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
   16519 #define vreinterpret_u16_s64
   16520 
   16521 uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
   16522 #define vreinterpret_u16_f32
   16523 
   16524 uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
   16525 #define vreinterpret_u16_p16
   16526 
   16527 uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
   16528 #define vreinterpret_u16_p8
   16529 
   16530 uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
   16531 #define vreinterpretq_u16_u32
   16532 
   16533 uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
   16534 #define vreinterpretq_u16_u8
   16535 
   16536 uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
   16537 #define vreinterpretq_u16_s32
   16538 
   16539 uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
   16540 #define vreinterpretq_u16_s16
   16541 
   16542 uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
   16543 #define vreinterpretq_u16_s8
   16544 
   16545 uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
   16546 #define vreinterpretq_u16_u64
   16547 
   16548 uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
   16549 #define vreinterpretq_u16_s64
   16550 
   16551 uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
   16552 #define vreinterpretq_u16_f32(t) _M128i(t)
   16553 
   16554 uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
   16555 #define vreinterpretq_u16_p16
   16556 
   16557 uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
   16558 #define vreinterpretq_u16_p8
   16559 
   16560 uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
   16561 #define vreinterpret_u32_u16
   16562 
   16563 uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
   16564 #define vreinterpret_u32_u8
   16565 
   16566 uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
   16567 #define vreinterpret_u32_s32
   16568 
   16569 uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
   16570 #define vreinterpret_u32_s16
   16571 
   16572 uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
   16573 #define vreinterpret_u32_s8
   16574 
   16575 uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
   16576 #define vreinterpret_u32_u64
   16577 
   16578 uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
   16579 #define vreinterpret_u32_s64
   16580 
   16581 uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
   16582 #define vreinterpret_u32_f32
   16583 
   16584 uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
   16585 #define vreinterpret_u32_p16
   16586 
   16587 uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
   16588 #define vreinterpret_u32_p8
   16589 
   16590 uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
   16591 #define vreinterpretq_u32_u16
   16592 
   16593 uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
   16594 #define vreinterpretq_u32_u8
   16595 
   16596 uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
   16597 #define vreinterpretq_u32_s32
   16598 
   16599 uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
   16600 #define vreinterpretq_u32_s16
   16601 
   16602 uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
   16603 #define vreinterpretq_u32_s8
   16604 
   16605 uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
   16606 #define vreinterpretq_u32_u64
   16607 
   16608 uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
   16609 #define vreinterpretq_u32_s64
   16610 
   16611 uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
   16612 #define  vreinterpretq_u32_f32(t) _M128i(t)
   16613 
   16614 uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
   16615 #define vreinterpretq_u32_p16
   16616 
   16617 uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
   16618 #define vreinterpretq_u32_p8
   16619 
   16620 #endif /* NEON2SSE_H */
   16621