Home | History | Annotate | Download | only in include
      1 //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation,  victoria.zhislina (at) intel.com
      2 
      3 //*** Copyright (C) 2012-2014 Intel Corporation.  All rights reserved.
      4 
      5 //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      6 
      7 //By downloading, copying, installing or using the software you agree to this license.
      8 //If you do not agree to this license, do not download, install, copy or use the software.
      9 
     10 //                              License Agreement
     11 
     12 //Permission to use, copy, modify, and/or distribute this software for any
     13 //purpose with or without fee is hereby granted, provided that the above
     14 //copyright notice and this permission notice appear in all copies.
     15 
     16 //THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
     17 //REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
     18 //AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT,
     19 //INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM
     20 //LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR
     21 //OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
     22 //PERFORMANCE OF THIS SOFTWARE.
     23 
     24 //*****************************************************************************************
     25 // This file is intended to simplify ARM->IA32 porting
     26 // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
     27 // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
     28 // MMX instruction set is not used due to performance overhead and the necessity to use the
     29 // EMMS instruction (_mm_empty())for mmx-x87 floating point switching
     30 //*****************************************************************************************
     31 
     32 //!!!!!!!  To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual.
     33 //!!!!!!!  Please pay attention at #define USE_SSSE3 and USE_SSE4 below - you need to define them for newest Intel platforms for
     34 //!!!!!!!  greater performance. It can be done by -mssse3 or -msse4.2 (which also implies -mssse3) compiler switch.
     35 
     36 #ifndef NEON2SSE_H
     37 #define NEON2SSE_H
     38 
     39 #ifndef USE_SSE4
     40     #if defined(__SSE4_2__)
     41         #define USE_SSE4
     42         #define USE_SSSE3
     43     #endif
     44 #endif
     45 
     46 #ifndef USE_SSSE3
     47     #if defined(__SSSE3__)
     48         #define USE_SSSE3
     49     #endif
     50 #endif
     51 
     52 #include <xmmintrin.h>     //SSE
     53 #include <emmintrin.h>     //SSE2
     54 #include <pmmintrin.h>     //SSE3
     55 
     56 #ifdef USE_SSSE3
     57     #include <tmmintrin.h>     //SSSE3
     58 #else
     59 # warning "Some functions require SSSE3 or higher."
     60 #endif
     61 
     62 #ifdef USE_SSE4
     63     #include <smmintrin.h>     //SSE4.1
     64     #include <nmmintrin.h>     //SSE4.2
     65 #endif
     66 
     67 /*********************************************************************************************************************/
     68 //    data types conversion
     69 /*********************************************************************************************************************/
     70 
     71 typedef __m128 float32x4_t;
     72 
     73 typedef __m128 float16x8_t;         //not supported by IA, for compartibility
     74 
     75 typedef __m128i int8x16_t;
     76 typedef __m128i int16x8_t;
     77 typedef __m128i int32x4_t;
     78 typedef __m128i int64x2_t;
     79 typedef __m128i uint8x16_t;
     80 typedef __m128i uint16x8_t;
     81 typedef __m128i uint32x4_t;
     82 typedef __m128i uint64x2_t;
     83 typedef __m128i poly8x16_t;
     84 typedef __m128i poly16x8_t;
     85 
     86 #if defined(_MSC_VER) && (_MSC_VER < 1300)
     87     typedef signed char int8_t;
     88     typedef unsigned char uint8_t;
     89     typedef signed short int16_t;
     90     typedef unsigned short uint16_t;
     91     typedef signed int int32_t;
     92     typedef unsigned int uint32_t;
     93     typedef signed long long int64_t;
     94     typedef unsigned long long uint64_t;
     95 #elif defined(_MSC_VER)
     96     typedef signed __int8 int8_t;
     97     typedef unsigned __int8 uint8_t;
     98     typedef signed __int16 int16_t;
     99     typedef unsigned __int16 uint16_t;
    100     typedef signed __int32 int32_t;
    101     typedef unsigned __int32 uint32_t;
    102 
    103 typedef signed long long int64_t;
    104 typedef unsigned long long uint64_t;
    105 #else
    106     #include <stdint.h>
    107     #include <limits.h>
    108 #endif
    109 #if defined(_MSC_VER)
    110 #define SINT_MIN     (-2147483647 - 1)    /* min signed int value */
    111 #define SINT_MAX       2147483647         /* max signed int value */
    112 #else
    113 #define SINT_MIN     INT_MIN              /* min signed int value */
    114 #define SINT_MAX     INT_MAX              /* max signed int value */
    115 #endif
    116 
    117 typedef   float float32_t;
    118 #if !defined(__clang__)
    119 typedef   float __fp16;
    120 #endif
    121 
    122 typedef  uint8_t poly8_t;
    123 typedef  uint16_t poly16_t;
    124 
    125 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
    126 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
    127 
    128 //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
    129 struct int8x16x2_t {
    130     int8x16_t val[2];
    131 };
    132 struct int16x8x2_t {
    133     int16x8_t val[2];
    134 };
    135 struct int32x4x2_t {
    136     int32x4_t val[2];
    137 };
    138 struct int64x2x2_t {
    139     int64x2_t val[2];
    140 };
    141 
    142 typedef struct int8x16x2_t int8x16x2_t;         //for C compilers to make them happy
    143 typedef struct int16x8x2_t int16x8x2_t;         //for C compilers to make them happy
    144 typedef struct int32x4x2_t int32x4x2_t;         //for C compilers to make them happy
    145 typedef struct int64x2x2_t int64x2x2_t;         //for C compilers to make them happy
    146 //to avoid pointers conversion
    147 typedef  int8x16x2_t int8x8x2_t;
    148 typedef  int16x8x2_t int16x4x2_t;
    149 typedef  int32x4x2_t int32x2x2_t;
    150 typedef  int64x2x2_t int64x1x2_t;
    151 
    152 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
    153 typedef struct int8x16x2_t uint8x16x2_t;
    154 typedef struct int16x8x2_t uint16x8x2_t;
    155 typedef struct int32x4x2_t uint32x4x2_t;
    156 typedef struct int64x2x2_t uint64x2x2_t;
    157 typedef struct int8x16x2_t poly8x16x2_t;
    158 typedef struct int16x8x2_t poly16x8x2_t;
    159 
    160 typedef  int8x8x2_t uint8x8x2_t;
    161 typedef  int16x4x2_t uint16x4x2_t;
    162 typedef  int32x2x2_t uint32x2x2_t;
    163 typedef  int64x1x2_t uint64x1x2_t;
    164 typedef  int8x8x2_t poly8x8x2_t;
    165 typedef  int16x4x2_t poly16x4x2_t;
    166 
    167 //float
    168 struct float32x4x2_t {
    169     float32x4_t val[2];
    170 };
    171 struct float16x8x2_t {
    172     float16x8_t val[2];
    173 };
    174 typedef struct float32x4x2_t float32x4x2_t;         //for C compilers to make them happy
    175 typedef struct float16x8x2_t float16x8x2_t;         //for C compilers to make them happy
    176 typedef  float32x4x2_t float32x2x2_t;
    177 typedef  float16x8x2_t float16x4x2_t;
    178 
    179 //4
    180 struct int8x16x4_t {
    181     int8x16_t val[4];
    182 };
    183 struct int16x8x4_t {
    184     int16x8_t val[4];
    185 };
    186 struct int32x4x4_t {
    187     int32x4_t val[4];
    188 };
    189 struct int64x2x4_t {
    190     int64x2_t val[4];
    191 };
    192 
    193 typedef struct int8x16x4_t int8x16x4_t;         //for C compilers to make them happy
    194 typedef struct int16x8x4_t int16x8x4_t;         //for C compilers to make them happy
    195 typedef struct int32x4x4_t int32x4x4_t;         //for C compilers to make them happy
    196 typedef struct int64x2x4_t int64x2x4_t;         //for C compilers to make them happy
    197 typedef  int8x16x4_t int8x8x4_t;
    198 typedef  int16x8x4_t int16x4x4_t;
    199 typedef  int32x4x4_t int32x2x4_t;
    200 typedef  int64x2x4_t int64x1x4_t;
    201 
    202 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
    203 typedef int8x8x4_t uint8x8x4_t;
    204 typedef int16x4x4_t uint16x4x4_t;
    205 typedef int32x2x4_t uint32x2x4_t;
    206 typedef int64x1x4_t uint64x1x4_t;
    207 typedef uint8x8x4_t poly8x8x4_t;
    208 typedef uint16x4x4_t poly16x4x4_t;
    209 
    210 typedef struct int8x16x4_t uint8x16x4_t;
    211 typedef struct int16x8x4_t uint16x8x4_t;
    212 typedef struct int32x4x4_t uint32x4x4_t;
    213 typedef struct int64x2x4_t uint64x2x4_t;
    214 typedef struct int8x16x4_t poly8x16x4_t;
    215 typedef struct int16x8x4_t poly16x8x4_t;
    216 
    217 struct float32x4x4_t {
    218     float32x4_t val[4];
    219 };
    220 struct float16x8x4_t {
    221     float16x8_t val[4];
    222 };
    223 
    224 typedef struct float32x4x4_t float32x4x4_t;         //for C compilers to make them happy
    225 typedef struct float16x8x4_t float16x8x4_t;         //for C compilers to make them happy
    226 typedef  float32x4x4_t float32x2x4_t;
    227 typedef  float16x8x4_t float16x4x4_t;
    228 
    229 //3
    230 struct int16x8x3_t {
    231     int16x8_t val[3];
    232 };
    233 struct int32x4x3_t {
    234     int32x4_t val[3];
    235 };
    236 struct int64x2x3_t {
    237     int64x2_t val[3];
    238 };
    239 struct int8x16x3_t {
    240     int8x16_t val[3];
    241 };
    242 
    243 typedef struct int16x8x3_t int16x8x3_t;         //for C compilers to make them happy
    244 typedef struct int32x4x3_t int32x4x3_t;         //for C compilers to make them happy
    245 typedef struct int64x2x3_t int64x2x3_t;         //for C compilers to make them happy
    246 typedef struct int8x16x3_t int8x16x3_t;         //for C compilers to make them happy
    247 typedef  int16x8x3_t int16x4x3_t;
    248 typedef  int32x4x3_t int32x2x3_t;
    249 typedef  int64x2x3_t int64x1x3_t;
    250 typedef  int8x16x3_t int8x8x3_t;
    251 
    252 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
    253 typedef struct int8x16x3_t uint8x16x3_t;
    254 typedef struct int16x8x3_t uint16x8x3_t;
    255 typedef struct int32x4x3_t uint32x4x3_t;
    256 typedef struct int64x2x3_t uint64x2x3_t;
    257 typedef struct int8x16x3_t poly8x16x3_t;
    258 typedef struct int16x8x3_t poly16x8x3_t;
    259 typedef int8x8x3_t uint8x8x3_t;
    260 typedef int16x4x3_t uint16x4x3_t;
    261 typedef int32x2x3_t uint32x2x3_t;
    262 typedef int64x1x3_t uint64x1x3_t;
    263 typedef int8x8x3_t poly8x8x3_t;
    264 typedef int16x4x3_t poly16x4x3_t;
    265 
    266 //float
    267 struct float32x4x3_t {
    268     float32x4_t val[3];
    269 };
    270 struct float16x8x3_t {
    271     float16x8_t val[3];
    272 };
    273 
    274 typedef struct float32x4x3_t float32x4x3_t;         //for C compilers to make them happy
    275 typedef struct float16x8x3_t float16x8x3_t;         //for C compilers to make them happy
    276 typedef  float32x4x3_t float32x2x3_t;
    277 typedef  float16x8x3_t float16x4x3_t;
    278 
    279 //****************************************************************************
    280 //****** Porting auxiliary macros ********************************************
    281 #define _M128i(a) (*(__m128i*)&(a))
    282 #define _M128d(a) (*(__m128d*)&(a))
    283 #define _M128(a) (*(__m128*)&(a))
    284 #define _Ui64(a) (*(uint64_t*)&(a))
    285 #define _UNSIGNED_T(a) u##a
    286 
    287 #define _SIGNBIT64 ((uint64_t)1 << 63)
    288 #define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
    289 #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
    290 
    291 #define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
    292 #define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
    293 
    294 //***************  functions attributes  ********************************************
    295 //***********************************************************************************
    296 #ifdef __GNUC__
    297     #define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
    298     #define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
    299     #define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    300     #if _GCC_VERSION <  40500
    301         #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
    302     #else
    303         #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
    304     #endif
    305 #elif defined(_MSC_VER)|| defined (__INTEL_COMPILER)
    306     #define _NEON2SSE_ALIGN_16  __declspec(align(16))
    307     #define _NEON2SSE_INLINE __inline
    308     #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
    309 #else
    310     #define _NEON2SSE_ALIGN_16  __declspec(align(16))
    311     #define _NEON2SSE_INLINE inline
    312     #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
    313 #endif
    314 
    315 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    316 #define __constrange(min,max)  const
    317 #define __transfersize(size)
    318 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    319 
    320 //*************************************************************************
    321 //*************************************************************************
    322 //*********  Functions declarations as declared in original arm_neon.h *****
    323 //*************************************************************************
    324 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
    325 
    326 int8x16_t vaddq_s8(int8x16_t a, int8x16_t b);         // VADD.I8 q0,q0,q0
    327 int16x8_t vaddq_s16(int16x8_t a, int16x8_t b);         // VADD.I16 q0,q0,q0
    328 int32x4_t vaddq_s32(int32x4_t a, int32x4_t b);         // VADD.I32 q0,q0,q0
    329 int64x2_t vaddq_s64(int64x2_t a, int64x2_t b);         // VADD.I64 q0,q0,q0
    330 float32x4_t vaddq_f32(float32x4_t a, float32x4_t b);         // VADD.F32 q0,q0,q0
    331 uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b);         // VADD.I8 q0,q0,q0
    332 uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b);         // VADD.I16 q0,q0,q0
    333 uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b);         // VADD.I32 q0,q0,q0
    334 uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b);         // VADD.I64 q0,q0,q0
    335 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
    336 
    337 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
    338 
    339 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
    340 
    341 int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b);         // VHADD.S8 q0,q0,q0
    342 int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b);         // VHADD.S16 q0,q0,q0
    343 int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b);         // VHADD.S32 q0,q0,q0
    344 uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b);         // VHADD.U8 q0,q0,q0
    345 uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b);         // VHADD.U16 q0,q0,q0
    346 uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b);         // VHADD.U32 q0,q0,q0
    347 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
    348 
    349 int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b);         // VRHADD.S8 q0,q0,q0
    350 int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b);         // VRHADD.S16 q0,q0,q0
    351 int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b);         // VRHADD.S32 q0,q0,q0
    352 uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b);         // VRHADD.U8 q0,q0,q0
    353 uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b);         // VRHADD.U16 q0,q0,q0
    354 uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b);         // VRHADD.U32 q0,q0,q0
    355 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
    356 
    357 int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b);         // VQADD.S8 q0,q0,q0
    358 int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b);         // VQADD.S16 q0,q0,q0
    359 int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b);         // VQADD.S32 q0,q0,q0
    360 int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b);         // VQADD.S64 q0,q0,q0
    361 uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b);         // VQADD.U8 q0,q0,q0
    362 uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b);         // VQADD.U16 q0,q0,q0
    363 uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b);         // VQADD.U32 q0,q0,q0
    364 uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
    365 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
    366 
    367 //Vector rounding add high half: vraddhn
    368 
    369 //Multiplication
    370 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
    371 
    372 int8x16_t vmulq_s8(int8x16_t a, int8x16_t b);         // VMUL.I8 q0,q0,q0
    373 int16x8_t vmulq_s16(int16x8_t a, int16x8_t b);         // VMUL.I16 q0,q0,q0
    374 int32x4_t vmulq_s32(int32x4_t a, int32x4_t b);         // VMUL.I32 q0,q0,q0
    375 float32x4_t vmulq_f32(float32x4_t a, float32x4_t b);         // VMUL.F32 q0,q0,q0
    376 uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b);         // VMUL.I8 q0,q0,q0
    377 uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b);         // VMUL.I16 q0,q0,q0
    378 uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b);         // VMUL.I32 q0,q0,q0
    379 poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b);         // VMUL.P8 q0,q0,q0
    380 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
    381 
    382 int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLA.I8 q0,q0,q0
    383 int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLA.I16 q0,q0,q0
    384 int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLA.I32 q0,q0,q0
    385 float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLA.F32 q0,q0,q0
    386 uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLA.I8 q0,q0,q0
    387 uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLA.I16 q0,q0,q0
    388 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLA.I32 q0,q0,q0
    389 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
    390 
    391 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
    392 
    393 int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLS.I8 q0,q0,q0
    394 int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLS.I16 q0,q0,q0
    395 int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLS.I32 q0,q0,q0
    396 float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLS.F32 q0,q0,q0
    397 uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLS.I8 q0,q0,q0
    398 uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLS.I16 q0,q0,q0
    399 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLS.I32 q0,q0,q0
    400 //Vector multiply subtract long
    401 
    402 //Vector saturating doubling multiply high
    403 
    404 int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b);         // VQDMULH.S16 q0,q0,q0
    405 int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b);         // VQDMULH.S32 q0,q0,q0
    406 //Vector saturating rounding doubling multiply high
    407 
    408 int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b);         // VQRDMULH.S16 q0,q0,q0
    409 int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b);         // VQRDMULH.S32 q0,q0,q0
    410 //Vector saturating doubling multiply accumulate long
    411 
    412 //Vector saturating doubling multiply subtract long
    413 
    414 //Vector long multiply
    415 
    416 //Vector saturating doubling long multiply
    417 
    418 //Subtraction
    419 //Vector subtract
    420 
    421 int8x16_t vsubq_s8(int8x16_t a, int8x16_t b);         // VSUB.I8 q0,q0,q0
    422 int16x8_t vsubq_s16(int16x8_t a, int16x8_t b);         // VSUB.I16 q0,q0,q0
    423 int32x4_t vsubq_s32(int32x4_t a, int32x4_t b);         // VSUB.I32 q0,q0,q0
    424 int64x2_t vsubq_s64(int64x2_t a, int64x2_t b);         // VSUB.I64 q0,q0,q0
    425 float32x4_t vsubq_f32(float32x4_t a, float32x4_t b);         // VSUB.F32 q0,q0,q0
    426 uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b);         // VSUB.I8 q0,q0,q0
    427 uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b);         // VSUB.I16 q0,q0,q0
    428 uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b);         // VSUB.I32 q0,q0,q0
    429 uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b);         // VSUB.I64 q0,q0,q0
    430 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
    431 
    432 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
    433 
    434 //Vector saturating subtract
    435 
    436 int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b);         // VQSUB.S8 q0,q0,q0
    437 int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b);         // VQSUB.S16 q0,q0,q0
    438 int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b);         // VQSUB.S32 q0,q0,q0
    439 int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b);         // VQSUB.S64 q0,q0,q0
    440 uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b);         // VQSUB.U8 q0,q0,q0
    441 uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b);         // VQSUB.U16 q0,q0,q0
    442 uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b);         // VQSUB.U32 q0,q0,q0
    443 uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b);         // VQSUB.U64 q0,q0,q0
    444 //Vector halving subtract
    445 
    446 int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b);         // VHSUB.S8 q0,q0,q0
    447 int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b);         // VHSUB.S16 q0,q0,q0
    448 int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b);         // VHSUB.S32 q0,q0,q0
    449 uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b);         // VHSUB.U8 q0,q0,q0
    450 uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b);         // VHSUB.U16 q0,q0,q0
    451 uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b);         // VHSUB.U32 q0,q0,q0
    452 //Vector subtract high half
    453 
    454 //Vector rounding subtract high half
    455 
    456 //Comparison
    457 //Vector compare equal
    458 
    459 uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b);         // VCEQ.I8 q0, q0, q0
    460 uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b);         // VCEQ.I16 q0, q0, q0
    461 uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b);         // VCEQ.I32 q0, q0, q0
    462 uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b);         // VCEQ.F32 q0, q0, q0
    463 uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b);         // VCEQ.I8 q0, q0, q0
    464 uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b);         // VCEQ.I16 q0, q0, q0
    465 uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b);         // VCEQ.I32 q0, q0, q0
    466 uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b);         // VCEQ.I8 q0, q0, q0
    467 //Vector compare greater-than or equal
    468 
    469 uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
    470 uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
    471 uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
    472 uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
    473 uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
    474 uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.U16 q0, q0, q0
    475 uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
    476 //Vector compare less-than or equal
    477 
    478 uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
    479 uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
    480 uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
    481 uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
    482 uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
    483 uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.U16 q0, q0, q0
    484 uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
    485 //Vector compare greater-than
    486 
    487 uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
    488 uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
    489 uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
    490 uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
    491 uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
    492 uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.U16 q0, q0, q0
    493 uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
    494 //Vector compare less-than
    495 
    496 uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
    497 uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
    498 uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
    499 uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
    500 uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
    501 uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.U16 q0, q0, q0
    502 uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
    503 //Vector compare absolute greater-than or equal
    504 
    505 uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
    506 //Vector compare absolute less-than or equal
    507 
    508 uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
    509 //Vector compare absolute greater-than
    510 
    511 uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
    512 //Vector compare absolute less-than
    513 
    514 uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
    515 //Vector test bits
    516 
    517 uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b);         // VTST.8 q0, q0, q0
    518 uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b);         // VTST.16 q0, q0, q0
    519 uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b);         // VTST.32 q0, q0, q0
    520 uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b);         // VTST.8 q0, q0, q0
    521 uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b);         // VTST.16 q0, q0, q0
    522 uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b);         // VTST.32 q0, q0, q0
    523 uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b);         // VTST.8 q0, q0, q0
    524 //Absolute difference
    525 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
    526 
    527 int8x16_t vabdq_s8(int8x16_t a, int8x16_t b);         // VABD.S8 q0,q0,q0
    528 int16x8_t vabdq_s16(int16x8_t a, int16x8_t b);         // VABD.S16 q0,q0,q0
    529 int32x4_t vabdq_s32(int32x4_t a, int32x4_t b);         // VABD.S32 q0,q0,q0
    530 uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b);         // VABD.U8 q0,q0,q0
    531 uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b);         // VABD.U16 q0,q0,q0
    532 uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b);         // VABD.U32 q0,q0,q0
    533 float32x4_t vabdq_f32(float32x4_t a, float32x4_t b);         // VABD.F32 q0,q0,q0
    534 //Absolute difference - long
    535 
    536 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
    537 
    538 int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VABA.S8 q0,q0,q0
    539 int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VABA.S16 q0,q0,q0
    540 int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VABA.S32 q0,q0,q0
    541 uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VABA.U8 q0,q0,q0
    542 uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VABA.U16 q0,q0,q0
    543 uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VABA.U32 q0,q0,q0
    544 //Absolute difference and accumulate - long
    545 
    546 //Max/Min
    547 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
    548 
    549 int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b);         // VMAX.S8 q0,q0,q0
    550 int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b);         // VMAX.S16 q0,q0,q0
    551 int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b);         // VMAX.S32 q0,q0,q0
    552 uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b);         // VMAX.U8 q0,q0,q0
    553 uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b);         // VMAX.U16 q0,q0,q0
    554 uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b);         // VMAX.U32 q0,q0,q0
    555 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b);         // VMAX.F32 q0,q0,q0
    556 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
    557 
    558 int8x16_t vminq_s8(int8x16_t a, int8x16_t b);         // VMIN.S8 q0,q0,q0
    559 int16x8_t vminq_s16(int16x8_t a, int16x8_t b);         // VMIN.S16 q0,q0,q0
    560 int32x4_t vminq_s32(int32x4_t a, int32x4_t b);         // VMIN.S32 q0,q0,q0
    561 uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b);         // VMIN.U8 q0,q0,q0
    562 uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b);         // VMIN.U16 q0,q0,q0
    563 uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b);         // VMIN.U32 q0,q0,q0
    564 float32x4_t vminq_f32(float32x4_t a, float32x4_t b);         // VMIN.F32 q0,q0,q0
    565 //Pairwise addition
    566 //Pairwise add
    567 
    568 //Long pairwise add
    569 
    570 int16x8_t vpaddlq_s8(int8x16_t a);         // VPADDL.S8 q0,q0
    571 int32x4_t vpaddlq_s16(int16x8_t a);         // VPADDL.S16 q0,q0
    572 int64x2_t vpaddlq_s32(int32x4_t a);         // VPADDL.S32 q0,q0
    573 uint16x8_t vpaddlq_u8(uint8x16_t a);         // VPADDL.U8 q0,q0
    574 uint32x4_t vpaddlq_u16(uint16x8_t a);         // VPADDL.U16 q0,q0
    575 uint64x2_t vpaddlq_u32(uint32x4_t a);         // VPADDL.U32 q0,q0
    576 //Long pairwise add and accumulate
    577 
    578 int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b);         // VPADAL.S8 q0,q0
    579 int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b);         // VPADAL.S16 q0,q0
    580 int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b);         // VPADAL.S32 q0,q0
    581 uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b);         // VPADAL.U8 q0,q0
    582 uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b);         // VPADAL.U16 q0,q0
    583 uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b);         // VPADAL.U32 q0,q0
    584 //Folding maximum vpmax -> takes maximum of adjacent pairs
    585 
    586 //Folding minimum vpmin -> takes minimum of adjacent pairs
    587 
    588 //Reciprocal/Sqrt
    589 
    590 float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b);         // VRECPS.F32 q0, q0, q0
    591 
    592 float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b);         // VRSQRTS.F32 q0, q0, q0
    593 //Shifts by signed variable
    594 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
    595 
    596 int8x16_t vshlq_s8(int8x16_t a, int8x16_t b);         // VSHL.S8 q0,q0,q0
    597 int16x8_t vshlq_s16(int16x8_t a, int16x8_t b);         // VSHL.S16 q0,q0,q0
    598 int32x4_t vshlq_s32(int32x4_t a, int32x4_t b);         // VSHL.S32 q0,q0,q0
    599 int64x2_t vshlq_s64(int64x2_t a, int64x2_t b);         // VSHL.S64 q0,q0,q0
    600 uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b);         // VSHL.U8 q0,q0,q0
    601 uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b);         // VSHL.U16 q0,q0,q0
    602 uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b);         // VSHL.U32 q0,q0,q0
    603 uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b);         // VSHL.U64 q0,q0,q0
    604 //Vector saturating shift left: (negative values shift right)
    605 
    606 int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b);         // VQSHL.S8 q0,q0,q0
    607 int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b);         // VQSHL.S16 q0,q0,q0
    608 int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b);         // VQSHL.S32 q0,q0,q0
    609 int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b);         // VQSHL.S64 q0,q0,q0
    610 uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b);         // VQSHL.U8 q0,q0,q0
    611 uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b);         // VQSHL.U16 q0,q0,q0
    612 uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b);         // VQSHL.U32 q0,q0,q0
    613 uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b);         // VQSHL.U64 q0,q0,q0
    614 //Vector rounding shift left: (negative values shift right)
    615 
    616 int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b);         // VRSHL.S8 q0,q0,q0
    617 int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b);         // VRSHL.S16 q0,q0,q0
    618 int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b);         // VRSHL.S32 q0,q0,q0
    619 int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b);         // VRSHL.S64 q0,q0,q0
    620 uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b);         // VRSHL.U8 q0,q0,q0
    621 uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b);         // VRSHL.U16 q0,q0,q0
    622 uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b);         // VRSHL.U32 q0,q0,q0
    623 uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b);         // VRSHL.U64 q0,q0,q0
    624 //Vector saturating rounding shift left: (negative values shift right)
    625 
    626 int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b);         // VQRSHL.S8 q0,q0,q0
    627 int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b);         // VQRSHL.S16 q0,q0,q0
    628 int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b);         // VQRSHL.S32 q0,q0,q0
    629 int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b);         // VQRSHL.S64 q0,q0,q0
    630 uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b);         // VQRSHL.U8 q0,q0,q0
    631 uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b);         // VQRSHL.U16 q0,q0,q0
    632 uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b);         // VQRSHL.U32 q0,q0,q0
    633 uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b);         // VQRSHL.U64 q0,q0,q0
    634 //Shifts by a constant
    635 //Vector shift right by constant
    636 
    637 int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VSHR.S8 q0,q0,#8
    638 int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VSHR.S16 q0,q0,#16
    639 int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VSHR.S32 q0,q0,#32
    640 int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VSHR.S64 q0,q0,#64
    641 uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VSHR.U8 q0,q0,#8
    642 uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VSHR.U16 q0,q0,#16
    643 uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VSHR.U32 q0,q0,#32
    644 uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VSHR.U64 q0,q0,#64
    645 //Vector shift left by constant
    646 
    647 int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
    648 int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
    649 int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
    650 int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
    651 uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
    652 uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
    653 uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
    654 uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
    655 //Vector rounding shift right by constant
    656 
    657 int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VRSHR.S8 q0,q0,#8
    658 int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VRSHR.S16 q0,q0,#16
    659 int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VRSHR.S32 q0,q0,#32
    660 int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VRSHR.S64 q0,q0,#64
    661 uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VRSHR.U8 q0,q0,#8
    662 uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VRSHR.U16 q0,q0,#16
    663 uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VRSHR.U32 q0,q0,#32
    664 uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VRSHR.U64 q0,q0,#64
    665 //Vector shift right by constant and accumulate
    666 
    667 int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRA.S8 q0,q0,#8
    668 int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRA.S16 q0,q0,#16
    669 int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRA.S32 q0,q0,#32
    670 int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRA.S64 q0,q0,#64
    671 uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRA.U8 q0,q0,#8
    672 uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRA.U16 q0,q0,#16
    673 uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRA.U32 q0,q0,#32
    674 uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRA.U64 q0,q0,#64
    675 //Vector rounding shift right by constant and accumulate
    676 
    677 int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VRSRA.S8 q0,q0,#8
    678 int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VRSRA.S16 q0,q0,#16
    679 int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VRSRA.S32 q0,q0,#32
    680 int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VRSRA.S64 q0,q0,#64
    681 uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VRSRA.U8 q0,q0,#8
    682 uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VRSRA.U16 q0,q0,#16
    683 uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VRSRA.U32 q0,q0,#32
    684 uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VRSRA.U64 q0,q0,#64
    685 //Vector saturating shift left by constant
    686 
    687 int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHL.S8 q0,q0,#0
    688 int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHL.S16 q0,q0,#0
    689 int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHL.S32 q0,q0,#0
    690 int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHL.S64 q0,q0,#0
    691 uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VQSHL.U8 q0,q0,#0
    692 uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VQSHL.U16 q0,q0,#0
    693 uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VQSHL.U32 q0,q0,#0
    694 uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VQSHL.U64 q0,q0,#0
    695 //Vector signed->unsigned saturating shift left by constant
    696 
    697 uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHLU.S8 q0,q0,#0
    698 uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHLU.S16 q0,q0,#0
    699 uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHLU.S32 q0,q0,#0
    700 uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHLU.S64 q0,q0,#0
    701 //Vector narrowing shift right by constant
    702 
    703 //Vector signed->unsigned narrowing saturating shift right by constant
    704 
    705 //Vector signed->unsigned rounding narrowing saturating shift right by constant
    706 
    707 //Vector narrowing saturating shift right by constant
    708 
    709 //Vector rounding narrowing shift right by constant
    710 
    711 //Vector rounding narrowing saturating shift right by constant
    712 
    713 //Vector widening shift left by constant
    714 
    715 //Shifts with insert
    716 //Vector shift right and insert
    717 
    718 int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
    719 int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
    720 int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
    721 int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
    722 uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
    723 uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
    724 uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
    725 uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
    726 poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
    727 poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
    728 //Vector shift left and insert
    729 
    730 int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
    731 int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
    732 int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
    733 int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
    734 uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
    735 uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
    736 uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
    737 uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
    738 poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
    739 poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
    740 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
    741 //Load a single vector from memory
    742 uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
    743 uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
    744 uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
    745 uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
    746 int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
    747 int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
    748 int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
    749 int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
    750 float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr);         // VLD1.16 {d0, d1}, [r0]
    751 float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
    752 poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
    753 poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
    754 
    755 //Load a single lane from memory
    756 uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
    757 uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
    758 uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
    759 uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
    760 int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
    761 int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane);         //VLD1.16 {d0[0]}, [r0]
    762 int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane);         //VLD1.32 {d0[0]}, [r0]
    763 float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
    764 int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane);         //VLD1.64 {d0}, [r0]
    765 poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane);         //VLD1.8 {d0[0]}, [r0]
    766 poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
    767 
    768 //Load all lanes of vector with same value from memory
    769 uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
    770 uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
    771 uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
    772 uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr);         // VLD1.64 {d0}, [r0]
    773 int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
    774 int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
    775 int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
    776 int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr);         // VLD1.64 {d0}, [r0]
    777 float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr);         // VLD1.16 {d0[]}, [r0]
    778 float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
    779 poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
    780 poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
    781 
    782 //Store a single vector or lane. Stores all lanes or a single lane of a vector.
    783 //Store a single vector into memory
    784 void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val);         // VST1.8 {d0, d1}, [r0]
    785 void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val);         // VST1.16 {d0, d1}, [r0]
    786 void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val);         // VST1.32 {d0, d1}, [r0]
    787 void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val);         // VST1.64 {d0, d1}, [r0]
    788 void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val);         // VST1.8 {d0, d1}, [r0]
    789 void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val);         // VST1.16 {d0, d1}, [r0]
    790 void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val);         // VST1.32 {d0, d1}, [r0]
    791 void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val);         // VST1.64 {d0, d1}, [r0]
    792 void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val);         // VST1.16 {d0, d1}, [r0]
    793 void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val);         // VST1.32 {d0, d1}, [r0]
    794 void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val);         // VST1.8 {d0, d1}, [r0]
    795 void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val);         // VST1.16 {d0, d1}, [r0]
    796 
    797 //Store a lane of a vector into memory
    798 //Loads of an N-element structure
    799 //Load N-element structure from memory
    800 uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
    801 uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
    802 uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
    803 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
    804 int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
    805 int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
    806 float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr);         // VLD2.16 {d0, d2}, [r0]
    807 float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
    808 poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
    809 poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
    810 uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
    811 uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
    812 uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
    813 uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
    814 int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
    815 int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
    816 int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
    817 int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
    818 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
    819 float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
    820 poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
    821 poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
    822 uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
    823 uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
    824 uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
    825 int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
    826 int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
    827 int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
    828 float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
    829 float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
    830 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
    831 poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
    832 uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
    833 uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
    834 uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
    835 uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
    836 int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
    837 int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
    838 int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
    839 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
    840 float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
    841 float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
    842 poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
    843 poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
    844 uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
    845 uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
    846 uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
    847 int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
    848 int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
    849 int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
    850 float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
    851 float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
    852 poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
    853 poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
    854 uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
    855 uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
    856 uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
    857 uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
    858 int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
    859 int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
    860 int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
    861 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
    862 float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
    863 float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
    864 poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
    865 poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
    866 //Load all lanes of N-element structure with same value from memory
    867 uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
    868 uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
    869 uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
    870 uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
    871 int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
    872 int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
    873 int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
    874 int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
    875 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
    876 float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
    877 poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
    878 poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
    879 uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
    880 uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
    881 uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
    882 uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
    883 int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
    884 int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
    885 int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
    886 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
    887 float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
    888 float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
    889 poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
    890 poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
    891 uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
    892 uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
    893 uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
    894 uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
    895 int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
    896 int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
    897 int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
    898 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
    899 float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
    900 float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
    901 poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
    902 poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
    903 //Load a single lane of N-element structure from memory
    904 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
    905 uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
    906 uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
    907 int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
    908 int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
    909 float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
    910 float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane);         // VLD2.32 {d0[0], d2[0]}, [r0]
    911 poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane);         // VLD2.16 {d0[0], d2[0]}, [r0]
    912 uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
    913 uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
    914 uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
    915 int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
    916 int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane);         //VLD2.16 {d0[0], d1[0]}, [r0]
    917 int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane);         //VLD2.32 {d0[0], d1[0]}, [r0]
    918 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
    919 float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
    920 poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane);         //VLD2.8 {d0[0], d1[0]}, [r0]
    921 poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
    922 uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
    923 uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
    924 int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
    925 int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
    926 float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
    927 float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane);         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
    928 poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
    929 uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
    930 uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
    931 uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
    932 int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
    933 int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane);         //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
    934 int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane);         //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
    935 float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
    936 float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
    937 poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane);         //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
    938 poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
    939 uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
    940 uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
    941 int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
    942 int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
    943 float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
    944 float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
    945 poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
    946 uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    947 uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    948 uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane);         // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    949 int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    950 int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);         //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    951 int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);         //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    952 float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    953 float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane);         // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    954 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);         //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    955 poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);         // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
    956 //Store N-element structure to memory
    957 void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
    958 void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
    959 void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
    960 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
    961 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
    962 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
    963 void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
    964 void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val);         // VST2.32 {d0, d2}, [r0]
    965 void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);         // VST2.8 {d0, d2}, [r0]
    966 void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);         // VST2.16 {d0, d2}, [r0]
    967 void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
    968 void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
    969 void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
    970 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);         // VST1.64 {d0, d1}, [r0]
    971 void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
    972 void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
    973 void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
    974 void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val);         // VST1.64 {d0, d1}, [r0]
    975 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
    976 void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
    977 void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val);         // VST2.8 {d0, d1}, [r0]
    978 void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val);         // VST2.16 {d0, d1}, [r0]
    979 void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
    980 void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
    981 void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
    982 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
    983 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
    984 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
    985 void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
    986 void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val);         // VST3.32 {d0, d2, d4}, [r0]
    987 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);         // VST3.8 {d0, d2, d4}, [r0]
    988 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);         // VST3.16 {d0, d2, d4}, [r0]
    989 void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
    990 void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
    991 void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
    992 void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val);         // VST1.64 {d0, d1, d2}, [r0]
    993 void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
    994 void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
    995 void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
    996 void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val);         // VST1.64 {d0, d1, d2}, [r0]
    997 void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
    998 void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val);         // VST3.32 {d0, d1, d2}, [r0]
    999 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);         // VST3.8 {d0, d1, d2}, [r0]
   1000 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
   1001 void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
   1002 void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
   1003 void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
   1004 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
   1005 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
   1006 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
   1007 void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
   1008 void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val);         // VST4.32 {d0, d2, d4, d6}, [r0]
   1009 void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);         // VST4.8 {d0, d2, d4, d6}, [r0]
   1010 void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);         // VST4.16 {d0, d2, d4, d6}, [r0]
   1011 void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
   1012 void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
   1013 void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
   1014 void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val);         // VST1.64 {d0, d1, d2, d3}, [r0]
   1015 void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
   1016 void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
   1017 void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
   1018 void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);         // VST1.64 {d0, d1, d2, d3}, [r0]
   1019 void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
   1020 void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val);         // VST4.32 {d0, d1, d2, d3}, [r0]
   1021 void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);         // VST4.8 {d0, d1, d2, d3}, [r0]
   1022 void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);         // VST4.16 {d0, d1, d2, d3}, [r0]
   1023 //Store a single lane of N-element structure to memory
   1024 void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
   1025 void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane);         // VST2.32{d0[0], d2[0]}, [r0]
   1026 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
   1027 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);         // VST2.32{d0[0], d2[0]}, [r0]
   1028 void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
   1029 void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane);         //VST2.32 {d0[0], d2[0]}, [r0]
   1030 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);         // VST2.16{d0[0], d2[0]}, [r0]
   1031 void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8{d0[0], d1[0]}, [r0]
   1032 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
   1033 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
   1034 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8 {d0[0],d1[0]}, [r0]
   1035 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
   1036 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
   1037 void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
   1038 void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32{d0[0], d1[0]}, [r0]
   1039 void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane);         // VST2.8{d0[0], d1[0]}, [r0]
   1040 void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16{d0[0], d1[0]}, [r0]
   1041 void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1042 void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane);         // VST3.32{d0[0], d2[0], d4[0]}, [r0]
   1043 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1044 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);         // VST3.32{d0[0], d2[0], d4[0]}, [r0]
   1045 void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1046 void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane);         //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   1047 void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);         // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1048 void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8{d0[0], d1[0], d2[0]}, [r0]
   1049 void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1050 void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
   1051 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
   1052 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1053 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
   1054 void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1055 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);         // VST3.32{d0[0], d1[0], d2[0]}, [r0]
   1056 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);         // VST3.8{d0[0], d1[0], d2[0]}, [r0]
   1057 void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);         // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1058 void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1059 void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane);         // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1060 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1061 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);         // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1062 void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1063 void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane);         //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1064 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);         // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1065 void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1066 void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1067 void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1068 void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
   1069 void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1070 void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1071 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1072 void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane);         // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1073 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);         // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1074 void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);         // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1075 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
   1076 
   1077 uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
   1078 uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.U16 r0, d0[0]
   1079 uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
   1080 int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane);         // VMOV.S8 r0, d0[0]
   1081 int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane);         // VMOV.S16 r0, d0[0]
   1082 int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
   1083 poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
   1084 poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.U16 r0, d0[0]
   1085 float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
   1086 
   1087 int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
   1088 uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
   1089 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
   1090 
   1091 uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
   1092 uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
   1093 uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
   1094 int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
   1095 int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
   1096 int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
   1097 poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
   1098 poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
   1099 float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
   1100 
   1101 int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
   1102 uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
   1103 //Initialize a vector from a literal bit pattern.
   1104 
   1105 //Set all lanes to same value
   1106 //Load all lanes of vector to the same literal value
   1107 
   1108 uint8x16_t vdupq_n_u8(uint8_t value);         // VDUP.8 q0,r0
   1109 uint16x8_t vdupq_n_u16(uint16_t value);         // VDUP.16 q0,r0
   1110 uint32x4_t vdupq_n_u32(uint32_t value);         // VDUP.32 q0,r0
   1111 int8x16_t vdupq_n_s8(int8_t value);         // VDUP.8 q0,r0
   1112 int16x8_t vdupq_n_s16(int16_t value);         // VDUP.16 q0,r0
   1113 int32x4_t vdupq_n_s32(int32_t value);         // VDUP.32 q0,r0
   1114 poly8x16_t vdupq_n_p8(poly8_t value);         // VDUP.8 q0,r0
   1115 poly16x8_t vdupq_n_p16(poly16_t value);         // VDUP.16 q0,r0
   1116 float32x4_t vdupq_n_f32(float32_t value);         // VDUP.32 q0,r0
   1117 
   1118 int64x2_t vdupq_n_s64(int64_t value);         // VMOV d0,r0,r0
   1119 uint64x2_t vdupq_n_u64(uint64_t value);         // VMOV d0,r0,r0
   1120 
   1121 uint8x16_t vmovq_n_u8(uint8_t value);         // VDUP.8 q0,r0
   1122 uint16x8_t vmovq_n_u16(uint16_t value);         // VDUP.16 q0,r0
   1123 uint32x4_t vmovq_n_u32(uint32_t value);         // VDUP.32 q0,r0
   1124 int8x16_t vmovq_n_s8(int8_t value);         // VDUP.8 q0,r0
   1125 int16x8_t vmovq_n_s16(int16_t value);         // VDUP.16 q0,r0
   1126 int32x4_t vmovq_n_s32(int32_t value);         // VDUP.32 q0,r0
   1127 poly8x16_t vmovq_n_p8(poly8_t value);         // VDUP.8 q0,r0
   1128 poly16x8_t vmovq_n_p16(poly16_t value);         // VDUP.16 q0,r0
   1129 float32x4_t vmovq_n_f32(float32_t value);         // VDUP.32 q0,r0
   1130 
   1131 int64x2_t vmovq_n_s64(int64_t value);         // VMOV d0,r0,r0
   1132 uint64x2_t vmovq_n_u64(uint64_t value);         // VMOV d0,r0,r0
   1133 //Load all lanes of the vector to the value of a lane of a vector
   1134 
   1135 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
   1136 
   1137 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
   1138 
   1139 //Converting vectors. These intrinsics are used to convert vectors.
   1140 //Convert from float
   1141 
   1142 int32x4_t vcvtq_s32_f32(float32x4_t a);         // VCVT.S32.F32 q0, q0
   1143 uint32x4_t vcvtq_u32_f32(float32x4_t a);         // VCVT.U32.F32 q0, q0
   1144 
   1145 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.S32.F32 q0, q0, #32
   1146 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.U32.F32 q0, q0, #32
   1147 //Convert to float
   1148 
   1149 float32x4_t vcvtq_f32_s32(int32x4_t a);         // VCVT.F32.S32 q0, q0
   1150 float32x4_t vcvtq_f32_u32(uint32x4_t a);         // VCVT.F32.U32 q0, q0
   1151 
   1152 float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b);         // VCVT.F32.S32 q0, q0, #32
   1153 float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b);         // VCVT.F32.U32 q0, q0, #32
   1154 //Convert between floats
   1155 
   1156 //Vector narrow integer
   1157 
   1158 //Vector long move
   1159 
   1160 //Vector saturating narrow integer
   1161 
   1162 //Vector saturating narrow integer signed->unsigned
   1163 
   1164 //Table look up
   1165 
   1166 //Extended table look up intrinsics
   1167 
   1168 //Operations with a scalar value
   1169 //Vector multiply accumulate with scalar
   1170 
   1171 //Vector widening multiply accumulate with scalar
   1172 
   1173 //Vector widening saturating doubling multiply accumulate with scalar
   1174 
   1175 //Vector multiply subtract with scalar
   1176 
   1177 //Vector widening multiply subtract with scalar
   1178 
   1179 //Vector widening saturating doubling multiply subtract with scalar
   1180 
   1181 //Vector multiply by scalar
   1182 
   1183 int16x8_t vmulq_n_s16(int16x8_t a, int16_t b);         // VMUL.I16 q0,q0,d0[0]
   1184 int32x4_t vmulq_n_s32(int32x4_t a, int32_t b);         // VMUL.I32 q0,q0,d0[0]
   1185 float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);         // VMUL.F32 q0,q0,d0[0]
   1186 uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b);         // VMUL.I16 q0,q0,d0[0]
   1187 uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b);         // VMUL.I32 q0,q0,d0[0]
   1188 //Vector long multiply with scalar
   1189 
   1190 //Vector long multiply by scalar
   1191 
   1192 //Vector saturating doubling long multiply with scalar
   1193 
   1194 //Vector saturating doubling long multiply by scalar
   1195 
   1196 //Vector saturating doubling multiply high with scalar
   1197 
   1198 int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQDMULH.S16 q0,q0,d0[0]
   1199 int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQDMULH.S32 q0,q0,d0[0]
   1200 //Vector saturating doubling multiply high by scalar
   1201 
   1202 //Vector saturating rounding doubling multiply high with scalar
   1203 
   1204 int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQRDMULH.S16 q0,q0,d0[0]
   1205 int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQRDMULH.S32 q0,q0,d0[0]
   1206 //Vector rounding saturating doubling multiply high by scalar
   1207 
   1208 //Vector multiply accumulate with scalar
   1209 
   1210 int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLA.I16 q0, q0, d0[0]
   1211 int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLA.I32 q0, q0, d0[0]
   1212 uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLA.I16 q0, q0, d0[0]
   1213 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLA.I32 q0, q0, d0[0]
   1214 float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLA.F32 q0, q0, d0[0]
   1215 //Vector widening multiply accumulate with scalar
   1216 
   1217 //Vector widening saturating doubling multiply accumulate with scalar
   1218 
   1219 //Vector multiply subtract with scalar
   1220 
   1221 int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLS.I16 q0, q0, d0[0]
   1222 int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLS.I32 q0, q0, d0[0]
   1223 uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLS.I16 q0, q0, d0[0]
   1224 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLS.I32 q0, q0, d0[0]
   1225 float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLS.F32 q0, q0, d0[0]
   1226 //Vector widening multiply subtract with scalar
   1227 
   1228 //Vector widening saturating doubling multiply subtract with scalar
   1229 
   1230 //Vector extract
   1231 
   1232 int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
   1233 uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
   1234 poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
   1235 int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
   1236 uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
   1237 poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
   1238 int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
   1239 uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
   1240 int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
   1241 uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
   1242 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
   1243 
   1244 int8x16_t vrev64q_s8(int8x16_t vec);         // VREV64.8 q0,q0
   1245 int16x8_t vrev64q_s16(int16x8_t vec);         // VREV64.16 q0,q0
   1246 int32x4_t vrev64q_s32(int32x4_t vec);         // VREV64.32 q0,q0
   1247 uint8x16_t vrev64q_u8(uint8x16_t vec);         // VREV64.8 q0,q0
   1248 uint16x8_t vrev64q_u16(uint16x8_t vec);         // VREV64.16 q0,q0
   1249 uint32x4_t vrev64q_u32(uint32x4_t vec);         // VREV64.32 q0,q0
   1250 poly8x16_t vrev64q_p8(poly8x16_t vec);         // VREV64.8 q0,q0
   1251 poly16x8_t vrev64q_p16(poly16x8_t vec);         // VREV64.16 q0,q0
   1252 float32x4_t vrev64q_f32(float32x4_t vec);         // VREV64.32 q0,q0
   1253 
   1254 int8x16_t vrev32q_s8(int8x16_t vec);         // VREV32.8 q0,q0
   1255 int16x8_t vrev32q_s16(int16x8_t vec);         // VREV32.16 q0,q0
   1256 uint8x16_t vrev32q_u8(uint8x16_t vec);         // VREV32.8 q0,q0
   1257 uint16x8_t vrev32q_u16(uint16x8_t vec);         // VREV32.16 q0,q0
   1258 poly8x16_t vrev32q_p8(poly8x16_t vec);         // VREV32.8 q0,q0
   1259 
   1260 int8x16_t vrev16q_s8(int8x16_t vec);         // VREV16.8 q0,q0
   1261 uint8x16_t vrev16q_u8(uint8x16_t vec);         // VREV16.8 q0,q0
   1262 poly8x16_t vrev16q_p8(poly8x16_t vec);         // VREV16.8 q0,q0
   1263 //Other single operand arithmetic
   1264 //Absolute: Vd[i] = |Va[i]|
   1265 
   1266 int8x16_t vabsq_s8(int8x16_t a);         // VABS.S8 q0,q0
   1267 int16x8_t vabsq_s16(int16x8_t a);         // VABS.S16 q0,q0
   1268 int32x4_t vabsq_s32(int32x4_t a);         // VABS.S32 q0,q0
   1269 float32x4_t vabsq_f32(float32x4_t a);         // VABS.F32 q0,q0
   1270 //Saturating absolute: Vd[i] = sat(|Va[i]|)
   1271 
   1272 int8x16_t vqabsq_s8(int8x16_t a);         // VQABS.S8 q0,q0
   1273 int16x8_t vqabsq_s16(int16x8_t a);         // VQABS.S16 q0,q0
   1274 int32x4_t vqabsq_s32(int32x4_t a);         // VQABS.S32 q0,q0
   1275 //Negate: Vd[i] = - Va[i]
   1276 
   1277 int8x16_t vnegq_s8(int8x16_t a);         // VNE//q0,q0
   1278 int16x8_t vnegq_s16(int16x8_t a);         // VNE//q0,q0
   1279 int32x4_t vnegq_s32(int32x4_t a);         // VNE//q0,q0
   1280 float32x4_t vnegq_f32(float32x4_t a);         // VNE//q0,q0
   1281 //Saturating Negate: sat(Vd[i] = - Va[i])
   1282 
   1283 int8x16_t vqnegq_s8(int8x16_t a);         // VQNE//q0,q0
   1284 int16x8_t vqnegq_s16(int16x8_t a);         // VQNE//q0,q0
   1285 int32x4_t vqnegq_s32(int32x4_t a);         // VQNE//q0,q0
   1286 //Count leading sign bits
   1287 
   1288 int8x16_t vclsq_s8(int8x16_t a);         // VCLS.S8 q0,q0
   1289 int16x8_t vclsq_s16(int16x8_t a);         // VCLS.S16 q0,q0
   1290 int32x4_t vclsq_s32(int32x4_t a);         // VCLS.S32 q0,q0
   1291 //Count leading zeros
   1292 
   1293 int8x16_t vclzq_s8(int8x16_t a);         // VCLZ.I8 q0,q0
   1294 int16x8_t vclzq_s16(int16x8_t a);         // VCLZ.I16 q0,q0
   1295 int32x4_t vclzq_s32(int32x4_t a);         // VCLZ.I32 q0,q0
   1296 uint8x16_t vclzq_u8(uint8x16_t a);         // VCLZ.I8 q0,q0
   1297 uint16x8_t vclzq_u16(uint16x8_t a);         // VCLZ.I16 q0,q0
   1298 uint32x4_t vclzq_u32(uint32x4_t a);         // VCLZ.I32 q0,q0
   1299 //Count number of set bits
   1300 
   1301 uint8x16_t vcntq_u8(uint8x16_t a);         // VCNT.8 q0,q0
   1302 int8x16_t vcntq_s8(int8x16_t a);         // VCNT.8 q0,q0
   1303 poly8x16_t vcntq_p8(poly8x16_t a);         // VCNT.8 q0,q0
   1304 //Reciprocal estimate
   1305 
   1306 float32x4_t vrecpeq_f32(float32x4_t a);         // VRECPE.F32 q0,q0
   1307 uint32x4_t vrecpeq_u32(uint32x4_t a);         // VRECPE.U32 q0,q0
   1308 //Reciprocal square root estimate
   1309 
   1310 float32x4_t vrsqrteq_f32(float32x4_t a);         // VRSQRTE.F32 q0,q0
   1311 uint32x4_t vrsqrteq_u32(uint32x4_t a);         // VRSQRTE.U32 q0,q0
   1312 //Logical operations
   1313 //Bitwise not
   1314 
   1315 int8x16_t vmvnq_s8(int8x16_t a);         // VMVN q0,q0
   1316 int16x8_t vmvnq_s16(int16x8_t a);         // VMVN q0,q0
   1317 int32x4_t vmvnq_s32(int32x4_t a);         // VMVN q0,q0
   1318 uint8x16_t vmvnq_u8(uint8x16_t a);         // VMVN q0,q0
   1319 uint16x8_t vmvnq_u16(uint16x8_t a);         // VMVN q0,q0
   1320 uint32x4_t vmvnq_u32(uint32x4_t a);         // VMVN q0,q0
   1321 poly8x16_t vmvnq_p8(poly8x16_t a);         // VMVN q0,q0
   1322 //Bitwise and
   1323 
   1324 int8x16_t vandq_s8(int8x16_t a, int8x16_t b);         // VAND q0,q0,q0
   1325 int16x8_t vandq_s16(int16x8_t a, int16x8_t b);         // VAND q0,q0,q0
   1326 int32x4_t vandq_s32(int32x4_t a, int32x4_t b);         // VAND q0,q0,q0
   1327 int64x2_t vandq_s64(int64x2_t a, int64x2_t b);         // VAND q0,q0,q0
   1328 uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b);         // VAND q0,q0,q0
   1329 uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b);         // VAND q0,q0,q0
   1330 uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b);         // VAND q0,q0,q0
   1331 uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b);         // VAND q0,q0,q0
   1332 //Bitwise or
   1333 
   1334 int8x16_t vorrq_s8(int8x16_t a, int8x16_t b);         // VORR q0,q0,q0
   1335 int16x8_t vorrq_s16(int16x8_t a, int16x8_t b);         // VORR q0,q0,q0
   1336 int32x4_t vorrq_s32(int32x4_t a, int32x4_t b);         // VORR q0,q0,q0
   1337 int64x2_t vorrq_s64(int64x2_t a, int64x2_t b);         // VORR q0,q0,q0
   1338 uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b);         // VORR q0,q0,q0
   1339 uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b);         // VORR q0,q0,q0
   1340 uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b);         // VORR q0,q0,q0
   1341 uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b);         // VORR q0,q0,q0
   1342 //Bitwise exclusive or (EOR or XOR)
   1343 
   1344 int8x16_t veorq_s8(int8x16_t a, int8x16_t b);         // VEOR q0,q0,q0
   1345 int16x8_t veorq_s16(int16x8_t a, int16x8_t b);         // VEOR q0,q0,q0
   1346 int32x4_t veorq_s32(int32x4_t a, int32x4_t b);         // VEOR q0,q0,q0
   1347 int64x2_t veorq_s64(int64x2_t a, int64x2_t b);         // VEOR q0,q0,q0
   1348 uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b);         // VEOR q0,q0,q0
   1349 uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b);         // VEOR q0,q0,q0
   1350 uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b);         // VEOR q0,q0,q0
   1351 uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b);         // VEOR q0,q0,q0
   1352 //Bit Clear
   1353 
   1354 int8x16_t vbicq_s8(int8x16_t a, int8x16_t b);         // VBIC q0,q0,q0
   1355 int16x8_t vbicq_s16(int16x8_t a, int16x8_t b);         // VBIC q0,q0,q0
   1356 int32x4_t vbicq_s32(int32x4_t a, int32x4_t b);         // VBIC q0,q0,q0
   1357 int64x2_t vbicq_s64(int64x2_t a, int64x2_t b);         // VBIC q0,q0,q0
   1358 uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b);         // VBIC q0,q0,q0
   1359 uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b);         // VBIC q0,q0,q0
   1360 uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b);         // VBIC q0,q0,q0
   1361 uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b);         // VBIC q0,q0,q0
   1362 //Bitwise OR complement
   1363 
   1364 int8x16_t vornq_s8(int8x16_t a, int8x16_t b);         // VORN q0,q0,q0
   1365 int16x8_t vornq_s16(int16x8_t a, int16x8_t b);         // VORN q0,q0,q0
   1366 int32x4_t vornq_s32(int32x4_t a, int32x4_t b);         // VORN q0,q0,q0
   1367 int64x2_t vornq_s64(int64x2_t a, int64x2_t b);         // VORN q0,q0,q0
   1368 uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b);         // VORN q0,q0,q0
   1369 uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b);         // VORN q0,q0,q0
   1370 uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b);         // VORN q0,q0,q0
   1371 uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b);         // VORN q0,q0,q0
   1372 //Bitwise Select
   1373 
   1374 int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c);         // VBSL q0,q0,q0
   1375 int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c);         // VBSL q0,q0,q0
   1376 int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c);         // VBSL q0,q0,q0
   1377 int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c);         // VBSL q0,q0,q0
   1378 uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VBSL q0,q0,q0
   1379 uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VBSL q0,q0,q0
   1380 uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VBSL q0,q0,q0
   1381 uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);         // VBSL q0,q0,q0
   1382 float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c);         // VBSL q0,q0,q0
   1383 poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c);         // VBSL q0,q0,q0
   1384 poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c);         // VBSL q0,q0,q0
   1385 //Transposition operations
   1386 //Transpose elements
   1387 
   1388 int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b);         // VTRN.8 q0,q0
   1389 int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b);         // VTRN.16 q0,q0
   1390 int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b);         // VTRN.32 q0,q0
   1391 uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b);         // VTRN.8 q0,q0
   1392 uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b);         // VTRN.16 q0,q0
   1393 uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b);         // VTRN.32 q0,q0
   1394 float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b);         // VTRN.32 q0,q0
   1395 poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b);         // VTRN.8 q0,q0
   1396 poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b);         // VTRN.16 q0,q0
   1397 //Interleave elements
   1398 
   1399 int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b);         // VZIP.8 q0,q0
   1400 int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b);         // VZIP.16 q0,q0
   1401 int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b);         // VZIP.32 q0,q0
   1402 uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b);         // VZIP.8 q0,q0
   1403 uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b);         // VZIP.16 q0,q0
   1404 uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b);         // VZIP.32 q0,q0
   1405 float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b);         // VZIP.32 q0,q0
   1406 poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b);         // VZIP.8 q0,q0
   1407 poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b);         // VZIP.16 q0,q0
   1408 //De-Interleave elements
   1409 
   1410 int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b);         // VUZP.8 q0,q0
   1411 int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b);         // VUZP.16 q0,q0
   1412 int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b);         // VUZP.32 q0,q0
   1413 uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b);         // VUZP.8 q0,q0
   1414 uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b);         // VUZP.16 q0,q0
   1415 uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b);         // VUZP.32 q0,q0
   1416 float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b);         // VUZP.32 q0,q0
   1417 poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b);         // VUZP.8 q0,q0
   1418 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
   1419 
   1420 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   1421 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must,
   1422 //for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
   1423 //
   1424 #if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined __INTEL_COMPILER )&& defined NDEBUG     //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers.
   1425 
   1426     #if defined(USE_SSSE3)
   1427         #define _MM_ALIGNR_EPI8 _mm_alignr_epi8
   1428     #endif
   1429 
   1430     #define _MM_EXTRACT_EPI16  _mm_extract_epi16
   1431     #define _MM_INSERT_EPI16 _mm_insert_epi16
   1432     #ifdef USE_SSE4
   1433         #define _MM_EXTRACT_EPI8  _mm_extract_epi8
   1434         #define _MM_EXTRACT_EPI32  _mm_extract_epi32
   1435         #define _MM_EXTRACT_PS  _mm_extract_ps
   1436 
   1437         #define _MM_INSERT_EPI8  _mm_insert_epi8
   1438         #define _MM_INSERT_EPI32 _mm_insert_epi32
   1439         #define _MM_INSERT_PS    _mm_insert_ps
   1440     #ifdef  _M_X64
   1441             #define _MM_INSERT_EPI64 _mm_insert_epi64
   1442             #define _MM_EXTRACT_EPI64 _mm_extract_epi64
   1443     #endif
   1444     #endif     //SSE4
   1445 #else
   1446     #define _NEON2SSE_COMMA ,
   1447     #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
   1448             switch(LANE)         \
   1449         {                \
   1450         case 0:     return NAME(a b, 0); \
   1451         case 1:     return NAME(a b, 1); \
   1452         case 2:     return NAME(a b, 2); \
   1453         case 3:     return NAME(a b, 3); \
   1454         case 4:     return NAME(a b, 4); \
   1455         case 5:     return NAME(a b, 5); \
   1456         case 6:     return NAME(a b, 6); \
   1457         case 7:     return NAME(a b, 7); \
   1458         case 8:     return NAME(a b, 8); \
   1459         case 9:     return NAME(a b, 9); \
   1460         case 10:    return NAME(a b, 10); \
   1461         case 11:    return NAME(a b, 11); \
   1462         case 12:    return NAME(a b, 12); \
   1463         case 13:    return NAME(a b, 13); \
   1464         case 14:    return NAME(a b, 14); \
   1465         case 15:    return NAME(a b, 15); \
   1466         default:    return NAME(a b, 0); \
   1467         }
   1468 
   1469     #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
   1470             switch(LANE)              \
   1471         {                          \
   1472         case 0:  return NAME(vec p,0); \
   1473         case 1:  return NAME(vec p,1); \
   1474         case 2:  return NAME(vec p,2); \
   1475         case 3:  return NAME(vec p,3); \
   1476         case 4:  return NAME(vec p,4); \
   1477         case 5:  return NAME(vec p,5); \
   1478         case 6:  return NAME(vec p,6); \
   1479         case 7:  return NAME(vec p,7); \
   1480         default: return NAME(vec p,0); \
   1481         }
   1482 
   1483     #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
   1484             switch(LANE)              \
   1485         {                          \
   1486         case case0:  return NAME(vec p,case0); \
   1487         case case1:  return NAME(vec p,case1); \
   1488         case case2:  return NAME(vec p,case2); \
   1489         case case3:  return NAME(vec p,case3); \
   1490         default:     return NAME(vec p,case0); \
   1491         }
   1492 
   1493     #if defined(USE_SSSE3)
   1494     _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
   1495     {
   1496         _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
   1497     }
   1498     #endif
   1499 
   1500     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
   1501     {
   1502         _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
   1503     }
   1504 
   1505     _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
   1506     {
   1507         _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
   1508     }
   1509 
   1510     #ifdef USE_SSE4
   1511         _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
   1512         {
   1513             _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
   1514         }
   1515 
   1516         _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
   1517         {
   1518             _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
   1519         }
   1520 
   1521         _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
   1522         {
   1523             _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
   1524         }
   1525 
   1526         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
   1527         {
   1528             _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
   1529         }
   1530 
   1531         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
   1532         {
   1533             _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
   1534         }
   1535     #ifdef  _M_X64
   1536             _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
   1537             {
   1538                 switch(LANE)
   1539                 {
   1540                 case 0:
   1541                     return _mm_insert_epi64(vec,  p, 0);
   1542                 case 1:
   1543                     return _mm_insert_epi64(vec,  p, 1);
   1544                 default:
   1545                     return _mm_insert_epi64(vec,  p, 0);
   1546                 }
   1547             }
   1548 
   1549             _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
   1550             {
   1551                 if (LANE ==0) return _mm_extract_epi64(val, 0);
   1552                 else return _mm_extract_epi64(val, 1);
   1553             }
   1554     #endif
   1555         _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
   1556         {
   1557             _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
   1558         }
   1559 
   1560     #endif     //USE_SSE4
   1561 
   1562 #endif     //#ifdef NDEBUG
   1563 
   1564 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   1565 // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
   1566 // or for some specific commonly used operations implementation missing in SSE
   1567 #ifdef USE_SSE4
   1568     #define _MM_CVTEPU8_EPI16  _mm_cvtepu8_epi16
   1569     #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
   1570     #define _MM_CVTEPU32_EPI64  _mm_cvtepu32_epi64
   1571 
   1572     #define _MM_CVTEPI8_EPI16  _mm_cvtepi8_epi16
   1573     #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
   1574     #define _MM_CVTEPI32_EPI64  _mm_cvtepi32_epi64
   1575 
   1576     #define _MM_MAX_EPI8  _mm_max_epi8
   1577     #define _MM_MAX_EPI32 _mm_max_epi32
   1578     #define _MM_MAX_EPU16 _mm_max_epu16
   1579     #define _MM_MAX_EPU32 _mm_max_epu32
   1580 
   1581     #define _MM_MIN_EPI8  _mm_min_epi8
   1582     #define _MM_MIN_EPI32 _mm_min_epi32
   1583     #define _MM_MIN_EPU16 _mm_min_epu16
   1584     #define _MM_MIN_EPU32 _mm_min_epu32
   1585 
   1586     #define _MM_BLENDV_EPI8 _mm_blendv_epi8
   1587     #define _MM_PACKUS_EPI32 _mm_packus_epi32
   1588     #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
   1589 
   1590     #define _MM_MULLO_EPI32 _mm_mullo_epi32
   1591     #define _MM_MUL_EPI32  _mm_mul_epi32
   1592 #else     //no SSE4 !!!!!!
   1593     _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
   1594     {
   1595         __m128i zero = _mm_setzero_si128();
   1596         return _mm_unpacklo_epi8(a, zero);
   1597     }
   1598 
   1599     _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
   1600     {
   1601         __m128i zero = _mm_setzero_si128();
   1602         return _mm_unpacklo_epi16(a, zero);
   1603     }
   1604 
   1605     _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
   1606     {
   1607         __m128i zero = _mm_setzero_si128();
   1608         return _mm_unpacklo_epi32(a, zero);
   1609     }
   1610 
   1611     _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
   1612     {
   1613         __m128i zero = _mm_setzero_si128();
   1614         __m128i sign = _mm_cmpgt_epi8(zero, a);
   1615         return _mm_unpacklo_epi8(a, sign);
   1616     }
   1617 
   1618     _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
   1619     {
   1620         __m128i zero = _mm_setzero_si128();
   1621         __m128i sign = _mm_cmpgt_epi16(zero, a);
   1622         return _mm_unpacklo_epi16(a, sign);
   1623     }
   1624 
   1625     _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
   1626     {
   1627         __m128i zero = _mm_setzero_si128();
   1628         __m128i sign = _mm_cmpgt_epi32(zero, a);
   1629         return _mm_unpacklo_epi32(a, sign);
   1630     }
   1631 
   1632     _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
   1633     {
   1634         _NEON2SSE_ALIGN_16 int32_t tmp[4];
   1635         _mm_store_si128((__m128i*)tmp, vec);
   1636         return tmp[LANE];
   1637     }
   1638 
   1639     _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
   1640     {
   1641         _NEON2SSE_ALIGN_16 int8_t tmp[16];
   1642         _mm_store_si128((__m128i*)tmp, vec);
   1643         return (int)tmp[LANE];
   1644     }
   1645 
   1646     _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
   1647     {
   1648         _NEON2SSE_ALIGN_16 int32_t tmp[4];
   1649         _mm_store_si128((__m128i*)tmp, _M128i(vec));
   1650         return tmp[LANE];
   1651     }
   1652 
   1653     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
   1654     {
   1655         _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
   1656         _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
   1657         __m128i vec_masked, p_masked;
   1658         pvec[LANE] = p;
   1659         mask[LANE] = 0x0;
   1660         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
   1661         p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
   1662         return _mm_or_si128(vec_masked, p_masked);
   1663     }
   1664 
   1665     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
   1666     {
   1667         _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
   1668         _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
   1669         __m128i vec_masked, p_masked;
   1670         pvec[LANE] = (int8_t)p;
   1671         mask[LANE] = 0x0;
   1672         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
   1673         p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
   1674         return _mm_or_si128(vec_masked, p_masked);
   1675     }
   1676 
   1677     _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
   1678     {
   1679         _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
   1680         __m128 tmp, vec_masked, p_masked;
   1681         mask[LANE >> 4] = 0x0;         //here the LANE is not actural lane, need to deal with it
   1682         vec_masked = _mm_and_ps (*(__m128*)mask,vec);         //ready for p
   1683         p_masked = _mm_andnot_ps (*(__m128*)mask, p);         //ready for vec
   1684         tmp = _mm_or_ps(vec_masked, p_masked);
   1685         return tmp;
   1686     }
   1687 
   1688     _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
   1689     {
   1690         __m128i cmp, resa, resb;
   1691         cmp = _mm_cmpgt_epi8 (a, b);
   1692         resa = _mm_and_si128 (cmp, a);
   1693         resb = _mm_andnot_si128 (cmp,b);
   1694         return _mm_or_si128(resa, resb);
   1695     }
   1696 
   1697     _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
   1698     {
   1699         __m128i cmp, resa, resb;
   1700         cmp = _mm_cmpgt_epi32(a, b);
   1701         resa = _mm_and_si128 (cmp, a);
   1702         resb = _mm_andnot_si128 (cmp,b);
   1703         return _mm_or_si128(resa, resb);
   1704     }
   1705 
   1706     _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
   1707     {
   1708         __m128i c8000, b_s, a_s, cmp;
   1709         c8000 = _mm_cmpeq_epi16 (a,a);         //0xffff
   1710         c8000 = _mm_slli_epi16 (c8000, 15);         //0x8000
   1711         b_s = _mm_sub_epi16 (b, c8000);
   1712         a_s = _mm_sub_epi16 (a, c8000);
   1713         cmp = _mm_cmpgt_epi16 (a_s, b_s);         //no unsigned comparison, need to go to signed
   1714         a_s = _mm_and_si128 (cmp,a);
   1715         b_s = _mm_andnot_si128 (cmp,b);
   1716         return _mm_or_si128(a_s, b_s);
   1717     }
   1718 
   1719     _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
   1720     {
   1721         __m128i c80000000, b_s, a_s, cmp;
   1722         c80000000 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
   1723         c80000000 = _mm_slli_epi32 (c80000000, 31);         //0x80000000
   1724         b_s = _mm_sub_epi32 (b, c80000000);
   1725         a_s = _mm_sub_epi32 (a, c80000000);
   1726         cmp = _mm_cmpgt_epi32 (a_s, b_s);         //no unsigned comparison, need to go to signed
   1727         a_s = _mm_and_si128 (cmp,a);
   1728         b_s = _mm_andnot_si128 (cmp,b);
   1729         return _mm_or_si128(a_s, b_s);
   1730     }
   1731 
   1732     _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
   1733     {
   1734         __m128i cmp, resa, resb;
   1735         cmp = _mm_cmpgt_epi8 (b, a);
   1736         resa = _mm_and_si128 (cmp, a);
   1737         resb = _mm_andnot_si128 (cmp,b);
   1738         return _mm_or_si128(resa, resb);
   1739     }
   1740 
   1741     _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
   1742     {
   1743         __m128i cmp, resa, resb;
   1744         cmp = _mm_cmpgt_epi32(b, a);
   1745         resa = _mm_and_si128 (cmp, a);
   1746         resb = _mm_andnot_si128 (cmp,b);
   1747         return _mm_or_si128(resa, resb);
   1748     }
   1749 
   1750     _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
   1751     {
   1752         __m128i c8000, b_s, a_s, cmp;
   1753         c8000 = _mm_cmpeq_epi16 (a,a);         //0xffff
   1754         c8000 = _mm_slli_epi16 (c8000, 15);         //0x8000
   1755         b_s = _mm_sub_epi16 (b, c8000);
   1756         a_s = _mm_sub_epi16 (a, c8000);
   1757         cmp = _mm_cmpgt_epi16 (b_s, a_s);         //no unsigned comparison, need to go to signed
   1758         a_s = _mm_and_si128 (cmp,a);
   1759         b_s = _mm_andnot_si128 (cmp,b);
   1760         return _mm_or_si128(a_s, b_s);
   1761     }
   1762 
   1763     _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
   1764     {
   1765         __m128i c80000000, b_s, a_s, cmp;
   1766         c80000000 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
   1767         c80000000 = _mm_slli_epi32 (c80000000, 31);         //0x80000000
   1768         b_s = _mm_sub_epi32 (b, c80000000);
   1769         a_s = _mm_sub_epi32 (a, c80000000);
   1770         cmp = _mm_cmpgt_epi32 (b_s, a_s);         //no unsigned comparison, need to go to signed
   1771         a_s = _mm_and_si128 (cmp,a);
   1772         b_s = _mm_andnot_si128 (cmp,b);
   1773         return _mm_or_si128(a_s, b_s);
   1774     }
   1775 
   1776     _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask)         //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
   1777     {         //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
   1778         __m128i a_masked, b_masked;
   1779         b_masked = _mm_and_si128 (mask,b);         //use b if mask 0xff
   1780         a_masked = _mm_andnot_si128 (mask,a);
   1781         return _mm_or_si128(a_masked, b_masked);
   1782     }
   1783 
   1784     #if defined(USE_SSSE3)
   1785     _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
   1786     {
   1787         _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
   1788         __m128i a16, b16, res, reshi,cmp, zero;
   1789         zero = _mm_setzero_si128();
   1790         a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
   1791         b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
   1792         res = _mm_unpacklo_epi64(a16, b16);         //result without saturation
   1793         reshi = _mm_unpackhi_epi64(a16, b16);         //hi part of result used for saturation
   1794         cmp = _mm_cmpgt_epi16(zero, reshi);         //if cmp<0 the result should be zero
   1795         res = _mm_andnot_si128(cmp,res);         //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
   1796         cmp = _mm_cmpgt_epi16(reshi,zero);         //if cmp positive
   1797         return _mm_or_si128(res, cmp);         //if cmp positive we are out of 16bits need to saturaate to 0xffff
   1798     }
   1799     #endif
   1800 
   1801     #if defined(USE_SSSE3)
   1802     _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
   1803     {
   1804         _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9,  12,13,  2,3, 6,7,10,11,14,15};
   1805         __m128i a16, res, reshi,cmp, zero;
   1806         zero = _mm_setzero_si128();
   1807         a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
   1808         reshi = _mm_unpackhi_epi64(a16, a16);         //hi part of result used for saturation
   1809         cmp = _mm_cmpgt_epi16(zero, reshi);         //if cmp<0 the result should be zero
   1810         res = _mm_andnot_si128(cmp, a16);         //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
   1811         cmp = _mm_cmpgt_epi16(reshi,zero);         //if cmp positive
   1812         return _mm_or_si128(res, cmp);         //if cmp positive we are out of 16bits need to saturaate to 0xffff
   1813     }
   1814     #endif
   1815 
   1816     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
   1817     {
   1818         _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
   1819         int64_t res64;
   1820         int i;
   1821         _mm_store_si128((__m128i*)atmp, a);
   1822         _mm_store_si128((__m128i*)btmp, b);
   1823         for (i = 0; i<4; i++) {
   1824             res64 = atmp[i] * btmp[i];
   1825             res[i] = (int)(res64 & 0xffffffff);
   1826         }
   1827         return _mm_load_si128((__m128i*)res);
   1828     }
   1829 
   1830     #if defined(USE_SSSE3)
   1831     _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
   1832     {
   1833         __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
   1834         sign = _mm_xor_si128 (a, b);
   1835         sign =  _mm_srai_epi32 (sign, 31);         //promote sign bit to all fields, all fff if negative and all 0 if positive
   1836         zero = _mm_setzero_si128();
   1837         a_neg = _mm_abs_epi32 (a);         //negate a and b
   1838         b_neg = _mm_abs_epi32 (b);         //negate a and b
   1839         mul_us = _mm_mul_epu32 (a_neg, b_neg);         //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
   1840         mul_us_neg = _mm_sub_epi64(zero, mul_us);
   1841         mul_us_neg = _mm_and_si128(sign, mul_us_neg);
   1842         mul_us = _mm_andnot_si128(sign, mul_us);
   1843         return _mm_or_si128 (mul_us, mul_us_neg);
   1844     }
   1845     #endif
   1846 #endif     //SSE4
   1847 
   1848 #ifndef _MM_INSERT_EPI64     //special case of SSE4 and  _M_X64
   1849     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
   1850     {
   1851         _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
   1852         _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff,0xffffffffffffffff};
   1853         __m128i vec_masked, p_masked;
   1854         pvec[LANE] = p;
   1855         mask[LANE] = 0x0;
   1856         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec);         //ready for p
   1857         p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec);         //ready for vec
   1858         return _mm_or_si128(vec_masked, p_masked);
   1859     }
   1860 #endif
   1861 #ifndef _MM_EXTRACT_EPI64     //special case of SSE4 and  _M_X64
   1862     _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
   1863     {
   1864         _NEON2SSE_ALIGN_16 int64_t tmp[2];
   1865         _mm_store_si128((__m128i*)tmp, val);
   1866         return tmp[LANE];
   1867     }
   1868 #endif
   1869 
   1870 int32x4_t  vqd_s32(int32x4_t a);         //Doubling saturation for signed ints
   1871 _NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
   1872 {         //Overflow happens only if a and sum have the opposite signs
   1873     __m128i c7fffffff, res, res_sat, res_xor_a;
   1874     c7fffffff = _mm_set1_epi32(0x7fffffff);
   1875     res = _mm_slli_epi32 (a, 1);         // res = a*2
   1876     res_sat = _mm_srli_epi32(a, 31);
   1877     res_sat = _mm_add_epi32(res_sat, c7fffffff);
   1878     res_xor_a = _mm_xor_si128(res, a);
   1879     res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
   1880     res_sat = _mm_and_si128(res_xor_a, res_sat);
   1881     res = _mm_andnot_si128(res_xor_a, res);
   1882     return _mm_or_si128(res, res_sat);
   1883 }
   1884 
   1885 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   1886 //*************************************************************************
   1887 //*************************************************************************
   1888 //*****************  Functions redefinition\implementatin starts here *****
   1889 //*************************************************************************
   1890 //*************************************************************************
   1891 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   1892 
   1893 /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
   1894 #ifdef ARM
   1895 #define vector_addq_s32 _mm_add_epi32
   1896 #else //if we have IA
   1897 #endif
   1898 
   1899 ********************************************************************************************
   1900 Functions below are organised in the following way:
   1901 
   1902 Each NEON intrinsic function has one of the following options:
   1903 1.  its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
   1904 2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
   1905 3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
   1906 4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
   1907 the serial implementation is provided along with the corresponding compiler warnin//these functions are on your app critical path
   1908 - please consider such functions removal from your code.
   1909 */
   1910 
   1911 //***********************************************************************
   1912 //************************      Vector add   *****************************
   1913 //***********************************************************************
   1914 
   1915 int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b);         // VADD.I8 q0,q0,q0
   1916 #define vaddq_s8 _mm_add_epi8
   1917 
   1918 int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b);         // VADD.I16 q0,q0,q0
   1919 #define vaddq_s16 _mm_add_epi16
   1920 
   1921 int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b);         // VADD.I32 q0,q0,q0
   1922 #define vaddq_s32 _mm_add_epi32
   1923 
   1924 int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b);         // VADD.I64 q0,q0,q0
   1925 #define vaddq_s64 _mm_add_epi64
   1926 
   1927 float32x4_t vaddq_f32(float32x4_t a, float32x4_t b);         // VADD.F32 q0,q0,q0
   1928 #define vaddq_f32 _mm_add_ps
   1929 
   1930 uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b);         // VADD.I8 q0,q0,q0
   1931 #define vaddq_u8 _mm_add_epi8
   1932 
   1933 uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b);         // VADD.I16 q0,q0,q0
   1934 #define vaddq_u16 _mm_add_epi16
   1935 
   1936 uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b);         // VADD.I32 q0,q0,q0
   1937 #define vaddq_u32 _mm_add_epi32
   1938 
   1939 uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b);         // VADD.I64 q0,q0,q0
   1940 #define vaddq_u64 _mm_add_epi64
   1941 
   1942 //**************************** Vector long add *****************************:
   1943 //***********************************************************************
   1944 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
   1945 
   1946 //***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
   1947 //*************** *********************************************************************
   1948 
   1949 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
   1950 //*************************************************************************************************************************
   1951 
   1952 int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b);         // VHADD.S8 q0,q0,q0
   1953 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
   1954 {         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   1955     __m128i tmp1, tmp2;
   1956     tmp1 = _mm_and_si128(a,b);
   1957     tmp2 = _mm_xor_si128(a,b);
   1958     tmp2 = vshrq_n_s8(tmp2,1);
   1959     return _mm_add_epi8(tmp1,tmp2);
   1960 }
   1961 
   1962 int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b);         // VHADD.S1 6 q0,q0,q0
   1963 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
   1964 {         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   1965     __m128i tmp1, tmp2;
   1966     tmp1 = _mm_and_si128(a,b);
   1967     tmp2 = _mm_xor_si128(a,b);
   1968     tmp2 = _mm_srai_epi16(tmp2,1);
   1969     return _mm_add_epi16(tmp1,tmp2);
   1970 }
   1971 
   1972 int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b);         // VHADD.S32 q0,q0,q0
   1973 _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b)         // VHADD.S32 q0,q0,q0
   1974 {         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   1975     __m128i tmp1, tmp2;
   1976     tmp1 = _mm_and_si128(a,b);
   1977     tmp2 = _mm_xor_si128(a,b);
   1978     tmp2 = _mm_srai_epi32(tmp2,1);
   1979     return _mm_add_epi32(tmp1,tmp2);
   1980 }
   1981 
   1982 uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b);         // VHADD.U8 q0,q0,q0
   1983 _NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b)         // VHADD.U8 q0,q0,q0
   1984 {
   1985     __m128i c1, sum, res;
   1986     c1 = _mm_set1_epi8(1);
   1987     sum = _mm_avg_epu8(a, b);         //result is rounded, need to compensate it
   1988     res = _mm_xor_si128(a, b);         //for rounding compensation
   1989     res = _mm_and_si128(res,c1);         //for rounding compensation
   1990     return _mm_sub_epi8 (sum, res);         //actual rounding compensation
   1991 }
   1992 
   1993 uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b);         // VHADD.s16 q0,q0,q0
   1994 _NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b)         // VHADD.s16 q0,q0,q0
   1995 {
   1996     __m128i sum, res;
   1997     sum = _mm_avg_epu16(a, b);         //result is rounded, need to compensate it
   1998     res = _mm_xor_si128(a, b);         //for rounding compensation
   1999     res = _mm_slli_epi16 (res,15);         //shift left  then back right to
   2000     res = _mm_srli_epi16 (res,15);         //get 1 or zero
   2001     return _mm_sub_epi16 (sum, res);         //actual rounding compensation
   2002 }
   2003 
   2004 uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b);         // VHADD.U32 q0,q0,q0
   2005 _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b)         // VHADD.U32 q0,q0,q0
   2006 {         //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   2007     __m128i tmp1, tmp2;
   2008     tmp1 = _mm_and_si128(a,b);
   2009     tmp2 = _mm_xor_si128(a,b);
   2010     tmp2 = _mm_srli_epi32(tmp2,1);
   2011     return _mm_add_epi32(tmp1,tmp2);
   2012 }
   2013 
   2014 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
   2015 //*****************************************************************************************************************************
   2016 
   2017 //SSE, result rounding!!!
   2018 
   2019 //SSE, result rounding!!!
   2020 
   2021 int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b);         // VRHADD.S8 q0,q0,q0
   2022 _NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b)         // VRHADD.S8 q0,q0,q0
   2023 {         //no signed average in x86 SIMD, go to unsigned
   2024     __m128i c128, au, bu, sum;
   2025     c128 = _mm_set1_epi8(128);
   2026     au = _mm_add_epi8(a, c128);
   2027     bu = _mm_add_epi8(b, c128);
   2028     sum = _mm_avg_epu8(au, bu);
   2029     return _mm_sub_epi8 (sum, c128);
   2030 }
   2031 
   2032 int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b);         // VRHADD.S16 q0,q0,q0
   2033 _NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b)         // VRHADD.S16 q0,q0,q0
   2034 {         //no signed average in x86 SIMD, go to unsigned
   2035     __m128i cx8000, au, bu, sum;
   2036     cx8000 = _mm_set1_epi16(0x8000);
   2037     au = _mm_add_epi16(a, cx8000);
   2038     bu = _mm_add_epi16(b, cx8000);
   2039     sum = _mm_avg_epu16(au, bu);
   2040     return _mm_sub_epi16 (sum, cx8000);
   2041 }
   2042 
   2043 int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b);         // VRHADD.S32 q0,q0,q0
   2044 _NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
   2045 {         //need to avoid overflow
   2046     __m128i a2, b2, res, sum;
   2047     a2 = _mm_srai_epi32(a,1);         //a2=a/2;
   2048     b2 = _mm_srai_epi32(b,1);         // b2=b/2;
   2049     res = _mm_or_si128(a,b);         //for rounding
   2050     res = _mm_slli_epi32 (res,31);         //shift left  then back right to
   2051     res = _mm_srli_epi32 (res,31);         //get 1 or zero
   2052     sum = _mm_add_epi32(a2,b2);
   2053     return _mm_add_epi32(sum,res);
   2054 }
   2055 
   2056 uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b);         // VRHADD.U8 q0,q0,q0
   2057 #define vrhaddq_u8 _mm_avg_epu8         //SSE2, results rounded
   2058 
   2059 uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b);         // VRHADD.s16 q0,q0,q0
   2060 #define vrhaddq_u16 _mm_avg_epu16         //SSE2, results rounded
   2061 
   2062 uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b);         // VRHADD.U32 q0,q0,q0
   2063 _NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b)         // VRHADD.U32 q0,q0,q0
   2064 {         //need to avoid overflow
   2065     __m128i a2, b2, res, sum;
   2066     a2 = _mm_srli_epi32(a,1);         //a2=a/2;
   2067     b2 = _mm_srli_epi32(b,1);         // b2=b/2;
   2068     res = _mm_or_si128(a,b);         //for rounding
   2069     res = _mm_slli_epi32 (res,31);         //shift left  then back right to
   2070     res = _mm_srli_epi32 (res,31);         //get 1 or zero
   2071     sum = _mm_add_epi32(a2,b2);
   2072     return _mm_add_epi32(sum,res);
   2073 }
   2074 
   2075 //****************** VQADD: Vector saturating add ************************
   2076 //************************************************************************
   2077 
   2078 int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b);         // VQADD.S8 q0,q0,q0
   2079 #define vqaddq_s8 _mm_adds_epi8
   2080 
   2081 int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b);         // VQADD.S16 q0,q0,q0
   2082 #define vqaddq_s16 _mm_adds_epi16
   2083 
   2084 int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b);         // VQADD.S32 q0,q0,q0
   2085 _NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
   2086 {         //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
   2087     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
   2088     c7fffffff = _mm_set1_epi32(0x7fffffff);
   2089     res = _mm_add_epi32(a, b);
   2090     res_sat = _mm_srli_epi32(a, 31);
   2091     res_sat = _mm_add_epi32(res_sat, c7fffffff);
   2092     res_xor_a = _mm_xor_si128(res, a);
   2093     b_xor_a_ = _mm_xor_si128(b, a);
   2094     res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
   2095     res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
   2096     res_sat = _mm_and_si128(res_xor_a, res_sat);
   2097     res = _mm_andnot_si128(res_xor_a, res);
   2098     return _mm_or_si128(res, res_sat);
   2099 }
   2100 
   2101 int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b);         // VQADD.S64 q0,q0,q0
   2102 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   2103 {
   2104     _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
   2105     _mm_store_si128((__m128i*)atmp, a);
   2106     _mm_store_si128((__m128i*)btmp, b);
   2107     res[0] = atmp[0] + btmp[0];
   2108     res[1] = atmp[1] + btmp[1];
   2109 
   2110     atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
   2111     atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
   2112 
   2113     if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
   2114         res[0] = atmp[0];
   2115     }
   2116     if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
   2117         res[1] = atmp[1];
   2118     }
   2119     return _mm_load_si128((__m128i*)res);
   2120 }
   2121 
   2122 uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b);         // VQADD.U8 q0,q0,q0
   2123 #define vqaddq_u8 _mm_adds_epu8
   2124 
   2125 uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b);         // VQADD.s16 q0,q0,q0
   2126 #define vqaddq_u16 _mm_adds_epu16
   2127 
   2128 uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b);         // VQADD.U32 q0,q0,q0
   2129 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
   2130 {
   2131     __m128i c80000000, cmp, subsum, suba, sum;
   2132     c80000000 = _mm_set1_epi32 (0x80000000);
   2133     sum = _mm_add_epi32 (a, b);
   2134     subsum = _mm_sub_epi32 (sum, c80000000);
   2135     suba = _mm_sub_epi32 (a, c80000000);
   2136     cmp = _mm_cmpgt_epi32 ( suba, subsum);         //no unsigned comparison, need to go to signed
   2137     return _mm_or_si128 (sum, cmp);         //saturation
   2138 }
   2139 
   2140 uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b);         // VQADD.U64 q0,q0,q0
   2141 #ifdef USE_SSE4
   2142     _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
   2143     {
   2144         __m128i c80000000, sum, cmp, suba, subsum;
   2145         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
   2146         sum = _mm_add_epi64 (a, b);
   2147         subsum = _mm_sub_epi64 (sum, c80000000);
   2148         suba = _mm_sub_epi64 (a, c80000000);
   2149         cmp = _mm_cmpgt_epi64 ( suba, subsum);         //no unsigned comparison, need to go to signed, SSE4.2!!!
   2150         return _mm_or_si128 (sum, cmp);         //saturation
   2151     }
   2152 #else
   2153     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   2154     {
   2155         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
   2156         _mm_store_si128((__m128i*)atmp, a);
   2157         _mm_store_si128((__m128i*)btmp, b);
   2158         res[0] = atmp[0] + btmp[0];
   2159         res[1] = atmp[1] + btmp[1];
   2160         if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
   2161         if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
   2162         return _mm_load_si128((__m128i*)(res));
   2163     }
   2164 #endif
   2165 
   2166 //******************* Vector add high half (truncated)  ******************
   2167 //************************************************************************
   2168 
   2169 //*********** Vector rounding add high half: vraddhn_<type> ******************.
   2170 //***************************************************************************
   2171 
   2172 //**********************************************************************************
   2173 //*********             Multiplication            *************************************
   2174 //**************************************************************************************
   2175 
   2176 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
   2177 //As we don't go to wider result functions are equal to "multiply low" in x86
   2178 
   2179 #if defined(USE_SSSE3)
   2180 int8x16_t vmulq_s8(int8x16_t a, int8x16_t b);         // VMUL.I8 q0,q0,q0
   2181 _NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b)         // VMUL.I8 q0,q0,q0
   2182 {         // no 8 bit simd multiply, need to go to 16 bits
   2183       //solution may be not optimal
   2184     __m128i a16, b16, r16_1, r16_2;
   2185     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   2186     a16 = _MM_CVTEPI8_EPI16 (a);         // SSE 4.1
   2187     b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
   2188     r16_1 = _mm_mullo_epi16 (a16, b16);
   2189     //swap hi and low part of a and b to process the remaining data
   2190     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   2191     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   2192     a16 = _MM_CVTEPI8_EPI16 (a16);         // SSE 4.1
   2193     b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1  __m128i r16_2
   2194 
   2195     r16_2 = _mm_mullo_epi16 (a16, b16);
   2196     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd);         //return to 8 bit
   2197     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd);         //return to 8 bit
   2198 
   2199     return _mm_unpacklo_epi64(r16_1,  r16_2);
   2200 }
   2201 #endif
   2202 
   2203 int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b);         // VMUL.I16 q0,q0,q0
   2204 #define vmulq_s16 _mm_mullo_epi16
   2205 
   2206 int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b);         // VMUL.I32 q0,q0,q0
   2207 #define vmulq_s32 _MM_MULLO_EPI32         //SSE4.1
   2208 
   2209 float32x4_t vmulq_f32(float32x4_t a, float32x4_t b);         // VMUL.F32 q0,q0,q0
   2210 #define vmulq_f32 _mm_mul_ps
   2211 
   2212 uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b);         // VMUL.I8 q0,q0,q0
   2213 _NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b)         // VMUL.I8 q0,q0,q0
   2214 {         // no 8 bit simd multiply, need to go to 16 bits
   2215       //solution may be not optimal
   2216     __m128i maskff, a16, b16, r16_1, r16_2;
   2217     maskff = _mm_set1_epi16(0xff);
   2218     a16 = _MM_CVTEPU8_EPI16 (a);         // SSE 4.1
   2219     b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
   2220     r16_1 = _mm_mullo_epi16 (a16, b16);
   2221     r16_1 = _mm_and_si128(r16_1, maskff);         //to avoid saturation
   2222     //swap hi and low part of a and b to process the remaining data
   2223     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   2224     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   2225     a16 = _MM_CVTEPI8_EPI16 (a16);         // SSE 4.1
   2226     b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
   2227 
   2228     r16_2 = _mm_mullo_epi16 (a16, b16);
   2229     r16_2 = _mm_and_si128(r16_2, maskff);         //to avoid saturation
   2230     return _mm_packus_epi16 (r16_1,  r16_2);
   2231 }
   2232 
   2233 uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b);         // VMUL.I16 q0,q0,q0
   2234 #define vmulq_u16 _mm_mullo_epi16
   2235 
   2236 uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b);         // VMUL.I32 q0,q0,q0
   2237 #define vmulq_u32 _MM_MULLO_EPI32         //SSE4.1
   2238 
   2239 poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b);         // VMUL.P8 q0,q0,q0
   2240 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
   2241 {         //may be optimized
   2242     __m128i c1, res, tmp, bmasked;
   2243     int i;
   2244     c1 = _mm_cmpeq_epi8 (a,a);         //all ones 0xff....
   2245     c1 = vshrq_n_u8(c1,7);         //0x1
   2246     bmasked = _mm_and_si128(b, c1);         //0x1
   2247     res = vmulq_u8(a, bmasked);
   2248     for(i = 1; i<8; i++) {
   2249         c1 = _mm_slli_epi16(c1,1);         //shift mask left by 1, 16 bit shift is OK here
   2250         bmasked = _mm_and_si128(b, c1);         //0x1
   2251         tmp = vmulq_u8(a, bmasked);
   2252         res = _mm_xor_si128(res, tmp);
   2253     }
   2254     return res;
   2255 }
   2256 
   2257 //************************* Vector long multiply ***********************************
   2258 //****************************************************************************
   2259 
   2260 //****************Vector saturating doubling long multiply **************************
   2261 //*****************************************************************
   2262 
   2263 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
   2264 //******************************************************************************************
   2265 
   2266 #if defined(USE_SSSE3)
   2267 int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLA.I8 q0,q0,q0
   2268 _NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VMLA.I8 q0,q0,q0
   2269 {         //solution may be not optimal
   2270       // no 8 bit simd multiply, need to go to 16 bits
   2271     __m128i b16, c16, r16_1, a_2,r16_2;
   2272     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   2273     b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
   2274     c16 = _MM_CVTEPI8_EPI16 (c);         // SSE 4.1
   2275     r16_1 = _mm_mullo_epi16 (b16, c16);
   2276     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
   2277     r16_1 = _mm_add_epi8 (r16_1, a);
   2278     //swap hi and low part of a, b and c to process the remaining data
   2279     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   2280     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   2281     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   2282     b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
   2283     c16 = _MM_CVTEPI8_EPI16 (c16);         // SSE 4.1
   2284 
   2285     r16_2 = _mm_mullo_epi16 (b16, c16);
   2286     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   2287     r16_2 = _mm_add_epi8(r16_2, a_2);
   2288     return _mm_unpacklo_epi64(r16_1,r16_2);
   2289 }
   2290 #endif
   2291 
   2292 int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLA.I16 q0,q0,q0
   2293 _NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VMLA.I16 q0,q0,q0
   2294 {
   2295     __m128i res;
   2296     res = _mm_mullo_epi16 (c, b);
   2297     return _mm_add_epi16 (res, a);
   2298 }
   2299 
   2300 int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLA.I32 q0,q0,q0
   2301 _NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VMLA.I32 q0,q0,q0
   2302 {
   2303     __m128i res;
   2304     res = _MM_MULLO_EPI32 (c,  b);         //SSE4.1
   2305     return _mm_add_epi32 (res, a);
   2306 }
   2307 
   2308 float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLA.F32 q0,q0,q0
   2309 _NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c)         // VMLA.F32 q0,q0,q0
   2310 {         //fma is coming soon, but right now:
   2311     __m128 res;
   2312     res = _mm_mul_ps (c, b);
   2313     return _mm_add_ps (a, res);
   2314 }
   2315 
   2316 #if defined(USE_SSSE3)
   2317 uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLA.I8 q0,q0,q0
   2318 _NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)         // VMLA.I8 q0,q0,q0
   2319 {         //solution may be not optimal
   2320       // no 8 bit simd multiply, need to go to 16 bits
   2321     __m128i b16, c16, r16_1, a_2, r16_2;
   2322     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   2323     b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
   2324     c16 = _MM_CVTEPU8_EPI16 (c);         // SSE 4.1
   2325     r16_1 = _mm_mullo_epi16 (b16, c16);
   2326     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
   2327     r16_1 = _mm_add_epi8 (r16_1, a);
   2328     //swap hi and low part of a, b and c to process the remaining data
   2329     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   2330     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   2331     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   2332     b16 = _MM_CVTEPU8_EPI16 (b16);         // SSE 4.1
   2333     c16 = _MM_CVTEPU8_EPI16 (c16);         // SSE 4.1
   2334 
   2335     r16_2 = _mm_mullo_epi16 (b16, c16);
   2336     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   2337     r16_2 = _mm_add_epi8(r16_2, a_2);
   2338     return _mm_unpacklo_epi64(r16_1,r16_2);
   2339 }
   2340 #endif
   2341 
   2342 uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLA.I16 q0,q0,q0
   2343 #define vmlaq_u16 vmlaq_s16
   2344 
   2345 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLA.I32 q0,q0,q0
   2346 #define vmlaq_u32 vmlaq_s32
   2347 
   2348 //**********************  Vector widening multiply accumulate (long multiply accumulate):
   2349 //                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
   2350 //********************************************************************************************
   2351 
   2352 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
   2353 //********************************************************************************************
   2354 
   2355 #if defined(USE_SSSE3)
   2356 int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VMLS.I8 q0,q0,q0
   2357 _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VMLS.I8 q0,q0,q0
   2358 {         //solution may be not optimal
   2359       // no 8 bit simd multiply, need to go to 16 bits
   2360     __m128i b16, c16, r16_1, a_2, r16_2;
   2361     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   2362     b16 = _MM_CVTEPI8_EPI16 (b);         // SSE 4.1
   2363     c16 = _MM_CVTEPI8_EPI16 (c);         // SSE 4.1
   2364     r16_1 = _mm_mullo_epi16 (b16, c16);
   2365     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
   2366     r16_1 = _mm_sub_epi8 (a, r16_1);
   2367     //swap hi and low part of a, b, c to process the remaining data
   2368     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   2369     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   2370     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   2371     b16 = _MM_CVTEPI8_EPI16 (b16);         // SSE 4.1
   2372     c16 = _MM_CVTEPI8_EPI16 (c16);         // SSE 4.1
   2373 
   2374     r16_2 = _mm_mullo_epi16 (b16, c16);
   2375     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   2376     r16_2 = _mm_sub_epi8 (a_2, r16_2);
   2377     return _mm_unpacklo_epi64(r16_1,r16_2);
   2378 }
   2379 #endif
   2380 
   2381 int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VMLS.I16 q0,q0,q0
   2382 _NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VMLS.I16 q0,q0,q0
   2383 {
   2384     __m128i res;
   2385     res = _mm_mullo_epi16 (c, b);
   2386     return _mm_sub_epi16 (a, res);
   2387 }
   2388 
   2389 int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VMLS.I32 q0,q0,q0
   2390 _NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VMLS.I32 q0,q0,q0
   2391 {
   2392     __m128i res;
   2393     res = _MM_MULLO_EPI32 (c, b);         //SSE4.1
   2394     return _mm_sub_epi32 (a, res);
   2395 }
   2396 
   2397 float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c);         // VMLS.F32 q0,q0,q0
   2398 _NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c)         // VMLS.F32 q0,q0,q0
   2399 {
   2400     __m128 res;
   2401     res = _mm_mul_ps (c, b);
   2402     return _mm_sub_ps (a, res);
   2403 }
   2404 
   2405 #if defined(USE_SSSE3)
   2406 uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VMLS.I8 q0,q0,q0
   2407 _NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)         // VMLS.I8 q0,q0,q0
   2408 {         //solution may be not optimal
   2409       // no 8 bit simd multiply, need to go to 16 bits
   2410     __m128i b16, c16, r16_1, a_2, r16_2;
   2411     _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 };
   2412     b16 = _MM_CVTEPU8_EPI16 (b);         // SSE 4.1
   2413     c16 = _MM_CVTEPU8_EPI16 (c);         // SSE 4.1
   2414     r16_1 = _mm_mullo_epi16 (b16, c16);
   2415     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);         //return to 8 bits
   2416     r16_1 = _mm_sub_epi8 (a, r16_1);
   2417     //swap hi and low part of a, b and c to process the remaining data
   2418     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   2419     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   2420     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   2421     b16 = _MM_CVTEPU8_EPI16 (b16);         // SSE 4.1
   2422     c16 = _MM_CVTEPU8_EPI16 (c16);         // SSE 4.1
   2423 
   2424     r16_2 = _mm_mullo_epi16 (b16, c16);
   2425     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   2426     r16_2 = _mm_sub_epi8(a_2, r16_2);
   2427     return _mm_unpacklo_epi64(r16_1,r16_2);
   2428 }
   2429 #endif
   2430 
   2431 uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VMLS.I16 q0,q0,q0
   2432 #define vmlsq_u16 vmlsq_s16
   2433 
   2434 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VMLS.I32 q0,q0,q0
   2435 #define vmlsq_u32 vmlsq_s32
   2436 
   2437 //******************** Vector multiply subtract long (widening multiply subtract) ************************************
   2438 //*************************************************************************************************************
   2439 
   2440 //******  Vector saturating doubling multiply high **********************
   2441 //*************************************************************************
   2442 //For some ARM implementations if the multiply high result is all 0xffffffff then it is not doubled. We do the same here
   2443 
   2444 int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b);         // VQDMULH.S16 q0,q0,q0
   2445 _NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b)         // VQDMULH.S16 q0,q0,q0
   2446 {
   2447     __m128i res_sat, cffff, mask, res;
   2448     res = _mm_mulhi_epi16 (a, b);
   2449     cffff = _mm_cmpeq_epi16(res,res);         //0xffff
   2450     mask = _mm_cmpeq_epi16(res, cffff);         //if ffff need to saturate
   2451     res_sat = _mm_adds_epi16(res, res);         //res *= 2 and saturate
   2452     return _mm_or_si128(mask, res_sat);
   2453 }
   2454 
   2455 #if defined(USE_SSSE3)
   2456 int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b);         // VQDMULH.S32 q0,q0,q0
   2457 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   2458 {         // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
   2459     __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1;
   2460     ab = _mm_unpacklo_epi32 (a, b);         //a0, b0, a1,b1
   2461     ba = _mm_unpacklo_epi32 (b, a);         //b0, a0, b1,a1
   2462     mul = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   2463     ab = _mm_unpackhi_epi32 (a, b);         //a2, b2, a3,b3
   2464     ba = _mm_unpackhi_epi32 (b, a);         //b2, a2, b3,a3
   2465     mul1 = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   2466     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
   2467     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
   2468     mul = _mm_unpacklo_epi64(mul, mul1);
   2469     cffffffff = _mm_cmpeq_epi32(mul,mul);         //0xffffffff
   2470     mask = _mm_cmpeq_epi32(mul, cffffffff);         //if ffffffff need to saturate
   2471     res_sat = vqd_s32(mul);
   2472     return _mm_or_si128(mask, res_sat);
   2473 }
   2474 #endif
   2475 
   2476 //********* Vector saturating rounding doubling multiply high ****************
   2477 //****************************************************************************
   2478 //If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
   2479 
   2480 #if defined(USE_SSSE3)
   2481 int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b);         // VQRDMULH.S16 q0,q0,q0
   2482 _NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b)         // VQRDMULH.S16 q0,q0,q0
   2483 {
   2484     __m128i res_sat, cffff, mask, res;
   2485     res = _mm_mulhrs_epi16 (a, b);
   2486     cffff = _mm_cmpeq_epi16(res,res);         //0xffff
   2487     mask = _mm_cmpeq_epi16(res, cffff);         //if ffff need to saturate
   2488     res_sat = _mm_adds_epi16(res, res);         //res *= 2 and saturate
   2489     return _mm_or_si128(mask, res_sat);
   2490 }
   2491 #endif
   2492 
   2493 #if defined(USE_SSSE3)
   2494 int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b);         // VQRDMULH.S32 q0,q0,q0
   2495 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   2496 {         // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
   2497     __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1, mask1;
   2498     ab = _mm_unpacklo_epi32 (a, b);         //a0, b0, a1,b1
   2499     ba = _mm_unpacklo_epi32 (b, a);         //b0, a0, b1,a1
   2500     mul = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   2501     ab = _mm_unpackhi_epi32 (a, b);         //a2, b2, a3,b3
   2502     ba = _mm_unpackhi_epi32 (b, a);         //b2, a2, b3,a3
   2503     mul1 = _MM_MUL_EPI32(ab, ba);         //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   2504     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
   2505     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6));         //shuffle the data to get 2 32-bits
   2506     mul = _mm_unpacklo_epi64(mul, mul1);
   2507     cffffffff = _mm_cmpeq_epi32(mul,mul);         //0xffffffff
   2508     mask1 = _mm_slli_epi32(mul, 17);         //shift left then back right to
   2509     mask1 = _mm_srli_epi32(mul,31);         //get  15-th bit 1 or zero
   2510     mul = _mm_add_epi32 (mul, mask1);         //actual rounding
   2511     mask = _mm_cmpeq_epi32(mul, cffffffff);         //if ffffffff need to saturate
   2512     res_sat = vqd_s32(mul);
   2513     return _mm_or_si128(mask, res_sat);
   2514 }
   2515 #endif
   2516 
   2517 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
   2518 //*************************************************************************************************************************
   2519 
   2520 //************************************************************************************
   2521 //******************  Vector subtract ***********************************************
   2522 //************************************************************************************
   2523 
   2524 int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b);         // VSUB.I8 q0,q0,q0
   2525 #define vsubq_s8 _mm_sub_epi8
   2526 
   2527 int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b);         // VSUB.I16 q0,q0,q0
   2528 #define vsubq_s16 _mm_sub_epi16
   2529 
   2530 int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b);         // VSUB.I32 q0,q0,q0
   2531 #define vsubq_s32 _mm_sub_epi32
   2532 
   2533 int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b);         // VSUB.I64 q0,q0,q0
   2534 #define vsubq_s64 _mm_sub_epi64
   2535 
   2536 float32x4_t vsubq_f32(float32x4_t a, float32x4_t b);         // VSUB.F32 q0,q0,q0
   2537 #define vsubq_f32 _mm_sub_ps
   2538 
   2539 uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b);         // VSUB.I8 q0,q0,q0
   2540 #define vsubq_u8 _mm_sub_epi8
   2541 
   2542 uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b);         // VSUB.I16 q0,q0,q0
   2543 #define vsubq_u16 _mm_sub_epi16
   2544 
   2545 uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b);         // VSUB.I32 q0,q0,q0
   2546 #define vsubq_u32 _mm_sub_epi32
   2547 
   2548 uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b);         // VSUB.I64 q0,q0,q0
   2549 #define vsubq_u64 _mm_sub_epi64
   2550 
   2551 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
   2552 //***********************************************************************************
   2553 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
   2554 
   2555 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
   2556 //*****************************************************************************************************
   2557 
   2558 //************************Vector saturating subtract *********************************
   2559 //*************************************************************************************
   2560 
   2561 int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b);         // VQSUB.S8 q0,q0,q0
   2562 #define vqsubq_s8 _mm_subs_epi8
   2563 
   2564 int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b);         // VQSUB.S16 q0,q0,q0
   2565 #define vqsubq_s16 _mm_subs_epi16
   2566 
   2567 int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b);         // VQSUB.S32 q0,q0,q0
   2568 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
   2569 {         //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
   2570     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
   2571     c7fffffff = _mm_set1_epi32(0x7fffffff);
   2572     res = _mm_sub_epi32(a, b);
   2573     res_sat = _mm_srli_epi32(a, 31);
   2574     res_sat = _mm_add_epi32(res_sat, c7fffffff);
   2575     res_xor_a = _mm_xor_si128(res, a);
   2576     b_xor_a = _mm_xor_si128(b, a);
   2577     res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
   2578     res_xor_a = _mm_srai_epi32(res_xor_a,31);         //propagate the sigh bit, all ffff if <0 all ones otherwise
   2579     res_sat = _mm_and_si128(res_xor_a, res_sat);
   2580     res = _mm_andnot_si128(res_xor_a, res);
   2581     return _mm_or_si128(res, res_sat);
   2582 }
   2583 
   2584 int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b);         // VQSUB.S64 q0,q0,q0
   2585 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)         //no optimal SIMD soulution
   2586 {
   2587     _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
   2588     _NEON2SSE_ALIGN_16 uint64_t res[2];
   2589     _mm_store_si128((__m128i*)atmp, a);
   2590     _mm_store_si128((__m128i*)btmp, b);
   2591     res[0] = atmp[0] - btmp[0];
   2592     res[1] = atmp[1] - btmp[1];
   2593     if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
   2594         res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
   2595     }
   2596     if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
   2597         res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
   2598     }
   2599     return _mm_load_si128((__m128i*)res);
   2600 }
   2601 
   2602 uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b);         // VQSUB.U8 q0,q0,q0
   2603 #define vqsubq_u8 _mm_subs_epu8
   2604 
   2605 uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b);         // VQSUB.s16 q0,q0,q0
   2606 #define vqsubq_u16 _mm_subs_epu16
   2607 
   2608 uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b);         // VQSUB.U32 q0,q0,q0
   2609 _NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b)         // VQSUB.U32 q0,q0,q0
   2610 {
   2611     __m128i min, mask, sub;
   2612     min = _MM_MIN_EPU32(a, b);         //SSE4.1
   2613     mask = _mm_cmpeq_epi32 (min,  b);
   2614     sub = _mm_sub_epi32 (a, b);
   2615     return _mm_and_si128 ( sub, mask);
   2616 }
   2617 
   2618 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL);         // VQSUB.U64 q0,q0,q0
   2619 #ifdef USE_SSE4
   2620     _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
   2621     {
   2622         __m128i c80000000, subb, suba, cmp, sub;
   2623         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
   2624         sub  = _mm_sub_epi64 (a, b);
   2625         suba = _mm_sub_epi64 (a, c80000000);
   2626         subb = _mm_sub_epi64 (b, c80000000);
   2627         cmp = _mm_cmpgt_epi64 ( suba, subb);         //no unsigned comparison, need to go to signed, SSE4.2!!!
   2628         return _mm_and_si128 (sub, cmp);         //saturation
   2629     }
   2630 #else
   2631     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   2632     {
   2633         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
   2634         _mm_store_si128((__m128i*)atmp, a);
   2635         _mm_store_si128((__m128i*)btmp, b);
   2636         res[0] = (atmp[0] > btmp[0]) ? atmp[0] -  btmp[0] : 0;
   2637         res[1] = (atmp[1] > btmp[1]) ? atmp[1] -  btmp[1] : 0;
   2638         return _mm_load_si128((__m128i*)(res));
   2639     }
   2640 #endif
   2641 
   2642 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
   2643 //****************************************************************
   2644 
   2645 int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b);         // VHSUB.S8 q0,q0,q0
   2646 _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b)         // VHSUB.S8 q0,q0,q0
   2647 {         // //need to deal with the possibility of internal overflow
   2648     __m128i c128, au,bu;
   2649     c128 = _mm_set1_epi8 (128);
   2650     au = _mm_add_epi8( a, c128);
   2651     bu = _mm_add_epi8( b, c128);
   2652     return vhsubq_u8(au,bu);
   2653 }
   2654 
   2655 int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b);         // VHSUB.S16 q0,q0,q0
   2656 _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b)         // VHSUB.S16 q0,q0,q0
   2657 {         //need to deal with the possibility of internal overflow
   2658     __m128i c8000, au,bu;
   2659     c8000 = _mm_set1_epi16(0x8000);
   2660     au = _mm_add_epi16( a, c8000);
   2661     bu = _mm_add_epi16( b, c8000);
   2662     return vhsubq_u16(au,bu);
   2663 }
   2664 
   2665 int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b);         // VHSUB.S32 q0,q0,q0
   2666 _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b)         // VHSUB.S32 q0,q0,q0
   2667 {//need to deal with the possibility of internal overflow
   2668     __m128i a2, b2,r, b_1;
   2669     a2 = _mm_srai_epi32 (a,1);
   2670     b2 = _mm_srai_epi32 (b,1);
   2671     r = _mm_sub_epi32 (a2, b2);
   2672     b_1 = _mm_andnot_si128(a, b); //!a and b
   2673     b_1 = _mm_slli_epi32 (b_1,31);
   2674     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
   2675     return _mm_sub_epi32(r,b_1);
   2676 }
   2677 
   2678 uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b);         // VHSUB.U8 q0,q0,q0
   2679 _NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b)         // VHSUB.U8 q0,q0,q0
   2680 {
   2681     __m128i avg;
   2682     avg = _mm_avg_epu8 (a, b);
   2683     return _mm_sub_epi8(a, avg);
   2684 }
   2685 
   2686 uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b);         // VHSUB.s16 q0,q0,q0
   2687 _NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b)         // VHSUB.s16 q0,q0,q0
   2688 {
   2689     __m128i avg;
   2690     avg = _mm_avg_epu16 (a, b);
   2691     return _mm_sub_epi16(a, avg);
   2692 }
   2693 
   2694 uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b);         // VHSUB.U32 q0,q0,q0
   2695 _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b)         // VHSUB.U32 q0,q0,q0
   2696 {//need to deal with the possibility of internal overflow
   2697     __m128i a2, b2,r, b_1;
   2698     a2 = _mm_srli_epi32 (a,1);
   2699     b2 = _mm_srli_epi32 (b,1);
   2700     r = _mm_sub_epi32 (a2, b2);
   2701     b_1 = _mm_andnot_si128(a, b); //!a and b
   2702     b_1 = _mm_slli_epi32 (b_1,31);
   2703     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
   2704     return _mm_sub_epi32(r,b_1);
   2705 }
   2706 
   2707 //******* Vector subtract high half (truncated) ** ************
   2708 //************************************************************
   2709 
   2710 //************ Vector rounding subtract high half *********************
   2711 //*********************************************************************
   2712 
   2713 //*********** Vector saturating doubling multiply subtract long ********************
   2714 //************************************************************************************
   2715 
   2716 //******************  COMPARISON ***************************************
   2717 //******************* Vector compare equal *************************************
   2718 //****************************************************************************
   2719 
   2720 uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b);         // VCEQ.I8 q0, q0, q0
   2721 #define vceqq_s8 _mm_cmpeq_epi8
   2722 
   2723 uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b);         // VCEQ.I16 q0, q0, q0
   2724 #define vceqq_s16 _mm_cmpeq_epi16
   2725 
   2726 uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b);         // VCEQ.I32 q0, q0, q0
   2727 #define vceqq_s32 _mm_cmpeq_epi32
   2728 
   2729 uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b);         // VCEQ.F32 q0, q0, q0
   2730 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
   2731 {
   2732     __m128 res;
   2733     res = _mm_cmpeq_ps(a,b);
   2734     return *(__m128i*)&res;
   2735 }
   2736 
   2737 uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b);         // VCEQ.I8 q0, q0, q0
   2738 #define vceqq_u8 _mm_cmpeq_epi8
   2739 
   2740 uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b);         // VCEQ.I16 q0, q0, q0
   2741 #define vceqq_u16 _mm_cmpeq_epi16
   2742 
   2743 uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b);         // VCEQ.I32 q0, q0, q0
   2744 #define vceqq_u32 _mm_cmpeq_epi32
   2745 
   2746 uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b);         // VCEQ.I8 q0, q0, q0
   2747 #define vceqq_p8 _mm_cmpeq_epi8
   2748 
   2749 //******************Vector compare greater-than or equal*************************
   2750 //*******************************************************************************
   2751 //in IA SIMD no greater-than-or-equal comparison for integers,
   2752 // there is greater-than available only, so we need the following tricks
   2753 
   2754 uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
   2755 _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b)         // VCGE.S8 q0, q0, q0
   2756 {
   2757     __m128i m1, m2;
   2758     m1 = _mm_cmpgt_epi8 ( a, b);
   2759     m2 = _mm_cmpeq_epi8 ( a, b);
   2760     return _mm_or_si128  ( m1, m2);
   2761 }
   2762 
   2763 uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
   2764 _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b)         // VCGE.S16 q0, q0, q0
   2765 {
   2766     __m128i m1, m2;
   2767     m1 = _mm_cmpgt_epi16 ( a, b);
   2768     m2 = _mm_cmpeq_epi16 ( a, b);
   2769     return _mm_or_si128   ( m1,m2);
   2770 }
   2771 
   2772 uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
   2773 _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b)         // VCGE.S32 q0, q0, q0
   2774 {
   2775     __m128i m1, m2;
   2776     m1 = _mm_cmpgt_epi32 (a, b);
   2777     m2 = _mm_cmpeq_epi32 (a, b);
   2778     return _mm_or_si128   (m1, m2);
   2779 }
   2780 
   2781 uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
   2782 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
   2783 {
   2784     __m128 res;
   2785     res = _mm_cmpge_ps(a,b);         //use only 2 first entries
   2786     return *(__m128i*)&res;
   2787 }
   2788 
   2789 uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
   2790 _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b)         // VCGE.U8 q0, q0, q0
   2791 {         //no unsigned chars comparison, only signed available,so need the trick
   2792     #ifdef USE_SSE4
   2793         __m128i cmp;
   2794         cmp = _mm_max_epu8(a, b);
   2795         return _mm_cmpeq_epi8(cmp, a);         //a>=b
   2796     #else
   2797         __m128i c128, as, bs, m1, m2;
   2798         c128 = _mm_set1_epi8 (128);
   2799         as = _mm_sub_epi8( a, c128);
   2800         bs = _mm_sub_epi8( b, c128);
   2801         m1 = _mm_cmpgt_epi8( as, bs);
   2802         m2 = _mm_cmpeq_epi8 (as, bs);
   2803         return _mm_or_si128 ( m1,  m2);
   2804     #endif
   2805 }
   2806 
   2807 uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.s16 q0, q0, q0
   2808 _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b)         // VCGE.s16 q0, q0, q0
   2809 {         //no unsigned shorts comparison, only signed available,so need the trick
   2810     #ifdef USE_SSE4
   2811         __m128i cmp;
   2812         cmp = _mm_max_epu16(a, b);
   2813         return _mm_cmpeq_epi16(cmp, a);         //a>=b
   2814     #else
   2815         __m128i c8000, as, bs, m1, m2;
   2816         c8000 = _mm_set1_epi16 (0x8000);
   2817         as = _mm_sub_epi16(a,c8000);
   2818         bs = _mm_sub_epi16(b,c8000);
   2819         m1 = _mm_cmpgt_epi16(as, bs);
   2820         m2 = _mm_cmpeq_epi16 (as, bs);
   2821         return _mm_or_si128 ( m1, m2);
   2822     #endif
   2823 }
   2824 
   2825 uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
   2826 _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b)         // VCGE.U32 q0, q0, q0
   2827 {         //no unsigned ints comparison, only signed available,so need the trick
   2828     #ifdef USE_SSE4
   2829         __m128i cmp;
   2830         cmp = _mm_max_epu32(a, b);
   2831         return _mm_cmpeq_epi32(cmp, a);         //a>=b
   2832     #else
   2833         //serial solution may be faster
   2834         __m128i c80000000, as, bs, m1, m2;
   2835         c80000000 = _mm_set1_epi32 (0x80000000);
   2836         as = _mm_sub_epi32(a,c80000000);
   2837         bs = _mm_sub_epi32(b,c80000000);
   2838         m1 = _mm_cmpgt_epi32 (as, bs);
   2839         m2 = _mm_cmpeq_epi32 (as, bs);
   2840         return _mm_or_si128 ( m1,  m2);
   2841     #endif
   2842 }
   2843 
   2844 //**********************Vector compare less-than or equal******************************
   2845 //***************************************************************************************
   2846 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
   2847 
   2848 uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b);         // VCGE.S8 q0, q0, q0
   2849 _NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b)         // VCGE.S8 q0, q0, q0
   2850 {
   2851     __m128i c1, res;
   2852     c1 = _mm_cmpeq_epi8 (a,a);         //all ones 0xff....
   2853     res = _mm_cmpgt_epi8 ( a,  b);
   2854     return _mm_andnot_si128 (res, c1);         //inverse the cmpgt result, get less-than-or-equal
   2855 }
   2856 
   2857 uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b);         // VCGE.S16 q0, q0, q0
   2858 _NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b)         // VCGE.S16 q0, q0, q0
   2859 {
   2860     __m128i c1, res;
   2861     c1 = _mm_cmpeq_epi16 (a,a);         //all ones 0xff....
   2862     res = _mm_cmpgt_epi16 ( a,  b);
   2863     return _mm_andnot_si128 (res, c1);
   2864 }
   2865 
   2866 uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b);         // VCGE.S32 q0, q0, q0
   2867 _NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b)         // VCGE.S32 q0, q0, q0
   2868 {
   2869     __m128i c1, res;
   2870     c1 = _mm_cmpeq_epi32 (a,a);         //all ones 0xff....
   2871     res = _mm_cmpgt_epi32 ( a,  b);
   2872     return _mm_andnot_si128 (res, c1);
   2873 }
   2874 
   2875 uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b);         // VCGE.F32 q0, q0, q0
   2876 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
   2877 {
   2878     __m128 res;
   2879     res = _mm_cmple_ps(a,b);
   2880     return *(__m128i*)&res;
   2881 }
   2882 
   2883 uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b);         // VCGE.U8 q0, q0, q0
   2884 #ifdef USE_SSE4
   2885     _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b)         // VCGE.U8 q0, q0, q0
   2886     {         //no unsigned chars comparison in SSE, only signed available,so need the trick
   2887 
   2888         __m128i cmp;
   2889         cmp = _mm_min_epu8(a, b);
   2890         return _mm_cmpeq_epi8(cmp, a);         //a<=b
   2891     }
   2892 #else
   2893     #define vcleq_u8(a,b) vcgeq_u8(b,a)
   2894 #endif
   2895 
   2896 uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b);         // VCGE.s16 q0, q0, q0
   2897 #ifdef USE_SSE4
   2898     _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b)         // VCGE.s16 q0, q0, q0
   2899     {         //no unsigned shorts comparison in SSE, only signed available,so need the trick
   2900         __m128i cmp;
   2901         cmp = _mm_min_epu16(a, b);
   2902         return _mm_cmpeq_epi16(cmp, a);         //a<=b
   2903     }
   2904 #else
   2905     #define vcleq_u16(a,b) vcgeq_u16(b,a)
   2906 #endif
   2907 
   2908 uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b);         // VCGE.U32 q0, q0, q0
   2909 #ifdef USE_SSE4
   2910     _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b)         // VCGE.U32 q0, q0, q0
   2911     {         //no unsigned chars comparison in SSE, only signed available,so need the trick
   2912         __m128i cmp;
   2913         cmp = _mm_min_epu32(a, b);
   2914         return _mm_cmpeq_epi32(cmp, a);         //a<=b
   2915     }
   2916 #else
   2917 //solution may be not optimal compared with the serial one
   2918     #define vcleq_u32(a,b) vcgeq_u32(b,a)
   2919 #endif
   2920 
   2921 //****** Vector compare greater-than ******************************************
   2922 //**************************************************************************
   2923 
   2924 uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
   2925 #define vcgtq_s8 _mm_cmpgt_epi8
   2926 
   2927 uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
   2928 #define vcgtq_s16 _mm_cmpgt_epi16
   2929 
   2930 uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
   2931 #define vcgtq_s32 _mm_cmpgt_epi32
   2932 
   2933 uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
   2934 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
   2935 {
   2936     __m128 res;
   2937     res = _mm_cmpgt_ps(a,b);         //use only 2 first entries
   2938     return *(__m128i*)&res;
   2939 }
   2940 
   2941 uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
   2942 _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b)         // VCGT.U8 q0, q0, q0
   2943 {         //no unsigned chars comparison, only signed available,so need the trick
   2944     __m128i c128, as, bs;
   2945     c128 = _mm_set1_epi8 (128);
   2946     as = _mm_sub_epi8(a,c128);
   2947     bs = _mm_sub_epi8(b,c128);
   2948     return _mm_cmpgt_epi8 (as, bs);
   2949 }
   2950 
   2951 uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.s16 q0, q0, q0
   2952 _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b)         // VCGT.s16 q0, q0, q0
   2953 {         //no unsigned short comparison, only signed available,so need the trick
   2954     __m128i c8000, as, bs;
   2955     c8000 = _mm_set1_epi16 (0x8000);
   2956     as = _mm_sub_epi16(a,c8000);
   2957     bs = _mm_sub_epi16(b,c8000);
   2958     return _mm_cmpgt_epi16 ( as, bs);
   2959 }
   2960 
   2961 uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
   2962 _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b)         // VCGT.U32 q0, q0, q0
   2963 {         //no unsigned int comparison, only signed available,so need the trick
   2964     __m128i c80000000, as, bs;
   2965     c80000000 = _mm_set1_epi32 (0x80000000);
   2966     as = _mm_sub_epi32(a,c80000000);
   2967     bs = _mm_sub_epi32(b,c80000000);
   2968     return _mm_cmpgt_epi32 ( as, bs);
   2969 }
   2970 
   2971 //********************* Vector compare less-than **************************
   2972 //*************************************************************************
   2973 
   2974 uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b);         // VCGT.S8 q0, q0, q0
   2975 #define vcltq_s8(a,b) vcgtq_s8(b, a)         //swap the arguments!!
   2976 
   2977 uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b);         // VCGT.S16 q0, q0, q0
   2978 #define vcltq_s16(a,b) vcgtq_s16(b, a)         //swap the arguments!!
   2979 
   2980 uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b);         // VCGT.S32 q0, q0, q0
   2981 #define vcltq_s32(a,b) vcgtq_s32(b, a)         //swap the arguments!!
   2982 
   2983 uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b);         // VCGT.F32 q0, q0, q0
   2984 #define vcltq_f32(a,b) vcgtq_f32(b, a)         //swap the arguments!!
   2985 
   2986 uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b);         // VCGT.U8 q0, q0, q0
   2987 #define vcltq_u8(a,b) vcgtq_u8(b, a)         //swap the arguments!!
   2988 
   2989 uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b);         // VCGT.s16 q0, q0, q0
   2990 #define vcltq_u16(a,b) vcgtq_u16(b, a)         //swap the arguments!!
   2991 
   2992 uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b);         // VCGT.U32 q0, q0, q0
   2993 #define vcltq_u32(a,b) vcgtq_u32(b, a)         //swap the arguments!!
   2994 
   2995 //*****************Vector compare absolute greater-than or equal ************
   2996 //***************************************************************************
   2997 
   2998 uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
   2999 _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b)         // VACGE.F32 q0, q0, q0
   3000 {
   3001     __m128i c7fffffff;
   3002     __m128 a0, b0;
   3003     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   3004     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   3005     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   3006     a0 = _mm_cmpge_ps ( a0, b0);
   3007     return (*(__m128i*)&a0);
   3008 }
   3009 
   3010 //********Vector compare absolute less-than or equal ******************
   3011 //********************************************************************
   3012 
   3013 uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b);         // VACGE.F32 q0, q0, q0
   3014 _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b)         // VACGE.F32 q0, q0, q0
   3015 {
   3016     __m128i c7fffffff;
   3017     __m128 a0, b0;
   3018     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   3019     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   3020     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   3021     a0 = _mm_cmple_ps (a0, b0);
   3022     return (*(__m128i*)&a0);
   3023 }
   3024 
   3025 //********  Vector compare absolute greater-than    ******************
   3026 //******************************************************************
   3027 
   3028 uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
   3029 _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b)         // VACGT.F32 q0, q0, q0
   3030 {
   3031     __m128i c7fffffff;
   3032     __m128 a0, b0;
   3033     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   3034     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   3035     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   3036     a0 = _mm_cmpgt_ps (a0, b0);
   3037     return (*(__m128i*)&a0);
   3038 }
   3039 
   3040 //***************Vector compare absolute less-than  ***********************
   3041 //*************************************************************************
   3042 
   3043 uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b);         // VACGT.F32 q0, q0, q0
   3044 _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b)         // VACGT.F32 q0, q0, q0
   3045 {
   3046     __m128i c7fffffff;
   3047     __m128 a0, b0;
   3048     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   3049     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   3050     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   3051     a0 = _mm_cmplt_ps (a0, b0);
   3052     return (*(__m128i*)&a0);
   3053 
   3054 }
   3055 
   3056 //*************************Vector test bits************************************
   3057 //*****************************************************************************
   3058 /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
   3059 with the corresponding element of a second vector. If the result is not zero, the
   3060 corresponding element in the destination vector is set to all ones. Otherwise, it is set to
   3061 all zeros. */
   3062 
   3063 uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b);         // VTST.8 q0, q0, q0
   3064 _NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b)         // VTST.8 q0, q0, q0
   3065 {
   3066     __m128i zero, one, res;
   3067     zero = _mm_setzero_si128 ();
   3068     one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
   3069     res = _mm_and_si128 (a, b);
   3070     res =  _mm_cmpeq_epi8 (res, zero);
   3071     return _mm_xor_si128(res, one);         //invert result
   3072 }
   3073 
   3074 uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b);         // VTST.16 q0, q0, q0
   3075 _NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b)         // VTST.16 q0, q0, q0
   3076 {
   3077     __m128i zero, one, res;
   3078     zero = _mm_setzero_si128 ();
   3079     one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
   3080     res = _mm_and_si128 (a, b);
   3081     res =  _mm_cmpeq_epi16 (res, zero);
   3082     return _mm_xor_si128(res, one);         //invert result
   3083 }
   3084 
   3085 uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b);         // VTST.32 q0, q0, q0
   3086 _NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b)         // VTST.32 q0, q0, q0
   3087 {
   3088     __m128i zero, one, res;
   3089     zero = _mm_setzero_si128 ();
   3090     one = _mm_cmpeq_epi8(zero,zero);         //0xfff..ffff
   3091     res = _mm_and_si128 (a, b);
   3092     res =  _mm_cmpeq_epi32 (res, zero);
   3093     return _mm_xor_si128(res, one);         //invert result
   3094 }
   3095 
   3096 uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b);         // VTST.8 q0, q0, q0
   3097 #define vtstq_u8 vtstq_s8
   3098 
   3099 uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b);         // VTST.16 q0, q0, q0
   3100 #define vtstq_u16 vtstq_s16
   3101 
   3102 uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b);         // VTST.32 q0, q0, q0
   3103 #define vtstq_u32 vtstq_s32
   3104 
   3105 uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b);         // VTST.8 q0, q0, q0
   3106 #define vtstq_p8 vtstq_u8
   3107 
   3108 //****************** Absolute difference ********************
   3109 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
   3110 //************************************************************
   3111 #if defined(USE_SSSE3)
   3112 
   3113 #endif
   3114 
   3115 #if defined(USE_SSSE3)
   3116 int8x16_t vabdq_s8(int8x16_t a, int8x16_t b);         // VABD.S8 q0,q0,q0
   3117 _NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b)         // VABD.S8 q0,q0,q0
   3118 {
   3119     __m128i res;
   3120     res = _mm_sub_epi8 (a, b);
   3121     return _mm_abs_epi8 (res);
   3122 }
   3123 #endif
   3124 
   3125 #if defined(USE_SSSE3)
   3126 int16x8_t vabdq_s16(int16x8_t a, int16x8_t b);         // VABD.S16 q0,q0,q0
   3127 _NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b)         // VABD.S16 q0,q0,q0
   3128 {
   3129     __m128i res;
   3130     res = _mm_sub_epi16 (a,b);
   3131     return _mm_abs_epi16 (res);
   3132 }
   3133 #endif
   3134 
   3135 #if defined(USE_SSSE3)
   3136 int32x4_t vabdq_s32(int32x4_t a, int32x4_t b);         // VABD.S32 q0,q0,q0
   3137 _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b)         // VABD.S32 q0,q0,q0
   3138 {
   3139     __m128i res;
   3140     res = _mm_sub_epi32 (a,b);
   3141     return _mm_abs_epi32 (res);
   3142 }
   3143 #endif
   3144 
   3145 uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b);         // VABD.U8 q0,q0,q0
   3146 _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b)         //no abs for unsigned
   3147 {
   3148     __m128i cmp, difab, difba;
   3149     cmp = vcgtq_u8(a,b);
   3150     difab = _mm_sub_epi8(a,b);
   3151     difba = _mm_sub_epi8 (b,a);
   3152     difab = _mm_and_si128(cmp, difab);
   3153     difba = _mm_andnot_si128(cmp, difba);
   3154     return _mm_or_si128(difab, difba);
   3155 }
   3156 
   3157 uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b);         // VABD.s16 q0,q0,q0
   3158 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
   3159 {
   3160     __m128i cmp, difab, difba;
   3161     cmp = vcgtq_u16(a,b);
   3162     difab = _mm_sub_epi16(a,b);
   3163     difba = _mm_sub_epi16 (b,a);
   3164     difab = _mm_and_si128(cmp, difab);
   3165     difba = _mm_andnot_si128(cmp, difba);
   3166     return _mm_or_si128(difab, difba);
   3167 }
   3168 
   3169 uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b);         // VABD.U32 q0,q0,q0
   3170 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
   3171 {
   3172     __m128i cmp, difab, difba;
   3173     cmp = vcgtq_u32(a,b);
   3174     difab = _mm_sub_epi32(a,b);
   3175     difba = _mm_sub_epi32 (b,a);
   3176     difab = _mm_and_si128(cmp, difab);
   3177     difba = _mm_andnot_si128(cmp, difba);
   3178     return _mm_or_si128(difab, difba);
   3179 }
   3180 
   3181 float32x4_t vabdq_f32(float32x4_t a, float32x4_t b);         // VABD.F32 q0,q0,q0
   3182 _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b)         // VABD.F32 q0,q0,q0
   3183 {
   3184     __m128i c1;
   3185     __m128 res;
   3186     c1 =  _mm_set1_epi32(0x7fffffff);
   3187     res = _mm_sub_ps (a, b);
   3188     return _mm_and_ps (res, *(__m128*)&c1);
   3189 }
   3190 
   3191 //************  Absolute difference - long **************************
   3192 //********************************************************************
   3193 
   3194 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
   3195 //*********************************************************************************************
   3196 
   3197 #if defined(USE_SSSE3)
   3198 int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c);         // VABA.S8 q0,q0,q0
   3199 _NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c)         // VABA.S8 q0,q0,q0
   3200 {
   3201     int8x16_t sub;
   3202     sub = vabdq_s8(b, c);
   3203     return vaddq_s8( a, sub);
   3204 }
   3205 #endif
   3206 
   3207 #if defined(USE_SSSE3)
   3208 int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c);         // VABA.S16 q0,q0,q0
   3209 _NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c)         // VABA.S16 q0,q0,q0
   3210 {
   3211     int16x8_t sub;
   3212     sub = vabdq_s16(b, c);
   3213     return vaddq_s16( a, sub);
   3214 }
   3215 #endif
   3216 
   3217 #if defined(USE_SSSE3)
   3218 int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c);         // VABA.S32 q0,q0,q0
   3219 _NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c)         // VABA.S32 q0,q0,q0
   3220 {
   3221     int32x4_t sub;
   3222     sub = vabdq_s32(b, c);
   3223     return vaddq_s32( a, sub);
   3224 }
   3225 #endif
   3226 
   3227 uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VABA.U8 q0,q0,q0
   3228 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
   3229 {
   3230     uint8x16_t sub;
   3231     sub = vabdq_u8(b, c);
   3232     return vaddq_u8( a, sub);
   3233 }
   3234 
   3235 uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VABA.s16 q0,q0,q0
   3236 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
   3237 {
   3238     uint16x8_t sub;
   3239     sub = vabdq_u16(b, c);
   3240     return vaddq_u16( a, sub);
   3241 }
   3242 
   3243 uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VABA.U32 q0,q0,q0
   3244 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
   3245 {
   3246     uint32x4_t sub;
   3247     sub = vabdq_u32(b, c);
   3248     return vaddq_u32( a, sub);
   3249 }
   3250 
   3251 //************** Absolute difference and accumulate - long ********************************
   3252 //*************************************************************************************
   3253 
   3254 //***********************************************************************************
   3255 //****************  Maximum and minimum operations **********************************
   3256 //***********************************************************************************
   3257 //************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
   3258 //***********************************************************************************
   3259 
   3260 int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b);         // VMAX.S8 q0,q0,q0
   3261 #define vmaxq_s8 _MM_MAX_EPI8         //SSE4.1
   3262 
   3263 int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b);         // VMAX.S16 q0,q0,q0
   3264 #define vmaxq_s16 _mm_max_epi16
   3265 
   3266 int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b);         // VMAX.S32 q0,q0,q0
   3267 #define vmaxq_s32 _MM_MAX_EPI32         //SSE4.1
   3268 
   3269 uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b);         // VMAX.U8 q0,q0,q0
   3270 #define vmaxq_u8 _mm_max_epu8
   3271 
   3272 uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b);         // VMAX.s16 q0,q0,q0
   3273 #define vmaxq_u16 _MM_MAX_EPU16         //SSE4.1
   3274 
   3275 uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b);         // VMAX.U32 q0,q0,q0
   3276 #define vmaxq_u32 _MM_MAX_EPU32         //SSE4.1
   3277 
   3278 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b);         // VMAX.F32 q0,q0,q0
   3279 #define vmaxq_f32 _mm_max_ps
   3280 
   3281 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
   3282 //***********************************************************************************************************
   3283 
   3284 int8x16_t   vminq_s8(int8x16_t a, int8x16_t b);         // VMIN.S8 q0,q0,q0
   3285 #define vminq_s8 _MM_MIN_EPI8         //SSE4.1
   3286 
   3287 int16x8_t   vminq_s16(int16x8_t a, int16x8_t b);         // VMIN.S16 q0,q0,q0
   3288 #define vminq_s16 _mm_min_epi16
   3289 
   3290 int32x4_t   vminq_s32(int32x4_t a, int32x4_t b);         // VMIN.S32 q0,q0,q0
   3291 #define vminq_s32 _MM_MIN_EPI32         //SSE4.1
   3292 
   3293 uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b);         // VMIN.U8 q0,q0,q0
   3294 #define vminq_u8 _mm_min_epu8
   3295 
   3296 uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b);         // VMIN.s16 q0,q0,q0
   3297 #define vminq_u16 _MM_MIN_EPU16         //SSE4.1
   3298 
   3299 uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b);         // VMIN.U32 q0,q0,q0
   3300 #define vminq_u32 _MM_MIN_EPU32         //SSE4.1
   3301 
   3302 float32x4_t vminq_f32(float32x4_t a, float32x4_t b);         // VMIN.F32 q0,q0,q0
   3303 #define vminq_f32 _mm_min_ps
   3304 
   3305 //*************  Pairwise addition operations. **************************************
   3306 //************************************************************************************
   3307 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
   3308 
   3309 //**************************  Long pairwise add  **********************************
   3310 //*********************************************************************************
   3311 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
   3312 // and places the final results in the destination vector.
   3313 
   3314 #if defined(USE_SSSE3)
   3315 int16x8_t vpaddlq_s8(int8x16_t a);         // VPADDL.S8 q0,q0
   3316 _NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a)         // VPADDL.S8 q0,q0
   3317 {         //no 8 bit hadd in IA32, need to go to 16 bit
   3318     __m128i r16_1, r16_2;
   3319     r16_1 = _MM_CVTEPI8_EPI16 (a);         // SSE 4.1
   3320     //swap hi and low part of r to process the remaining data
   3321     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3322     r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
   3323     return _mm_hadd_epi16 (r16_1, r16_2);
   3324 }
   3325 #endif
   3326 
   3327 #if defined(USE_SSSE3)
   3328 int32x4_t vpaddlq_s16(int16x8_t a);         // VPADDL.S16 q0,q0
   3329 _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a)         // VPADDL.S16 q0,q0
   3330 {         //no 8 bit hadd in IA32, need to go to 16 bit
   3331     __m128i r32_1, r32_2;
   3332     r32_1 = _MM_CVTEPI16_EPI32(a);
   3333     //swap hi and low part of r to process the remaining data
   3334     r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3335     r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
   3336     return _mm_hadd_epi32 (r32_1, r32_2);
   3337 }
   3338 #endif
   3339 
   3340 int64x2_t vpaddlq_s32(int32x4_t a);         // VPADDL.S32 q0,q0
   3341 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)         // VPADDL.S32 q0,q0
   3342 {
   3343     _NEON2SSE_ALIGN_16 int32_t atmp[4];
   3344     _NEON2SSE_ALIGN_16 int64_t res[2];
   3345     _mm_store_si128((__m128i*)atmp, a);
   3346     res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
   3347     res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
   3348     return _mm_load_si128((__m128i*)res);
   3349 }
   3350 
   3351 #if defined(USE_SSSE3)
   3352 uint16x8_t vpaddlq_u8(uint8x16_t a);         // VPADDL.U8 q0,q0
   3353 _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a)         // VPADDL.U8 q0,q0
   3354 {         //no 8 bit hadd in IA32, need to go to 16 bit
   3355     __m128i r16_1, r16_2;
   3356     r16_1 = _MM_CVTEPU8_EPI16(a);
   3357     //swap hi and low part of r to process the remaining data
   3358     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3359     r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
   3360     return _mm_hadd_epi16 (r16_1, r16_2);
   3361 }
   3362 #endif
   3363 
   3364 uint32x4_t vpaddlq_u16(uint16x8_t a);         // VPADDL.s16 q0,q0
   3365 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
   3366 {         //serial solution looks faster than a SIMD one
   3367     _NEON2SSE_ALIGN_16 uint16_t atmp[8];
   3368     _NEON2SSE_ALIGN_16 uint32_t res[4];
   3369     _mm_store_si128((__m128i*)atmp, a);
   3370     res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
   3371     res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
   3372     res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
   3373     res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
   3374     return _mm_load_si128((__m128i*)res);
   3375 }
   3376 
   3377 uint64x2_t vpaddlq_u32(uint32x4_t a);         // VPADDL.U32 q0,q0
   3378 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   3379 {
   3380     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
   3381     _NEON2SSE_ALIGN_16 uint64_t res[2];
   3382     _mm_store_si128((__m128i*)atmp, a);
   3383     res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
   3384     res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
   3385     return _mm_load_si128((__m128i*)res);
   3386 }
   3387 
   3388 //************************  Long pairwise add and accumulate **************************
   3389 //****************************************************************************************
   3390 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
   3391 // and accumulates the  values of the results into the elements of the destination (wide) vector
   3392 
   3393 #if defined(USE_SSSE3)
   3394 int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b);         // VPADAL.S8 q0,q0
   3395 _NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b)         // VPADAL.S8 q0,q0
   3396 {
   3397     int16x8_t pad;
   3398     pad = vpaddlq_s8(b);
   3399     return _mm_add_epi16 (a, pad);
   3400 }
   3401 #endif
   3402 
   3403 #if defined(USE_SSSE3)
   3404 int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b);         // VPADAL.S16 q0,q0
   3405 _NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b)         // VPADAL.S16 q0,q0
   3406 {
   3407     int32x4_t pad;
   3408     pad = vpaddlq_s16(b);
   3409     return _mm_add_epi32(a, pad);
   3410 }
   3411 #endif
   3412 
   3413 int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b);         // VPADAL.S32 q0,q0
   3414 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
   3415 {
   3416     int64x2_t pad;
   3417     pad = vpaddlq_s32(b);
   3418     return _mm_add_epi64 (a, pad);
   3419 }
   3420 
   3421 #if defined(USE_SSSE3)
   3422 uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b);         // VPADAL.U8 q0,q0
   3423 _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b)         // VPADAL.U8 q0,q0
   3424 {
   3425     uint16x8_t pad;
   3426     pad = vpaddlq_u8(b);
   3427     return _mm_add_epi16 (a, pad);
   3428 }
   3429 #endif
   3430 
   3431 uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b);         // VPADAL.s16 q0,q0
   3432 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3433 {
   3434     uint32x4_t pad;
   3435     pad = vpaddlq_u16(b);
   3436     return _mm_add_epi32(a, pad);
   3437 }         //no optimal SIMD solution, serial is faster
   3438 
   3439 uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b);         // VPADAL.U32 q0,q0
   3440 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3441 {         //no optimal SIMD solution, serial is faster
   3442     uint64x2_t pad;
   3443     pad = vpaddlq_u32(b);
   3444     return _mm_add_epi64(a, pad);
   3445 }         //no optimal SIMD solution, serial is faster
   3446 
   3447 //**********  Folding maximum   *************************************
   3448 //*******************************************************************
   3449 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
   3450 //and copies the larger of each pair into the corresponding element in the destination
   3451 //    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
   3452 
   3453 // ***************** Folding minimum  ****************************
   3454 // **************************************************************
   3455 //vpmin -> takes minimum of adjacent pairs
   3456 
   3457 //***************************************************************
   3458 //***********  Reciprocal/Sqrt ************************************
   3459 //***************************************************************
   3460 //****************** Reciprocal estimate *******************************
   3461 
   3462 //the ARM NEON and x86 SIMD results may be slightly different
   3463 
   3464 float32x4_t vrecpeq_f32(float32x4_t a);         // VRECPE.F32 q0,q0
   3465 //the ARM NEON and x86 SIMD results may be slightly different
   3466 #define vrecpeq_f32 _mm_rcp_ps
   3467 
   3468 uint32x4_t vrecpeq_u32(uint32x4_t a);         // VRECPE.U32 q0,q0
   3469 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   3470 {         //no reciprocal for ints in IA32 available, neither for  unsigned int to float 4 lanes conversion, so serial solution looks faster
   3471     _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4];
   3472     _mm_store_si128((__m128i*)atmp, a);
   3473     res[0] = (atmp[0]) ? 1 / atmp[0] : 0xffffffff;
   3474     res[1] = (atmp[1]) ? 1 / atmp[1] : 0xffffffff;
   3475     return _mm_load_si128((__m128i*)res);
   3476 }
   3477 
   3478 //**********Reciprocal square root estimate ****************
   3479 //**********************************************************
   3480 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
   3481 
   3482 float32x4_t vrsqrteq_f32(float32x4_t a);         // VRSQRTE.F32 q0,q0
   3483 //the ARM NEON and x86 SIMD results may be slightly different
   3484 #define vrsqrteq_f32 _mm_rsqrt_ps
   3485 
   3486 uint32x4_t vrsqrteq_u32(uint32x4_t a);         // VRSQRTE.U32 q0,q0
   3487 #define vrsqrteq_u32(a) _mm_castps_si128(_mm_rsqrt_ps(_M128(a)) )
   3488 
   3489 //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
   3490 //******************************************************************************************
   3491 //******VRECPS (Vector Reciprocal Step) ***************************************************
   3492 //multiplies the elements of one vector by the corresponding elements of another vector,
   3493 //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
   3494 
   3495 float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b);         // VRECPS.F32 q0, q0, q0
   3496 _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b)         // VRECPS.F32 q0, q0, q0
   3497 {
   3498     __m128 f2, mul;
   3499     f2 =  _mm_set1_ps(2.);
   3500     mul = _mm_mul_ps(a,b);
   3501     return _mm_sub_ps(f2,mul);
   3502 }
   3503 
   3504 //*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
   3505 //multiplies the elements of one vector by the corresponding elements of another vector,
   3506 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
   3507 
   3508 float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b);         // VRSQRTS.F32 q0, q0, q0
   3509 _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b)         // VRSQRTS.F32 q0, q0, q0
   3510 {
   3511     __m128 f3, f05, mul;
   3512     f3 =  _mm_set1_ps(3.);
   3513     f05 =  _mm_set1_ps(0.5);
   3514     mul = _mm_mul_ps(a,b);
   3515     f3 = _mm_sub_ps(f3,mul);
   3516     return _mm_mul_ps (f3, f05);
   3517 }
   3518 //********************************************************************************************
   3519 //***************************** Shifts by signed variable ***********************************
   3520 //********************************************************************************************
   3521 //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
   3522 //********************************************************************************************
   3523 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
   3524 //helper macro. It matches ARM implementation for big shifts
   3525 #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
   3526         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
   3527         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   3528         for (i = 0; i<LEN; i++) { \
   3529         if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
   3530         else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
   3531         return _mm_load_si128((__m128i*)res);
   3532 
   3533 int8x16_t vshlq_s8(int8x16_t a, int8x16_t b);         // VSHL.S8 q0,q0,q0
   3534 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3535 {
   3536     SERIAL_SHIFT(int8_t, int8_t, 16, 16)
   3537 }
   3538 
   3539 int16x8_t vshlq_s16(int16x8_t a, int16x8_t b);         // VSHL.S16 q0,q0,q0
   3540 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3541 {
   3542     SERIAL_SHIFT(int16_t, int16_t, 8, 8)
   3543 }
   3544 
   3545 int32x4_t vshlq_s32(int32x4_t a, int32x4_t b);         // VSHL.S32 q0,q0,q0
   3546 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3547 {
   3548     SERIAL_SHIFT(int32_t, int32_t, 4, 4)
   3549 }
   3550 
   3551 int64x2_t vshlq_s64(int64x2_t a, int64x2_t b);         // VSHL.S64 q0,q0,q0
   3552 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3553 {
   3554     SERIAL_SHIFT(int64_t, int64_t, 2, 2)
   3555 }
   3556 
   3557 uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b);         // VSHL.U8 q0,q0,q0
   3558 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3559 {
   3560     SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
   3561 }
   3562 
   3563 uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b);         // VSHL.s16 q0,q0,q0
   3564 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3565 {
   3566     SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
   3567 }
   3568 
   3569 uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b);         // VSHL.U32 q0,q0,q0
   3570 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3571 {
   3572     SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
   3573 }
   3574 
   3575 uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b);         // VSHL.U64 q0,q0,q0
   3576 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3577 {
   3578     SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
   3579 }
   3580 
   3581 //*********** Vector saturating shift left: (negative values shift right) **********************
   3582 //********************************************************************************************
   3583 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
   3584 #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
   3585         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
   3586         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
   3587         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   3588         for (i = 0; i<LEN; i++) { \
   3589         if (atmp[i] ==0) res[i] = 0; \
   3590         else{ \
   3591             if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
   3592             else{ \
   3593                 if (btmp[i]>lanesize_1) { \
   3594                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   3595                 }else{ \
   3596                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
   3597                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
   3598                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   3599                     else res[i] = atmp[i] << btmp[i]; }}}} \
   3600         return _mm_load_si128((__m128i*)res);
   3601 
   3602 #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
   3603         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
   3604         TYPE lanesize = (sizeof(TYPE) << 3); \
   3605         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   3606         for (i = 0; i<LEN; i++) { \
   3607         if (atmp[i] ==0) {res[i] = 0; \
   3608         }else{ \
   3609             if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
   3610             else{ \
   3611                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
   3612                 else{ \
   3613                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
   3614                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
   3615         return _mm_load_si128((__m128i*)res);
   3616 
   3617 int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b);         // VQSHL.S8 q0,q0,q0
   3618 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3619 {
   3620     SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
   3621 }
   3622 
   3623 int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b);         // VQSHL.S16 q0,q0,q0
   3624 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3625 {
   3626     SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
   3627 }
   3628 
   3629 int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b);         // VQSHL.S32 q0,q0,q0
   3630 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3631 {
   3632     SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
   3633 }
   3634 
   3635 int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b);         // VQSHL.S64 q0,q0,q0
   3636 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3637 {
   3638     SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
   3639 }
   3640 
   3641 uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b);         // VQSHL.U8 q0,q0,q0
   3642 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3643 {
   3644     SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
   3645 }
   3646 
   3647 uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b);         // VQSHL.s16 q0,q0,q0
   3648 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3649 {
   3650     SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
   3651 }
   3652 
   3653 uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b);         // VQSHL.U32 q0,q0,q0
   3654 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3655 {
   3656     SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
   3657 }
   3658 
   3659 uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b);         // VQSHL.U64 q0,q0,q0
   3660 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3661 {
   3662     SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
   3663 }
   3664 
   3665 //******** Vector rounding shift left: (negative values shift right) **********
   3666 //****************************************************************************
   3667 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
   3668 //rounding makes sense for right shifts only.
   3669 #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
   3670         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
   3671         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   3672         for (i = 0; i<LEN; i++) { \
   3673         if( btmp[i] >= 0) { \
   3674             if(btmp[i] >= lanesize) res[i] = 0; \
   3675             else res[i] = (atmp[i] << btmp[i]); \
   3676         }else{ \
   3677             res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
   3678                             (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
   3679                             (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
   3680         return _mm_load_si128((__m128i*)res);
   3681 
   3682 int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b);         // VRSHL.S8 q0,q0,q0
   3683 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3684 {
   3685     SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
   3686 }
   3687 
   3688 int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b);         // VRSHL.S16 q0,q0,q0
   3689 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3690 {
   3691     SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
   3692 }
   3693 
   3694 int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b);         // VRSHL.S32 q0,q0,q0
   3695 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3696 {
   3697     SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
   3698 }
   3699 
   3700 int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b);         // VRSHL.S64 q0,q0,q0
   3701 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3702 {
   3703     SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
   3704 }
   3705 
   3706 uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b);         // VRSHL.U8 q0,q0,q0
   3707 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3708 {
   3709     SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
   3710 }
   3711 
   3712 uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b);         // VRSHL.s16 q0,q0,q0
   3713 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3714 {
   3715     SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
   3716 }
   3717 
   3718 uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b);         // VRSHL.U32 q0,q0,q0
   3719 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3720 {
   3721     SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
   3722 }
   3723 
   3724 uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b);         // VRSHL.U64 q0,q0,q0
   3725 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3726 {
   3727     SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
   3728 }
   3729 
   3730 //********** Vector saturating rounding shift left: (negative values shift right) ****************
   3731 //*************************************************************************************************
   3732 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
   3733 //Saturation happens for left shifts only while rounding makes sense for right shifts only.
   3734 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
   3735         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
   3736         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
   3737         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   3738         for (i = 0; i<LEN; i++) { \
   3739         if (atmp[i] ==0) res[i] = 0; \
   3740         else{ \
   3741             if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
   3742             else{ \
   3743                 if (btmp[i]>lanesize_1) { \
   3744                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   3745                 }else{ \
   3746                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
   3747                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
   3748                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   3749                     else res[i] = atmp[i] << btmp[i]; }}}} \
   3750         return _mm_load_si128((__m128i*)res);
   3751 
   3752 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
   3753         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
   3754         int lanesize = (sizeof(TYPE) << 3); \
   3755         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   3756         for (i = 0; i<LEN; i++) { \
   3757         if (atmp[i] ==0) {res[i] = 0; \
   3758         }else{ \
   3759             if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
   3760             else{ \
   3761                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
   3762                 else{ \
   3763                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
   3764                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
   3765         return _mm_load_si128((__m128i*)res);
   3766 
   3767 int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b);         // VQRSHL.S8 q0,q0,q0
   3768 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3769 {
   3770     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
   3771 }
   3772 
   3773 int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b);         // VQRSHL.S16 q0,q0,q0
   3774 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3775 {
   3776     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
   3777 }
   3778 
   3779 int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b);         // VQRSHL.S32 q0,q0,q0
   3780 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3781 {
   3782     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
   3783 }
   3784 
   3785 int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b);         // VQRSHL.S64 q0,q0,q0
   3786 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3787 {
   3788     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
   3789 }
   3790 
   3791 uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b);         // VQRSHL.U8 q0,q0,q0
   3792 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3793 {
   3794     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
   3795 }
   3796 
   3797 uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b);         // VQRSHL.s16 q0,q0,q0
   3798 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3799 {
   3800     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
   3801 }
   3802 
   3803 uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b);         // VQRSHL.U32 q0,q0,q0
   3804 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3805 {
   3806     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
   3807 }
   3808 
   3809 uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b);         // VQRSHL.U64 q0,q0,q0
   3810 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   3811 {
   3812     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
   3813 }
   3814 
   3815 // *********************************************************************************
   3816 // *****************************  Shifts by a constant *****************************
   3817 // *********************************************************************************
   3818 //**************** Vector shift right by constant*************************************
   3819 //************************************************************************************
   3820 
   3821 int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VSHR.S8 q0,q0,#8
   3822 _NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b)         // VSHR.S8 q0,q0,#8
   3823 {         //no 8 bit shift available, go to 16 bit trick
   3824     __m128i zero, mask0, a_sign, r, a_sign_mask;
   3825     _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
   3826     zero = _mm_setzero_si128();
   3827     mask0 = _mm_set1_epi16(mask0_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
   3828     a_sign =  _mm_cmpgt_epi8 (zero, a);         //ff if a<0 or zero if a>0
   3829     r = _mm_srai_epi16 (a, b);
   3830     a_sign_mask =  _mm_and_si128 (mask0, a_sign);
   3831     r =  _mm_andnot_si128 (mask0, r);
   3832     return _mm_or_si128 (r, a_sign_mask);
   3833 }
   3834 
   3835 int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VSHR.S16 q0,q0,#16
   3836 #define vshrq_n_s16 _mm_srai_epi16
   3837 
   3838 int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VSHR.S32 q0,q0,#32
   3839 #define vshrq_n_s32 _mm_srai_epi32
   3840 
   3841 int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VSHR.S64 q0,q0,#64
   3842 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
   3843 {         //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
   3844     __m128i c1, signmask,a0,  res64;
   3845     _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
   3846     c1 =  _mm_cmpeq_epi32(a,a);         //0xffffffffffffffff
   3847     signmask  =  _mm_slli_epi64 (c1, (64 - b));
   3848     a0 = _mm_or_si128(a, *(__m128i*)mask);         //get the first bit
   3849     #ifdef USE_SSE4
   3850         a0 = _mm_cmpeq_epi64 (a, a0);         //SSE4.1
   3851     #else
   3852         a0 = _mm_cmpeq_epi32 (a, a0);
   3853         a0 = _mm_shuffle_epi32 (a0, 1 | (1 << 2) | (3 << 4) | (3 << 6));         //copy the information from hi to low part of the 64 bit data
   3854     #endif
   3855     signmask = _mm_and_si128(a0, signmask);
   3856     res64 = _mm_srli_epi64 (a, b);
   3857     return _mm_or_si128(res64, signmask);
   3858 }
   3859 
   3860 uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VSHR.U8 q0,q0,#8
   3861 _NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b)         // VSHR.U8 q0,q0,#8
   3862 {         //no 8 bit shift available, need the special trick
   3863     __m128i mask0, r;
   3864     _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
   3865     mask0 = _mm_set1_epi16(mask10_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
   3866     r = _mm_srli_epi16 ( a, b);
   3867     return _mm_and_si128 (r,  mask0);
   3868 }
   3869 
   3870 uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VSHR.s16 q0,q0,#16
   3871 #define vshrq_n_u16 _mm_srli_epi16
   3872 
   3873 uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VSHR.U32 q0,q0,#32
   3874 #define vshrq_n_u32 _mm_srli_epi32
   3875 
   3876 uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VSHR.U64 q0,q0,#64
   3877 #define vshrq_n_u64 _mm_srli_epi64
   3878 
   3879 //*************************** Vector shift left by constant *************************
   3880 //*********************************************************************************
   3881 
   3882 int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
   3883 #define vshlq_n_s8 vshlq_n_u8
   3884 
   3885 int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
   3886 #define vshlq_n_s16 _mm_slli_epi16
   3887 
   3888 int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
   3889 #define vshlq_n_s32 _mm_slli_epi32
   3890 
   3891 int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
   3892 #define vshlq_n_s64 _mm_slli_epi64
   3893 
   3894 uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VSHL.I8 q0,q0,#0
   3895 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
   3896 {         //no 8 bit shift available, need the special trick
   3897     __m128i mask0, r;
   3898     _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
   3899     mask0 = _mm_set1_epi16(mask10_16[b]);         //to mask the bits to be "spoiled"  by 16 bit shift
   3900     r = _mm_slli_epi16 ( a, b);
   3901     return _mm_and_si128 (r,  mask0);
   3902 }
   3903 
   3904 uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VSHL.I16 q0,q0,#0
   3905 #define vshlq_n_u16 vshlq_n_s16
   3906 
   3907 uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VSHL.I32 q0,q0,#0
   3908 #define vshlq_n_u32 vshlq_n_s32
   3909 
   3910 uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VSHL.I64 q0,q0,#0
   3911 #define vshlq_n_u64 vshlq_n_s64
   3912 
   3913 //************* Vector rounding shift right by constant ******************
   3914 //*************************************************************************
   3915 //No corresponding  x86 intrinsics exist, need to do some tricks
   3916 
   3917 int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b);         // VRSHR.S8 q0,q0,#8
   3918 _NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b)         // VRSHR.S8 q0,q0,#8
   3919 {         //no 8 bit shift available, go to 16 bit trick
   3920     __m128i r, mask1, maskb;
   3921     _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};         // 2^b-th bit set to 1
   3922     r = vshrq_n_s8 (a, b);
   3923     mask1 = _mm_set1_epi16(mask2b[b]);         // 2^b-th bit set to 1 for 16bit, need it for rounding
   3924     maskb = _mm_and_si128(a, mask1);         //get b or 0 for rounding
   3925     maskb =  _mm_srli_epi16 (maskb, b - 1);         // to add 1
   3926     return _mm_add_epi8(r, maskb);         //actual rounding
   3927 }
   3928 
   3929 int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b);         // VRSHR.S16 q0,q0,#16
   3930 _NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b)         // VRSHR.S16 q0,q0,#16
   3931 {
   3932     __m128i maskb, r;
   3933     maskb =  _mm_slli_epi16(a, (16 - b));         //to get rounding (b-1)th bit
   3934     maskb = _mm_srli_epi16(maskb, 15);         //1 or 0
   3935     r = _mm_srai_epi16 (a, b);
   3936     return _mm_add_epi16 (r, maskb);         //actual rounding
   3937 }
   3938 
   3939 int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b);         // VRSHR.S32 q0,q0,#32
   3940 _NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b)         // VRSHR.S32 q0,q0,#32
   3941 {
   3942     __m128i maskb,  r;
   3943     maskb = _mm_slli_epi32 (a, (32 - b));         //to get rounding (b-1)th bit
   3944     maskb = _mm_srli_epi32 (maskb,31);         //1 or 0
   3945     r = _mm_srai_epi32(a, b);
   3946     return _mm_add_epi32 (r, maskb);         //actual rounding
   3947 }
   3948 
   3949 int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b);         // VRSHR.S64 q0,q0,#64
   3950 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
   3951 {         //solution may be not optimal compared with a serial one
   3952     __m128i maskb;
   3953     int64x2_t r;
   3954     maskb = _mm_slli_epi64 (a, (64 - b));         //to get rounding (b-1)th bit
   3955     maskb = _mm_srli_epi64 (maskb,63);         //1 or 0
   3956     r = vshrq_n_s64(a, b);
   3957     return _mm_add_epi64 (r, maskb);         //actual rounding
   3958 }
   3959 
   3960 uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b);         // VRSHR.U8 q0,q0,#8
   3961 _NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b)         // VRSHR.U8 q0,q0,#8
   3962 {         //no 8 bit shift available, go to 16 bit trick
   3963     __m128i r, mask1, maskb;
   3964     _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080};         // 2^b-th bit set to 1
   3965     r = vshrq_n_u8 (a, b);
   3966     mask1 = _mm_set1_epi16(mask2b[b]);         // 2^b-th bit set to 1 for 16bit, need it for rounding
   3967     maskb = _mm_and_si128(a, mask1);         //get b or 0 for rounding
   3968     maskb =  _mm_srli_epi16 (maskb, b - 1);         // to add 1
   3969     return _mm_add_epi8(r, maskb);         //actual rounding
   3970 }
   3971 
   3972 uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b);         // VRSHR.s16 q0,q0,#16
   3973 _NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b)         // VRSHR.S16 q0,q0,#16
   3974 {
   3975     __m128i maskb, r;
   3976     maskb =  _mm_slli_epi16(a, (16 - b));         //to get rounding (b-1)th bit
   3977     maskb = _mm_srli_epi16(maskb, 15);         //1 or 0
   3978     r = _mm_srli_epi16 (a, b);
   3979     return _mm_add_epi16 (r, maskb);         //actual rounding
   3980 }
   3981 
   3982 uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b);         // VRSHR.U32 q0,q0,#32
   3983 _NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b)         // VRSHR.S32 q0,q0,#32
   3984 {
   3985     __m128i maskb,  r;
   3986     maskb = _mm_slli_epi32 (a, (32 - b));         //to get rounding (b-1)th bit
   3987     maskb = _mm_srli_epi32 (maskb,31);         //1 or 0
   3988     r = _mm_srli_epi32(a, b);
   3989     return _mm_add_epi32 (r, maskb);         //actual rounding
   3990 }
   3991 
   3992 uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b);         // VRSHR.U64 q0,q0,#64
   3993 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
   3994 {         //solution may be not optimal compared with a serial one
   3995     __m128i maskb,  r;
   3996     maskb = _mm_slli_epi64 (a, (64 - b));         //to get rounding (b-1)th bit
   3997     maskb = _mm_srli_epi64 (maskb,63);         //1 or 0
   3998     r = _mm_srli_epi64(a, b);
   3999     return _mm_add_epi64 (r, maskb);         //actual rounding
   4000 }
   4001 
   4002 //************* Vector shift right by constant and accumulate *********
   4003 //*********************************************************************
   4004 
   4005 int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRA.S8 q0,q0,#8
   4006 _NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VSRA.S8 q0,q0,#8
   4007 {
   4008     int8x16_t shift;
   4009     shift = vshrq_n_s8(b, c);
   4010     return vaddq_s8(a, shift);
   4011 }
   4012 
   4013 int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRA.S16 q0,q0,#16
   4014 _NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VSRA.S16 q0,q0,#16
   4015 {
   4016     int16x8_t shift;
   4017     shift = vshrq_n_s16(b, c);
   4018     return vaddq_s16(a, shift);
   4019 }
   4020 
   4021 int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRA.S32 q0,q0,#32
   4022 _NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VSRA.S32 q0,q0,#32
   4023 {
   4024     int32x4_t shift;
   4025     shift = vshrq_n_s32(b, c);
   4026     return vaddq_s32(a, shift);
   4027 }
   4028 
   4029 int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRA.S64 q0,q0,#64
   4030 _NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)         // VSRA.S64 q0,q0,#64
   4031 {
   4032     int64x2_t shift;
   4033     shift = vshrq_n_s64(b, c);
   4034     return vaddq_s64( a, shift);
   4035 }
   4036 
   4037 uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRA.U8 q0,q0,#8
   4038 _NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c)         // VSRA.U8 q0,q0,#8
   4039 {
   4040     uint8x16_t shift;
   4041     shift = vshrq_n_u8(b, c);
   4042     return vaddq_u8(a, shift);
   4043 }
   4044 
   4045 uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRA.s16 q0,q0,#16
   4046 _NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c)         // VSRA.s16 q0,q0,#16
   4047 {
   4048     uint16x8_t shift;
   4049     shift = vshrq_n_u16(b, c);
   4050     return vaddq_u16(a,  shift);
   4051 }
   4052 
   4053 uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRA.U32 q0,q0,#32
   4054 _NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c)         // VSRA.U32 q0,q0,#32
   4055 {
   4056     uint32x4_t shift;
   4057     shift = vshrq_n_u32(b, c);
   4058     return vaddq_u32(a, shift);
   4059 }
   4060 
   4061 uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRA.U64 q0,q0,#64
   4062 _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)         // VSRA.U64 q0,q0,#64
   4063 {
   4064     uint64x2_t shift;
   4065     shift = vshrq_n_u64(b, c);
   4066     return vaddq_u64(a, shift);
   4067 }
   4068 
   4069 //************* Vector rounding shift right by constant and accumulate ****************************
   4070 //************************************************************************************************
   4071 
   4072 int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VRSRA.S8 q0,q0,#8
   4073 _NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VRSRA.S8 q0,q0,#8
   4074 {
   4075     int8x16_t shift;
   4076     shift = vrshrq_n_s8(b, c);
   4077     return vaddq_s8(a, shift);
   4078 }
   4079 
   4080 int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VRSRA.S16 q0,q0,#16
   4081 _NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VRSRA.S16 q0,q0,#16
   4082 {
   4083     int16x8_t shift;
   4084     shift = vrshrq_n_s16(b, c);
   4085     return vaddq_s16(a, shift);
   4086 }
   4087 
   4088 int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VRSRA.S32 q0,q0,#32
   4089 _NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VRSRA.S32 q0,q0,#32
   4090 {
   4091     int32x4_t shift;
   4092     shift = vrshrq_n_s32(b, c);
   4093     return vaddq_s32(a, shift);
   4094 }
   4095 
   4096 int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VRSRA.S64 q0,q0,#64
   4097 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
   4098 {
   4099     int64x2_t shift;
   4100     shift = vrshrq_n_s64(b, c);
   4101     return vaddq_s64(a, shift);
   4102 }
   4103 
   4104 uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VRSRA.U8 q0,q0,#8
   4105 _NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c)         // VRSRA.U8 q0,q0,#8
   4106 {
   4107     uint8x16_t shift;
   4108     shift = vrshrq_n_u8(b, c);
   4109     return vaddq_u8(a, shift);
   4110 }
   4111 
   4112 uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VRSRA.s16 q0,q0,#16
   4113 _NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c)         // VRSRA.s16 q0,q0,#16
   4114 {
   4115     uint16x8_t shift;
   4116     shift = vrshrq_n_u16(b, c);
   4117     return vaddq_u16(a,  shift);
   4118 }
   4119 
   4120 uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VRSRA.U32 q0,q0,#32
   4121 _NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c)         // VRSRA.U32 q0,q0,#32
   4122 {
   4123     uint32x4_t shift;
   4124     shift = vrshrq_n_u32(b, c);
   4125     return vaddq_u32(a, shift);
   4126 }
   4127 
   4128 uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VRSRA.U64 q0,q0,#64
   4129 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
   4130 {
   4131     uint64x2_t shift;
   4132     shift = vrshrq_n_u64(b, c);
   4133     return vaddq_u64(a, shift);
   4134 }
   4135 
   4136 //**********************Vector saturating shift left by constant *****************************
   4137 //********************************************************************************************
   4138 //we don't check const ranges  assuming they are met
   4139 
   4140 int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHL.S8 q0,q0,#0
   4141 _NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b)         // VQSHL.S8 q0,q0,#0
   4142 {         // go to 16 bit to get the auto saturation (in packs function)
   4143     __m128i a128, r128_1, r128_2;
   4144     a128 = _MM_CVTEPI8_EPI16 (a);         //SSE 4.1
   4145     r128_1 = _mm_slli_epi16 (a128, b);
   4146     //swap hi and low part of a128 to process the remaining data
   4147     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   4148     a128 = _MM_CVTEPI8_EPI16 (a128);
   4149     r128_2 = _mm_slli_epi16 (a128, b);
   4150     return _mm_packs_epi16 (r128_1, r128_2);         //saturated s8
   4151 }
   4152 
   4153 int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHL.S16 q0,q0,#0
   4154 _NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b)         // VQSHL.S16 q0,q0,#0
   4155 {         // manual saturation solution looks LESS optimal than 32 bits conversion one
   4156       // go to 32 bit to get the auto saturation (in packs function)
   4157     __m128i a128, r128_1, r128_2;
   4158     a128 = _MM_CVTEPI16_EPI32 (a);         //SSE 4.1
   4159     r128_1 = _mm_slli_epi32 (a128, b);         //shift_res
   4160     //swap hi and low part of a128 to process the remaining data
   4161     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   4162     a128 = _MM_CVTEPI16_EPI32 (a128);
   4163     r128_2 = _mm_slli_epi32 (a128, b);
   4164     return _mm_packs_epi32 (r128_1, r128_2);         //saturated s16
   4165 }
   4166 
   4167 int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHL.S32 q0,q0,#0
   4168 _NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b)         // VQSHL.S32 q0,q0,#0
   4169 {         // no 64 bit saturation option available, special tricks necessary
   4170     __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
   4171     c1 = _mm_cmpeq_epi32(a,a);         //0xff..ff
   4172     maskA = _mm_srli_epi32(c1, b + 1);         //mask for positive numbers (32-b+1) zeros and b-1 ones
   4173     saturation_mask = _mm_cmpgt_epi32 (a, maskA);         //0xff...ff if we need saturation, 0  otherwise
   4174     c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1);         //saturated to 0x7f..ff when needed and zeros if not
   4175     shift_res = _mm_slli_epi32 (a, b);
   4176     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
   4177     //result with positive numbers saturated
   4178     shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
   4179     //treat negative numbers
   4180     maskA = _mm_slli_epi32(c1, 31 - b);         //mask for negative numbers b-1 ones  and (32-b+1)  zeros
   4181     saturation_mask = _mm_cmpgt_epi32 (maskA,a);         //0xff...ff if we need saturation, 0  otherwise
   4182     c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31);         //saturated to 0x80..00 when needed and zeros if not
   4183     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
   4184     return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
   4185 }
   4186 
   4187 int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHL.S64 q0,q0,#0
   4188 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   4189 {         // no effective SIMD solution here
   4190     _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
   4191     int64_t bmask;
   4192     int i;
   4193     bmask = ( int64_t)1 << (63 - b);         //positive
   4194     _mm_store_si128((__m128i*)atmp, a);
   4195     for (i = 0; i<2; i++) {
   4196         if (atmp[i] >= bmask) {
   4197             res[i] = ~(_SIGNBIT64);
   4198         } else {
   4199             res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
   4200         }
   4201     }
   4202     return _mm_load_si128((__m128i*)res);
   4203 }
   4204 
   4205 uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b);         // VQSHL.U8 q0,q0,#0
   4206 _NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)         // VQSHL.U8 q0,q0,#0
   4207 {         // go to 16 bit to get the auto saturation (in packs function)
   4208     __m128i a128, r128_1, r128_2;
   4209     a128 = _MM_CVTEPU8_EPI16 (a);         //SSE 4.1
   4210     r128_1 = _mm_slli_epi16 (a128, b);
   4211     //swap hi and low part of a128 to process the remaining data
   4212     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   4213     a128 = _MM_CVTEPU8_EPI16 (a128);
   4214     r128_2 = _mm_slli_epi16 (a128, b);
   4215     return _mm_packus_epi16 (r128_1, r128_2);         //saturated u8
   4216 }
   4217 
   4218 uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b);         // VQSHL.s16 q0,q0,#0
   4219 _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b)         // VQSHL.s16 q0,q0,#0
   4220 {         // manual saturation solution looks more optimal than 32 bits conversion one
   4221     __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
   4222     cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
   4223     c8000 = _mm_set1_epi16 (0x8000);
   4224 //no unsigned shorts comparison in SSE, only signed available, so need the trick
   4225     a_signed = _mm_sub_epi16(a, c8000);         //go to signed
   4226     saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
   4227     shift_res = _mm_slli_epi16 (a, b);
   4228     return _mm_or_si128 (shift_res, saturation_mask);
   4229 }
   4230 
   4231 uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b);         // VQSHL.U32 q0,q0,#0
   4232 _NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b)         // VQSHL.U32 q0,q0,#0
   4233 {         // manual saturation solution, no 64 bit saturation option, the serial version may be faster
   4234     __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
   4235     cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
   4236     c80000000 = _mm_set1_epi32 (0x80000000);
   4237 //no unsigned ints comparison in SSE, only signed available, so need the trick
   4238     a_signed = _mm_sub_epi32(a, c80000000);         //go to signed
   4239     saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
   4240     shift_res = _mm_slli_epi32 (a, b);
   4241     return _mm_or_si128 (shift_res, saturation_mask);
   4242 }
   4243 
   4244 uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b);         // VQSHL.U64 q0,q0,#0
   4245 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   4246 {         // no effective SIMD solution here
   4247     _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
   4248     uint64_t bmask;
   4249     int i;
   4250     bmask = ( uint64_t)1 << (64 - b);
   4251     _mm_store_si128((__m128i*)atmp, a);
   4252     for (i = 0; i<2; i++) {
   4253         res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b;         //if b=0 we are fine with any a
   4254     }
   4255     return _mm_load_si128((__m128i*)res);
   4256 }
   4257 
   4258 //**************Vector signed->unsigned saturating shift left by constant *************
   4259 //*************************************************************************************
   4260 
   4261 uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b);         // VQSHLU.S8 q0,q0,#0
   4262 _NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b)         // VQSHLU.S8 q0,q0,#0
   4263 {
   4264     __m128i a128, r128_1, r128_2;
   4265     a128 = _MM_CVTEPI8_EPI16 (a);         //SSE 4.1
   4266     r128_1 = _mm_slli_epi16 (a128, b);
   4267     //swap hi and low part of a128 to process the remaining data
   4268     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   4269     a128 = _MM_CVTEPI8_EPI16 (a128);
   4270     r128_2 = _mm_slli_epi16 (a128, b);
   4271     return _mm_packus_epi16 (r128_1, r128_2);         //saturated u8
   4272 }
   4273 
   4274 #if defined(USE_SSSE3)
   4275 uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b);         // VQSHLU.S16 q0,q0,#0
   4276 _NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b)         // VQSHLU.S16 q0,q0,#0
   4277 {         // manual saturation solution looks LESS optimal than 32 bits conversion one
   4278     __m128i a128, r128_1, r128_2;
   4279     a128 = _MM_CVTEPI16_EPI32 (a);         //SSE 4.1
   4280     r128_1 = _mm_slli_epi32 (a128, b);         //shift_res
   4281     //swap hi and low part of a128 to process the remaining data
   4282     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   4283     a128 = _MM_CVTEPI16_EPI32 (a128);
   4284     r128_2 = _mm_slli_epi32 (a128, b);
   4285     return _MM_PACKUS_EPI32 (r128_1, r128_2);         //saturated s16
   4286 }
   4287 #endif
   4288 
   4289 uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b);         // VQSHLU.S32 q0,q0,#0
   4290 _NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b)         // VQSHLU.S32 q0,q0,#0
   4291 {         //solution may be  not optimal compared with the serial one
   4292     __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
   4293     zero = _mm_setzero_si128();
   4294     maskA = _mm_cmpeq_epi32(a, a);
   4295     maskA = _mm_slli_epi32(maskA,(32 - b));         // b ones and (32-b)zeros
   4296     //saturate negative numbers to zero
   4297     maskGT0   = _mm_cmpgt_epi32 (a, zero);         // //0xffffffff if positive number and zero otherwise (negative numbers)
   4298     a0 = _mm_and_si128 (a,  maskGT0);         //negative are zeros now
   4299     //saturate positive to 0xffffffff
   4300     a_masked = _mm_and_si128 (a0, maskA);
   4301     a_masked = _mm_cmpgt_epi32 (a_masked, zero);         //0xffffffff if saturation necessary 0 otherwise
   4302     a_shift = _mm_slli_epi32 (a0, b);
   4303     return _mm_or_si128 (a_shift, a_masked);         //actual saturation
   4304 }
   4305 
   4306 uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b);         // VQSHLU.S64 q0,q0,#0
   4307 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
   4308 {         // no effective SIMD solution here, serial execution looks faster
   4309     _NEON2SSE_ALIGN_16 int64_t atmp[2];
   4310     _NEON2SSE_ALIGN_16 uint64_t res[2];
   4311     uint64_t limit;
   4312     int i;
   4313     _mm_store_si128((__m128i*)atmp, a);
   4314     for (i = 0; i<2; i++) {
   4315         if (atmp[i]<=0) {
   4316             res[i] = 0;
   4317         } else {
   4318             limit = (uint64_t) 1 << (64 - b);
   4319             res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
   4320         }
   4321     }
   4322     return _mm_load_si128((__m128i*)res);
   4323 }
   4324 
   4325 //************** Vector narrowing  shift right by constant **************
   4326 //**********************************************************************
   4327 
   4328 //************** Vector signed->unsigned narrowing saturating shift right by constant ********
   4329 //*********************************************************************************************
   4330 
   4331 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
   4332 
   4333 //***** Vector narrowing saturating shift right by constant ******
   4334 //*****************************************************************
   4335 
   4336 //********* Vector rounding narrowing shift right by constant *************************
   4337 //****************************************************************************************
   4338 
   4339 //************* Vector rounding narrowing saturating shift right by constant ************
   4340 //****************************************************************************************
   4341 
   4342 //************** Vector widening shift left by constant ****************
   4343 //************************************************************************
   4344 
   4345 //************************************************************************************
   4346 //**************************** Shifts with insert ************************************
   4347 //************************************************************************************
   4348 //takes each element in a vector,  shifts them by an immediate value,
   4349 //and inserts the results in the destination vector. Bits shifted out of the each element are lost.
   4350 
   4351 //**************** Vector shift right and insert ************************************
   4352 //Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
   4353 //All other bits are taken from b shifted.
   4354 
   4355 int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
   4356 _NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c)         // VSRI.8 q0,q0,#8
   4357 {
   4358     __m128i maskA, a_masked;
   4359     uint8x16_t b_shift;
   4360     _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff};         //"a" bits mask, 0 bit not used
   4361     maskA = _mm_set1_epi8(maskLeft[c]);         // c ones and (8-c)zeros
   4362     a_masked = _mm_and_si128 (a, maskA);
   4363     b_shift = vshrq_n_u8( b, c);         // c zeros on the left in b due to logical shift
   4364     return _mm_or_si128 (a_masked, b_shift);         //combine (insert b into a)
   4365 }
   4366 
   4367 int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
   4368 _NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c)         // VSRI.16 q0,q0,#16
   4369 {         //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
   4370     uint16x8_t b_shift;
   4371     uint16x8_t a_c;
   4372     b_shift = vshrq_n_u16( b, c);         // c zeros on the left in b due to logical shift
   4373     a_c = vshrq_n_u16( a, (16 - c));
   4374     a_c  = _mm_slli_epi16(a_c, (16 - c));         //logical shift provides right "c" bits zeros in a
   4375     return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
   4376 }
   4377 
   4378 int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
   4379 _NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c)         // VSRI.32 q0,q0,#32
   4380 {         //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
   4381     uint32x4_t b_shift;
   4382     uint32x4_t a_c;
   4383     b_shift = vshrq_n_u32( b, c);         // c zeros on the left in b due to logical shift
   4384     a_c = vshrq_n_u32( a, (32 - c));
   4385     a_c  = _mm_slli_epi32(a_c, (32 - c));         //logical shift provides right "c" bits zeros in a
   4386     return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
   4387 }
   4388 
   4389 int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
   4390 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
   4391 {         //serial solution may be faster
   4392     uint64x2_t b_shift;
   4393     uint64x2_t a_c;
   4394     b_shift = _mm_srli_epi64(b, c);         // c zeros on the left in b due to logical shift
   4395     a_c = _mm_srli_epi64(a, (64 - c));
   4396     a_c  = _mm_slli_epi64(a_c, (64 - c));         //logical shift provides right "c" bits zeros in a
   4397     return _mm_or_si128 (a_c, b_shift);         //combine (insert b into a)
   4398 }
   4399 
   4400 uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
   4401 #define vsriq_n_u8 vsriq_n_s8
   4402 
   4403 uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
   4404 #define vsriq_n_u16 vsriq_n_s16
   4405 
   4406 uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c);         // VSRI.32 q0,q0,#32
   4407 #define vsriq_n_u32 vsriq_n_s32
   4408 
   4409 uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c);         // VSRI.64 q0,q0,#64
   4410 #define vsriq_n_u64 vsriq_n_s64
   4411 
   4412 poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c);         // VSRI.8 q0,q0,#8
   4413 #define vsriq_n_p8 vsriq_n_u8
   4414 
   4415 poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c);         // VSRI.16 q0,q0,#16
   4416 #define vsriq_n_p16 vsriq_n_u16
   4417 
   4418 //***** Vector shift left and insert *********************************************
   4419 //*********************************************************************************
   4420 //Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
   4421 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
   4422 
   4423 int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
   4424 _NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c)         // VSLI.8 q0,q0,#0
   4425 {
   4426     __m128i maskA, a_masked;
   4427     int8x16_t b_shift;
   4428     _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f};         //"a" bits mask
   4429     maskA = _mm_set1_epi8(maskRight[c]);         // (8-c)zeros and c ones
   4430     b_shift = vshlq_n_s8( b, c);
   4431     a_masked = _mm_and_si128 (a, maskA);
   4432     return _mm_or_si128 (b_shift, a_masked);         //combine (insert b into a)
   4433 }
   4434 
   4435 int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
   4436 _NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c)         // VSLI.16 q0,q0,#0
   4437 {         //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
   4438     int16x8_t b_shift;
   4439     int16x8_t a_c;
   4440     b_shift = vshlq_n_s16( b, c);
   4441     a_c = vshlq_n_s16( a, (16 - c));
   4442     a_c  = _mm_srli_epi16(a_c, (16 - c));
   4443     return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
   4444 }
   4445 
   4446 int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
   4447 _NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c)         // VSLI.32 q0,q0,#0
   4448 {         //solution may be  not optimal compared with the serial one
   4449       //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
   4450     int32x4_t b_shift;
   4451     int32x4_t a_c;
   4452     b_shift = vshlq_n_s32( b, c);
   4453     a_c = vshlq_n_s32( a, (32 - c));
   4454     a_c  = _mm_srli_epi32(a_c, (32 - c));
   4455     return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
   4456 }
   4457 
   4458 int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
   4459 _NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c)         // VSLI.64 q0,q0,#0
   4460 {         //solution may be  not optimal compared with the serial one
   4461       //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
   4462     int64x2_t b_shift;
   4463     int64x2_t a_c;
   4464     b_shift = vshlq_n_s64( b, c);
   4465     a_c = vshlq_n_s64( a, (64 - c));
   4466     a_c  = _mm_srli_epi64(a_c, (64 - c));
   4467     return _mm_or_si128 (b_shift, a_c);         //combine (insert b into a)
   4468 }
   4469 
   4470 uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
   4471 #define vsliq_n_u8 vsliq_n_s8
   4472 
   4473 uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
   4474 #define vsliq_n_u16 vsliq_n_s16
   4475 
   4476 uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c);         // VSLI.32 q0,q0,#0
   4477 #define vsliq_n_u32 vsliq_n_s32
   4478 
   4479 uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c);         // VSLI.64 q0,q0,#0
   4480 #define vsliq_n_u64 vsliq_n_s64
   4481 
   4482 poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c);         // VSLI.8 q0,q0,#0
   4483 #define vsliq_n_p8 vsliq_n_u8
   4484 
   4485 poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c);         // VSLI.16 q0,q0,#0
   4486 #define vsliq_n_p16 vsliq_n_u16
   4487 
   4488 // ***********************************************************************************************
   4489 // ****************** Loads and stores of a single vector ***************************************
   4490 // ***********************************************************************************************
   4491 //Performs loads and stores of a single vector of some type.
   4492 //*******************************  Loads ********************************************************
   4493 // ***********************************************************************************************
   4494 //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
   4495 //also for SSE3  supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
   4496 // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
   4497 //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
   4498 #define LOAD_SI128(ptr) \
   4499         ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr));
   4500 
   4501 uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
   4502 #define vld1q_u8 LOAD_SI128
   4503 
   4504 uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
   4505 #define vld1q_u16 LOAD_SI128
   4506 
   4507 uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
   4508 #define vld1q_u32 LOAD_SI128
   4509 
   4510 uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
   4511 #define vld1q_u64 LOAD_SI128
   4512 
   4513 int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
   4514 #define vld1q_s8 LOAD_SI128
   4515 
   4516 int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
   4517 #define vld1q_s16 LOAD_SI128
   4518 
   4519 int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
   4520 #define vld1q_s32 LOAD_SI128
   4521 
   4522 int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
   4523 #define vld1q_s64 LOAD_SI128
   4524 
   4525 float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr);         // VLD1.16 {d0, d1}, [r0]
   4526 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
   4527 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
   4528 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
   4529 __m128 f2;
   4530 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
   4531 }*/
   4532 
   4533 float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr);         // VLD1.32 {d0, d1}, [r0]
   4534 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
   4535 {
   4536     if( (((unsigned long)(ptr)) & 15 ) == 0 )         //16 bits aligned
   4537         return _mm_load_ps(ptr);
   4538     else
   4539         return _mm_loadu_ps(ptr);
   4540 }
   4541 
   4542 poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr);         // VLD1.8 {d0, d1}, [r0]
   4543 #define vld1q_p8  LOAD_SI128
   4544 
   4545 poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr);         // VLD1.16 {d0, d1}, [r0]
   4546 #define vld1q_p16 LOAD_SI128
   4547 
   4548 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
   4549 
   4550 //***********************************************************************************************************
   4551 //******* Lane load functions - insert the data at  vector's given position (lane) *************************
   4552 //***********************************************************************************************************
   4553 uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
   4554 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
   4555 
   4556 uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
   4557 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
   4558 
   4559 uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
   4560 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
   4561 
   4562 uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
   4563 #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane);         // _p;
   4564 
   4565 int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
   4566 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
   4567 
   4568 int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
   4569 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
   4570 
   4571 int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
   4572 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
   4573 
   4574 //current IA SIMD doesn't support float16
   4575 
   4576 float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane);         // VLD1.32 {d0[0]}, [r0]
   4577 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
   4578 {         //we need to deal with  ptr  16bit NOT aligned case
   4579     __m128 p;
   4580     p = _mm_set1_ps(*(ptr));
   4581     return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
   4582 }
   4583 
   4584 int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane);         // VLD1.64 {d0}, [r0]
   4585 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
   4586 
   4587 poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane);         // VLD1.8 {d0[0]}, [r0]
   4588 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
   4589 
   4590 poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane);         // VLD1.16 {d0[0]}, [r0]
   4591 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
   4592 
   4593 //serial solution may be faster
   4594 
   4595 //current IA SIMD doesn't support float16
   4596 
   4597 // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
   4598 // ******************************************************************************************************************
   4599 uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
   4600 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
   4601 
   4602 uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
   4603 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
   4604 
   4605 uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
   4606 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
   4607 
   4608 uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr);         // VLD1.64 {d0}, [r0]
   4609 _NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
   4610 {
   4611     _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
   4612     return LOAD_SI128(val);
   4613 }
   4614 
   4615 int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
   4616 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
   4617 
   4618 int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
   4619 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
   4620 
   4621 int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
   4622 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
   4623 
   4624 int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr);         // VLD1.64 {d0}, [r0]
   4625 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
   4626 
   4627 float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr);         // VLD1.16 {d0[]}, [r0]
   4628 //current IA SIMD doesn't support float16, need to go to 32 bits
   4629 
   4630 float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr);         // VLD1.32 {d0[]}, [r0]
   4631 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
   4632 
   4633 poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr);         // VLD1.8 {d0[]}, [r0]
   4634 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
   4635 
   4636 poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr);         // VLD1.16 {d0[]}, [r0]
   4637 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
   4638 
   4639 //current IA SIMD doesn't support float16
   4640 
   4641 //*************************************************************************************
   4642 //********************************* Store **********************************************
   4643 //*************************************************************************************
   4644 // If ptr is 16bit aligned and you  need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
   4645 //here we assume the case of  NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
   4646 #define STORE_SI128(ptr, val) \
   4647         (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
   4648 
   4649 void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val);         // VST1.8 {d0, d1}, [r0]
   4650 #define vst1q_u8 STORE_SI128
   4651 
   4652 void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val);         // VST1.16 {d0, d1}, [r0]
   4653 #define vst1q_u16 STORE_SI128
   4654 
   4655 void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val);         // VST1.32 {d0, d1}, [r0]
   4656 #define vst1q_u32 STORE_SI128
   4657 
   4658 void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val);         // VST1.64 {d0, d1}, [r0]
   4659 #define vst1q_u64 STORE_SI128
   4660 
   4661 void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val);         // VST1.8 {d0, d1}, [r0]
   4662 #define vst1q_s8 STORE_SI128
   4663 
   4664 void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val);         // VST1.16 {d0, d1}, [r0]
   4665 #define vst1q_s16 STORE_SI128
   4666 
   4667 void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val);         // VST1.32 {d0, d1}, [r0]
   4668 #define vst1q_s32 STORE_SI128
   4669 
   4670 void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val);         // VST1.64 {d0, d1}, [r0]
   4671 #define vst1q_s64 STORE_SI128
   4672 
   4673 void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val);         // VST1.16 {d0, d1}, [r0]
   4674 // IA32 SIMD doesn't work with 16bit floats currently
   4675 
   4676 void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val);         // VST1.32 {d0, d1}, [r0]
   4677 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
   4678 {
   4679     if( ((unsigned long)(ptr) & 15)  == 0 )         //16 bits aligned
   4680         _mm_store_ps (ptr, val);
   4681     else
   4682         _mm_storeu_ps (ptr, val);
   4683 }
   4684 
   4685 void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val);         // VST1.8 {d0, d1}, [r0]
   4686 #define vst1q_p8  vst1q_u8
   4687 
   4688 void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val);         // VST1.16 {d0, d1}, [r0]
   4689 #define vst1q_p16 vst1q_u16
   4690 
   4691 //current IA SIMD doesn't support float16
   4692 
   4693 //***********Store a lane of a vector into memory (extract given lane) *********************
   4694 //******************************************************************************************
   4695 void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
   4696 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
   4697 
   4698 void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
   4699 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
   4700 
   4701 void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
   4702 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
   4703 
   4704 void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane);         // VST1.64 {d0}, [r0]
   4705 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
   4706 
   4707 void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
   4708 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane)
   4709 
   4710 void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
   4711 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane)
   4712 
   4713 void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
   4714 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
   4715 
   4716 void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane);         // VST1.64 {d0}, [r0]
   4717 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
   4718 
   4719 void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
   4720 //current IA SIMD doesn't support float16
   4721 
   4722 void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane);         // VST1.32 {d0[0]}, [r0]
   4723 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
   4724 {
   4725     int32_t ilane;
   4726     ilane = _MM_EXTRACT_PS(val,lane);
   4727     *(ptr) =  *((float*)&ilane);
   4728 }
   4729 
   4730 void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane);         // VST1.8 {d0[0]}, [r0]
   4731 #define vst1q_lane_p8   vst1q_lane_u8
   4732 
   4733 void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane);         // VST1.16 {d0[0]}, [r0]
   4734 #define vst1q_lane_p16   vst1q_lane_s16
   4735 
   4736 //current IA SIMD doesn't support float16
   4737 
   4738 //***********************************************************************************************
   4739 //**************** Loads and stores of an N-element structure **********************************
   4740 //***********************************************************************************************
   4741 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
   4742 //We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
   4743 //****************** 2 elements load  *********************************************
   4744 uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
   4745 _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr)         // VLD2.8 {d0, d2}, [r0]
   4746 {
   4747     uint8x16x2_t v;
   4748     v.val[0] = vld1q_u8(ptr);
   4749     v.val[1] = vld1q_u8((ptr + 16));
   4750     v = vuzpq_s8(v.val[0], v.val[1]);
   4751     return v;
   4752 }
   4753 
   4754 #if defined(USE_SSSE3)
   4755 uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
   4756 _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr)         // VLD2.16 {d0, d2}, [r0]
   4757 {
   4758     uint16x8x2_t v;
   4759     v.val[0] = vld1q_u16( ptr);
   4760     v.val[1] = vld1q_u16( (ptr + 8));
   4761     v = vuzpq_s16(v.val[0], v.val[1]);
   4762     return v;
   4763 }
   4764 #endif
   4765 
   4766 uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
   4767 _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr)         // VLD2.32 {d0, d2}, [r0]
   4768 {
   4769     uint32x4x2_t v;
   4770     v.val[0] = vld1q_u32 ( ptr);
   4771     v.val[1] = vld1q_u32 ( (ptr + 4));
   4772     v = vuzpq_s32(v.val[0], v.val[1]);
   4773     return v;
   4774 }
   4775 
   4776 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
   4777 #define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
   4778 
   4779 #if defined(USE_SSSE3)
   4780 int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
   4781 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
   4782 #endif
   4783 
   4784 int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
   4785 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
   4786 
   4787 float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr);         // VLD2.16 {d0, d2}, [r0]
   4788 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   4789 
   4790 float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr);         // VLD2.32 {d0, d2}, [r0]
   4791 _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr)         // VLD2.32 {d0, d2}, [r0]
   4792 {
   4793     float32x4x2_t v;
   4794     v.val[0] =  vld1q_f32 (ptr);
   4795     v.val[1] =  vld1q_f32 ((ptr + 4));
   4796     v = vuzpq_f32(v.val[0], v.val[1]);
   4797     return v;
   4798 }
   4799 
   4800 poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr);         // VLD2.8 {d0, d2}, [r0]
   4801 #define  vld2q_p8 vld2q_u8
   4802 
   4803 #if defined(USE_SSSE3)
   4804 poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr);         // VLD2.16 {d0, d2}, [r0]
   4805 #define vld2q_p16 vld2q_u16
   4806 #endif
   4807 
   4808 #if defined(USE_SSSE3)
   4809 uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
   4810 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
   4811 {
   4812     uint8x8x2_t v;
   4813     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15};
   4814     __m128i ld128;
   4815     ld128 = vld1q_u8(ptr);         //merge two 64-bits in 128 bit
   4816     v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd);
   4817     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   4818     return v;
   4819 }
   4820 #endif
   4821 
   4822 #if defined(USE_SSSE3)
   4823 uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
   4824 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
   4825 {
   4826     uint16x4x2_t v;
   4827     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
   4828     __m128i ld128;
   4829     ld128 = vld1q_u16(ptr);         //merge two 64-bits in 128 bit
   4830     v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd);
   4831     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   4832     return v;
   4833 }
   4834 #endif
   4835 
   4836 uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
   4837 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
   4838 {
   4839     uint32x2x2_t v;
   4840     __m128i ld128;
   4841     ld128 = vld1q_u32(ptr);         //merge two 64-bits in 128 bit
   4842     v.val[0] = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
   4843     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   4844     return v;
   4845 }
   4846 
   4847 uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
   4848 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
   4849 {
   4850     uint64x1x2_t v;
   4851     v.val[0] = vld1q_u64(ptr);
   4852     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   4853     return v;
   4854 }
   4855 
   4856 #if defined(USE_SSSE3)
   4857 int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
   4858 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
   4859 
   4860 int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
   4861 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
   4862 #endif
   4863 
   4864 int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
   4865 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
   4866 
   4867 int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
   4868 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
   4869 
   4870 float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr);         // VLD2.16 {d0, d1}, [r0]
   4871 
   4872 float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr);         // VLD2.32 {d0, d1}, [r0]
   4873 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
   4874 {
   4875     float32x2x2_t v;
   4876     v.val[0] = vld1q_f32(ptr);
   4877     v.val[0] = _mm_shuffle_ps(v.val[0], v.val[0], _MM_SHUFFLE(3,1, 2, 0));
   4878     v.val[1] = _mm_movehl_ps(v.val[0],v.val[0]);
   4879     return v;
   4880 }
   4881 
   4882 #if defined(USE_SSSE3)
   4883 poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr);         // VLD2.8 {d0, d1}, [r0]
   4884 #define vld2_p8 vld2_u8
   4885 
   4886 poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr);         // VLD2.16 {d0, d1}, [r0]
   4887 #define vld2_p16 vld2_u16
   4888 #endif
   4889 
   4890 //******************** Triplets ***************************************
   4891 //*********************************************************************
   4892 #if defined(USE_SSSE3)
   4893 uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
   4894 _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr)         // VLD3.8 {d0, d2, d4}, [r0]
   4895 {  //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
   4896    //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
   4897    //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
   4898    //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
   4899     uint8x16x3_t v;
   4900     __m128i tmp0, tmp1,tmp2, tmp3;
   4901     _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
   4902     _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
   4903     _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
   4904 
   4905     v.val[0] =  vld1q_u8 (ptr);        //a0,a1,a2,a3,...a7, ...a15
   4906     v.val[1] =  vld1q_u8 ((ptr + 16));	//b0,b1,b2,b3...b7, ...b15
   4907     v.val[2] =  vld1q_u8 ((ptr + 32));  //c0,c1,c2,c3,...c7,...c15
   4908 
   4909     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
   4910     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
   4911     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
   4912 
   4913     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
   4914     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
   4915     tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
   4916     tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
   4917     v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
   4918     v.val[0] = _mm_or_si128(v.val[0],tmp3) ;//a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
   4919 
   4920     tmp3 = _mm_slli_si128(tmp0, 5);//0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
   4921     tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
   4922     v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
   4923     v.val[1] = _mm_slli_si128(v.val[1], 5);//0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
   4924     v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
   4925     v.val[1] =	_mm_slli_si128(v.val[1],5);//0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
   4926     v.val[1] = _mm_srli_si128(v.val[1], 5);//a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
   4927     tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
   4928     tmp3 = _mm_slli_si128(tmp3,11);//0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
   4929     v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
   4930 
   4931     tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
   4932     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
   4933     v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
   4934     v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
   4935     v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
   4936     tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
   4937     v.val[2] = _mm_or_si128(v.val[2],tmp0);//a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
   4938     return v;
   4939 }
   4940 #endif
   4941 
   4942 #if defined(USE_SSSE3)
   4943 uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
   4944 _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr)         // VLD3.16 {d0, d2, d4}, [r0]
   4945 {  //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
   4946     uint16x8x3_t v;
   4947     __m128i tmp0, tmp1,tmp2, tmp3;
   4948     _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
   4949     _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
   4950     _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
   4951 
   4952     v.val[0] =  vld1q_u16 (ptr);        //a0,a1,a2,a3,...a7,
   4953     v.val[1] =  vld1q_u16 ((ptr + 8));	//b0,b1,b2,b3...b7
   4954     v.val[2] =  vld1q_u16 ((ptr + 16));  //c0,c1,c2,c3,...c7
   4955 
   4956     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
   4957     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
   4958     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
   4959 
   4960     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
   4961     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
   4962     tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
   4963     tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
   4964     v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
   4965     v.val[0] = _mm_or_si128(v.val[0],tmp3);//a0,a3,a6,b1,b4,b7,c2,c5
   4966 
   4967     tmp3 = _mm_slli_si128(tmp0, 4);//0,0,a0,a3,a6,a1,a4,a7
   4968     tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
   4969     v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
   4970     v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
   4971     v.val[1] = _mm_or_si128(v.val[1],tmp3);//a1,a4,a7,b2,b5,b0,b3,b6,
   4972     v.val[1] =	_mm_slli_si128(v.val[1],6);//0,0,0,a1,a4,a7,b2,b5,
   4973     v.val[1] = _mm_srli_si128(v.val[1], 6);//a1,a4,a7,b2,b5,0,0,0,
   4974     tmp3 = _mm_srli_si128(tmp2,4);  //c0,c3,c6, c1,c4,c7,0,0
   4975     tmp3 = _mm_slli_si128(tmp3,10);  //0,0,0,0,0,c0,c3,c6,
   4976     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
   4977 
   4978     tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
   4979     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
   4980     v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
   4981     v.val[2] = _mm_slli_si128(v.val[2],4);//0,0, b0,b3,b6,0,0,0
   4982     v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0, b0,b3,b6,c1,c4,c7,
   4983     tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
   4984     v.val[2] = _mm_or_si128(v.val[2],tmp0);//a2,a5,b0,b3,b6,c1,c4,c7,
   4985     return v;
   4986 }
   4987 #endif
   4988 
   4989 uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
   4990 _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr)         // VLD3.32 {d0, d2, d4}, [r0]
   4991 {//a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
   4992     uint32x4x3_t v;
   4993     __m128i tmp0, tmp1,tmp2, tmp3;
   4994     v.val[0] =  vld1q_u32 (ptr);        //a0,a1,a2,a3,
   4995     v.val[1] =  vld1q_u32 ((ptr + 4));	//b0,b1,b2,b3
   4996     v.val[2] =  vld1q_u32 ((ptr + 8));  //c0,c1,c2,c3,
   4997 
   4998     tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
   4999     tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
   5000     tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
   5001 
   5002     tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
   5003     v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
   5004     tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
   5005     v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
   5006     v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
   5007     v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
   5008     return v;
   5009 }
   5010 
   5011 #if defined(USE_SSSE3)
   5012 int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
   5013 #define  vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
   5014 
   5015 int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
   5016 #define  vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
   5017 #endif
   5018 
   5019 int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
   5020 #define  vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
   5021 
   5022 float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
   5023 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   5024 
   5025 float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr);         // VLD3.32 {d0, d2, d4}, [r0]
   5026 _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr)         // VLD3.32 {d0, d2, d4}, [r0]
   5027 { //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
   5028     float32x4x3_t v;
   5029     __m128 tmp0, tmp1,tmp2, tmp3;
   5030     v.val[0] =  vld1q_f32 (ptr);        //a0,a1,a2,a3,
   5031     v.val[1] =  vld1q_f32 ((ptr + 4));	//b0,b1,b2,b3
   5032     v.val[2] =  vld1q_f32 ((ptr + 8));  //c0,c1,c2,c3,
   5033 
   5034     tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
   5035     tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
   5036     tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
   5037     tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
   5038 
   5039     v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
   5040     tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
   5041     v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
   5042     v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
   5043     v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
   5044     return v;
   5045 }
   5046 
   5047 #if defined(USE_SSSE3)
   5048 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr);         // VLD3.8 {d0, d2, d4}, [r0]
   5049 #define vld3q_p8 vld3q_u8
   5050 
   5051 poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr);         // VLD3.16 {d0, d2, d4}, [r0]
   5052 #define vld3q_p16 vld3q_u16
   5053 #endif
   5054 
   5055 #if defined(USE_SSSE3)
   5056 uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
   5057 _NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr)         // VLD3.8 {d0, d1, d2}, [r0]
   5058 { //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
   5059     uint8x8x3_t v;
   5060     __m128i tmp0, tmp1;
   5061     _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
   5062     _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
   5063     v.val[0] =  vld1q_u8 (ptr);        //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
   5064 
   5065     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
   5066     tmp1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
   5067     v.val[0] = _mm_slli_si128(tmp0,10);
   5068     v.val[0] = _mm_srli_si128(v.val[0],10);  //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
   5069     v.val[2] = _mm_slli_si128(tmp1,6);//0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
   5070     v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
   5071 
   5072     v.val[1] = _mm_slli_si128(tmp0,5);  //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
   5073     v.val[1] = _mm_srli_si128(v.val[1],11);  //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
   5074     v.val[2] = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
   5075     v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
   5076     v.val[1] = _mm_or_si128(v.val[1],v.val[2]) ;//a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
   5077 
   5078     tmp0 = _mm_srli_si128(tmp0,11);  //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
   5079     v.val[2] = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
   5080     v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c1,c4,c7,
   5081     v.val[2] = _mm_or_si128(tmp0, v.val[2]) ;//a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
   5082     return v;
   5083 }
   5084 #endif
   5085 
   5086 #if defined(USE_SSSE3)
   5087 uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
   5088 _NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr)         // VLD3.16 {d0, d1, d2}, [r0]
   5089 { //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
   5090     uint16x4x3_t v;
   5091     __m128i tmp0, tmp1;
   5092     _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
   5093     v.val[0] =  vld1q_u16 (ptr);        //a0,a1,a2,a3,  b0,b1,b2,b3
   5094 
   5095     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
   5096     tmp1 = _mm_shufflelo_epi16(v.val[2], 201); //11 00 10 01     : c1, c2, c0, c3,
   5097     v.val[0] = _mm_slli_si128(tmp0,10);
   5098     v.val[0] = _mm_srli_si128(v.val[0],10);  //a0, a3, b2, 0,0, 0,0,
   5099     v.val[2] = _mm_slli_si128(tmp1,14);//0,0,0,0,0,0,0,c1
   5100     v.val[2] = _mm_srli_si128(v.val[2],8);//0,0,0,c1,0,0,0,0
   5101     v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0, a3, b2, c1, x,x,x,x
   5102 
   5103     v.val[1] = _mm_slli_si128(tmp0,4);  //0,0,0,0,0,a1, b0, b3
   5104     v.val[1] = _mm_srli_si128(v.val[1],10);  //a1, b0, b3, 0,0, 0,0,
   5105     v.val[2] = _mm_srli_si128(tmp1,2);//c2, 0,0,0,0,0,0,0,
   5106     v.val[2] = _mm_slli_si128(v.val[2],6);//0,0,0,c2,0,0,0,0
   5107     v.val[1] = _mm_or_si128(v.val[1],v.val[2]); //a1, b0, b3, c2, x,x,x,x
   5108 
   5109     tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
   5110     tmp1 = _mm_srli_si128(tmp1,4);
   5111     tmp1 = _mm_slli_si128(tmp1,4);  //0,0,c0, c3,
   5112     v.val[2] = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
   5113     return v;
   5114 }
   5115 #endif
   5116 
   5117 uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
   5118 _NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr)         // VLD3.32 {d0, d1, d2}, [r0]
   5119 { //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
   5120     uint32x2x3_t v;
   5121     v.val[0] =  vld1q_u32 (ptr);        //a0,a1,  b0,b1,
   5122 
   5123     v.val[0] = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6));  //a0,b1, a1, b0
   5124     v.val[2] =  _mm_slli_si128(v.val[2], 8);  //x, x,c0,c1,
   5125     v.val[1] =  _mm_unpackhi_epi32(v.val[0],v.val[2]); //a1,c0, b0, c1
   5126     v.val[2] =  _mm_srli_si128(v.val[1], 8);  //b0, c1, x, x,
   5127     return v;
   5128 }
   5129 uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
   5130 _NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2}, [r0]
   5131 {
   5132     uint64x1x3_t v;
   5133     v.val[0] = vld1q_u64 (ptr);
   5134     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   5135     return v;
   5136 }
   5137 
   5138 #if defined(USE_SSSE3)
   5139 int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
   5140 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
   5141 
   5142 int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
   5143 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
   5144 #endif
   5145 
   5146 int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
   5147 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
   5148 
   5149 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
   5150 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
   5151 
   5152 float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
   5153 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   5154 
   5155 float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr);         // VLD3.32 {d0, d1, d2}, [r0]
   5156 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
   5157 { //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
   5158     float32x2x3_t v;
   5159     v.val[0] =  vld1q_f32 (ptr);        //a0,a1,  b0,b1,
   5160 
   5161     v.val[0] = _mm_shuffle_ps(v.val[0],v.val[0], _MM_SHUFFLE(2,1, 3, 0));  //a0,b1, a1, b0
   5162     v.val[2] =  _mm_movelh_ps(v.val[2], v.val[2]);  //x, x,c0,c1,
   5163     v.val[1] =  _mm_unpackhi_ps(v.val[0],v.val[2]); //a1,c0, b0, c1
   5164     v.val[2] =  _mm_movehl_ps(v.val[1], v.val[1]);  //b0, c1, x, x,
   5165     return v;
   5166 }
   5167 
   5168 #if defined(USE_SSSE3)
   5169 poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr);         // VLD3.8 {d0, d1, d2}, [r0]
   5170 #define vld3_p8 vld3_u8
   5171 
   5172 poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr);         // VLD3.16 {d0, d1, d2}, [r0]
   5173 #define vld3_p16 vld3_u16
   5174 #endif
   5175 
   5176 //***************  Quadruples load ********************************
   5177 //*****************************************************************
   5178 uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
   5179 _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr)         // VLD4.8 {d0, d2, d4, d6}, [r0]
   5180 {
   5181     uint8x16x4_t v;
   5182    __m128i tmp3, tmp2, tmp1, tmp0;
   5183 
   5184     v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
   5185     v.val[1] = vld1q_u8 ( (ptr + 16));//b0, b1,b2,...b7.... b15
   5186     v.val[2] = vld1q_u8 ( (ptr + 32));//c0, c1,c2,...c7....c15
   5187     v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
   5188 
   5189     tmp0= _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
   5190     tmp1= _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
   5191     tmp2= _mm_unpackhi_epi8(v.val[0],v.val[1]);//a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
   5192     tmp3= _mm_unpackhi_epi8(v.val[2],v.val[3]);//c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
   5193 
   5194     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8,  a1,a9, b1,b9, ....a3,a11, b3,b11
   5195     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
   5196     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
   5197     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
   5198 
   5199     tmp0 =  _mm_unpacklo_epi32(v.val[0] , v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
   5200     tmp1 =  _mm_unpackhi_epi32(v.val[0] , v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
   5201     tmp2 =  _mm_unpacklo_epi32(v.val[1] , v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
   5202     tmp3 =  _mm_unpackhi_epi32(v.val[1] , v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
   5203 
   5204     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
   5205     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
   5206     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
   5207     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
   5208     return v;
   5209 }
   5210 
   5211 uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
   5212 _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr)         // VLD4.16 {d0, d2, d4, d6}, [r0]
   5213 {
   5214     uint16x8x4_t v;
   5215     __m128i tmp3, tmp2, tmp1, tmp0;
   5216     tmp0  =  vld1q_u16 (ptr);       //a0,a1,a2,...a7
   5217     tmp1  =  vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
   5218     tmp2  =  vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
   5219     tmp3  =  vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
   5220     v.val[0]= _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
   5221     v.val[1]= _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
   5222     v.val[2]= _mm_unpackhi_epi16(tmp0,tmp1);//a4,b4, a5,b5, a6,b6, a7,b7
   5223     v.val[3]= _mm_unpackhi_epi16(tmp2,tmp3);//c4,d4, c5,d5, c6,d6, c7,d7
   5224     tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]);//a0,a4, b0,b4, a1,a5, b1,b5
   5225     tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
   5226     tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
   5227     tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]);//c2,c6, d2,d6, c3,c7, d3,d7
   5228     v.val[0] =  _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
   5229     v.val[1] =  _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
   5230     v.val[2] =  _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
   5231     v.val[3] =  _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
   5232     return v;
   5233 }
   5234 
   5235 uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
   5236 _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr)         // VLD4.32 {d0, d2, d4, d6}, [r0]
   5237 {
   5238     uint32x4x4_t v;
   5239     __m128i tmp3, tmp2, tmp1, tmp0;
   5240     v.val[0] =  vld1q_u32 (ptr);
   5241     v.val[1] =  vld1q_u32 ((ptr + 4));
   5242     v.val[2] =  vld1q_u32 ((ptr + 8));
   5243     v.val[3] =  vld1q_u32 ((ptr + 12));
   5244     tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
   5245     tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
   5246     tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
   5247     tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
   5248     v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
   5249     v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
   5250     v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
   5251     v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
   5252     return v;
   5253 }
   5254 
   5255 int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
   5256 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
   5257 
   5258 int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
   5259 #define  vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
   5260 
   5261 int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
   5262 #define  vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
   5263 
   5264 float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
   5265 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   5266 
   5267 float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr);         // VLD4.32 {d0, d2, d4, d6}, [r0]
   5268 _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr)         // VLD4.32 {d0, d2, d4, d6}, [r0]
   5269 {
   5270     float32x4x4_t v;
   5271     __m128 tmp3, tmp2, tmp1, tmp0;
   5272 
   5273     v.val[0] =  vld1q_f32 ((float*) ptr);
   5274     v.val[1] =  vld1q_f32 ((float*) (ptr + 4));
   5275     v.val[2] =  vld1q_f32 ((float*) (ptr + 8));
   5276     v.val[3] =  vld1q_f32 ((float*) (ptr + 12));
   5277     tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
   5278     tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
   5279     tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
   5280     tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
   5281     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
   5282     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
   5283     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
   5284     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
   5285     return v;
   5286 }
   5287 
   5288 poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr);         // VLD4.8 {d0, d2, d4, d6}, [r0]
   5289 #define vld4q_p8 vld4q_u8
   5290 
   5291 poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr);         // VLD4.16 {d0, d2, d4, d6}, [r0]
   5292 #define vld4q_p16 vld4q_s16
   5293 
   5294 #if defined(USE_SSSE3)
   5295 uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
   5296 _NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr)         // VLD4.8 {d0, d1, d2, d3}, [r0]
   5297 {
   5298     uint8x8x4_t v;
   5299     __m128i sh0, sh1;
   5300     _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
   5301 
   5302     v.val[0] = vld1q_u8(( ptr));         //load first 64-bits in val[0] and val[1]
   5303     v.val[1] = vld1q_u8(( ptr + 16));         //load third and forth 64-bits in val[2], val[3]
   5304 
   5305     sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_8);
   5306     sh1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask4_8);
   5307     v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
   5308     v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
   5309     v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32);
   5310     v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32);
   5311 
   5312     return v;
   5313 }
   5314 #endif
   5315 
   5316 #if defined(USE_SSSE3)
   5317 uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
   5318 _NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr)         // VLD4.16 {d0, d1, d2, d3}, [r0]
   5319 {
   5320     uint16x4x4_t v;
   5321     __m128i sh0, sh1;
   5322     _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15};         //0, 4, 1, 5, 2, 6, 3, 7
   5323     v.val[0] = vld1q_u16 ( (ptr));         //load first 64-bits in val[0] and val[1]
   5324     v.val[2] = vld1q_u16 ( (ptr + 8));         //load third and forth 64-bits in val[2], val[3]
   5325     sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_16);
   5326     sh1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask4_16);
   5327     v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         //0,4,8,12, 1,5,9,13
   5328     v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         //2,6,10,14, 3,7,11,15
   5329     v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32);
   5330     v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32);
   5331     return v;
   5332 }
   5333 #endif
   5334 
   5335 uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
   5336 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
   5337 {   //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
   5338      uint32x4x4_t v, res;
   5339     v.val[0] =  vld1q_u32 (ptr);        //a0,a1,  b0,b1,
   5340     v.val[2] =  vld1q_u32 ((ptr + 4));  //c0,c1, d0,d1
   5341     res.val[0] = _mm_unpacklo_epi32(v.val[0],v.val[2]);  //a0, c0, a1,c1,
   5342     res.val[2] = _mm_unpackhi_epi32(v.val[0],v.val[2]);  //b0,d0, b1, d1
   5343     res.val[1] = _mm_shuffle_epi32(res.val[0],_SWAP_HI_LOW32); //a1,c1, a0, c0,
   5344     res.val[3] = _mm_shuffle_epi32(res.val[2],_SWAP_HI_LOW32);//b1, d1,b0,d0,
   5345     return res;
   5346 }
   5347 
   5348 uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
   5349 _NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2, d3}, [r0]
   5350 {
   5351     uint64x1x4_t v;
   5352     v.val[0] = vld1q_u64( (ptr));         //load first 64-bits in val[0] and val[1]
   5353     v.val[2] = vld1q_u64( (ptr + 2));         //load third and forth 64-bits in val[2], val[3]
   5354     return v;
   5355 }
   5356 
   5357 #if defined(USE_SSSE3)
   5358 int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
   5359 #define  vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
   5360 
   5361 int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
   5362 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
   5363 #endif
   5364 
   5365 int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
   5366 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
   5367 
   5368 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
   5369 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
   5370 
   5371 float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
   5372 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   5373 
   5374 float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr);         // VLD4.32 {d0, d1, d2, d3}, [r0]
   5375 _NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr)         // VLD4.32 {d0, d1, d2, d3}, [r0]
   5376 {         //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
   5377     float32x2x4_t v, res;
   5378     v.val[0] =  vld1q_f32 ((float*) ptr);         //a0,a1,  b0,b1,
   5379     v.val[2] =  vld1q_f32 ((float*) (ptr + 4));         //c0,c1, d0,d1
   5380     res.val[0] = _mm_unpacklo_ps(v.val[0],v.val[2]);         //a0, c0, a1,c1,
   5381     res.val[2] = _mm_unpackhi_ps(v.val[0],v.val[2]);         //b0,d0, b1, d1
   5382     res.val[1] = _mm_movehl_ps(res.val[0],res.val[0]);          // a1,c1, a0, c0,
   5383     res.val[3] = _mm_movehl_ps(res.val[2],res.val[2]);         // b1, d1, b0,d0,
   5384     return res;
   5385 }
   5386 
   5387 #if defined(USE_SSSE3)
   5388 poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr);         // VLD4.8 {d0, d1, d2, d3}, [r0]
   5389 #define vld4_p8 vld4_u8
   5390 
   5391 poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr);         // VLD4.16 {d0, d1, d2, d3}, [r0]
   5392 #define vld4_p16 vld4_u16
   5393 #endif
   5394 
   5395 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
   5396 //*******************************************************************************************************************
   5397 uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
   5398 _NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr)         // VLD2.8 {d0[], d1[]}, [r0]
   5399 {
   5400     uint8x8x2_t v;
   5401     v.val[0] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
   5402     v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
   5403     v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
   5404     v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
   5405     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   5406     return v;
   5407 }
   5408 
   5409 uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
   5410 _NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr)         // VLD2.16 {d0[], d1[]}, [r0]
   5411 {
   5412     uint16x4x2_t v;
   5413     v.val[1] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
   5414     v.val[0] = _mm_shufflelo_epi16(v.val[1], 0); //00 00 00 00 (all 0)
   5415     v.val[1] = _mm_shufflelo_epi16(v.val[1], 85);//01 01 01 01 (all 1)
   5416     return v;
   5417 }
   5418 
   5419 uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
   5420 _NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr)         // VLD2.32 {d0[], d1[]}, [r0]
   5421 {
   5422     uint32x2x2_t v;
   5423     v.val[0] = LOAD_SI128(ptr); //0,1,x,x
   5424     v.val[0] = _mm_shuffle_epi32(v.val[0],   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
   5425     v.val[1] = _mm_srli_si128(v.val[0], 8); //1,1,0x0,0x0
   5426     return v;
   5427 }
   5428 
   5429 uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
   5430 #define vld2_dup_u64 vld2_u64
   5431 
   5432 int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
   5433 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
   5434 
   5435 int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
   5436 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
   5437 
   5438 int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
   5439 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
   5440 
   5441 int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr);         // VLD1.64 {d0, d1}, [r0]
   5442 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
   5443 
   5444 float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
   5445 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   5446 
   5447 float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr);         // VLD2.32 {d0[], d1[]}, [r0]
   5448 _NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr)         // VLD2.32 {d0[], d1[]}, [r0]
   5449 {
   5450     float32x2x2_t v;
   5451     v.val[0] = vld1q_f32(ptr);  //0,1,x,x
   5452     v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x
   5453     v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,x,x
   5454     return v;
   5455 }
   5456 
   5457 poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr);         // VLD2.8 {d0[], d1[]}, [r0]
   5458 #define vld2_dup_p8 vld2_dup_u8
   5459 
   5460 poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr);         // VLD2.16 {d0[], d1[]}, [r0]
   5461 #define vld2_dup_p16 vld2_dup_s16
   5462 
   5463 //************* Duplicate (or propagate)triplets: *******************
   5464 //********************************************************************
   5465 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
   5466 uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
   5467 _NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr)         // VLD3.8 {d0[], d1[], d2[]}, [r0]
   5468 {
   5469     uint8x8x3_t v;
   5470     v.val[0] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
   5471     v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
   5472     v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
   5473     v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
   5474     v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
   5475     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   5476     return v;
   5477 }
   5478 
   5479 uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
   5480 _NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr)         // VLD3.16 {d0[], d1[], d2[]}, [r0]
   5481 {
   5482     uint16x4x3_t v;
   5483     v.val[2] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
   5484     v.val[0] = _mm_shufflelo_epi16(v.val[2], 0); //00 00 00 00 (all 0)
   5485     v.val[1] = _mm_shufflelo_epi16(v.val[2], 85);//01 01 01 01 (all 1)
   5486     v.val[2] = _mm_shufflelo_epi16(v.val[2], 170);//10 10 10 10 (all 2)
   5487     return v;
   5488 }
   5489 
   5490 uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
   5491 _NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr)         // VLD3.32 {d0[], d1[], d2[]}, [r0]
   5492 {
   5493     uint32x2x3_t v;
   5494     v.val[2] = LOAD_SI128(ptr); //0,1,2,x
   5495     v.val[0] = _mm_shuffle_epi32(v.val[2],   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
   5496     v.val[1] = _mm_shuffle_epi32(v.val[2],   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
   5497     v.val[2] = _mm_srli_si128(v.val[0], 8); //2,2,0x0,0x0
   5498     return v;
   5499 }
   5500 
   5501 uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
   5502 _NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2}, [r0]
   5503 {
   5504     uint64x1x3_t v;
   5505     v.val[0] = LOAD_SI128(ptr);//0,1,
   5506     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0
   5507     v.val[2] = LOAD_SI128((ptr + 2));  //2,x
   5508     return v;
   5509 }
   5510 
   5511 int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
   5512 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
   5513 
   5514 int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
   5515 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
   5516 
   5517 int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
   5518 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
   5519 
   5520 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr);         // VLD1.64 {d0, d1, d2}, [r0]
   5521 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
   5522 
   5523 float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
   5524 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   5525 
   5526 float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr);         // VLD3.32 {d0[], d1[], d2[]}, [r0]
   5527 _NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr)         // VLD3.32 {d0[], d1[], d2[]}, [r0]
   5528 {
   5529     float32x2x3_t v;
   5530     v.val[0] = vld1q_f32(ptr);  //0,1,2,x
   5531     v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x
   5532     v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2
   5533     v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0,
   5534     return v;
   5535 }
   5536 
   5537 poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr);         // VLD3.8 {d0[], d1[], d2[]}, [r0]
   5538 #define vld3_dup_p8 vld3_dup_u8
   5539 
   5540 poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr);         // VLD3.16 {d0[], d1[], d2[]}, [r0]
   5541 #define vld3_dup_p16 vld3_dup_s16
   5542 
   5543 //************* Duplicate (or propagate) quadruples: *******************
   5544 //***********************************************************************
   5545 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes  and  ptr[3] to all val[3] lanes
   5546 uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   5547 _NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr)         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   5548 {
   5549     uint8x8x4_t v;
   5550     v.val[0] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
   5551     v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
   5552     v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
   5553     v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
   5554     v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
   5555     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   5556     v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32);
   5557     return v;
   5558 }
   5559 
   5560 uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   5561 _NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr)         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   5562 {
   5563     uint16x4x4_t v;
   5564     v.val[3] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
   5565     v.val[0] = _mm_shufflelo_epi16(v.val[3], 0); //00 00 00 00 (all 0)
   5566     v.val[1] = _mm_shufflelo_epi16(v.val[3], 85);//01 01 01 01 (all 1)
   5567     v.val[2] = _mm_shufflelo_epi16(v.val[3], 170);//10 10 10 10 (all 2)
   5568     v.val[3] = _mm_shufflelo_epi16(v.val[3], 255);//11 11 11 11 (all 3)
   5569     return v;
   5570 }
   5571 
   5572 uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   5573 _NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr)         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   5574 {
   5575     uint32x2x4_t v;
   5576     v.val[3] = LOAD_SI128(ptr) ; //0,1,2,3
   5577     v.val[0] = _mm_shuffle_epi32(v.val[3],   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
   5578     v.val[1] = _mm_shuffle_epi32(v.val[3],   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
   5579     v.val[2] = _mm_shuffle_epi32(v.val[3],   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
   5580     v.val[3] = _mm_shuffle_epi32(v.val[3],   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
   5581     return v;
   5582 }
   5583 
   5584 uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
   5585 _NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr)         // VLD1.64 {d0, d1, d2, d3}, [r0]
   5586 {
   5587     uint64x1x4_t v;
   5588     v.val[0] = LOAD_SI128(ptr); //0,1,
   5589     v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0
   5590     v.val[2] = LOAD_SI128((ptr + 2)); //2,3
   5591     v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32); //3,2
   5592     return v;
   5593 }
   5594 
   5595 int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   5596 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
   5597 
   5598 int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   5599 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
   5600 
   5601 int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   5602 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
   5603 
   5604 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr);         // VLD1.64 {d0, d1, d2, d3}, [r0]
   5605 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
   5606 
   5607 float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   5608 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   5609 
   5610 float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr);         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   5611 _NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr)         // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   5612 {
   5613     float32x2x4_t v;
   5614     v.val[0] = vld1q_f32(ptr);  //0,1,2,3
   5615     v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,3,3
   5616     v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2
   5617     v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0,
   5618     v.val[3] = _mm_movehl_ps(v.val[1], v.val[1]); //3,3,1,1,
   5619     return v;
   5620 }
   5621 
   5622 poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr);         // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   5623 #define vld4_dup_p8 vld4_dup_u8
   5624 
   5625 poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr);         // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   5626 #define vld4_dup_p16 vld4_dup_u16
   5627 
   5628 //**********************************************************************************
   5629 //*******************Lane loads for  an N-element structures ***********************
   5630 //**********************************************************************************
   5631 //********************** Lane pairs  ************************************************
   5632 //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
   5633 //we assume  src is 16 bit aligned
   5634 
   5635 //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
   5636 //to fix it the all functions below work with  xxxxxx_2t pointers and the corresponding original functions are redefined
   5637 
   5638 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   5639 _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane)         // VLD2.16 {d0[0], d2[0]}, [r0]
   5640 {
   5641     uint16x8x2_t v;
   5642     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
   5643     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
   5644     return v;
   5645 }
   5646 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
   5647 
   5648 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
   5649 _NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane)         // VLD2.32 {d0[0], d2[0]}, [r0]
   5650 {
   5651     uint32x4x2_t v;
   5652     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
   5653     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
   5654     return v;
   5655 }
   5656 #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
   5657 
   5658 //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   5659 _NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
   5660 {
   5661     int16x8x2_t v;
   5662     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
   5663     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
   5664     return v;
   5665 }
   5666 #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
   5667 
   5668 //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
   5669 _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
   5670 {
   5671     int32x4x2_t v;
   5672     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
   5673     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
   5674     return v;
   5675 }
   5676 #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
   5677 
   5678 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   5679 //current IA SIMD doesn't support float16
   5680 
   5681 //float32x4x2_t vld2q_lane_f32(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
   5682 _NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane)         // VLD2.32 {d0[0], d2[0]}, [r0]
   5683 {
   5684     float32x4x2_t v;
   5685     v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
   5686     v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
   5687     return v;
   5688 }
   5689 #define vld2q_lane_f32(ptr, src, lane) vld2q_lane_f32_ptr(ptr, &src, lane)
   5690 
   5691 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   5692 #define vld2q_lane_p16 vld2q_lane_u16
   5693 
   5694 //uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
   5695 _NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane)         // VLD2.8 {d0[0], d1[0]}, [r0]
   5696 {
   5697     uint8x8x2_t val;
   5698     val.val[0] = _MM_INSERT_EPI8 (src->val[0], (int)ptr[0], lane);
   5699     val.val[1] = _MM_INSERT_EPI8 (src->val[1], (int)ptr[1], lane);
   5700     return val;
   5701 }
   5702 #define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane)
   5703 
   5704 //uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
   5705 #define vld2_lane_u16 vld2q_lane_u16
   5706 
   5707 //uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
   5708 #define vld2_lane_u32 vld2q_lane_u32
   5709 
   5710 //int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
   5711 int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane);         // VLD2.8 {d0[0], d1[0]}, [r0]
   5712 #define vld2_lane_s8(ptr, src, lane)  vld2_lane_u8(( uint8_t*) ptr, src, lane)
   5713 
   5714 //int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
   5715 int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
   5716 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
   5717 
   5718 //int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
   5719 int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
   5720 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
   5721 
   5722 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   5723 //current IA SIMD doesn't support float16
   5724 
   5725 float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane);         // VLD2.32 {d0[0], d1[0]}, [r0]
   5726 #define vld2_lane_f32 vld2q_lane_f32
   5727 
   5728 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
   5729 poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane);         // VLD2.8 {d0[0], d1[0]}, [r0]
   5730 #define vld2_lane_p8 vld2_lane_u8
   5731 
   5732 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
   5733 poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane);         // VLD2.16 {d0[0], d1[0]}, [r0]
   5734 #define vld2_lane_p16 vld2_lane_u16
   5735 
   5736 //*********** Lane triplets **********************
   5737 //*************************************************
   5738 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
   5739 //we assume src is 16 bit aligned
   5740 
   5741 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   5742 _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane)         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   5743 {
   5744     uint16x8x3_t v;
   5745     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
   5746     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
   5747     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
   5748     return v;
   5749 }
   5750 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
   5751 
   5752 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   5753 _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   5754 {
   5755     uint32x4x3_t v;
   5756     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
   5757     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
   5758     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
   5759     return v;
   5760 }
   5761 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
   5762 
   5763 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   5764 _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane)         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   5765 {
   5766     int16x8x3_t v;
   5767     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
   5768     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
   5769     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
   5770     return v;
   5771 }
   5772 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
   5773 
   5774 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   5775 _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   5776 {
   5777     int32x4x3_t v;
   5778     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
   5779     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
   5780     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
   5781     return v;
   5782 }
   5783 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
   5784 
   5785 float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   5786 //current IA SIMD doesn't support float16
   5787 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
   5788 
   5789 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   5790 _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane)         // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   5791 {
   5792     float32x4x3_t v;
   5793     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
   5794     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
   5795     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
   5796     return v;
   5797 }
   5798 #define vld3q_lane_f32(ptr, src, lane) vld3q_lane_f32_ptr(ptr, &src, lane)
   5799 
   5800 poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane);         // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   5801 #define vld3q_lane_p16 vld3q_lane_u16
   5802 
   5803 //uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   5804 _NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane)         // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   5805 {
   5806     uint8x8x3_t v;
   5807     v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane);
   5808     v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane);
   5809     v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane);
   5810     return v;
   5811 }
   5812 #define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane)
   5813 
   5814 //uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t   const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   5815 _NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane)         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   5816 {
   5817     uint16x4x3_t v;
   5818     v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane);
   5819     v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane);
   5820     v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane);
   5821     return v;
   5822 }
   5823 #define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane)
   5824 
   5825 //uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   5826 _NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane)         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   5827 {         //need to merge into 128 bit anyway
   5828     uint32x2x3_t v;
   5829     v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
   5830     v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane);
   5831     v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane);
   5832     return v;
   5833 }
   5834 #define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane)
   5835 
   5836 int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane);         // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   5837 #define vld3_lane_s8(ptr, src, lane)  vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane)
   5838 
   5839 int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   5840 #define vld3_lane_s16(ptr, src, lane)  vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane)
   5841 
   5842 int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane);         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   5843 #define vld3_lane_s32(ptr, src, lane)  vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane)
   5844 
   5845 float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane);         // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   5846 //current IA SIMD doesn't support float16
   5847 
   5848 //float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   5849 _NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane)         // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   5850 {
   5851     float32x2x3_t v;
   5852     v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
   5853     return v;
   5854 }
   5855 #define vld3_lane_f32(ptr, src, lane) vld3_lane_f32_ptr(ptr, &src, lane)
   5856 
   5857 //poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   5858 #define vld3_lane_p8 vld3_lane_u8
   5859 
   5860 //poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   5861 #define vld3_lane_p16 vld3_lane_u16
   5862 
   5863 //******************* Lane Quadruples  load ***************************
   5864 //*********************************************************************
   5865 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
   5866 //we assume src is 16 bit aligned
   5867 
   5868 //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5869 _NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
   5870 {
   5871     uint16x8x4_t v;
   5872     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
   5873     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
   5874     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
   5875     v.val[3] = _MM_INSERT_EPI16 ( src->val[3],  ptr[3], lane);
   5876     return v;
   5877 }
   5878 #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
   5879 
   5880 //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5881 _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
   5882 {
   5883     uint32x4x4_t v;
   5884     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
   5885     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
   5886     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
   5887     v.val[3] = _MM_INSERT_EPI32 ( src->val[3],  ptr[3], lane);
   5888     return v;
   5889 }
   5890 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
   5891 
   5892 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5893 int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5894 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
   5895 
   5896 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5897 int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane);         // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5898 #define vld4q_lane_s32(ptr, src, lane)  vld4q_lane_u32(( uint32_t*) ptr, src, lane)
   5899 
   5900 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5901 float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5902 //current IA SIMD doesn't support float16
   5903 
   5904 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5905 _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
   5906 {
   5907     float32x4x4_t v;
   5908     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
   5909     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
   5910     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
   5911     v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
   5912     return v;
   5913 }
   5914 #define vld4q_lane_f32(ptr, src, lane) vld4q_lane_f32_ptr(ptr, &src, lane)
   5915 
   5916 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5917 poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane);         // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   5918 #define vld4q_lane_p16 vld4q_lane_u16
   5919 
   5920 //uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5921 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane)
   5922 {
   5923     uint8x8x4_t v;
   5924     v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane);
   5925     v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane );
   5926     v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane );
   5927     v.val[3] = _MM_INSERT_EPI8 (src->val[3], ptr[3], lane );
   5928     return v;
   5929 }
   5930 #define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane)
   5931 
   5932 //uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5933 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane)
   5934 {
   5935     uint16x4x4_t v;
   5936     v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane);
   5937     v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane );
   5938     v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane );
   5939     v.val[3] = _MM_INSERT_EPI16 (src->val[3], ptr[3], lane );
   5940     return v;
   5941 }
   5942 #define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane)
   5943 
   5944 //uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5945 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane)
   5946 {
   5947     uint32x2x4_t v;
   5948     v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane);
   5949     v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane );
   5950     v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane );
   5951     v.val[3] = _MM_INSERT_EPI32 (src->val[3], ptr[3], lane );
   5952     return v;
   5953 }
   5954 #define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane)
   5955 
   5956 //int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5957 int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane);
   5958 #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
   5959 
   5960 //int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5961 int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane);
   5962 #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
   5963 
   5964 //int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5965 int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane);
   5966 #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
   5967 
   5968 //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5969 float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
   5970 //current IA SIMD doesn't support float16
   5971 
   5972 //float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5973 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane)
   5974 {         //serial solution may be faster
   5975     float32x2x4_t v;
   5976     return v;
   5977 }
   5978 #define vld4_lane_f32(ptr, src, lane) vld4_lane_f32_ptr(ptr, &src, lane)
   5979 
   5980 //poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5981 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane);
   5982 #define vld4_lane_p8 vld4_lane_u8
   5983 
   5984 //poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   5985 poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane);
   5986 #define vld4_lane_p16 vld4_lane_u16
   5987 
   5988 //******************* Store duplets *********************************************
   5989 //********************************************************************************
   5990 //here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function
   5991 //If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions
   5992 //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
   5993 _NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
   5994 {
   5995     uint8x16x2_t v;
   5996     v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
   5997     v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
   5998     vst1q_u8 (ptr, v.val[0]);
   5999     vst1q_u8 ((ptr + 16),  v.val[1]);
   6000 }
   6001 #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
   6002 
   6003 //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
   6004 _NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
   6005 {
   6006     uint16x8x2_t v;
   6007     v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
   6008     v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
   6009     vst1q_u16 (ptr, v.val[0]);
   6010     vst1q_u16 ((ptr + 8),  v.val[1]);
   6011 }
   6012 #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
   6013 
   6014 //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
   6015 _NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
   6016 {
   6017     uint32x4x2_t v;
   6018     v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
   6019     v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
   6020     vst1q_u32 (ptr, v.val[0]);
   6021     vst1q_u32 ((ptr + 4),  v.val[1]);
   6022 }
   6023 #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
   6024 
   6025 //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
   6026 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
   6027 #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
   6028 
   6029 //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
   6030 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
   6031 #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
   6032 
   6033 //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
   6034 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
   6035 #define vst2q_s32(ptr, val)  vst2q_u32((uint32_t*)(ptr), val)
   6036 
   6037 //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
   6038 void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
   6039 // IA32 SIMD doesn't work with 16bit floats currently
   6040 
   6041 //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
   6042 _NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
   6043 {
   6044     float32x4x2_t v;
   6045     v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
   6046     v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
   6047     vst1q_f32 (ptr, v.val[0]);
   6048     vst1q_f32 ((ptr + 4),  v.val[1]);
   6049 }
   6050 #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
   6051 
   6052 //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
   6053 void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
   6054 #define vst2q_p8 vst2q_u8
   6055 
   6056 //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
   6057 void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
   6058 #define vst2q_p16 vst2q_u16
   6059 
   6060 //void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
   6061 _NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val)
   6062 {
   6063     uint8x8x2_t v;
   6064     v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
   6065     vst1q_u8 (ptr, v.val[0]);
   6066 }
   6067 #define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val)
   6068 
   6069 //void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
   6070 _NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val)
   6071 {
   6072     uint16x4x2_t v;
   6073     v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
   6074     vst1q_u16 (ptr, v.val[0]);
   6075 }
   6076 #define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val)
   6077 
   6078 //void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
   6079 _NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val)
   6080 {
   6081     uint32x2x2_t v;
   6082     v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
   6083     vst1q_u32 (ptr, v.val[0]);
   6084 }
   6085 #define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val)
   6086 
   6087 //void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
   6088 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val);
   6089 _NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val)
   6090 {
   6091     uint64x1x2_t v;
   6092     v.val[0] = _mm_unpacklo_epi64(val->val[0], val->val[1]);
   6093     vst1q_u64(ptr, v.val[0]);
   6094 }
   6095 #define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val)
   6096 
   6097 //void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
   6098 #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
   6099 
   6100 //void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
   6101 #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
   6102 
   6103 //void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
   6104 #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
   6105 
   6106 //void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
   6107 #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
   6108 
   6109 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
   6110 //current IA SIMD doesn't support float16
   6111 
   6112 void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val);         // VST2.32 {d0, d1}, [r0]
   6113 _NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val)
   6114 {
   6115     float32x4x2_t v;
   6116     v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
   6117     vst1q_f32 (ptr, v.val[0]);
   6118 }
   6119 #define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val)
   6120 
   6121 //void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0]
   6122 #define vst2_p8 vst2_u8
   6123 
   6124 //void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
   6125 #define vst2_p16 vst2_u16
   6126 
   6127 //******************** Triplets store  *****************************************
   6128 //******************************************************************************
   6129 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
   6130 #if defined(USE_SSSE3)
   6131 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
   6132 {
   6133     uint8x16x3_t v;
   6134     __m128i v0,v1,v2, cff, bldmask;
   6135     _NEON2SSE_ALIGN_16 uint8_t mask0[16]   = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
   6136     _NEON2SSE_ALIGN_16 uint8_t mask1[16]   = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
   6137     _NEON2SSE_ALIGN_16 uint8_t mask2[16] =    {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
   6138     _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
   6139     _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
   6140     _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
   6141 
   6142     v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]);         //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
   6143     v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]);         //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
   6144     v1 =  _mm_alignr_epi8(v2, v0, 11);         //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
   6145     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0);         //make holes for the v.val[2] data embedding
   6146     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo);         //make plugs for the v.val[2] data embedding
   6147     cff = _mm_cmpeq_epi8(v0, v0);         //all ff
   6148     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
   6149     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
   6150     vst1q_u8(ptr,   v.val[0]);
   6151     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1);         //make holes for the v.val[2] data embedding
   6152     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med);         //make plugs for the v.val[2] data embedding
   6153     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
   6154     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
   6155     vst1q_u8((ptr + 16),  v.val[1]);
   6156     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2);         //make holes for the v.val[2] data embedding
   6157     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi);         //make plugs for the v.val[2] data embedding
   6158     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
   6159     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
   6160     vst1q_u8((ptr + 32),  v.val[2]);
   6161 }
   6162 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
   6163 #endif
   6164 
   6165 #if defined(USE_SSSE3)
   6166 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
   6167 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
   6168 {
   6169     uint16x8x3_t v;
   6170     __m128i v0,v1,v2, cff, bldmask;
   6171     _NEON2SSE_ALIGN_16 uint8_t mask0[16]   = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
   6172     _NEON2SSE_ALIGN_16 uint8_t mask1[16]   = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
   6173     _NEON2SSE_ALIGN_16 uint8_t mask2[16] =    {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
   6174     _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
   6175     _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
   6176     _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
   6177 
   6178     v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]);         //0,1, 3,4, 6,7, 9,10
   6179     v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]);         //12,13, 15,16, 18,19, 21,22,
   6180     v1 =  _mm_alignr_epi8(v2, v0, 12);         //9,10, 12,13, 15,16, 18,19
   6181     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0);         //make holes for the v.val[2] data embedding
   6182     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo);         //make plugs for the v.val[2] data embedding
   6183     cff = _mm_cmpeq_epi16(v0, v0);         //all ff
   6184     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
   6185     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
   6186     vst1q_u16(ptr,      v.val[0]);
   6187     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1);         //make holes for the v.val[2] data embedding
   6188     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med);         //make plugs for the v.val[2] data embedding
   6189     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
   6190     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
   6191     vst1q_u16((ptr + 8),  v.val[1]);
   6192     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2);         //make holes for the v.val[2] data embedding
   6193     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi);         //make plugs for the v.val[2] data embedding
   6194     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
   6195     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
   6196     vst1q_u16((ptr + 16), v.val[2]);
   6197 }
   6198 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
   6199 #endif
   6200 
   6201 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
   6202 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
   6203 {   //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
   6204     uint32x4x3_t v;
   6205     __m128i tmp0, tmp1,tmp2;
   6206     tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
   6207     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
   6208     tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
   6209     v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
   6210     v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
   6211     v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
   6212     tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
   6213     v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
   6214 
   6215     vst1q_u32(ptr,      v.val[0]);
   6216     vst1q_u32((ptr + 4),  v.val[1]);
   6217     vst1q_u32((ptr + 8),  v.val[2]);
   6218 }
   6219 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
   6220 
   6221 #if defined(USE_SSSE3)
   6222 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
   6223 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
   6224 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
   6225 
   6226 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
   6227 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
   6228 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
   6229 #endif
   6230 
   6231 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
   6232 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
   6233 #define vst3q_s32(ptr, val)  vst3q_u32((uint32_t*)(ptr), val)
   6234 
   6235 //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
   6236 void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
   6237 // IA32 SIMD doesn't work with 16bit floats currently
   6238 
   6239 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
   6240 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
   6241 {
   6242      float32x4x3_t  v;
   6243     __m128 tmp0, tmp1,tmp2;
   6244     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
   6245     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
   6246     tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
   6247     v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
   6248     v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
   6249     v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
   6250     tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
   6251     v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
   6252 
   6253     vst1q_f32( ptr,    v.val[0]);
   6254     vst1q_f32( (ptr + 4),  v.val[1]);
   6255     vst1q_f32( (ptr + 8),  v.val[2]);
   6256 }
   6257 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
   6258 
   6259 #if defined(USE_SSSE3)
   6260 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
   6261 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
   6262 #define vst3q_p8 vst3q_u8
   6263 
   6264 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
   6265 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
   6266 #define vst3q_p16 vst3q_u16
   6267 #endif
   6268 
   6269 //void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0]
   6270 #if defined(USE_SSSE3)
   6271 _NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val)
   6272 {
   6273     uint8x8x3_t v;
   6274     __m128i tmp, sh0, sh1;
   6275     _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
   6276     _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
   6277     _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
   6278     _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
   6279     tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]);
   6280     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0);         //for bi>15 bi is wrapped (bi-=15)
   6281     sh1 =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0);
   6282     v.val[0] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
   6283     vst1q_u8(ptr,   v.val[0]);         //store as 128 bit structure
   6284     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1);         //for bi>15 bi is wrapped (bi-=15)
   6285     sh1 =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1);
   6286     v.val[1] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
   6287 }
   6288 #define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val)
   6289 #endif
   6290 
   6291 //void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0]
   6292 #if defined(USE_SSSE3)
   6293 _NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val)
   6294 {
   6295     uint16x4x3_t v;
   6296     __m128i tmp;
   6297     _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
   6298     _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23,   0,0,0,0,0,0,0,0};
   6299     _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff};         //if all ones we take the result from v.val[0]  otherwise from v.val[1]
   6300     _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff};         //if all ones we take the result from v.val[1]  otherwise from v.val[0]
   6301     tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]);
   6302     v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
   6303     v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0);
   6304     v.val[0] = _MM_BLENDV_EPI8(v.val[1], v.val[0], *(__m128i*)mask0f);
   6305     vst1q_u16(ptr,     v.val[0]);         //store as 128 bit structure
   6306     v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
   6307     v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1);
   6308     v.val[1] = _MM_BLENDV_EPI8(v.val[0], v.val[1],  *(__m128i*)mask1f);         //change the operands order
   6309 }
   6310 #define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val)
   6311 #endif
   6312 
   6313 //void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
   6314 _NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val)
   6315 {         //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
   6316     uint32x2x3_t res;
   6317     res.val[0] = _mm_unpacklo_epi64(val->val[1], val->val[2]);         //val[0]: 1,4,2,5
   6318     res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (2 << 2) | (1 << 4) | (3 << 6));         //1,2,4,5
   6319     res.val[1] = _mm_srli_si128(res.val[0], 8);         //4,5, x,x
   6320     res.val[0] = _mm_unpacklo_epi32(val->val[0], res.val[0]);         //0,1,3,2
   6321     res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (1 << 2) | (3 << 4) | (2 << 6));         //0,1,2, 3
   6322     vst1q_u32(ptr, res.val[0]);         //store as 128 bit structure
   6323 }
   6324 #define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val)
   6325 
   6326 //void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0]
   6327 _NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val)
   6328 {
   6329     __m128i tmp;
   6330     tmp =  _mm_unpacklo_epi64(val->val[0], val->val[1]);
   6331     vst1q_u64(ptr, tmp);         //store as 128 bit structure
   6332 }
   6333 #define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val)
   6334 
   6335 #if defined(USE_SSSE3)
   6336 //void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val)  // VST3.8 {d0, d1, d2}, [r0]
   6337 #define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val)
   6338 
   6339 //void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val)  // VST3.16 {d0, d1, d2}, [r0]
   6340 #define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val)
   6341 #endif
   6342 
   6343 //void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
   6344 #define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val)
   6345 
   6346 //void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0]
   6347 #define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val)
   6348 
   6349 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
   6350 void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val);         // VST3.16 {d0, d1, d2}, [r0]
   6351 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   6352 
   6353 //void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0]
   6354 _NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val)
   6355 {         //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;
   6356     float32x2x3_t res;
   6357     res.val[0] = _mm_castsi128_ps(_mm_unpacklo_epi64(_mm_castps_si128(val->val[1]), _mm_castps_si128(val->val[2])) );
   6358     res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(3,1,2,0));         //1,2,4,5
   6359     res.val[1] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(1,0,3,2));         //4,5, 1,2
   6360     res.val[0] = _mm_unpacklo_ps(val->val[0], res.val[0]);         //0,1,3, 2
   6361     res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(2,3,1,0));         //0,1,2, 3
   6362     vst1q_f32(ptr, res.val[0]);         //store as 128 bit structure
   6363 }
   6364 #define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val)
   6365 
   6366 #if defined(USE_SSSE3)
   6367 //void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
   6368 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val);
   6369 #define vst3_p8 vst3_u8
   6370 
   6371 //void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
   6372 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val);
   6373 #define vst3_p16 vst3_s16
   6374 #endif
   6375 
   6376 //***************  Quadruples store ********************************
   6377 //*********************************************************************
   6378 //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
   6379 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
   6380 {
   6381     __m128i tmp1, tmp2, res;
   6382     tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]);         //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
   6383     tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]);         //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
   6384     res = _mm_unpacklo_epi16(tmp1, tmp2);         //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
   6385     vst1q_u8(ptr,  res);
   6386     res = _mm_unpackhi_epi16(tmp1, tmp2);         //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
   6387     vst1q_u8((ptr + 16), res);
   6388     tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]);         //
   6389     tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]);         //
   6390     res = _mm_unpacklo_epi16(tmp1, tmp2);         //
   6391     vst1q_u8((ptr + 32), res);
   6392     res = _mm_unpackhi_epi16(tmp1, tmp2);         //
   6393     vst1q_u8((ptr + 48), res);
   6394 }
   6395 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
   6396 
   6397 //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
   6398 _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
   6399 {
   6400     uint16x8x4_t v;
   6401     __m128i tmp1, tmp2;
   6402     tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
   6403     tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
   6404     v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
   6405     v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
   6406     tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
   6407     tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
   6408     v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
   6409     v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
   6410     vst1q_u16(ptr,     v.val[0]);
   6411     vst1q_u16((ptr + 8), v.val[1]);
   6412     vst1q_u16((ptr + 16),v.val[2]);
   6413     vst1q_u16((ptr + 24), v.val[3]);
   6414 }
   6415 #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
   6416 
   6417 //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
   6418 _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
   6419 {
   6420     uint16x8x4_t v;
   6421     __m128i tmp1, tmp2;
   6422     tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
   6423     tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
   6424     v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
   6425     v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
   6426     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]);         //0,1, 4,5, 8,9, 12,13
   6427     tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]);         //2,3, 6,7 , 10,11, 14,15
   6428     v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
   6429     v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
   6430     vst1q_u32(ptr,      v.val[0]);
   6431     vst1q_u32((ptr + 4),  v.val[1]);
   6432     vst1q_u32((ptr + 8),  v.val[2]);
   6433     vst1q_u32((ptr + 12), v.val[3]);
   6434 }
   6435 #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
   6436 
   6437 //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
   6438 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
   6439 #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
   6440 
   6441 //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
   6442 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
   6443 #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
   6444 
   6445 //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
   6446 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
   6447 #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
   6448 
   6449 //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
   6450 void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
   6451 // IA32 SIMD doesn't work with 16bit floats currently
   6452 
   6453 //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
   6454 _NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
   6455 {
   6456     __m128 tmp3, tmp2, tmp1, tmp0;
   6457     float32x4x4_t v;
   6458     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
   6459     tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
   6460     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
   6461     tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
   6462     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
   6463     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
   6464     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
   6465     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
   6466     vst1q_f32(ptr,   v.val[0]);
   6467     vst1q_f32((ptr + 4), v.val[1]);
   6468     vst1q_f32((ptr + 8), v.val[2]);
   6469     vst1q_f32((ptr + 12), v.val[3]);
   6470 }
   6471 #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
   6472 
   6473 //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
   6474 void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
   6475 #define vst4q_p8 vst4q_u8
   6476 
   6477 //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
   6478 void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
   6479 #define vst4q_p16 vst4q_s16
   6480 
   6481 //void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0]
   6482 _NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val)
   6483 {
   6484     uint8x8x4_t v;
   6485     __m128i sh0, sh1;
   6486     sh0 = _mm_unpacklo_epi8(val->val[0],val->val[1]);         // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
   6487     sh1 = _mm_unpacklo_epi8(val->val[2],val->val[3]);         // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
   6488     v.val[0] = _mm_unpacklo_epi16(sh0,sh1);         // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
   6489     v.val[2] = _mm_unpackhi_epi16(sh0,sh1);         //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
   6490     vst1q_u8(ptr,      v.val[0]);
   6491     vst1q_u8((ptr + 16),  v.val[2]);
   6492 }
   6493 #define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val)
   6494 
   6495 //void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0]
   6496 _NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val)
   6497 {
   6498     uint16x4x4_t v;
   6499     __m128i sh0, sh1;
   6500     sh0 = _mm_unpacklo_epi16(val->val[0],val->val[1]);         //a0,a1,b0,b1,c0,c1,d0,d1,
   6501     sh1 = _mm_unpacklo_epi16(val->val[2],val->val[3]);         //a2,a3,b2,b3,c2,c3,d2,d3
   6502     v.val[0] = _mm_unpacklo_epi32(sh0,sh1);         // a0,a1,a2,a3,b0,b1,b2,b3
   6503     v.val[2] = _mm_unpackhi_epi32(sh0,sh1);         // c0,c1,c2,c3,d0,d1,d2,d3
   6504     vst1q_u16(ptr,      v.val[0]);         //store as 128 bit structure
   6505     vst1q_u16((ptr + 8),  v.val[2]);
   6506 }
   6507 #define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val)
   6508 
   6509 //void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
   6510 _NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val)
   6511 {         //0,4,   1,5,  2,6,  3,7
   6512     uint32x2x4_t v;
   6513     __m128i sh0, sh1;
   6514     sh0 = _mm_unpacklo_epi32(val->val[0], val->val[1]);         //0,1,4,5
   6515     sh1 = _mm_unpacklo_epi32(val->val[2], val->val[3]);         //2,3,6,7
   6516     v.val[0] = _mm_unpacklo_epi64(sh0,sh1);         //
   6517     v.val[1] = _mm_unpackhi_epi64(sh0,sh1);         //
   6518     vst1q_u32(ptr,     v.val[0]);         //store as 128 bit structure
   6519     vst1q_u32((ptr + 4),  v.val[1]);
   6520 }
   6521 #define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val)
   6522 
   6523 //void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0]
   6524 _NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val)
   6525 {
   6526     vst1q_u64(ptr,    val->val[0]);
   6527     vst1q_u64((ptr + 2), val->val[2]);
   6528 }
   6529 #define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val)
   6530 
   6531 //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val)  //VST4.8 {d0, d1, d2, d3}, [r0]
   6532 #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
   6533 
   6534 //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val)  // VST4.16 {d0, d1, d2, d3}, [r0]
   6535 #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
   6536 
   6537 //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
   6538 #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
   6539 
   6540 //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
   6541 void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
   6542 #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
   6543 
   6544 //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
   6545 void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
   6546 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   6547 
   6548 //void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0]
   6549 _NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val)
   6550 {         //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
   6551     float32x2x4_t v;
   6552     v.val[0] = _mm_unpacklo_ps(val->val[0],val->val[1]);
   6553     v.val[2] = _mm_unpacklo_ps(val->val[2],val->val[3]);
   6554     v.val[1] = _mm_movelh_ps (v.val[0], v.val[2]);         //a0, c0, a1,c1,
   6555     v.val[3] = _mm_movehl_ps (v.val[2],v.val[0]);         //b0,d0, b1, d1
   6556     vst1q_f32(ptr,     v.val[1]);         //store as 128 bit structure
   6557     vst1q_f32((ptr + 4),  v.val[3]);
   6558 }
   6559 #define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val)
   6560 
   6561 //void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
   6562 void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val);
   6563 #define vst4_p8 vst4_u8
   6564 
   6565 //void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
   6566 void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val);
   6567 #define vst4_p16 vst4_u16
   6568 
   6569 //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors  *********************
   6570 //********************************************************************************************************************
   6571 //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
   6572 _NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
   6573 {
   6574     vst1q_lane_s16(ptr, val->val[0], lane);
   6575     vst1q_lane_s16((ptr + 1), val->val[1], lane);
   6576 }
   6577 #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
   6578 
   6579 //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
   6580 _NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
   6581 {
   6582     vst1q_lane_u32(ptr, val->val[0], lane);
   6583     vst1q_lane_u32((ptr + 1), val->val[1], lane);
   6584 }
   6585 #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
   6586 
   6587 //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
   6588 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
   6589 #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
   6590 
   6591 //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
   6592 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
   6593 #define vst2q_lane_s32(ptr, val, lane)  vst2q_lane_u32((uint32_t*)ptr, val, lane)
   6594 
   6595 //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
   6596 void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
   6597 //current IA SIMD doesn't support float16
   6598 
   6599 //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
   6600 _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
   6601 {
   6602     vst1q_lane_f32(ptr, val->val[0], lane);
   6603     vst1q_lane_f32((ptr + 1), val->val[1], lane);
   6604 }
   6605 #define vst2q_lane_f32(ptr, val, lane) vst2q_lane_f32_ptr(ptr, &val, lane)
   6606 
   6607 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
   6608 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
   6609 #define vst2q_lane_p16 vst2q_lane_s16
   6610 
   6611 //void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
   6612 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane);         // VST2.16 {d0[0], d1[0]}, [r0]
   6613 #define vst2_lane_u16 vst2q_lane_u16
   6614 
   6615 //void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
   6616 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32 {d0[0], d1[0]}, [r0]
   6617 #define vst2_lane_u32 vst2q_lane_u32
   6618 
   6619 //void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
   6620 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane);
   6621 #define vst2_lane_s8(ptr, val, lane)  vst2_lane_u8((uint8_t*)ptr, val, lane)
   6622 
   6623 //void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
   6624 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane);
   6625 #define vst2_lane_s16 vst2q_lane_s16
   6626 
   6627 //void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
   6628 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane);
   6629 #define vst2_lane_s32 vst2q_lane_s32
   6630 
   6631 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
   6632 //current IA SIMD doesn't support float16
   6633 
   6634 void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane);         // VST2.32 {d0[0], d1[0]}, [r0]
   6635 #define vst2_lane_f32 vst2q_lane_f32
   6636 
   6637 //void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
   6638 #define vst2_lane_p8 vst2_lane_u8
   6639 
   6640 //void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
   6641 #define vst2_lane_p16 vst2_lane_u16
   6642 
   6643 //************************* Triple lanes  stores *******************************************************
   6644 //*******************************************************************************************************
   6645 //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   6646 _NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
   6647 {
   6648     vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
   6649     vst1q_lane_u16((ptr + 2), val->val[2], lane);
   6650 }
   6651 #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
   6652 
   6653 //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   6654 _NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
   6655 {
   6656     vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
   6657     vst1q_lane_u32((ptr + 2), val->val[2], lane);
   6658 }
   6659 #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
   6660 
   6661 //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   6662 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
   6663 #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
   6664 
   6665 //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   6666 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
   6667 #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
   6668 
   6669 //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   6670 void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
   6671 //current IA SIMD doesn't support float16
   6672 
   6673 //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   6674 _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
   6675 {
   6676     vst1q_lane_f32(ptr,   val->val[0], lane);
   6677     vst1q_lane_f32((ptr + 1),   val->val[1], lane);
   6678     vst1q_lane_f32((ptr + 2), val->val[2], lane);
   6679 }
   6680 #define vst3q_lane_f32(ptr, val, lane) vst3q_lane_f32_ptr(ptr, &val, lane)
   6681 
   6682 //void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
   6683 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane);
   6684 #define  vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
   6685 
   6686 //void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   6687 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane);
   6688 #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
   6689 
   6690 //void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
   6691 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane);
   6692 #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
   6693 
   6694 //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   6695 void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
   6696 //current IA SIMD doesn't support float16
   6697 
   6698 //void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
   6699 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane);
   6700 #define vst3_lane_f32 vst3q_lane_f32
   6701 
   6702 //void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
   6703 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane);
   6704 #define vst3_lane_p8 vst3_lane_u8
   6705 
   6706 //void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   6707 void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane);
   6708 #define vst3_lane_p16 vst3_lane_s16
   6709 
   6710 //******************************** Quadruple lanes stores ***********************************************
   6711 //*******************************************************************************************************
   6712 //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   6713 _NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
   6714 {
   6715     vst2q_lane_u16_ptr(ptr,    (uint16x8x2_t*)val4->val, lane);
   6716     vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
   6717 }
   6718 #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
   6719 
   6720 //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   6721 _NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
   6722 {
   6723     vst2q_lane_u32_ptr(ptr,     (uint32x4x2_t*)val4->val, lane);
   6724     vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
   6725 }
   6726 #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
   6727 
   6728 //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   6729 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
   6730 #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
   6731 
   6732 //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   6733 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
   6734 #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
   6735 
   6736 //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   6737 void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
   6738 //current IA SIMD doesn't support float16
   6739 
   6740 //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   6741 _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
   6742 {
   6743     vst1q_lane_f32(ptr,   val->val[0], lane);
   6744     vst1q_lane_f32((ptr + 1), val->val[1], lane);
   6745     vst1q_lane_f32((ptr + 2), val->val[2], lane);
   6746     vst1q_lane_f32((ptr + 3), val->val[3], lane);
   6747 }
   6748 #define vst4q_lane_f32(ptr, val, lane) vst4q_lane_f32_ptr(ptr, &val, lane)
   6749 
   6750 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   6751 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
   6752 #define vst4q_lane_p16 vst4q_lane_u16
   6753 
   6754 //void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6755 _NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane)
   6756 {
   6757     vst1q_lane_u8(ptr,   val->val[0], lane);
   6758     vst1q_lane_u8((ptr + 1),  val->val[1], lane);
   6759     vst1q_lane_u8((ptr + 2), val->val[2], lane);
   6760     vst1q_lane_u8((ptr + 3), val->val[3], lane);
   6761 }
   6762 #define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane)
   6763 
   6764 //void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6765 _NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane)
   6766 {
   6767     vst1q_lane_u16(ptr,   val->val[0], lane);
   6768     vst1q_lane_u16((ptr + 1),val->val[1], lane);
   6769     vst1q_lane_u16((ptr + 2), val->val[2], lane);
   6770     vst1q_lane_u16((ptr + 3), val->val[3], lane);
   6771 }
   6772 #define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane)
   6773 
   6774 //void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6775 _NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane)
   6776 {
   6777     vst1q_lane_u32(ptr,   val->val[0], lane);
   6778     vst1q_lane_u32((ptr + 1), val->val[1], lane);
   6779     vst1q_lane_u32((ptr + 2), val->val[2], lane);
   6780     vst1q_lane_u32((ptr + 3), val->val[3], lane);
   6781 
   6782 }
   6783 #define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane)
   6784 
   6785 //void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6786 #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
   6787 
   6788 //void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6789 #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
   6790 
   6791 //void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6792 #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
   6793 
   6794 //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6795 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
   6796 //current IA SIMD doesn't support float16
   6797 
   6798 //void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6799 #define vst4_lane_f32 vst4q_lane_f32
   6800 
   6801 //void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6802 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane);
   6803 #define vst4_lane_p8 vst4_lane_u8
   6804 
   6805 //void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   6806 void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane);
   6807 #define vst4_lane_p16 vst4_lane_u16
   6808 
   6809 //**************************************************************************************************
   6810 //************************ Extract lanes from a vector ********************************************
   6811 //**************************************************************************************************
   6812 //These intrinsics extract a single lane (element) from a vector.
   6813 
   6814 uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
   6815 #define vgetq_lane_u8 _MM_EXTRACT_EPI8
   6816 
   6817 uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.s16 r0, d0[0]
   6818 #define  vgetq_lane_u16 _MM_EXTRACT_EPI16
   6819 
   6820 uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
   6821 #define vgetq_lane_u32 _MM_EXTRACT_EPI32
   6822 
   6823 int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane);         // VMOV.S8 r0, d0[0]
   6824 #define vgetq_lane_s8 vgetq_lane_u8
   6825 
   6826 int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane);         // VMOV.S16 r0, d0[0]
   6827 #define vgetq_lane_s16 vgetq_lane_u16
   6828 
   6829 int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
   6830 #define vgetq_lane_s32 vgetq_lane_u32
   6831 
   6832 poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.U8 r0, d0[0]
   6833 #define vgetq_lane_p8 vgetq_lane_u8
   6834 
   6835 poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.s16 r0, d0[0]
   6836 #define vgetq_lane_p16 vgetq_lane_u16
   6837 
   6838 float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 r0, d0[0]
   6839 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
   6840 {
   6841     int32_t ilane;
   6842     ilane = _MM_EXTRACT_PS(vec,lane);
   6843     return *(float*)&ilane;
   6844 }
   6845 
   6846 int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
   6847 #define vgetq_lane_s64 (int64_t) vgetq_lane_u64
   6848 
   6849 uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane);         // VMOV r0,r0,d0
   6850 #define vgetq_lane_u64 _MM_EXTRACT_EPI64
   6851 
   6852 // ***************** Set lanes within a vector ********************************************
   6853 // **************************************************************************************
   6854 //These intrinsics set a single lane (element) within a vector.
   6855 //same functions as vld1_lane_xx ones, but take the value to be set directly.
   6856 
   6857 uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
   6858 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
   6859 {
   6860     uint8_t val;
   6861     val = value;
   6862     return vld1q_lane_u8(&val, vec,  lane);
   6863 }
   6864 
   6865 uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
   6866 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
   6867 {
   6868     uint16_t val;
   6869     val = value;
   6870     return vld1q_lane_u16(&val, vec,  lane);
   6871 }
   6872 
   6873 uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
   6874 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
   6875 {
   6876     uint32_t val;
   6877     val = value;
   6878     return vld1q_lane_u32(&val, vec,  lane);
   6879 }
   6880 
   6881 int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
   6882 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
   6883 {
   6884     int8_t val;
   6885     val = value;
   6886     return vld1q_lane_s8(&val, vec,  lane);
   6887 }
   6888 
   6889 int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
   6890 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
   6891 {
   6892     int16_t val;
   6893     val = value;
   6894     return vld1q_lane_s16(&val, vec,  lane);
   6895 }
   6896 
   6897 int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
   6898 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
   6899 {
   6900     int32_t val;
   6901     val = value;
   6902     return vld1q_lane_s32(&val, vec,  lane);
   6903 }
   6904 
   6905 poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane);         // VMOV.8 d0[0],r0
   6906 #define vsetq_lane_p8 vsetq_lane_u8
   6907 
   6908 poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane);         // VMOV.16 d0[0],r0
   6909 #define vsetq_lane_p16 vsetq_lane_u16
   6910 
   6911 float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane);         // VMOV.32 d0[0],r0
   6912 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
   6913 {
   6914     float32_t val;
   6915     val = value;
   6916 }
   6917 
   6918 int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
   6919 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
   6920 {
   6921     uint64_t val;
   6922     val = value;
   6923     return vld1q_lane_s64(&val, vec,  lane);
   6924 }
   6925 
   6926 uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane);         // VMOV d0,r0,r0
   6927 #define vsetq_lane_u64 vsetq_lane_s64
   6928 
   6929 // *******************************************************************************
   6930 // **************** Initialize a vector from bit pattern ***************************
   6931 // *******************************************************************************
   6932 //These intrinsics create a vector from a literal bit pattern.
   6933 
   6934 //no IA32 SIMD avalilable
   6935 
   6936 //********************* Set all lanes to same value ********************************
   6937 //*********************************************************************************
   6938 //These intrinsics set all lanes to the same value.
   6939 
   6940 uint8x16_t   vdupq_n_u8(uint8_t value);         // VDUP.8 q0,r0
   6941 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
   6942 
   6943 uint16x8_t   vdupq_n_u16(uint16_t value);         // VDUP.16 q0,r0
   6944 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
   6945 
   6946 uint32x4_t   vdupq_n_u32(uint32_t value);         // VDUP.32 q0,r0
   6947 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
   6948 
   6949 int8x16_t   vdupq_n_s8(int8_t value);         // VDUP.8 q0,r0
   6950 #define vdupq_n_s8 _mm_set1_epi8
   6951 
   6952 int16x8_t   vdupq_n_s16(int16_t value);         // VDUP.16 q0,r0
   6953 #define vdupq_n_s16 _mm_set1_epi16
   6954 
   6955 int32x4_t   vdupq_n_s32(int32_t value);         // VDUP.32 q0,r0
   6956 #define vdupq_n_s32 _mm_set1_epi32
   6957 
   6958 poly8x16_t vdupq_n_p8(poly8_t value);         // VDUP.8 q0,r0
   6959 #define  vdupq_n_p8 vdupq_n_u8
   6960 
   6961 poly16x8_t vdupq_n_p16(poly16_t value);         // VDUP.16 q0,r0
   6962 #define  vdupq_n_p16 vdupq_n_u16
   6963 
   6964 float32x4_t vdupq_n_f32(float32_t value);         // VDUP.32 q0,r0
   6965 #define vdupq_n_f32 _mm_set1_ps
   6966 
   6967 int64x2_t   vdupq_n_s64(int64_t value);         // VMOV d0,r0,r0
   6968 _NEON2SSE_INLINE int64x2_t   vdupq_n_s64(int64_t value)
   6969 {
   6970     _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value};         //value may be an immediate
   6971     return LOAD_SI128(value2);
   6972 }
   6973 
   6974 uint64x2_t   vdupq_n_u64(uint64_t value);         // VMOV d0,r0,r0
   6975 _NEON2SSE_INLINE uint64x2_t   vdupq_n_u64(uint64_t value)
   6976 {
   6977     _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value};         //value may be an immediate
   6978     return LOAD_SI128(val);
   6979 }
   6980 
   6981 //****  Set all lanes to same value  ************************
   6982 //Same functions as above - just aliaces.********************
   6983 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
   6984 
   6985 uint8x16_t vmovq_n_u8(uint8_t value);         // VDUP.8 q0,r0
   6986 #define vmovq_n_u8 vdupq_n_u8
   6987 
   6988 uint16x8_t vmovq_n_u16(uint16_t value);         // VDUP.16 q0,r0
   6989 #define vmovq_n_u16 vdupq_n_s16
   6990 
   6991 uint32x4_t vmovq_n_u32(uint32_t value);         // VDUP.32 q0,r0
   6992 #define vmovq_n_u32 vdupq_n_u32
   6993 
   6994 int8x16_t vmovq_n_s8(int8_t value);         // VDUP.8 q0,r0
   6995 #define vmovq_n_s8 vdupq_n_s8
   6996 
   6997 int16x8_t vmovq_n_s16(int16_t value);         // VDUP.16 q0,r0
   6998 #define vmovq_n_s16 vdupq_n_s16
   6999 
   7000 int32x4_t vmovq_n_s32(int32_t value);         // VDUP.32 q0,r0
   7001 #define vmovq_n_s32 vdupq_n_s32
   7002 
   7003 poly8x16_t vmovq_n_p8(poly8_t value);         // VDUP.8 q0,r0
   7004 #define vmovq_n_p8 vdupq_n_u8
   7005 
   7006 poly16x8_t vmovq_n_p16(poly16_t value);         // VDUP.16 q0,r0
   7007 #define vmovq_n_p16 vdupq_n_s16
   7008 
   7009 float32x4_t vmovq_n_f32(float32_t value);         // VDUP.32 q0,r0
   7010 #define vmovq_n_f32 vdupq_n_f32
   7011 
   7012 int64x2_t vmovq_n_s64(int64_t value);         // VMOV d0,r0,r0
   7013 #define vmovq_n_s64 vdupq_n_s64
   7014 
   7015 uint64x2_t vmovq_n_u64(uint64_t value);         // VMOV d0,r0,r0
   7016 #define vmovq_n_u64 vdupq_n_u64
   7017 
   7018 //**************Set all lanes to the value of one lane of a vector *************
   7019 //****************************************************************************
   7020 //here shuffle is better solution than lane extraction followed by set1 function
   7021 
   7022 // ********************************************************************
   7023 // ********************  Combining vectors *****************************
   7024 // ********************************************************************
   7025 //These intrinsics join two 64 bit vectors into a single 128bit vector.
   7026 
   7027 //current IA SIMD doesn't support float16
   7028 
   7029 //**********************************************************************
   7030 //************************* Splitting vectors **************************
   7031 //**********************************************************************
   7032 //**************** Get high part ******************************************
   7033 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
   7034 
   7035 // IA32 SIMD doesn't work with 16bit floats currently
   7036 
   7037 //********************** Get low part **********************
   7038 //**********************************************************
   7039 
   7040 // IA32 SIMD doesn't work with 16bit floats currently
   7041 
   7042 //**************************************************************************
   7043 //************************ Converting vectors **********************************
   7044 //**************************************************************************
   7045 //************* Convert from float ***************************************
   7046 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
   7047 
   7048 int32x4_t   vcvtq_s32_f32(float32x4_t a);         // VCVT.S32.F32 q0, q0
   7049 #define vcvtq_s32_f32 _mm_cvtps_epi32
   7050 
   7051 uint32x4_t vcvtq_u32_f32(float32x4_t a);         // VCVT.U32.F32 q0, q0
   7052 _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a)         // VCVT.U32.F32 q0, q0
   7053 {         //No single instruction SSE solution  but we could implement it as following:
   7054     __m128i resi;
   7055     __m128 zero,  mask, a_pos, mask_f_max_si, res;
   7056     _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
   7057     zero = _mm_setzero_ps();
   7058     mask = _mm_cmpgt_ps(a, zero);
   7059     a_pos = _mm_and_ps(a, mask);
   7060     mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff);
   7061     res =  _mm_sub_ps(a_pos, mask_f_max_si);         //if the input fits to signed we don't subtract anything
   7062     resi = _mm_cvtps_epi32(res);
   7063     return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si);
   7064 }
   7065 
   7066 // ***** Convert to the fixed point  with   the number of fraction bits specified by b ***********
   7067 //*************************************************************************************************
   7068 //Intel SIMD doesn't support fixed point
   7069 
   7070 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.S32.F32 q0, q0, #32
   7071 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b);         // VCVT.U32.F32 q0, q0, #32
   7072 
   7073 //***************** Convert to float *************************
   7074 //*************************************************************
   7075 
   7076 float32x4_t vcvtq_f32_s32(int32x4_t a);         // VCVT.F32.S32 q0, q0
   7077 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
   7078 
   7079 float32x4_t vcvtq_f32_u32(uint32x4_t a);         // VCVT.F32.U32 q0, q0
   7080 _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a)         // VCVT.F32.U32 q0, q0
   7081 {         //solution may be not optimal
   7082     __m128 two16, fHi, fLo;
   7083     __m128i hi, lo;
   7084     two16 = _mm_set1_ps((float)0x10000);         //2^16
   7085     // Avoid double rounding by doing two exact conversions
   7086     // of high and low 16-bit segments
   7087     hi = _mm_srli_epi32(a, 16);
   7088     lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
   7089     fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
   7090     fLo = _mm_cvtepi32_ps(lo);
   7091     // do single rounding according to current rounding mode
   7092     return _mm_add_ps(fHi, fLo);
   7093 }
   7094 
   7095 //**************Convert between floats ***********************
   7096 //************************************************************
   7097 
   7098 //Intel SIMD doesn't support 16bits floats curently
   7099 
   7100 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
   7101 
   7102 //************Vector narrow integer conversion (truncation) ******************
   7103 //****************************************************************************
   7104 
   7105 //**************** Vector long move   ***********************
   7106 //***********************************************************
   7107 
   7108 //*************Vector saturating narrow integer*****************
   7109 //**************************************************************
   7110 
   7111 //************* Vector saturating narrow integer signed->unsigned **************
   7112 //*****************************************************************************
   7113 
   7114 // ********************************************************
   7115 // **************** Table look up **************************
   7116 // ********************************************************
   7117 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
   7118 //in a table and generate a new vector. Indexes out of range return 0.
   7119 //for Intel SIMD we need to set the MSB to 1 for zero return
   7120 
   7121 //Special trick to avoid __declspec(align('8')) won't be aligned" error
   7122 
   7123 //Special trick to avoid __declspec(align('16')) won't be aligned" error
   7124 
   7125 //****************** Extended table look up intrinsics ***************************
   7126 //**********************************************************************************
   7127 //VTBX (Vector Table Extension) works in the same way as VTBL do,
   7128 // except that indexes out of range leave the destination element unchanged.
   7129 
   7130 //Special trick to avoid __declspec(align('8')) won't be aligned" error
   7131 
   7132 //*************************************************************************************************
   7133 // *************************** Operations with a scalar value *********************************
   7134 //*************************************************************************************************
   7135 
   7136 //******* Vector multiply accumulate by scalar *************************************************
   7137 //**********************************************************************************************
   7138 
   7139 //***************** Vector widening multiply accumulate by scalar **********************
   7140 //***************************************************************************************
   7141 
   7142 // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
   7143 // ************************************************************************************************
   7144 
   7145 // ****** Vector multiply subtract by scalar *****************
   7146 // *************************************************************
   7147 
   7148 // **** Vector widening multiply subtract by scalar ****
   7149 // ****************************************************
   7150 
   7151 //********* Vector widening saturating doubling multiply subtract by scalar **************************
   7152 //******************************************************************************************************
   7153 
   7154 //********** Vector multiply with scalar *****************************
   7155 
   7156 int16x8_t vmulq_n_s16(int16x8_t a, int16_t b);         // VMUL.I16 q0,q0,d0[0]
   7157 _NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b)         // VMUL.I16 q0,q0,d0[0]
   7158 {
   7159     int16x8_t b16x8;
   7160     b16x8 = vdupq_n_s16(b);
   7161     return vmulq_s16(a, b16x8);
   7162 }
   7163 
   7164 int32x4_t vmulq_n_s32(int32x4_t a, int32_t b);         // VMUL.I32 q0,q0,d0[0]
   7165 _NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b)         // VMUL.I32 q0,q0,d0[0]
   7166 {
   7167     int32x4_t b32x4;
   7168     b32x4 = vdupq_n_s32(b);
   7169     return vmulq_s32(a, b32x4);
   7170 }
   7171 
   7172 float32x4_t vmulq_n_f32(float32x4_t a, float32_t b);         // VMUL.F32 q0,q0,d0[0]
   7173 _NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b)         // VMUL.F32 q0,q0,d0[0]
   7174 {
   7175     float32x4_t b32x4;
   7176     b32x4 = vdupq_n_f32(b);
   7177     return vmulq_f32(a, b32x4);
   7178 }
   7179 
   7180 uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b);         // VMUL.I16 q0,q0,d0[0]
   7181 _NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b)         // VMUL.I16 q0,q0,d0[0]
   7182 {
   7183     uint16x8_t b16x8;
   7184     b16x8 = vdupq_n_s16(b);
   7185     return vmulq_s16(a, b16x8);
   7186 }
   7187 
   7188 uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b);         // VMUL.I32 q0,q0,d0[0]
   7189 _NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b)         // VMUL.I32 q0,q0,d0[0]
   7190 {
   7191     uint32x4_t b32x4;
   7192     b32x4 = vdupq_n_u32(b);
   7193     return vmulq_u32(a, b32x4);
   7194 }
   7195 
   7196 //**** Vector long multiply with scalar ************
   7197 
   7198 //**** Vector long multiply by scalar ****
   7199 
   7200 //********* Vector saturating doubling long multiply with scalar  *******************
   7201 
   7202 //************* Vector saturating doubling long multiply by scalar ***********************************************
   7203 
   7204 // *****Vector saturating doubling multiply high with scalar *****
   7205 
   7206 int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2);         //  VQDMULH.S16 q0,q0,d0[0]
   7207 _NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2)         //  VQDMULH.S16 q0,q0,d0[0]
   7208 {         //solution may be not optimal
   7209     int16x8_t scalar;
   7210     scalar = vdupq_n_s16(val2);
   7211     return vqdmulhq_s16(vec1, scalar);
   7212 }
   7213 
   7214 int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2);         //  VQDMULH.S32 q0,q0,d0[0]
   7215 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   7216 {
   7217     int32x4_t scalar;
   7218     scalar = vdupq_n_s32(val2);
   7219     return vqdmulhq_s32(vec1, scalar);
   7220 }
   7221 
   7222 //***** Vector saturating doubling multiply high by scalar ****************
   7223 
   7224 //******** Vector saturating rounding doubling multiply high with scalar ***
   7225 
   7226 #if defined(USE_SSSE3)
   7227 int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2);         // VQRDMULH.S16 q0,q0,d0[0]
   7228 _NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2)         // VQRDMULH.S16 q0,q0,d0[0]
   7229 {         //solution may be not optimal
   7230     int16x8_t scalar;
   7231     scalar = vdupq_n_s16(val2);
   7232     return vqrdmulhq_s16(vec1, scalar);
   7233 }
   7234 #endif
   7235 
   7236 #if defined(USE_SSSE3)
   7237 int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2);         // VQRDMULH.S32 q0,q0,d0[0]
   7238 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   7239 {
   7240     int32x4_t scalar;
   7241     scalar = vdupq_n_s32(val2);
   7242     return vqrdmulhq_s32(vec1, scalar);
   7243 }
   7244 #endif
   7245 
   7246 //********* Vector rounding saturating doubling multiply high by scalar  ****
   7247 
   7248 //**************Vector multiply accumulate with scalar *******************
   7249 
   7250 int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLA.I16 q0, q0, d0[0]
   7251 _NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c)         // VMLA.I16 q0, q0, d0[0]
   7252 {
   7253     int16x8_t scalar;
   7254     scalar = vdupq_n_s16(c);
   7255     return vmlaq_s16(a,b,scalar);
   7256 }
   7257 
   7258 int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLA.I32 q0, q0, d0[0]
   7259 _NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c)         // VMLA.I32 q0, q0, d0[0]
   7260 {
   7261     int32x4_t scalar;
   7262     scalar = vdupq_n_s32(c);
   7263     return vmlaq_s32(a,b,scalar);
   7264 }
   7265 
   7266 uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLA.I16 q0, q0, d0[0]
   7267 #define vmlaq_n_u16 vmlaq_n_s16
   7268 
   7269 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLA.I32 q0, q0, d0[0]
   7270 #define vmlaq_n_u32 vmlaq_n_s32
   7271 
   7272 float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLA.F32 q0, q0, d0[0]
   7273 _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c)         // VMLA.F32 q0, q0, d0[0]
   7274 {
   7275     float32x4_t scalar;
   7276     scalar = vdupq_n_f32(c);
   7277     return vmlaq_f32(a,b,scalar);
   7278 }
   7279 
   7280 //************Vector widening multiply accumulate with scalar****************************
   7281 
   7282 //************ Vector widening saturating doubling multiply accumulate with scalar **************
   7283 
   7284 //******** Vector multiply subtract with scalar **************
   7285 
   7286 int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c);         // VMLS.I16 q0, q0, d0[0]
   7287 _NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c)         // VMLS.I16 q0, q0, d0[0]
   7288 {
   7289     int16x8_t vc;
   7290     vc = vdupq_n_s16(c);
   7291     return vmlsq_s16(a, b,vc);
   7292 }
   7293 
   7294 int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c);         // VMLS.I32 q0, q0, d0[0]
   7295 _NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c)         // VMLS.I32 q0, q0, d0[0]
   7296 {
   7297     int32x4_t vc;
   7298     vc = vdupq_n_s32(c);
   7299     return vmlsq_s32(a,b,vc);
   7300 }
   7301 
   7302 uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c);         // VMLS.I16 q0, q0, d0[0]
   7303 _NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c)         // VMLS.I16 q0, q0, d0[0]
   7304 {
   7305     uint32x4_t vc;
   7306     vc = vdupq_n_u32(c);
   7307     return vmlsq_u32(a,b,vc);
   7308 }
   7309 
   7310 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c);         // VMLS.I32 q0, q0, d0[0]
   7311 _NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c)         // VMLS.I32 q0, q0, d0[0]
   7312 {
   7313     uint32x4_t vc;
   7314     vc = vdupq_n_u32(c);
   7315     return vmlsq_u32(a,b,vc);
   7316 }
   7317 
   7318 float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c);         // VMLS.F32 q0, q0, d0[0]
   7319 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
   7320 {
   7321     float32x4_t vc;
   7322     vc = vdupq_n_f32(c);
   7323     return vmlsq_f32(a,b,vc);
   7324 }
   7325 
   7326 //**** Vector widening multiply subtract with scalar ******
   7327 
   7328 //***** Vector widening saturating doubling multiply subtract with scalar *********
   7329 //**********************************************************************************
   7330 
   7331 //*******************  Vector extract ***********************************************
   7332 //*************************************************************************************
   7333 //VEXT (Vector Extract) extracts  elements from the bottom end of the second operand
   7334 //vector and the top end of the first, concatenates them, and places the result in the destination vector
   7335 //c elements from the bottom end of the second operand and (8-c) from the top end of the first
   7336 
   7337 #if defined(USE_SSSE3)
   7338 //same result tested
   7339 
   7340 #endif
   7341 
   7342 #if defined(USE_SSSE3)
   7343 int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
   7344 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
   7345 
   7346 uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
   7347 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
   7348 
   7349 poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c);         // VEXT.8 q0,q0,q0,#0
   7350 #define vextq_p8 vextq_s8
   7351 
   7352 int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
   7353 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
   7354 
   7355 uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
   7356 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
   7357 
   7358 poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c);         // VEXT.16 q0,q0,q0,#0
   7359 #define vextq_p16 vextq_s16
   7360 #endif
   7361 
   7362 #if defined(USE_SSSE3)
   7363 int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
   7364 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
   7365 
   7366 uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c);         // VEXT.32 q0,q0,q0,#0
   7367 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
   7368 
   7369 int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
   7370 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
   7371 
   7372 uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c);         // VEXT.64 q0,q0,q0,#0
   7373 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
   7374 #endif
   7375 
   7376 //************ Reverse vector elements (swap endianness)*****************
   7377 //*************************************************************************
   7378 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
   7379 
   7380 #if defined(USE_SSSE3)
   7381 int8x16_t vrev64q_s8(int8x16_t vec);         // VREV64.8 q0,q0
   7382 _NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec)         // VREV64.8 q0,q0
   7383 {
   7384     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
   7385     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
   7386 }
   7387 #endif
   7388 
   7389 #if defined(USE_SSSE3)
   7390 int16x8_t vrev64q_s16(int16x8_t vec);         // VREV64.16 q0,q0
   7391 _NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec)         // VREV64.16 q0,q0
   7392 {         //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
   7393     _NEON2SSE_ALIGN_16 int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
   7394     return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
   7395 }
   7396 #endif
   7397 
   7398 int32x4_t vrev64q_s32(int32x4_t vec);         // VREV64.32 q0,q0
   7399 _NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec)         // VREV64.32 q0,q0
   7400 {
   7401     return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
   7402 }
   7403 
   7404 #if defined(USE_SSSE3)
   7405 uint8x16_t vrev64q_u8(uint8x16_t vec);         // VREV64.8 q0,q0
   7406 #define vrev64q_u8 vrev64q_s8
   7407 
   7408 uint16x8_t vrev64q_u16(uint16x8_t vec);         // VREV64.16 q0,q0
   7409 #define vrev64q_u16 vrev64q_s16
   7410 #endif
   7411 
   7412 uint32x4_t vrev64q_u32(uint32x4_t vec);         // VREV64.32 q0,q0
   7413 #define vrev64q_u32 vrev64q_s32
   7414 
   7415 #if defined(USE_SSSE3)
   7416 poly8x16_t vrev64q_p8(poly8x16_t vec);         // VREV64.8 q0,q0
   7417 #define vrev64q_p8 vrev64q_u8
   7418 
   7419 poly16x8_t vrev64q_p16(poly16x8_t vec);         // VREV64.16 q0,q0
   7420 #define vrev64q_p16 vrev64q_s16
   7421 #endif
   7422 
   7423 float32x4_t vrev64q_f32(float32x4_t vec);         // VREV64.32 q0,q0
   7424 #define vrev64q_f32(vec) _mm_shuffle_ps (vec,  vec, _MM_SHUFFLE(2,3, 0,1))
   7425 
   7426 //********************  32 bit shuffles **********************
   7427 //************************************************************
   7428 
   7429 #if defined(USE_SSSE3)
   7430 int8x16_t vrev32q_s8(int8x16_t vec);         // VREV32.8 q0,q0
   7431 _NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec)         // VREV32.8 q0,q0
   7432 {
   7433     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
   7434     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
   7435 }
   7436 #endif
   7437 
   7438 #if defined(USE_SSSE3)
   7439 int16x8_t vrev32q_s16(int16x8_t vec);         // VREV32.16 q0,q0
   7440 _NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec)         // VREV32.16 q0,q0
   7441 {
   7442     _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
   7443     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
   7444 }
   7445 #endif
   7446 
   7447 #if defined(USE_SSSE3)
   7448 uint8x16_t vrev32q_u8(uint8x16_t vec);         // VREV32.8 q0,q0
   7449 #define vrev32q_u8 vrev32q_s8
   7450 
   7451 uint16x8_t vrev32q_u16(uint16x8_t vec);         // VREV32.16 q0,q0
   7452 #define vrev32q_u16 vrev32q_s16
   7453 
   7454 poly8x16_t vrev32q_p8(poly8x16_t vec);         // VREV32.8 q0,q0
   7455 #define vrev32q_p8 vrev32q_u8
   7456 #endif
   7457 
   7458 //*************  16 bit shuffles **********************
   7459 //******************************************************
   7460 
   7461 #if defined(USE_SSSE3)
   7462 int8x16_t vrev16q_s8(int8x16_t vec);         // VREV16.8 q0,q0
   7463 _NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec)         // VREV16.8 q0,q0
   7464 {
   7465     _NEON2SSE_ALIGN_16 int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
   7466     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev8);
   7467 }
   7468 #endif
   7469 
   7470 #if defined(USE_SSSE3)
   7471 uint8x16_t vrev16q_u8(uint8x16_t vec);         // VREV16.8 q0,q0
   7472 #define vrev16q_u8 vrev16q_s8
   7473 
   7474 poly8x16_t vrev16q_p8(poly8x16_t vec);         // VREV16.8 q0,q0
   7475 #define vrev16q_p8 vrev16q_u8
   7476 #endif
   7477 
   7478 //*********************************************************************
   7479 //**************** Other single operand arithmetic *******************
   7480 //*********************************************************************
   7481 
   7482 //*********** Absolute: Vd[i] = |Va[i]| **********************************
   7483 //************************************************************************
   7484 
   7485 int8x16_t   vabsq_s8(int8x16_t a);         // VABS.S8 q0,q0
   7486 #define vabsq_s8 _mm_abs_epi8
   7487 
   7488 int16x8_t   vabsq_s16(int16x8_t a);         // VABS.S16 q0,q0
   7489 #define vabsq_s16 _mm_abs_epi16
   7490 
   7491 int32x4_t   vabsq_s32(int32x4_t a);         // VABS.S32 q0,q0
   7492 #define vabsq_s32 _mm_abs_epi32
   7493 
   7494 float32x4_t vabsq_f32(float32x4_t a);         // VABS.F32 q0,q0
   7495 _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a)         // VABS.F32 q0,q0
   7496 {
   7497     _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
   7498     return _mm_and_ps (a, *(__m128*)c7fffffff);
   7499 }
   7500 
   7501 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
   7502 //**********************************************************************
   7503 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
   7504 
   7505 #if defined(USE_SSSE3)
   7506 int8x16_t vqabsq_s8(int8x16_t a);         // VQABS.S8 q0,q0
   7507 _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a)         // VQABS.S8 q0,q0
   7508 {
   7509     __m128i c_128, abs, abs_cmp;
   7510     c_128 = _mm_set1_epi8 (0x80);         //-128
   7511     abs = _mm_abs_epi8 (a);
   7512     abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
   7513     return _mm_xor_si128 (abs,  abs_cmp);
   7514 }
   7515 #endif
   7516 
   7517 #if defined(USE_SSSE3)
   7518 int16x8_t vqabsq_s16(int16x8_t a);         // VQABS.S16 q0,q0
   7519 _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a)         // VQABS.S16 q0,q0
   7520 {
   7521     __m128i c_32768, abs, abs_cmp;
   7522     c_32768 = _mm_set1_epi16 (0x8000);         //-32768
   7523     abs = _mm_abs_epi16 (a);
   7524     abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
   7525     return _mm_xor_si128 (abs,  abs_cmp);
   7526 }
   7527 #endif
   7528 
   7529 #if defined(USE_SSSE3)
   7530 int32x4_t vqabsq_s32(int32x4_t a);         // VQABS.S32 q0,q0
   7531 _NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a)         // VQABS.S32 q0,q0
   7532 {
   7533     __m128i c80000000, abs, abs_cmp;
   7534     c80000000 = _mm_set1_epi32 (0x80000000);         //most negative value
   7535     abs = _mm_abs_epi32 (a);
   7536     abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
   7537     return _mm_xor_si128 (abs,  abs_cmp);
   7538 }
   7539 #endif
   7540 
   7541 //*************** Negate: Vd[i] = - Va[i] *************************************
   7542 //*****************************************************************************
   7543 //several Negate implementations possible for SIMD.
   7544 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
   7545 
   7546 int8x16_t vnegq_s8(int8x16_t a);         // VNE//q0,q0
   7547 _NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a)         // VNE//q0,q0
   7548 {
   7549     __m128i zero;
   7550     zero = _mm_setzero_si128 ();
   7551     return _mm_sub_epi8 (zero, a);
   7552 }         //or _mm_sign_epi8 (a, negative numbers vector)
   7553 
   7554 int16x8_t vnegq_s16(int16x8_t a);         // VNE//q0,q0
   7555 _NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a)         // VNE//q0,q0
   7556 {
   7557     __m128i zero;
   7558     zero = _mm_setzero_si128 ();
   7559     return _mm_sub_epi16 (zero, a);
   7560 }         //or _mm_sign_epi16 (a, negative numbers vector)
   7561 
   7562 int32x4_t vnegq_s32(int32x4_t a);         // VNE//q0,q0
   7563 _NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a)         // VNE//q0,q0
   7564 {
   7565     __m128i zero;
   7566     zero = _mm_setzero_si128 ();
   7567     return _mm_sub_epi32 (zero, a);
   7568 }         //or _mm_sign_epi32 (a, negative numbers vector)
   7569 
   7570 float32x4_t vnegq_f32(float32x4_t a);         // VNE//q0,q0
   7571 _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a)         // VNE//q0,q0
   7572 {
   7573     _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   7574     return _mm_xor_ps (a, *(__m128*) c80000000);
   7575 }
   7576 
   7577 //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
   7578 //***************************************************************************************
   7579 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
   7580 
   7581 int8x16_t vqnegq_s8(int8x16_t a);         // VQNE//q0,q0
   7582 _NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a)         // VQNE//q0,q0
   7583 {
   7584     __m128i zero;
   7585     zero = _mm_setzero_si128 ();
   7586     return _mm_subs_epi8 (zero, a);         //saturating substraction
   7587 }
   7588 
   7589 int16x8_t vqnegq_s16(int16x8_t a);         // VQNE//q0,q0
   7590 _NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a)         // VQNE//q0,q0
   7591 {
   7592     __m128i zero;
   7593     zero = _mm_setzero_si128 ();
   7594     return _mm_subs_epi16 (zero, a);         //saturating substraction
   7595 }
   7596 
   7597 int32x4_t vqnegq_s32(int32x4_t a);         // VQNE//q0,q0
   7598 _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a)         // VQNE//q0,q0
   7599 {         //solution may be not optimal compared with a serial
   7600     __m128i c80000000, zero, sub, cmp;
   7601     c80000000 = _mm_set1_epi32 (0x80000000);         //most negative value
   7602     zero = _mm_setzero_si128 ();
   7603     sub =  _mm_sub_epi32 (zero, a);         //substraction
   7604     cmp = _mm_cmpeq_epi32 (a, c80000000);
   7605     return _mm_xor_si128 (sub,  cmp);
   7606 }
   7607 
   7608 //****************** Count leading zeros ********************************
   7609 //**************************************************************************
   7610 //no corresponding vector intrinsics in IA32, need to implement it.  While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
   7611 
   7612 #if defined(USE_SSSE3)
   7613 int8x16_t vclzq_s8(int8x16_t a);         // VCLZ.I8 q0,q0
   7614 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
   7615 {
   7616     _NEON2SSE_ALIGN_16 int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
   7617                                        /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
   7618                                        /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
   7619                                        /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0};
   7620     __m128i maskLOW, c4, lowclz, mask, hiclz;
   7621     maskLOW = _mm_set1_epi8(0x0f);         //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
   7622     c4 = _mm_set1_epi8(4);
   7623     lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a);         //uses low 4 bits anyway
   7624     mask =  _mm_srli_epi16(a, 4);         //get high 4 bits as low bits
   7625     mask = _mm_and_si128(mask, maskLOW);         //low 4 bits, need masking to avoid zero if MSB is set
   7626     hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask);         //uses low 4 bits anyway
   7627     mask = _mm_cmpeq_epi8(hiclz, c4);         // shows the need to add lowclz zeros
   7628     lowclz = _mm_and_si128(lowclz,mask);
   7629     return _mm_add_epi8(lowclz, hiclz);
   7630 }
   7631 #endif
   7632 
   7633 #if defined(USE_SSSE3)
   7634 int16x8_t vclzq_s16(int16x8_t a);         // VCLZ.I16 q0,q0
   7635 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
   7636 {
   7637     __m128i c7, res8x16, res8x16_swap;
   7638     _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
   7639     _NEON2SSE_ALIGN_16 uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
   7640     c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5);         //7
   7641     res8x16 = vclzq_s8(a);
   7642     res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab);         //horisontal pairs swap
   7643     res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit);         //lowclz
   7644     res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit);         //hiclz
   7645     c7 = _mm_cmpgt_epi16(res8x16_swap, c7);         // shows the need to add lowclz zeros
   7646     res8x16 = _mm_and_si128(res8x16, c7);         //lowclz
   7647     return _mm_add_epi16(res8x16_swap, res8x16);
   7648 }
   7649 #endif
   7650 
   7651 int32x4_t vclzq_s32(int32x4_t a);         // VCLZ.I32 q0,q0
   7652 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
   7653 {
   7654     __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
   7655     c55555555 = _mm_set1_epi32(0x55555555);
   7656     c33333333 = _mm_set1_epi32(0x33333333);
   7657     c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
   7658     c3f = _mm_set1_epi32(0x3f);
   7659     c32 = _mm_set1_epi32(32);
   7660     tmp = _mm_srli_epi32(a, 1);
   7661     res = _mm_or_si128(tmp, a);         //atmp[i] |= (atmp[i] >> 1);
   7662     tmp = _mm_srli_epi32(res, 2);
   7663     res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 2);
   7664     tmp = _mm_srli_epi32(res, 4);
   7665     res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 4);
   7666     tmp = _mm_srli_epi32(res, 8);
   7667     res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 8);
   7668     tmp = _mm_srli_epi32(res, 16);
   7669     res = _mm_or_si128(tmp, res);         //atmp[i] |= (atmp[i] >> 16);
   7670 
   7671     tmp = _mm_srli_epi32(res, 1);
   7672     tmp = _mm_and_si128(tmp, c55555555);
   7673     res = _mm_sub_epi32(res, tmp);         //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
   7674 
   7675     tmp = _mm_srli_epi32(res, 2);
   7676     tmp = _mm_and_si128(tmp, c33333333);
   7677     tmp1 = _mm_and_si128(res, c33333333);
   7678     res = _mm_add_epi32(tmp, tmp1);         //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
   7679 
   7680     tmp = _mm_srli_epi32(res, 4);
   7681     tmp = _mm_add_epi32(tmp, res);
   7682     res = _mm_and_si128(tmp, c0f0f0f0f);         //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
   7683 
   7684     tmp = _mm_srli_epi32(res, 8);
   7685     res = _mm_add_epi32(tmp, res);         //atmp[i] += (atmp[i] >> 8);
   7686 
   7687     tmp = _mm_srli_epi32(res, 16);
   7688     res = _mm_add_epi32(tmp, res);         //atmp[i] += (atmp[i] >> 16);
   7689 
   7690     res = _mm_and_si128(res, c3f);         //atmp[i] = atmp[i] & 0x0000003f;
   7691 
   7692     return _mm_sub_epi32(c32, res);         //res[i] = 32 - atmp[i];
   7693 }
   7694 
   7695 #if defined(USE_SSSE3)
   7696 uint8x16_t vclzq_u8(uint8x16_t a);         // VCLZ.I8 q0,q0
   7697 #define vclzq_u8 vclzq_s8
   7698 
   7699 uint16x8_t vclzq_u16(uint16x8_t a);         // VCLZ.I16 q0,q0
   7700 #define vclzq_u16 vclzq_s16
   7701 #endif
   7702 
   7703 uint32x4_t vclzq_u32(uint32x4_t a);         // VCLZ.I32 q0,q0
   7704 #define vclzq_u32 vclzq_s32
   7705 
   7706 //************** Count leading sign bits **************************
   7707 //********************************************************************
   7708 //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
   7709 // the topmost bit, that are the same as the topmost bit, in each element in a vector
   7710 //No corresponding vector intrinsics in IA32, need to implement it.
   7711 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
   7712 
   7713 #if defined(USE_SSSE3)
   7714 int8x16_t vclsq_s8(int8x16_t a);         // VCLS.S8 q0,q0
   7715 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
   7716 {
   7717     __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
   7718     cff = _mm_cmpeq_epi8 (a,a);         //0xff
   7719     c80 = _mm_set1_epi8(0x80);
   7720     c1 = _mm_set1_epi8(1);
   7721     a_mask = _mm_and_si128(a, c80);
   7722     a_mask = _mm_cmpeq_epi8(a_mask, c80);         //0xff if negative input and 0 if positive
   7723     a_neg = _mm_xor_si128(a, cff);
   7724     a_neg = _mm_and_si128(a_mask, a_neg);
   7725     a_pos = _mm_andnot_si128(a_mask, a);
   7726     a_comb = _mm_or_si128(a_pos, a_neg);
   7727     a_comb = vclzq_s8(a_comb);
   7728     return _mm_sub_epi8(a_comb, c1);
   7729 }
   7730 #endif
   7731 
   7732 #if defined(USE_SSSE3)
   7733 int16x8_t vclsq_s16(int16x8_t a);         // VCLS.S16 q0,q0
   7734 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
   7735 {
   7736     __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
   7737     cffff = _mm_cmpeq_epi16(a,a);
   7738     c8000 =  _mm_slli_epi16(cffff, 15);         //0x8000
   7739     c1 = _mm_srli_epi16(cffff,15);         //0x1
   7740     a_mask = _mm_and_si128(a, c8000);
   7741     a_mask = _mm_cmpeq_epi16(a_mask, c8000);         //0xffff if negative input and 0 if positive
   7742     a_neg = _mm_xor_si128(a, cffff);
   7743     a_neg = _mm_and_si128(a_mask, a_neg);
   7744     a_pos = _mm_andnot_si128(a_mask, a);
   7745     a_comb = _mm_or_si128(a_pos, a_neg);
   7746     a_comb = vclzq_s16(a_comb);
   7747     return _mm_sub_epi16(a_comb, c1);
   7748 }
   7749 #endif
   7750 
   7751 int32x4_t vclsq_s32(int32x4_t a);         // VCLS.S32 q0,q0
   7752 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
   7753 {
   7754     __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
   7755     cffffffff = _mm_cmpeq_epi32(a,a);
   7756     c80000000 =  _mm_slli_epi32(cffffffff, 31);         //0x80000000
   7757     c1 = _mm_srli_epi32(cffffffff,31);         //0x1
   7758     a_mask = _mm_and_si128(a, c80000000);
   7759     a_mask = _mm_cmpeq_epi32(a_mask, c80000000);         //0xffffffff if negative input and 0 if positive
   7760     a_neg = _mm_xor_si128(a, cffffffff);
   7761     a_neg = _mm_and_si128(a_mask, a_neg);
   7762     a_pos = _mm_andnot_si128(a_mask, a);
   7763     a_comb = _mm_or_si128(a_pos, a_neg);
   7764     a_comb = vclzq_s32(a_comb);
   7765     return _mm_sub_epi32(a_comb, c1);
   7766 }
   7767 
   7768 //************************* Count number of set bits   ********************************
   7769 //*************************************************************************************
   7770 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2  _mm_popcnt__u32 (unsigned int v) for each element
   7771 //another option is to do the following algorithm:
   7772 
   7773 #if defined(USE_SSSE3)
   7774 uint8x16_t vcntq_u8(uint8x16_t a);         // VCNT.8 q0,q0
   7775 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
   7776 {
   7777     _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
   7778                                             /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
   7779                                             /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
   7780                                             /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
   7781     __m128i maskLOW, mask, lowpopcnt, hipopcnt;
   7782     maskLOW = _mm_set1_epi8(0x0f);         //low 4 bits, need masking to avoid zero if MSB is set
   7783     mask = _mm_and_si128(a, maskLOW);
   7784     lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask);         //uses low 4 bits anyway
   7785     mask =  _mm_srli_epi16(a, 4);         //get high 4 bits as low bits
   7786     mask = _mm_and_si128(mask, maskLOW);         //low 4 bits, need masking to avoid zero if MSB is set
   7787     hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask);         //uses low 4 bits anyway
   7788     return _mm_add_epi8(lowpopcnt, hipopcnt);
   7789 }
   7790 #endif
   7791 
   7792 #if defined(USE_SSSE3)
   7793 int8x16_t vcntq_s8(int8x16_t a);         // VCNT.8 q0,q0
   7794 #define vcntq_s8 vcntq_u8
   7795 
   7796 poly8x16_t vcntq_p8(poly8x16_t a);         // VCNT.8 q0,q0
   7797 #define vcntq_p8 vcntq_u8
   7798 #endif
   7799 
   7800 //**************************************************************************************
   7801 //*********************** Logical operations ****************************************
   7802 //**************************************************************************************
   7803 //************************** Bitwise not ***********************************
   7804 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
   7805 
   7806 int8x16_t vmvnq_s8(int8x16_t a);         // VMVN q0,q0
   7807 _NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a)         // VMVN q0,q0
   7808 {
   7809     __m128i c1;
   7810     c1 = _mm_cmpeq_epi8 (a,a);         //0xff
   7811     return _mm_andnot_si128 (a, c1);
   7812 }
   7813 
   7814 int16x8_t vmvnq_s16(int16x8_t a);         // VMVN q0,q0
   7815 _NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a)         // VMVN q0,q0
   7816 {
   7817     __m128i c1;
   7818     c1 = _mm_cmpeq_epi16 (a,a);         //0xffff
   7819     return _mm_andnot_si128 (a, c1);
   7820 }
   7821 
   7822 int32x4_t vmvnq_s32(int32x4_t a);         // VMVN q0,q0
   7823 _NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a)         // VMVN q0,q0
   7824 {
   7825     __m128i c1;
   7826     c1 = _mm_cmpeq_epi32 (a,a);         //0xffffffff
   7827     return _mm_andnot_si128 (a, c1);
   7828 }
   7829 
   7830 uint8x16_t vmvnq_u8(uint8x16_t a);         // VMVN q0,q0
   7831 #define vmvnq_u8 vmvnq_s8
   7832 
   7833 uint16x8_t vmvnq_u16(uint16x8_t a);         // VMVN q0,q0
   7834 #define vmvnq_u16 vmvnq_s16
   7835 
   7836 uint32x4_t vmvnq_u32(uint32x4_t a);         // VMVN q0,q0
   7837 #define vmvnq_u32 vmvnq_s32
   7838 
   7839 poly8x16_t vmvnq_p8(poly8x16_t a);         // VMVN q0,q0
   7840 #define vmvnq_p8 vmvnq_u8
   7841 
   7842 //****************** Bitwise and ***********************
   7843 //******************************************************
   7844 
   7845 int8x16_t   vandq_s8(int8x16_t a, int8x16_t b);         // VAND q0,q0,q0
   7846 #define vandq_s8 _mm_and_si128
   7847 
   7848 int16x8_t   vandq_s16(int16x8_t a, int16x8_t b);         // VAND q0,q0,q0
   7849 #define vandq_s16 _mm_and_si128
   7850 
   7851 int32x4_t   vandq_s32(int32x4_t a, int32x4_t b);         // VAND q0,q0,q0
   7852 #define vandq_s32 _mm_and_si128
   7853 
   7854 int64x2_t   vandq_s64(int64x2_t a, int64x2_t b);         // VAND q0,q0,q0
   7855 #define vandq_s64 _mm_and_si128
   7856 
   7857 uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b);         // VAND q0,q0,q0
   7858 #define vandq_u8 _mm_and_si128
   7859 
   7860 uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b);         // VAND q0,q0,q0
   7861 #define vandq_u16 _mm_and_si128
   7862 
   7863 uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b);         // VAND q0,q0,q0
   7864 #define vandq_u32 _mm_and_si128
   7865 
   7866 uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b);         // VAND q0,q0,q0
   7867 #define vandq_u64 _mm_and_si128
   7868 
   7869 //******************** Bitwise or *********************************
   7870 //******************************************************************
   7871 
   7872 int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b);         // VORR q0,q0,q0
   7873 #define vorrq_s8 _mm_or_si128
   7874 
   7875 int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b);         // VORR q0,q0,q0
   7876 #define vorrq_s16 _mm_or_si128
   7877 
   7878 int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b);         // VORR q0,q0,q0
   7879 #define vorrq_s32 _mm_or_si128
   7880 
   7881 int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b);         // VORR q0,q0,q0
   7882 #define vorrq_s64 _mm_or_si128
   7883 
   7884 uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b);         // VORR q0,q0,q0
   7885 #define vorrq_u8 _mm_or_si128
   7886 
   7887 uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b);         // VORR q0,q0,q0
   7888 #define vorrq_u16 _mm_or_si128
   7889 
   7890 uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b);         // VORR q0,q0,q0
   7891 #define vorrq_u32 _mm_or_si128
   7892 
   7893 uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b);         // VORR q0,q0,q0
   7894 #define vorrq_u64 _mm_or_si128
   7895 
   7896 //************* Bitwise exclusive or (EOR or XOR) ******************
   7897 //*******************************************************************
   7898 
   7899 int8x16_t   veorq_s8(int8x16_t a, int8x16_t b);         // VEOR q0,q0,q0
   7900 #define veorq_s8 _mm_xor_si128
   7901 
   7902 int16x8_t   veorq_s16(int16x8_t a, int16x8_t b);         // VEOR q0,q0,q0
   7903 #define veorq_s16 _mm_xor_si128
   7904 
   7905 int32x4_t   veorq_s32(int32x4_t a, int32x4_t b);         // VEOR q0,q0,q0
   7906 #define veorq_s32 _mm_xor_si128
   7907 
   7908 int64x2_t   veorq_s64(int64x2_t a, int64x2_t b);         // VEOR q0,q0,q0
   7909 #define veorq_s64 _mm_xor_si128
   7910 
   7911 uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b);         // VEOR q0,q0,q0
   7912 #define veorq_u8 _mm_xor_si128
   7913 
   7914 uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b);         // VEOR q0,q0,q0
   7915 #define veorq_u16 _mm_xor_si128
   7916 
   7917 uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b);         // VEOR q0,q0,q0
   7918 #define veorq_u32 _mm_xor_si128
   7919 
   7920 uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b);         // VEOR q0,q0,q0
   7921 #define veorq_u64 _mm_xor_si128
   7922 
   7923 //********************** Bit Clear **********************************
   7924 //*******************************************************************
   7925 //Logical AND complement (AND negation or AND NOT)
   7926 
   7927 //notice arguments "swap"
   7928 
   7929 //notice arguments "swap"
   7930 
   7931 //notice arguments "swap"
   7932 
   7933 //notice arguments "swap"
   7934 
   7935 //notice arguments "swap"
   7936 
   7937 //notice arguments "swap"
   7938 
   7939 //notice arguments "swap"
   7940 
   7941 //notice arguments "swap"
   7942 
   7943 int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b);         // VBIC q0,q0,q0
   7944 #define vbicq_s8(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
   7945 
   7946 int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b);         // VBIC q0,q0,q0
   7947 #define vbicq_s16(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
   7948 
   7949 int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b);         // VBIC q0,q0,q0
   7950 #define vbicq_s32(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
   7951 
   7952 int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b);         // VBIC q0,q0,q0
   7953 #define vbicq_s64(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
   7954 
   7955 uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b);         // VBIC q0,q0,q0
   7956 #define vbicq_u8(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
   7957 
   7958 uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b);         // VBIC q0,q0,q0
   7959 #define vbicq_u16(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
   7960 
   7961 uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b);         // VBIC q0,q0,q0
   7962 #define vbicq_u32(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
   7963 
   7964 uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b);         // VBIC q0,q0,q0
   7965 #define vbicq_u64(a,b) _mm_andnot_si128 (b,a)         //notice arguments "swap"
   7966 
   7967 //**************** Bitwise OR complement ********************************
   7968 //**************************************** ********************************
   7969 //no exact IA 32 match, need to implement it as following
   7970 
   7971 int8x16_t vornq_s8(int8x16_t a, int8x16_t b);         // VORN q0,q0,q0
   7972 _NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b)         // VORN q0,q0,q0
   7973 {
   7974     __m128i b1;
   7975     b1 = vmvnq_s8( b);         //bitwise not for b
   7976     return _mm_or_si128 (a, b1);
   7977 }
   7978 
   7979 int16x8_t vornq_s16(int16x8_t a, int16x8_t b);         // VORN q0,q0,q0
   7980 _NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b)         // VORN q0,q0,q0
   7981 {
   7982     __m128i b1;
   7983     b1 = vmvnq_s16( b);         //bitwise not for b
   7984     return _mm_or_si128 (a, b1);
   7985 }
   7986 
   7987 int32x4_t vornq_s32(int32x4_t a, int32x4_t b);         // VORN q0,q0,q0
   7988 _NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b)         // VORN q0,q0,q0
   7989 {
   7990     __m128i b1;
   7991     b1 = vmvnq_s32( b);         //bitwise not for b
   7992     return _mm_or_si128 (a, b1);
   7993 }
   7994 
   7995 int64x2_t vornq_s64(int64x2_t a, int64x2_t b);         // VORN q0,q0,q0
   7996 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
   7997 {
   7998     __m128i c1, b1;
   7999     c1 = _mm_cmpeq_epi8 (a, a);         //all ones 0xfffffff...fffff
   8000     b1 = _mm_andnot_si128 (b, c1);
   8001     return _mm_or_si128 (a, b1);
   8002 }
   8003 
   8004 uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b);         // VORN q0,q0,q0
   8005 _NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b)         // VORN q0,q0,q0
   8006 {
   8007     __m128i b1;
   8008     b1 = vmvnq_u8( b);         //bitwise not for b
   8009     return _mm_or_si128 (a, b1);
   8010 }
   8011 
   8012 uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b);         // VORN q0,q0,q0
   8013 _NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b)         // VORN q0,q0,q0
   8014 {
   8015     __m128i b1;
   8016     b1 = vmvnq_s16( b);         //bitwise not for b
   8017     return _mm_or_si128 (a, b1);
   8018 }
   8019 
   8020 uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b);         // VORN q0,q0,q0
   8021 _NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b)         // VORN q0,q0,q0
   8022 {
   8023     __m128i b1;
   8024     b1 = vmvnq_u32( b);         //bitwise not for b
   8025     return _mm_or_si128 (a, b1);
   8026 }
   8027 uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b);         // VORN q0,q0,q0
   8028 #define vornq_u64 vornq_s64
   8029 
   8030 //********************* Bitwise Select *****************************
   8031 //******************************************************************
   8032 //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
   8033 
   8034 //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
   8035 //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
   8036 
   8037 //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
   8038 //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
   8039 
   8040 //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
   8041 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
   8042 
   8043 //VBSL only is implemented for SIMD
   8044 
   8045 int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c);         // VBSL q0,q0,q0
   8046 _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c)         // VBSL q0,q0,q0
   8047 {
   8048     __m128i sel1, sel2;
   8049     sel1 = _mm_and_si128   (a, b);
   8050     sel2 = _mm_andnot_si128 (a, c);
   8051     return _mm_or_si128 (sel1, sel2);
   8052 }
   8053 
   8054 int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c);         // VBSL q0,q0,q0
   8055 #define vbslq_s16 vbslq_s8
   8056 
   8057 int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c);         // VBSL q0,q0,q0
   8058 #define vbslq_s32 vbslq_s8
   8059 
   8060 int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c);         // VBSL q0,q0,q0
   8061 #define vbslq_s64 vbslq_s8
   8062 
   8063 uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c);         // VBSL q0,q0,q0
   8064 #define vbslq_u8 vbslq_s8
   8065 
   8066 uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c);         // VBSL q0,q0,q0
   8067 #define vbslq_u16 vbslq_s8
   8068 
   8069 uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c);         // VBSL q0,q0,q0
   8070 #define vbslq_u32 vbslq_s8
   8071 
   8072 uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c);         // VBSL q0,q0,q0
   8073 #define vbslq_u64 vbslq_s8
   8074 
   8075 float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c);         // VBSL q0,q0,q0
   8076 _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c)         // VBSL q0,q0,q0
   8077 {
   8078     __m128 sel1, sel2;
   8079     sel1 = _mm_and_ps   (*(__m128*)&a, b);
   8080     sel2 = _mm_andnot_ps (*(__m128*)&a, c);
   8081     return _mm_or_ps (sel1, sel2);
   8082 }
   8083 
   8084 poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c);         // VBSL q0,q0,q0
   8085 #define vbslq_p8 vbslq_u8
   8086 
   8087 poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c);         // VBSL q0,q0,q0
   8088 #define vbslq_p16 vbslq_s8
   8089 
   8090 //************************************************************************************
   8091 //**************** Transposition operations ****************************************
   8092 //************************************************************************************
   8093 //*****************  Vector Transpose ************************************************
   8094 //************************************************************************************
   8095 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
   8096 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
   8097 
   8098 #if defined(USE_SSSE3)
   8099 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
   8100 _NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b)         // VTRN.8 q0,q0
   8101 {
   8102     int8x16x2_t r8x16;
   8103     __m128i a_sh, b_sh;
   8104     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
   8105     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd);         //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
   8106     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd);         //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
   8107 
   8108     r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh);         //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
   8109     r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh);         // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
   8110     return r8x16;
   8111 }
   8112 #endif
   8113 
   8114 #if defined(USE_SSSE3)
   8115 int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b);         // VTRN.16 q0,q0
   8116 _NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b)         // VTRN.16 q0,q0
   8117 {
   8118     int16x8x2_t v16x8;
   8119     __m128i a_sh, b_sh;
   8120     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
   8121     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd);         //a0, a2, a4, a6,  a1, a3, a5, a7
   8122     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd);         //b0, b2, b4, b6,  b1, b3, b5, b7
   8123     v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh);         //a0, b0, a2, b2, a4, b4, a6, b6
   8124     v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh);         //a1, b1, a3, b3, a5, b5, a7, b7
   8125     return v16x8;
   8126 }
   8127 #endif
   8128 
   8129 int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b);         // VTRN.32 q0,q0
   8130 _NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b)         // VTRN.32 q0,q0
   8131 {         //may be not optimal solution compared with serial
   8132     int32x4x2_t v32x4;
   8133     __m128i a_sh, b_sh;
   8134     a_sh = _mm_shuffle_epi32 (a, 216);         //a0, a2, a1, a3
   8135     b_sh = _mm_shuffle_epi32 (b, 216);         //b0, b2, b1, b3
   8136 
   8137     v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh);         //a0, b0, a2, b2
   8138     v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh);         //a1, b1, a3,  b3
   8139     return v32x4;
   8140 }
   8141 
   8142 #if defined(USE_SSSE3)
   8143 uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b);         // VTRN.8 q0,q0
   8144 #define vtrnq_u8 vtrnq_s8
   8145 
   8146 uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b);         // VTRN.16 q0,q0
   8147 #define vtrnq_u16 vtrnq_s16
   8148 #endif
   8149 
   8150 uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b);         // VTRN.32 q0,q0
   8151 #define vtrnq_u32 vtrnq_s32
   8152 
   8153 float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b);         // VTRN.32 q0,q0
   8154 _NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b)         // VTRN.32 q0,q0
   8155 {         //may be not optimal solution compared with serial
   8156     float32x4x2_t f32x4;
   8157     __m128 a_sh, b_sh;
   8158     a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0));         //a0, a2, a1, a3, need to check endiness
   8159     b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0));         //b0, b2, b1, b3, need to check endiness
   8160 
   8161     f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh);         //a0, b0, a2, b2
   8162     f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh);         //a1, b1, a3,  b3
   8163     return f32x4;
   8164 }
   8165 
   8166 #if defined(USE_SSSE3)
   8167 poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b);         // VTRN.8 q0,q0
   8168 #define vtrnq_p8 vtrnq_s8
   8169 
   8170 poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b);         // VTRN.16 q0,q0
   8171 #define vtrnq_p16 vtrnq_s16
   8172 #endif
   8173 
   8174 //***************** Interleave elements ***************************
   8175 //*****************************************************************
   8176 //output has (a0,b0,a1,b1, a2,b2,.....)
   8177 
   8178 int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b);         // VZIP.8 q0,q0
   8179 _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b)         // VZIP.8 q0,q0
   8180 {
   8181     int8x16x2_t r8x16;
   8182     r8x16.val[0] =  _mm_unpacklo_epi8(a, b);
   8183     r8x16.val[1] =  _mm_unpackhi_epi8(a, b);
   8184     return r8x16;
   8185 }
   8186 
   8187 int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b);         // VZIP.16 q0,q0
   8188 _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b)         // VZIP.16 q0,q0
   8189 {
   8190     int16x8x2_t r16x8;
   8191     r16x8.val[0] =  _mm_unpacklo_epi16(a, b);
   8192     r16x8.val[1] =  _mm_unpackhi_epi16(a, b);
   8193     return r16x8;
   8194 }
   8195 
   8196 int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b);         // VZIP.32 q0,q0
   8197 _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b)         // VZIP.32 q0,q0
   8198 {
   8199     int32x4x2_t r32x4;
   8200     r32x4.val[0] =  _mm_unpacklo_epi32(a, b);
   8201     r32x4.val[1] =  _mm_unpackhi_epi32(a, b);
   8202     return r32x4;
   8203 }
   8204 
   8205 uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b);         // VZIP.8 q0,q0
   8206 #define vzipq_u8 vzipq_s8
   8207 
   8208 uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b);         // VZIP.16 q0,q0
   8209 #define vzipq_u16 vzipq_s16
   8210 
   8211 uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b);         // VZIP.32 q0,q0
   8212 #define vzipq_u32 vzipq_s32
   8213 
   8214 float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b);         // VZIP.32 q0,q0
   8215 _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b)         // VZIP.32 q0,q0
   8216 {
   8217     float32x4x2_t f32x4;
   8218     f32x4.val[0] =   _mm_unpacklo_ps ( a,  b);
   8219     f32x4.val[1] =   _mm_unpackhi_ps ( a,  b);
   8220     return f32x4;
   8221 }
   8222 
   8223 poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b);         // VZIP.8 q0,q0
   8224 #define vzipq_p8 vzipq_u8
   8225 
   8226 poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b);         // VZIP.16 q0,q0
   8227 #define vzipq_p16 vzipq_u16
   8228 
   8229 //*********************** De-Interleave elements *************************
   8230 //*************************************************************************
   8231 //As the result of these functions first val  contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
   8232 //no such functions in IA32 SIMD, shuffle is required
   8233 
   8234 #if defined(USE_SSSE3)
   8235 int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b);         // VUZP.8 q0,q0
   8236 _NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b)         // VUZP.8 q0,q0
   8237 {
   8238     int8x16x2_t v8x16;
   8239     __m128i a_sh, b_sh;
   8240     _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15};
   8241     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd);         //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
   8242     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd);         //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
   8243     //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
   8244     v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
   8245     v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
   8246     return v8x16;
   8247 }
   8248 #endif
   8249 
   8250 #if defined(USE_SSSE3)
   8251 int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b);         // VUZP.16 q0,q0
   8252 _NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b)         // VUZP.16 q0,q0
   8253 {
   8254     int16x8x2_t v16x8;
   8255     __m128i a_sh, b_sh;
   8256     _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15};
   8257     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd);         //a0, a2, a4, a6,  a1, a3, a5, a7
   8258     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd);         //b0, b2, b4, b6,  b1, b3, b5, b7
   8259     v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         //a0, a2, a4, a6, b0, b2, b4, b6
   8260     v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, a5, a7, b1, b3, b5, b7
   8261     return v16x8;
   8262 }
   8263 #endif
   8264 
   8265 int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b);         // VUZP.32 q0,q0
   8266 _NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b)         // VUZP.32 q0,q0
   8267 {         //may be not optimal solution compared with serial
   8268     int32x4x2_t v32x4;
   8269     __m128i a_sh, b_sh;
   8270     a_sh = _mm_shuffle_epi32 (a, 216);         //a0, a2, a1, a3
   8271     b_sh = _mm_shuffle_epi32 (b, 216);         //b0, b2, b1, b3
   8272 
   8273     v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh);         //a0, a2, b0, b2
   8274     v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh);         //a1, a3, b1, b3
   8275     return v32x4;
   8276 }
   8277 
   8278 #if defined(USE_SSSE3)
   8279 uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b);         // VUZP.8 q0,q0
   8280 #define vuzpq_u8 vuzpq_s8
   8281 
   8282 uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b);         // VUZP.16 q0,q0
   8283 #define vuzpq_u16 vuzpq_s16
   8284 #endif
   8285 
   8286 uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b);         // VUZP.32 q0,q0
   8287 #define vuzpq_u32 vuzpq_s32
   8288 
   8289 float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b);         // VUZP.32 q0,q0
   8290 _NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b)         // VUZP.32 q0,q0
   8291 {
   8292     float32x4x2_t v32x4;
   8293     v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0));         //a0, a2, b0, b2 , need to check endianess however
   8294     v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1));         //a1, a3, b1, b3, need to check endianess however
   8295     return v32x4;
   8296 }
   8297 
   8298 #if defined(USE_SSSE3)
   8299 poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b);         // VUZP.8 q0,q0
   8300 #define vuzpq_p8 vuzpq_u8
   8301 
   8302 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b);         // VUZP.16 q0,q0
   8303 #define vuzpq_p16 vuzpq_u16
   8304 #endif
   8305 
   8306 //##############################################################################################
   8307 //*********************** Reinterpret cast intrinsics.******************************************
   8308 //##############################################################################################
   8309 // Not a part of oficial NEON instruction set but available in gcc compiler *********************
   8310 
   8311 poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
   8312 #define vreinterpretq_p8_u32
   8313 
   8314 poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
   8315 #define vreinterpretq_p8_u16
   8316 
   8317 poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
   8318 #define vreinterpretq_p8_u8
   8319 
   8320 poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
   8321 #define vreinterpretq_p8_s32
   8322 
   8323 poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
   8324 #define vreinterpretq_p8_s16
   8325 
   8326 poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
   8327 #define vreinterpretq_p8_s8
   8328 
   8329 poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
   8330 #define vreinterpretq_p8_u64
   8331 
   8332 poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
   8333 #define vreinterpretq_p8_s64
   8334 
   8335 poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
   8336 #define vreinterpretq_p8_f32(t) _M128i(t)
   8337 
   8338 poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
   8339 #define vreinterpretq_p8_p16
   8340 
   8341 poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
   8342 #define vreinterpretq_p16_u32
   8343 
   8344 poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
   8345 #define vreinterpretq_p16_u16
   8346 
   8347 poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
   8348 #define vreinterpretq_p16_s32
   8349 
   8350 poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
   8351 #define vreinterpretq_p16_s16
   8352 
   8353 poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
   8354 #define vreinterpretq_p16_s8
   8355 
   8356 poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
   8357 #define vreinterpretq_p16_u64
   8358 
   8359 poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
   8360 #define vreinterpretq_p16_s64
   8361 
   8362 poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
   8363 #define vreinterpretq_p16_f32(t) _M128i(t)
   8364 
   8365 poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
   8366 #define vreinterpretq_p16_p8  vreinterpretq_s16_p8
   8367 
   8368 //****  Integer to float  ******
   8369 
   8370 float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
   8371 #define  vreinterpretq_f32_u32(t) *(__m128*)&(t)
   8372 
   8373 float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
   8374 #define vreinterpretq_f32_u16 vreinterpretq_f32_u32
   8375 
   8376 float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
   8377 #define vreinterpretq_f32_u8 vreinterpretq_f32_u32
   8378 
   8379 float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
   8380 #define vreinterpretq_f32_s32 vreinterpretq_f32_u32
   8381 
   8382 float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
   8383 #define vreinterpretq_f32_s16 vreinterpretq_f32_u32
   8384 
   8385 float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
   8386 #define vreinterpretq_f32_s8 vreinterpretq_f32_u32
   8387 
   8388 float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
   8389 #define vreinterpretq_f32_u64 vreinterpretq_f32_u32
   8390 
   8391 float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
   8392 #define vreinterpretq_f32_s64 vreinterpretq_f32_u32
   8393 
   8394 float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
   8395 #define vreinterpretq_f32_p16 vreinterpretq_f32_u32
   8396 
   8397 float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
   8398 #define vreinterpretq_f32_p8 vreinterpretq_f32_u32
   8399 
   8400 //*** Integer type conversions ******************
   8401 //no conversion necessary for the following functions because it is same data type
   8402 
   8403 int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
   8404 #define vreinterpretq_s64_u32
   8405 
   8406 int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
   8407 #define vreinterpretq_s64_s16
   8408 
   8409 int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
   8410 #define vreinterpretq_s64_u8
   8411 
   8412 int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
   8413 #define vreinterpretq_s64_s32
   8414 
   8415 int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
   8416 #define vreinterpretq_s64_u16
   8417 
   8418 int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
   8419 #define vreinterpretq_s64_s8
   8420 
   8421 int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
   8422 #define vreinterpretq_s64_u64
   8423 
   8424 int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
   8425 #define vreinterpretq_s64_f32(t) _M128i(t)
   8426 
   8427 int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
   8428 #define vreinterpretq_s64_p16
   8429 
   8430 int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
   8431 #define vreinterpretq_s64_p8
   8432 
   8433 uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
   8434 #define vreinterpretq_u64_u32
   8435 
   8436 uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
   8437 #define vreinterpretq_u64_u16
   8438 
   8439 uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
   8440 #define vreinterpretq_u64_u8
   8441 
   8442 uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
   8443 #define vreinterpretq_u64_s32
   8444 
   8445 uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
   8446 #define vreinterpretq_u64_s16
   8447 
   8448 uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
   8449 #define vreinterpretq_u64_s8
   8450 
   8451 uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
   8452 #define vreinterpretq_u64_s64
   8453 
   8454 uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
   8455 #define vreinterpretq_u64_f32(t) _M128i(t)
   8456 
   8457 uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
   8458 #define vreinterpretq_u64_p16
   8459 
   8460 uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
   8461 #define vreinterpretq_u64_p8
   8462 
   8463 int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
   8464 #define vreinterpretq_s8_u32
   8465 
   8466 int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
   8467 #define vreinterpretq_s8_u16
   8468 
   8469 int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
   8470 #define vreinterpretq_s8_u8
   8471 
   8472 int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
   8473 #define vreinterpretq_s8_s32
   8474 
   8475 int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
   8476 #define vreinterpretq_s8_s16
   8477 
   8478 int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
   8479 #define vreinterpretq_s8_u64
   8480 
   8481 int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
   8482 #define vreinterpretq_s8_s64
   8483 
   8484 int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
   8485 #define vreinterpretq_s8_f32(t) _M128i(t)
   8486 
   8487 int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
   8488 #define vreinterpretq_s8_p16
   8489 
   8490 int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
   8491 #define vreinterpretq_s8_p8
   8492 
   8493 int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
   8494 #define vreinterpretq_s16_u32
   8495 
   8496 int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
   8497 #define vreinterpretq_s16_u16
   8498 
   8499 int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
   8500 #define vreinterpretq_s16_u8
   8501 
   8502 int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
   8503 #define vreinterpretq_s16_s32
   8504 
   8505 int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
   8506 #define vreinterpretq_s16_s8
   8507 
   8508 int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
   8509 #define vreinterpretq_s16_u64
   8510 
   8511 int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
   8512 #define vreinterpretq_s16_s64
   8513 
   8514 int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
   8515 #define vreinterpretq_s16_f32(t) _M128i(t)
   8516 
   8517 int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
   8518 #define vreinterpretq_s16_p16
   8519 
   8520 int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
   8521 #define vreinterpretq_s16_p8
   8522 
   8523 int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
   8524 #define vreinterpretq_s32_u32
   8525 
   8526 int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
   8527 #define vreinterpretq_s32_u16
   8528 
   8529 int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
   8530 #define vreinterpretq_s32_u8
   8531 
   8532 int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
   8533 #define vreinterpretq_s32_s16
   8534 
   8535 int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
   8536 #define vreinterpretq_s32_s8
   8537 
   8538 int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
   8539 #define vreinterpretq_s32_u64
   8540 
   8541 int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
   8542 #define vreinterpretq_s32_s64
   8543 
   8544 int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
   8545 #define vreinterpretq_s32_f32(t)  _mm_castps_si128(t)         //(*(__m128i*)&(t))
   8546 
   8547 int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
   8548 #define vreinterpretq_s32_p16
   8549 
   8550 int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
   8551 #define vreinterpretq_s32_p8
   8552 
   8553 uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
   8554 #define vreinterpretq_u8_u32
   8555 
   8556 uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
   8557 #define vreinterpretq_u8_u16
   8558 
   8559 uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
   8560 #define vreinterpretq_u8_s32
   8561 
   8562 uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
   8563 #define vreinterpretq_u8_s16
   8564 
   8565 uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
   8566 #define vreinterpretq_u8_s8
   8567 
   8568 uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
   8569 #define vreinterpretq_u8_u64
   8570 
   8571 uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
   8572 #define vreinterpretq_u8_s64
   8573 
   8574 uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
   8575 #define vreinterpretq_u8_f32(t) _M128i(t)
   8576 
   8577 uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
   8578 #define vreinterpretq_u8_p16
   8579 
   8580 uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
   8581 #define vreinterpretq_u8_p8
   8582 
   8583 uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
   8584 #define vreinterpretq_u16_u32
   8585 
   8586 uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
   8587 #define vreinterpretq_u16_u8
   8588 
   8589 uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
   8590 #define vreinterpretq_u16_s32
   8591 
   8592 uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
   8593 #define vreinterpretq_u16_s16
   8594 
   8595 uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
   8596 #define vreinterpretq_u16_s8
   8597 
   8598 uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
   8599 #define vreinterpretq_u16_u64
   8600 
   8601 uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
   8602 #define vreinterpretq_u16_s64
   8603 
   8604 uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
   8605 #define vreinterpretq_u16_f32(t) _M128i(t)
   8606 
   8607 uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
   8608 #define vreinterpretq_u16_p16
   8609 
   8610 uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
   8611 #define vreinterpretq_u16_p8
   8612 
   8613 uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
   8614 #define vreinterpretq_u32_u16
   8615 
   8616 uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
   8617 #define vreinterpretq_u32_u8
   8618 
   8619 uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
   8620 #define vreinterpretq_u32_s32
   8621 
   8622 uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
   8623 #define vreinterpretq_u32_s16
   8624 
   8625 uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
   8626 #define vreinterpretq_u32_s8
   8627 
   8628 uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
   8629 #define vreinterpretq_u32_u64
   8630 
   8631 uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
   8632 #define vreinterpretq_u32_s64
   8633 
   8634 uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
   8635 #define  vreinterpretq_u32_f32(t) _M128i(t)
   8636 
   8637 uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
   8638 #define vreinterpretq_u32_p16
   8639 
   8640 uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
   8641 #define vreinterpretq_u32_p8
   8642 
   8643 #endif /* NEON2SSE_H */
   8644