Home | History | Annotate | Download | only in neon_2_sse
      1 //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation,  victoria.zhislina (at) intel.com
      2 
      3 //*** Copyright (C) 2012-2018 Intel Corporation.  All rights reserved.
      4 
      5 //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
      6 
      7 //By downloading, copying, installing or using the software you agree to this license.
      8 //If you do not agree to this license, do not download, install, copy or use the software.
      9 
     10 //                              License Agreement
     11 //Redistribution and use in source and binary forms, with or without modification,
     12 //are permitted provided that the following conditions are met:
     13 
     14 //  * Redistributions of source code must retain the above copyright notice,
     15 //    this list of conditions and the following disclaimer.
     16 
     17 //  * The name of the copyright holders may not be used to endorse or promote products
     18 //    derived from this software without specific prior written permission.
     19 
     20 //This software is provided by the copyright holders and contributors "as is" and
     21 //any express or implied warranties, including, but not limited to, the implied
     22 //warranties of merchantability and fitness for a particular purpose are disclaimed.
     23 //In no event shall the Intel Corporation or contributors be liable for any direct,
     24 //indirect, incidental, special, exemplary, or consequential damages
     25 //(including, but not limited to, procurement of substitute goods or services;
     26 //loss of use, data, or profits; or business interruption) however caused
     27 //and on any theory of liability, whether in contract, strict liability,
     28 //or tort (including negligence or otherwise) arising in any way out of
     29 //the use of this software, even if advised of the possibility of such damage.
     30 
     31 //*****************************************************************************************
     32 // This file is intended to simplify ARM->IA32 porting
     33 // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h")
     34 // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below
     35 //MMX instruction set is not used due to non availability on x64 systems,
     36 //performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point  switching
     37 //*****************************************************************************************
     38 
     39 //!!!!!!!!!!!!!!  To use this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h" and compile it as usual
     40 //!!!!!!!!!!!!!!  but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance.
     41 
     42 #ifndef NEON2SSE_H
     43 #define NEON2SSE_H
     44 
     45 /*********************************************************************************************************************/
     46 //!!!!!!!!!!!!!!
     47 //if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used
     48 //For older devices without SSE4 support it should be undefined,  for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine
     49 #ifndef USE_SSE4
     50 #   if defined(__SSE4_2__)
     51 #       define USE_SSE4
     52 #   endif
     53 #endif
     54 /*********************************************************************************************************************/
     55 
     56 #include <xmmintrin.h>     //SSE
     57 #include <emmintrin.h>     //SSE2
     58 #include <pmmintrin.h>     //SSE3
     59 #include <tmmintrin.h>     //SSSE3
     60 #ifdef USE_SSE4
     61 #   include <smmintrin.h> //SSE4.1
     62 #   include <nmmintrin.h> //SSE4.2
     63 #endif
     64 
     65 #include <math.h>
     66 
     67 //***************  functions and data attributes, compiler dependent  *********************************
     68 //***********************************************************************************
     69 #ifdef __GNUC__
     70 #   define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
     71 #   define _NEON2SSESTORAGE static
     72 #   define _NEON2SSE_ALIGN_16  __attribute__((aligned(16)))
     73 #   define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     74 #   ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING
     75 #       if _GCC_VERSION <  40500
     76 #           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated)) function
     77 #       else
     78 #           define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)   __attribute__((deprecated(explanation))) function
     79 #       endif
     80 #   else
     81 #       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
     82 #   endif
     83 #   if defined(__x86_64__)
     84 #       define _NEON2SSE_64BIT  __x86_64__
     85 #   endif
     86 #else
     87 #   define _NEON2SSESTORAGE static
     88 #   define _NEON2SSE_ALIGN_16  __declspec(align(16))
     89 #   define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline
     90 #   if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING)
     91 #       define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function
     92 #       if defined(_M_X64)
     93 #           define _NEON2SSE_64BIT  _M_X64
     94 #       endif
     95 #   else
     96 #       define _NEON2SSE_PERFORMANCE_WARNING(function, explanation)  function
     97 #   endif
     98 #endif
     99 
    100 #if defined  (_NEON2SSE_64BIT) && defined (USE_SSE4)
    101 #   define _NEON2SSE_64BIT_SSE4
    102 #endif
    103 
    104 /*********************************************************************************************************************/
    105 //    data types conversion
    106 /*********************************************************************************************************************/
    107 #if defined(_MSC_VER) && (_MSC_VER < 1300)
    108     typedef signed char int8_t;
    109     typedef unsigned char uint8_t;
    110     typedef signed short int16_t;
    111     typedef unsigned short uint16_t;
    112     typedef signed int int32_t;
    113     typedef unsigned int uint32_t;
    114     typedef signed long long int64_t;
    115     typedef unsigned long long uint64_t;
    116 #elif defined(_MSC_VER)
    117     typedef signed __int8 int8_t;
    118     typedef unsigned __int8 uint8_t;
    119     typedef signed __int16 int16_t;
    120     typedef unsigned __int16 uint16_t;
    121     typedef signed __int32 int32_t;
    122     typedef unsigned __int32 uint32_t;
    123 
    124     typedef signed long long int64_t;
    125     typedef unsigned long long uint64_t;
    126 #else
    127 #   include <stdint.h>
    128 #   include <limits.h>
    129 #endif
    130 
    131 typedef union   __m64_128 {
    132     uint64_t m64_u64[1];
    133     float m64_f32[2];
    134     int8_t m64_i8[8];
    135     int16_t m64_i16[4];
    136     int32_t m64_i32[2];
    137     int64_t m64_i64[1];
    138     uint8_t m64_u8[8];
    139     uint16_t m64_u16[4];
    140     uint32_t m64_u32[2];
    141 } __m64_128;
    142 
    143 typedef __m64_128 int8x8_t;
    144 typedef __m64_128 uint8x8_t;
    145 typedef __m64_128 int16x4_t;
    146 typedef __m64_128 uint16x4_t;
    147 typedef __m64_128 int32x2_t;
    148 typedef __m64_128 uint32x2_t;
    149 typedef __m64_128 int64x1_t;
    150 typedef __m64_128 uint64x1_t;
    151 typedef __m64_128 poly8x8_t;
    152 typedef __m64_128 poly16x4_t;
    153 
    154 typedef __m64_128 float32x2_t;
    155 typedef __m128 float32x4_t;
    156 
    157 typedef __m128 float16x4_t; //not supported by IA, for compartibility
    158 typedef __m128 float16x8_t; //not supported by IA, for compartibility
    159 
    160 typedef __m64_128 float64x1_t;
    161 typedef __m128d float64x2_t;
    162 
    163 typedef __m128i int8x16_t;
    164 typedef __m128i int16x8_t;
    165 typedef __m128i int32x4_t;
    166 typedef __m128i int64x2_t;
    167 typedef __m128i uint8x16_t;
    168 typedef __m128i uint16x8_t;
    169 typedef __m128i uint32x4_t;
    170 typedef __m128i uint64x2_t;
    171 typedef __m128i poly8x16_t;
    172 typedef __m128i poly16x8_t;
    173 
    174 #if defined(_MSC_VER)
    175 #   define SINT_MIN     (-2147483647 - 1) /* min signed int value */
    176 #   define SINT_MAX       2147483647 /* max signed int value */
    177 #else
    178 #   define SINT_MIN     INT_MIN /* min signed int value */
    179 #   define SINT_MAX     INT_MAX /* max signed int value */
    180 #endif
    181 
    182 typedef   float float32_t;
    183 #if !defined(__clang__)
    184 typedef   float __fp16;
    185 #endif
    186 
    187 typedef   double float64_t;
    188 
    189 
    190 typedef  uint8_t poly8_t;
    191 typedef  uint16_t poly16_t;
    192 
    193 
    194 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type  as functions arguments resulting in
    195 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned.  To avoid it we need the special trick for functions that use these types
    196 struct int8x16x2_t {
    197     int8x16_t val[2];
    198 };
    199 struct int16x8x2_t {
    200     int16x8_t val[2];
    201 };
    202 struct int32x4x2_t {
    203     int32x4_t val[2];
    204 };
    205 struct int64x2x2_t {
    206     int64x2_t val[2];
    207 };
    208 //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!!
    209 struct int8x8x2_t {
    210     int8x8_t val[2];
    211 };
    212 struct int16x4x2_t {
    213     int16x4_t val[2];
    214 };
    215 struct int32x2x2_t {
    216     int32x2_t val[2];
    217 };
    218 struct int64x1x2_t {
    219     int64x1_t val[2];
    220 };
    221 
    222 typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy
    223 typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy
    224 typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy
    225 typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy
    226 
    227 typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy
    228 typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy
    229 typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy
    230 typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy
    231 
    232 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */
    233 typedef struct int8x16x2_t uint8x16x2_t;
    234 typedef struct int16x8x2_t uint16x8x2_t;
    235 typedef struct int32x4x2_t uint32x4x2_t;
    236 typedef struct int64x2x2_t uint64x2x2_t;
    237 typedef struct int8x16x2_t poly8x16x2_t;
    238 typedef struct int16x8x2_t poly16x8x2_t;
    239 
    240 typedef struct int8x8x2_t uint8x8x2_t;
    241 typedef struct int16x4x2_t uint16x4x2_t;
    242 typedef struct int32x2x2_t uint32x2x2_t;
    243 typedef struct int64x1x2_t uint64x1x2_t;
    244 typedef struct int8x8x2_t poly8x8x2_t;
    245 typedef struct int16x4x2_t poly16x4x2_t;
    246 
    247 //float
    248 struct float32x4x2_t {
    249     float32x4_t val[2];
    250 };
    251 struct float16x8x2_t {
    252     float16x8_t val[2];
    253 };
    254 struct float32x2x2_t {
    255     float32x2_t val[2];
    256 };
    257 
    258 typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy
    259 typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy
    260 typedef struct  float32x2x2_t float32x2x2_t; //for C compilers to make them happy
    261 typedef  float16x8x2_t float16x4x2_t;
    262 
    263 //4
    264 struct int8x16x4_t {
    265     int8x16_t val[4];
    266 };
    267 struct int16x8x4_t {
    268     int16x8_t val[4];
    269 };
    270 struct int32x4x4_t {
    271     int32x4_t val[4];
    272 };
    273 struct int64x2x4_t {
    274     int64x2_t val[4];
    275 };
    276 
    277 struct int8x8x4_t {
    278     int8x8_t val[4];
    279 };
    280 struct int16x4x4_t {
    281     int16x4_t val[4];
    282 };
    283 struct int32x2x4_t {
    284     int32x2_t val[4];
    285 };
    286 struct int64x1x4_t {
    287     int64x1_t val[4];
    288 };
    289 
    290 typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy
    291 typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy
    292 typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy
    293 typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy
    294 
    295 typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy
    296 typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy
    297 typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy
    298 typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy
    299 
    300 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
    301 typedef struct int8x8x4_t uint8x8x4_t;
    302 typedef struct int16x4x4_t uint16x4x4_t;
    303 typedef struct int32x2x4_t uint32x2x4_t;
    304 typedef struct int64x1x4_t uint64x1x4_t;
    305 typedef struct int8x8x4_t poly8x8x4_t;
    306 typedef struct int16x4x4_t poly16x4x4_t;
    307 
    308 typedef struct int8x16x4_t uint8x16x4_t;
    309 typedef struct int16x8x4_t uint16x8x4_t;
    310 typedef struct int32x4x4_t uint32x4x4_t;
    311 typedef struct int64x2x4_t uint64x2x4_t;
    312 typedef struct int8x16x4_t poly8x16x4_t;
    313 typedef struct int16x8x4_t poly16x8x4_t;
    314 
    315 struct float32x4x4_t {
    316     float32x4_t val[4];
    317 };
    318 struct float16x8x4_t {
    319     float16x8_t val[4];
    320 };
    321 struct float32x2x4_t {
    322     float32x2_t val[4];
    323 };
    324 
    325 typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy
    326 typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy
    327 typedef struct  float32x2x4_t float32x2x4_t; //for C compilers to make them happy
    328 typedef  float16x8x4_t float16x4x4_t;
    329 
    330 //3
    331 struct int16x8x3_t {
    332     int16x8_t val[3];
    333 };
    334 struct int32x4x3_t {
    335     int32x4_t val[3];
    336 };
    337 struct int64x2x3_t {
    338     int64x2_t val[3];
    339 };
    340 struct int8x16x3_t {
    341     int8x16_t val[3];
    342 };
    343 
    344 struct int16x4x3_t {
    345     int16x4_t val[3];
    346 };
    347 struct int32x2x3_t {
    348     int32x2_t val[3];
    349 };
    350 struct int64x1x3_t {
    351     int64x1_t val[3];
    352 };
    353 struct int8x8x3_t {
    354     int8x8_t val[3];
    355 };
    356 typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy
    357 typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy
    358 typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy
    359 typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy
    360 
    361 typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy
    362 typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy
    363 typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy
    364 typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy
    365 
    366 
    367 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/
    368 typedef struct int8x16x3_t uint8x16x3_t;
    369 typedef struct int16x8x3_t uint16x8x3_t;
    370 typedef struct int32x4x3_t uint32x4x3_t;
    371 typedef struct int64x2x3_t uint64x2x3_t;
    372 typedef struct int8x16x3_t poly8x16x3_t;
    373 typedef struct int16x8x3_t poly16x8x3_t;
    374 typedef struct  int8x8x3_t uint8x8x3_t;
    375 typedef struct  int16x4x3_t uint16x4x3_t;
    376 typedef struct  int32x2x3_t uint32x2x3_t;
    377 typedef struct  int64x1x3_t uint64x1x3_t;
    378 typedef struct  int8x8x3_t poly8x8x3_t;
    379 typedef struct  int16x4x3_t poly16x4x3_t;
    380 
    381 //float
    382 struct float32x4x3_t {
    383     float32x4_t val[3];
    384 };
    385 struct float32x2x3_t {
    386     float32x2_t val[3];
    387 };
    388 struct float16x8x3_t {
    389     float16x8_t val[3];
    390 };
    391 
    392 typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy
    393 typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy
    394 typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy
    395 typedef  float16x8x3_t float16x4x3_t;
    396 
    397 
    398 //****************************************************************************
    399 //****** Porting auxiliary macros ********************************************
    400 
    401 //** floating point related macros **
    402 #define _M128i(a) _mm_castps_si128(a)
    403 #define _M128(a) _mm_castsi128_ps(a)
    404 //here the most performance effective implementation is compiler and 32/64 bits build dependent
    405 #if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER  >= 1500) )
    406 #   define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a)))
    407 #   define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp);
    408 #   define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp));
    409 #else
    410    //for 32bit gcc and Microsoft compilers builds
    411 #   define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a))
    412 #   define _M64(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), inp)
    413 #   define _M64f(out, inp)  _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp))
    414 #endif
    415 #define _pM128(a) _mm_castsi128_ps(_pM128i(a))
    416 
    417 #define return64(a)  _M64(res64,a); return res64;
    418 #define return64f(a)  _M64f(res64,a); return res64;
    419 
    420 #define _Ui64(a) (*(uint64_t*)&(a))
    421 #define _UNSIGNED_T(a) u ## a
    422 
    423 #define _SIGNBIT64 ((uint64_t)1 << 63)
    424 #define _SWAP_HI_LOW32  (2 | (3 << 2) | (0 << 4) | (1 << 6))
    425 #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) )
    426 
    427 #define  _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it"
    428 #define  _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it"
    429 
    430 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    431 #define __constrange(min,max)  const
    432 #define __transfersize(size)
    433 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    434 
    435 //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
    436 _NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7,  9, 11, 13, 15 };
    437 _NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8,  9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 };
    438 //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&
    439 
    440 //*************************************************************************
    441 //*************************************************************************
    442 //*********  Functions declarations as declared in original arm_neon.h *****
    443 //*************************************************************************
    444 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes.
    445 _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
    446 _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
    447 _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
    448 _NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0
    449 _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
    450 _NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
    451 _NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
    452 _NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
    453 _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0
    454 _NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
    455 _NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
    456 _NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
    457 _NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
    458 _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
    459 _NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
    460 _NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
    461 _NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
    462 _NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
    463 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
    464 _NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
    465 _NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
    466 _NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
    467 _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
    468 _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0
    469 _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
    470 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i]
    471 _NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
    472 _NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
    473 _NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
    474 _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
    475 _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0
    476 _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
    477 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1
    478 _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0
    479 _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0
    480 _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0
    481 _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0
    482 _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0
    483 _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0
    484 _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
    485 _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0
    486 _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
    487 _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
    488 _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0
    489 _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
    490 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1
    491 _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0
    492 _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0
    493 _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0
    494 _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
    495 _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0
    496 _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0
    497 _NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
    498 _NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
    499 _NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
    500 _NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
    501 _NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0
    502 _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
    503 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i])
    504 _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
    505 _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
    506 _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0
    507 _NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
    508 _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
    509 _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0
    510 _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0
    511 _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
    512 _NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
    513 _NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
    514 _NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
    515 _NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
    516 _NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
    517 _NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0
    518 _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
    519 _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
    520 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i]
    521 _NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
    522 _NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
    523 _NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
    524 _NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
    525 _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
    526 _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
    527 //Vector rounding add high half: vraddhn
    528 _NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
    529 _NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
    530 _NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
    531 _NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
    532 _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
    533 _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
    534 //Multiplication
    535 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
    536 _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
    537 _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0
    538 _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0
    539 _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
    540 _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
    541 _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
    542 _NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
    543 _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
    544 _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
    545 _NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
    546 _NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
    547 _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
    548 _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
    549 _NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
    550 _NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
    551 _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
    552 //multiply lane
    553 _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
    554 _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
    555 _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
    556 _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
    557 _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
    558 _NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c);
    559 _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
    560 _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
    561 _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
    562 _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
    563 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]
    564 _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
    565 _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
    566 _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
    567 _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
    568 _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
    569 _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
    570 _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
    571 _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
    572 _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
    573 _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
    574 _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
    575 _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
    576 _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
    577 _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
    578 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i]
    579 _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
    580 _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
    581 _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
    582 _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
    583 _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0
    584 _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
    585 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i]
    586 _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
    587 _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
    588 _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
    589 _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
    590 _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
    591 _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
    592 _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
    593 _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
    594 _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
    595 _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
    596 _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
    597 _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
    598 _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
    599 _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
    600 //Vector multiply subtract long
    601 _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
    602 _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
    603 _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
    604 _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
    605 _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0
    606 _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
    607 //Vector saturating doubling multiply high
    608 _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0
    609 _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
    610 _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
    611 _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
    612 //Vector saturating rounding doubling multiply high
    613 _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0
    614 _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
    615 _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
    616 _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
    617 //Vector saturating doubling multiply accumulate long
    618 _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
    619 _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
    620 //Vector saturating doubling multiply subtract long
    621 _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
    622 _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
    623 //Vector long multiply
    624 _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
    625 _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
    626 _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
    627 _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
    628 _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0
    629 _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
    630 _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
    631 //Vector saturating doubling long multiply
    632 _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
    633 _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
    634 //Subtraction
    635 //Vector subtract
    636 _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
    637 _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
    638 _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
    639 _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0
    640 _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
    641 _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
    642 _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
    643 _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
    644 _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0
    645 _NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
    646 _NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
    647 _NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
    648 _NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
    649 _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
    650 _NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
    651 _NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
    652 _NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
    653 _NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
    654 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
    655 _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
    656 _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
    657 _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
    658 _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
    659 _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0
    660 _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
    661 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i]
    662 _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
    663 _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
    664 _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
    665 _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
    666 _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0
    667 _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
    668 //Vector saturating subtract
    669 _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
    670 _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
    671 _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0
    672 _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
    673 _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
    674 _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0
    675 _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0
    676 _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
    677 _NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
    678 _NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
    679 _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
    680 _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
    681 _NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
    682 _NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0
    683 _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
    684 _NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0
    685 //Vector halving subtract
    686 _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
    687 _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0
    688 _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0
    689 _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0
    690 _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0
    691 _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0
    692 _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
    693 _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
    694 _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
    695 _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
    696 _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0
    697 _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
    698 //Vector subtract high half
    699 _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
    700 _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
    701 _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
    702 _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
    703 _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
    704 _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
    705 //Vector rounding subtract high half
    706 _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
    707 _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
    708 _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
    709 _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
    710 _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
    711 _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
    712 //Comparison
    713 //Vector compare equal
    714 _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
    715 _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
    716 _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
    717 _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
    718 _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
    719 _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
    720 _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
    721 _NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
    722 _NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
    723 _NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
    724 _NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
    725 _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
    726 _NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
    727 _NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
    728 _NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
    729 _NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
    730 //Vector compare greater-than or equal
    731 _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
    732 _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
    733 _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
    734 _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
    735 _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
    736 _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
    737 _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
    738 _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
    739 _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
    740 _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
    741 _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
    742 _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
    743 _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
    744 _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
    745 //Vector compare less-than or equal
    746 _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0
    747 _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0
    748 _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0
    749 _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
    750 _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0
    751 _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0
    752 _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0
    753 _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
    754 _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
    755 _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
    756 _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
    757 _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
    758 _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0
    759 _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
    760 //Vector compare greater-than
    761 _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
    762 _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
    763 _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
    764 _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
    765 _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
    766 _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
    767 _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
    768 _NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
    769 _NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
    770 _NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
    771 _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
    772 _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
    773 _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
    774 _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
    775 //Vector compare less-than
    776 _NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
    777 _NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
    778 _NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
    779 _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
    780 _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
    781 _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0
    782 _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
    783 _NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
    784 _NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
    785 _NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
    786 _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
    787 _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
    788 _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0
    789 _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
    790 //Vector compare absolute greater-than or equal
    791 _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
    792 _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
    793 //Vector compare absolute less-than or equal
    794 _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
    795 _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
    796 //Vector compare absolute greater-than
    797 _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
    798 _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
    799 //Vector compare absolute less-than
    800 _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
    801 _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
    802 //Vector test bits
    803 _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0
    804 _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0
    805 _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0
    806 _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0
    807 _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0
    808 _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0
    809 _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
    810 _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
    811 _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
    812 _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
    813 _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
    814 _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
    815 _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
    816 _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
    817 //Absolute difference
    818 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |
    819 _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0
    820 _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0
    821 _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0
    822 _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0
    823 _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0
    824 _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0
    825 _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
    826 _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
    827 _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
    828 _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
    829 _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
    830 _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0
    831 _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
    832 _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
    833 //Absolute difference - long
    834 _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
    835 _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
    836 _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
    837 _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
    838 _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0
    839 _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
    840 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] |
    841 _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
    842 _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
    843 _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
    844 _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
    845 _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0
    846 _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
    847 _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
    848 _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
    849 _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
    850 _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
    851 _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0
    852 _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
    853 //Absolute difference and accumulate - long
    854 _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
    855 _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
    856 _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
    857 _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
    858 _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0
    859 _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
    860 //Max/Min
    861 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]
    862 _NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
    863 _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
    864 _NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
    865 _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
    866 _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0
    867 _NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
    868 _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
    869 _NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
    870 _NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
    871 _NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
    872 _NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
    873 _NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0
    874 _NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
    875 _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
    876 
    877 _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
    878 
    879 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i]
    880 _NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
    881 _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
    882 _NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
    883 _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
    884 _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0
    885 _NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
    886 _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
    887 _NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
    888 _NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
    889 _NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
    890 _NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
    891 _NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0
    892 _NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
    893 _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
    894 
    895 _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
    896 
    897 //Pairwise addition
    898 //Pairwise add
    899 _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
    900 _NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
    901 _NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
    902 _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
    903 _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
    904 _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
    905 _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
    906 //Long pairwise add
    907 _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
    908 _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
    909 _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
    910 _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
    911 _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0
    912 _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
    913 _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
    914 _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
    915 _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
    916 _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
    917 _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0
    918 _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
    919 //Long pairwise add and accumulate
    920 _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0
    921 _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0
    922 _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
    923 _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0
    924 _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0
    925 _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
    926 _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
    927 _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
    928 _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
    929 _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
    930 _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0
    931 _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
    932 //Folding maximum vpmax -> takes maximum of adjacent pairs
    933 _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
    934 _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
    935 _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
    936 _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
    937 _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0
    938 _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
    939 _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
    940 //Folding minimum vpmin -> takes minimum of adjacent pairs
    941 _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
    942 _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
    943 _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
    944 _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
    945 _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0
    946 _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
    947 _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
    948 //Reciprocal/Sqrt
    949 _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
    950 _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
    951 _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
    952 _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
    953 //Shifts by signed variable
    954 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right)
    955 _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
    956 _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
    957 _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
    958 _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
    959 _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
    960 _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0
    961 _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
    962 _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
    963 _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
    964 _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
    965 _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
    966 _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
    967 _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
    968 _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0
    969 _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
    970 _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
    971 //Vector saturating shift left: (negative values shift right)
    972 _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
    973 _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
    974 _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
    975 _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
    976 _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
    977 _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0
    978 _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
    979 _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
    980 _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
    981 _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
    982 _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
    983 _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
    984 _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
    985 _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0
    986 _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
    987 _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
    988 //Vector rounding shift left: (negative values shift right)
    989 _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
    990 _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
    991 _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
    992 _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
    993 _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
    994 _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0
    995 _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
    996 _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
    997 _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
    998 _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
    999 _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
   1000 _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
   1001 _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
   1002 _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0
   1003 _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
   1004 _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
   1005 //Vector saturating rounding shift left: (negative values shift right)
   1006 _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
   1007 _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
   1008 _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
   1009 _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
   1010 _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
   1011 _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0
   1012 _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
   1013 _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
   1014 _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
   1015 _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
   1016 _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
   1017 _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
   1018 _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
   1019 _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0
   1020 _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
   1021 _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
   1022 //Shifts by a constant
   1023 //Vector shift right by constant
   1024 _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
   1025 _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
   1026 _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
   1027 _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
   1028 _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
   1029 _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16
   1030 _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
   1031 _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
   1032 _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
   1033 _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
   1034 _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
   1035 _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
   1036 _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
   1037 _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16
   1038 _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
   1039 _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
   1040 //Vector shift left by constant
   1041 _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
   1042 _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
   1043 _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
   1044 _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
   1045 _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
   1046 _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
   1047 _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
   1048 _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
   1049 _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
   1050 _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
   1051 _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
   1052 _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
   1053 _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
   1054 _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
   1055 _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
   1056 _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
   1057 //Vector rounding shift right by constant
   1058 _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
   1059 _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
   1060 _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
   1061 _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
   1062 _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
   1063 _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16
   1064 _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
   1065 _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
   1066 _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
   1067 _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
   1068 _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
   1069 _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
   1070 _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
   1071 _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16
   1072 _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
   1073 _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
   1074 //Vector shift right by constant and accumulate
   1075 _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
   1076 _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
   1077 _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
   1078 _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
   1079 _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
   1080 _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16
   1081 _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
   1082 _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
   1083 _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
   1084 _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
   1085 _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
   1086 _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
   1087 _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
   1088 _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16
   1089 _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
   1090 _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
   1091 //Vector rounding shift right by constant and accumulate
   1092 _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
   1093 _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
   1094 _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
   1095 _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
   1096 _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
   1097 _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16
   1098 _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
   1099 _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
   1100 _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
   1101 _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
   1102 _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
   1103 _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
   1104 _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
   1105 _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16
   1106 _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
   1107 _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
   1108 //Vector saturating shift left by constant
   1109 _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
   1110 _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
   1111 _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
   1112 _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
   1113 _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
   1114 _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0
   1115 _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
   1116 _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
   1117 _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
   1118 _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
   1119 _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
   1120 _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
   1121 _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
   1122 _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0
   1123 _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
   1124 _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
   1125 //Vector signed->unsigned saturating shift left by constant
   1126 _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
   1127 _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
   1128 _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
   1129 _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
   1130 _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
   1131 _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
   1132 _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
   1133 _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
   1134 //Vector narrowing shift right by constant
   1135 _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
   1136 _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
   1137 _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
   1138 _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
   1139 _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
   1140 _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
   1141 //Vector signed->unsigned narrowing saturating shift right by constant
   1142 _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
   1143 _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
   1144 _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
   1145 //Vector signed->unsigned rounding narrowing saturating shift right by constant
   1146 _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
   1147 _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
   1148 _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
   1149 //Vector narrowing saturating shift right by constant
   1150 _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
   1151 _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
   1152 _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
   1153 _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8
   1154 _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
   1155 _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
   1156 //Vector rounding narrowing shift right by constant
   1157 _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
   1158 _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
   1159 _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
   1160 _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
   1161 _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
   1162 _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
   1163 //Vector rounding narrowing saturating shift right by constant
   1164 _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
   1165 _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
   1166 _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
   1167 _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8
   1168 _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
   1169 _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
   1170 //Vector widening shift left by constant
   1171 _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
   1172 _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
   1173 _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
   1174 _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
   1175 _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0
   1176 _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
   1177 //Shifts with insert
   1178 //Vector shift right and insert
   1179 _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   1180 _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   1181 _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
   1182 _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
   1183 _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   1184 _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   1185 _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
   1186 _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
   1187 _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   1188 _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   1189 _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   1190 _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   1191 _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
   1192 _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
   1193 _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   1194 _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   1195 _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
   1196 _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
   1197 _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   1198 _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   1199 //Vector shift left and insert
   1200 _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   1201 _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   1202 _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
   1203 _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
   1204 _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   1205 _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   1206 _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
   1207 _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
   1208 _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   1209 _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   1210 _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   1211 _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   1212 _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
   1213 _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
   1214 _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   1215 _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   1216 _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
   1217 _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
   1218 _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   1219 _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   1220 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type.
   1221 //Load a single vector from memory
   1222 _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   1223 _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   1224 _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   1225 _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1226 _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   1227 _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   1228 _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   1229 _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1230 _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
   1231 _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   1232 _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   1233 _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   1234 _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
   1235 _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
   1236 _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
   1237 _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   1238 _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
   1239 _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
   1240 _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
   1241 _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   1242 _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
   1243 _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
   1244 _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
   1245 _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
   1246 
   1247 _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1248 
   1249 //Load a single lane from memory
   1250 _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
   1251 _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   1252 _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   1253 _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
   1254 _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
   1255 _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
   1256 _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0]
   1257 _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0]
   1258 _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   1259 _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0]
   1260 _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0]
   1261 _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   1262 _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
   1263 _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
   1264 _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
   1265 _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
   1266 _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0]
   1267 _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
   1268 _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0]
   1269 _NEON2SSESTORAGE float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
   1270 _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
   1271 _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0]
   1272 _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0]
   1273 _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0]
   1274 //Load all lanes of vector with same value from memory
   1275 _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1276 _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1277 _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1278 _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   1279 _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1280 _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1281 _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1282 _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   1283 _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
   1284 _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1285 _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1286 _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1287 _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1288 _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1289 _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1290 _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   1291 _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1292 _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1293 _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1294 _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   1295 _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
   1296 _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   1297 _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   1298 _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   1299 //Store a single vector or lane. Stores all lanes or a single lane of a vector.
   1300 //Store a single vector into memory
   1301 _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
   1302 _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
   1303 _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
   1304 _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
   1305 _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
   1306 _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
   1307 _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
   1308 _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
   1309 _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
   1310 _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
   1311 _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
   1312 _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
   1313 _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
   1314 _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
   1315 _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
   1316 _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
   1317 _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
   1318 _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
   1319 _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
   1320 _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
   1321 _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
   1322 _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
   1323 _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
   1324 _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
   1325 //Store a lane of a vector into memory
   1326 //Loads of an N-element structure
   1327 //Load N-element structure from memory
   1328 _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   1329 _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   1330 _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   1331 _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   1332 _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   1333 _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   1334 _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
   1335 _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   1336 _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   1337 _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   1338 _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   1339 _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   1340 _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   1341 _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1342 _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   1343 _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   1344 _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   1345 _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1346 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
   1347 _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   1348 _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   1349 _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   1350 _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   1351 _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   1352 _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   1353 _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   1354 _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   1355 _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   1356 _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   1357 _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   1358 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   1359 _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   1360 _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   1361 _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   1362 _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   1363 _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   1364 _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   1365 _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   1366 _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   1367 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   1368 _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   1369 _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   1370 _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   1371 _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   1372 _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   1373 _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   1374 _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   1375 _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   1376 _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   1377 _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   1378 _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   1379 _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   1380 _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   1381 _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   1382 _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   1383 _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   1384 _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   1385 _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   1386 _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   1387 _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   1388 _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   1389 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   1390 _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   1391 _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   1392 _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   1393 _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   1394 //Load all lanes of N-element structure with same value from memory
   1395 _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   1396 _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   1397 _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   1398 _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1399 _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   1400 _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   1401 _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   1402 _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   1403 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   1404 _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   1405 _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   1406 _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   1407 _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   1408 _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   1409 _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   1410 _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   1411 _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   1412 _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   1413 _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   1414 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   1415 _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   1416 _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   1417 _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   1418 _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   1419 _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   1420 _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   1421 _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   1422 _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   1423 _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   1424 _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   1425 _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   1426 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   1427 _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   1428 _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   1429 _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   1430 _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   1431 //Load a single lane of N-element structure from memory
   1432 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned
   1433 _NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
   1434 _NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
   1435 _NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
   1436 _NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
   1437 _NEON2SSESTORAGE float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
   1438 _NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0]
   1439 _NEON2SSESTORAGE poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0]
   1440 _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
   1441 _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   1442 _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
   1443 _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
   1444 _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0]
   1445 _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0]
   1446 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   1447 _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
   1448 _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t  src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0]
   1449 _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t  src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   1450 _NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   1451 _NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   1452 _NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   1453 _NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   1454 _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   1455 _NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   1456 _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   1457 _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   1458 _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   1459 _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   1460 _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   1461 _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   1462 _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   1463 _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   1464 _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   1465 _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   1466 _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   1467 _NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1468 _NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1469 _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1470 _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1471 _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1472 _NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1473 _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1474 _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1475 _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1476 _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1477 _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1478 _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1479 _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1480 _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1481 _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1482 _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1483 _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   1484 //Store N-element structure to memory
   1485 _NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
   1486 _NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
   1487 _NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
   1488 _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
   1489 _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
   1490 _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
   1491 _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
   1492 _NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0]
   1493 _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0]
   1494 _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0]
   1495 _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0]
   1496 _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0]
   1497 _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0]
   1498 _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0]
   1499 _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0]
   1500 _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
   1501 _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
   1502 _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0]
   1503 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0]
   1504 _NEON2SSESTORAGE void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0]
   1505 _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0]
   1506 _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0]
   1507 _NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
   1508 _NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
   1509 _NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
   1510 _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
   1511 _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
   1512 _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
   1513 _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
   1514 _NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0]
   1515 _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0]
   1516 _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0]
   1517 _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
   1518 _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
   1519 _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
   1520 _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
   1521 _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
   1522 _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
   1523 _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
   1524 _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
   1525 _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
   1526 _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
   1527 _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0]
   1528 _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0]
   1529 _NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
   1530 _NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
   1531 _NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
   1532 _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
   1533 _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
   1534 _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
   1535 _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
   1536 _NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0]
   1537 _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0]
   1538 _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0]
   1539 _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
   1540 _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
   1541 _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
   1542 _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
   1543 _NEON2SSESTORAGE void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
   1544 _NEON2SSESTORAGE void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
   1545 _NEON2SSESTORAGE void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
   1546 _NEON2SSESTORAGE void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
   1547 _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0]
   1548 _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0]
   1549 _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0]
   1550 _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0]
   1551 //Store a single lane of N-element structure to memory
   1552 _NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
   1553 _NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
   1554 _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
   1555 _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0]
   1556 _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
   1557 _NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0]
   1558 _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0]
   1559 _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
   1560 _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
   1561 _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
   1562 _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0]
   1563 _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
   1564 _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
   1565 _NEON2SSESTORAGE void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
   1566 _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0]
   1567 _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0]
   1568 _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0]
   1569 _NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1570 _NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
   1571 _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1572 _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0]
   1573 _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1574 _NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   1575 _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0]
   1576 _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
   1577 _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1578 _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
   1579 _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0]
   1580 _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1581 _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
   1582 _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1583 _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0]
   1584 _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0]
   1585 _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0]
   1586 _NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1587 _NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1588 _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1589 _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1590 _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1591 _NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   1592 _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0]
   1593 _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1594 _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1595 _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1596 _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0]
   1597 _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1598 _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1599 _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1600 _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1601 _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1602 _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0]
   1603 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector.
   1604 _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
   1605 _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
   1606 _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   1607 _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
   1608 _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
   1609 _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   1610 _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
   1611 _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0]
   1612 _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   1613 _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
   1614 _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
   1615 _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   1616 _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
   1617 _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
   1618 _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   1619 _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
   1620 _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0]
   1621 _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   1622 _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
   1623 _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
   1624 _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
   1625 _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
   1626 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector.
   1627 _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   1628 _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   1629 _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   1630 _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   1631 _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   1632 _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   1633 _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   1634 _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   1635 _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   1636 _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   1637 _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   1638 _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   1639 _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   1640 _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   1641 _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   1642 _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   1643 _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   1644 _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   1645 _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
   1646 _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
   1647 _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
   1648 _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
   1649 //Initialize a vector from a literal bit pattern.
   1650 _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
   1651 _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
   1652 _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
   1653 _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
   1654 _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
   1655 _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
   1656 _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
   1657 _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
   1658 _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
   1659 _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
   1660 _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
   1661 _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
   1662 //Set all lanes to same value
   1663 //Load all lanes of vector to the same literal value
   1664 _NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
   1665 _NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
   1666 _NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
   1667 _NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0
   1668 _NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0
   1669 _NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0
   1670 _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
   1671 _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
   1672 _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
   1673 _NEON2SSESTORAGE uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
   1674 _NEON2SSESTORAGE uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
   1675 _NEON2SSESTORAGE uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
   1676 _NEON2SSESTORAGE int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
   1677 _NEON2SSESTORAGE int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
   1678 _NEON2SSESTORAGE int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
   1679 _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
   1680 _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
   1681 _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
   1682 _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
   1683 _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
   1684 _NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
   1685 _NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
   1686 _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
   1687 _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
   1688 _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
   1689 _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
   1690 _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
   1691 _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
   1692 _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
   1693 _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
   1694 _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
   1695 _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
   1696 _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
   1697 _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
   1698 _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
   1699 _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
   1700 _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
   1701 _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
   1702 _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
   1703 _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
   1704 _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
   1705 _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
   1706 _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
   1707 _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
   1708 //Load all lanes of the vector to the value of a lane of a vector
   1709 _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   1710 _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   1711 _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   1712 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   1713 _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   1714 _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   1715 _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   1716 _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   1717 _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   1718 _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   1719 _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   1720 _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   1721 _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   1722 _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   1723 _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   1724 _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   1725 _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   1726 _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   1727 _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
   1728 _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
   1729 _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
   1730 _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
   1731 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector.
   1732 _NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
   1733 _NEON2SSESTORAGE int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
   1734 _NEON2SSESTORAGE int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
   1735 _NEON2SSESTORAGE int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
   1736 _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
   1737 _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
   1738 _NEON2SSESTORAGE uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
   1739 _NEON2SSESTORAGE uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
   1740 _NEON2SSESTORAGE uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
   1741 _NEON2SSESTORAGE uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
   1742 _NEON2SSESTORAGE poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
   1743 _NEON2SSESTORAGE poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
   1744 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors
   1745 _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
   1746 _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
   1747 _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
   1748 _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
   1749 _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
   1750 _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
   1751 _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
   1752 _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
   1753 _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
   1754 _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
   1755 _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
   1756 _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
   1757 _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
   1758 _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
   1759 _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
   1760 _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
   1761 _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
   1762 _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
   1763 _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
   1764 _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
   1765 _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
   1766 _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
   1767 _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
   1768 _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
   1769 //Converting vectors. These intrinsics are used to convert vectors.
   1770 //Convert from float
   1771 _NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
   1772 _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
   1773 _NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
   1774 _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
   1775 _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
   1776 _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
   1777 _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
   1778 _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
   1779 _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
   1780 //Convert to float
   1781 _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
   1782 _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
   1783 _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
   1784 _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
   1785 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
   1786 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
   1787 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
   1788 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
   1789 //Convert between floats
   1790 _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
   1791 _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
   1792 //Vector narrow integer
   1793 _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
   1794 _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
   1795 _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
   1796 _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
   1797 _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
   1798 _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
   1799 //Vector long move
   1800 _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
   1801 _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
   1802 _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
   1803 _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
   1804 _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0
   1805 _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
   1806 //Vector saturating narrow integer
   1807 _NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
   1808 _NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
   1809 _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
   1810 _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0
   1811 _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
   1812 _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
   1813 //Vector saturating narrow integer signed->unsigned
   1814 _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
   1815 _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
   1816 _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
   1817 //Table look up
   1818 _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
   1819 _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0
   1820 _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
   1821 //Extended table look up intrinsics
   1822 _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
   1823 _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
   1824 _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
   1825 _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   1826 _NEON2SSESTORAGE int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   1827 _NEON2SSESTORAGE poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   1828 _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   1829 _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   1830 _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   1831 _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   1832 _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   1833 _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   1834 //Operations with a scalar value
   1835 //Vector multiply accumulate with scalar
   1836 _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
   1837 _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
   1838 _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0]
   1839 _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0]
   1840 _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0]
   1841 _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0]
   1842 _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0]
   1843 _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0]
   1844 _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0]
   1845 _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0]
   1846 //Vector widening multiply accumulate with scalar
   1847 _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0]
   1848 _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0]
   1849 _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0]
   1850 _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0]
   1851 //Vector widening saturating doubling multiply accumulate with scalar
   1852 _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0]
   1853 _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0]
   1854 //Vector multiply subtract with scalar
   1855 _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
   1856 _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
   1857 _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0]
   1858 _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0]
   1859 _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0]
   1860 _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0]
   1861 _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0]
   1862 _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0]
   1863 _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0]
   1864 _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0]
   1865 //Vector widening multiply subtract with scalar
   1866 _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0]
   1867 _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0]
   1868 _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0]
   1869 _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0]
   1870 //Vector widening saturating doubling multiply subtract with scalar
   1871 _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0]
   1872 _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0]
   1873 //Vector multiply by scalar
   1874 _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
   1875 _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
   1876 _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
   1877 _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
   1878 _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
   1879 _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
   1880 _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
   1881 _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
   1882 _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
   1883 _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
   1884 //Vector long multiply with scalar
   1885 _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
   1886 _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
   1887 _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0]
   1888 _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
   1889 //Vector long multiply by scalar
   1890 _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
   1891 _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
   1892 _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0]
   1893 _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
   1894 //Vector saturating doubling long multiply with scalar
   1895 _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
   1896 _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
   1897 //Vector saturating doubling long multiply by scalar
   1898 _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
   1899 _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0]
   1900 //Vector saturating doubling multiply high with scalar
   1901 _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0]
   1902 _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0]
   1903 _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0]
   1904 _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0]
   1905 //Vector saturating doubling multiply high by scalar
   1906 _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0]
   1907 _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0]
   1908 _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0]
   1909 _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0]
   1910 //Vector saturating rounding doubling multiply high with scalar
   1911 _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
   1912 _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
   1913 _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
   1914 _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
   1915 //Vector rounding saturating doubling multiply high by scalar
   1916 _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
   1917 _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
   1918 _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
   1919 _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
   1920 //Vector multiply accumulate with scalar
   1921 _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
   1922 _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
   1923 _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
   1924 _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
   1925 _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
   1926 _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
   1927 _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
   1928 _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
   1929 _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
   1930 _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
   1931 //Vector widening multiply accumulate with scalar
   1932 _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
   1933 _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
   1934 _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0]
   1935 _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
   1936 //Vector widening saturating doubling multiply accumulate with scalar
   1937 _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
   1938 _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
   1939 //Vector multiply subtract with scalar
   1940 _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
   1941 _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
   1942 _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
   1943 _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
   1944 _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
   1945 _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
   1946 _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
   1947 _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
   1948 _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
   1949 _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
   1950 //Vector widening multiply subtract with scalar
   1951 _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
   1952 _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
   1953 _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0]
   1954 _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
   1955 //Vector widening saturating doubling multiply subtract with scalar
   1956 _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
   1957 _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
   1958 //Vector extract
   1959 _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   1960 _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   1961 _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   1962 _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   1963 _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   1964 _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   1965 _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   1966 _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   1967 _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
   1968 _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
   1969 _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   1970 _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   1971 _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   1972 _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   1973 _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   1974 _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   1975 _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   1976 _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
   1977 _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
   1978 _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
   1979 _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
   1980 _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
   1981 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
   1982 _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
   1983 _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
   1984 _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
   1985 _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
   1986 _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
   1987 _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
   1988 _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
   1989 _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
   1990 _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
   1991 _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
   1992 _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
   1993 _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
   1994 _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
   1995 _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
   1996 _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
   1997 _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
   1998 _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
   1999 _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
   2000 _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
   2001 _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
   2002 _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
   2003 _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
   2004 _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
   2005 _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
   2006 _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
   2007 _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
   2008 _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
   2009 _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
   2010 _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
   2011 _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
   2012 _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
   2013 _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
   2014 _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
   2015 _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
   2016 _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
   2017 _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
   2018 //Other single operand arithmetic
   2019 //Absolute: Vd[i] = |Va[i]|
   2020 _NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0
   2021 _NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0
   2022 _NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0
   2023 _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
   2024 _NEON2SSESTORAGE int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
   2025 _NEON2SSESTORAGE int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
   2026 _NEON2SSESTORAGE int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
   2027 _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
   2028 
   2029 _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
   2030 _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
   2031 
   2032 //Saturating absolute: Vd[i] = sat(|Va[i]|)
   2033 _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
   2034 _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
   2035 _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
   2036 _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
   2037 _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
   2038 _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
   2039 //Negate: Vd[i] = - Va[i]
   2040 _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
   2041 _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
   2042 _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
   2043 _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
   2044 _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
   2045 _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
   2046 _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
   2047 _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
   2048 //Saturating Negate: sat(Vd[i] = - Va[i])
   2049 _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
   2050 _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
   2051 _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
   2052 _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
   2053 _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
   2054 _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
   2055 //Count leading sign bits
   2056 _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
   2057 _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
   2058 _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
   2059 _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
   2060 _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
   2061 _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
   2062 //Count leading zeros
   2063 _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
   2064 _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
   2065 _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
   2066 _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
   2067 _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
   2068 _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
   2069 _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
   2070 _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
   2071 _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
   2072 _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
   2073 _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
   2074 _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
   2075 //Count number of set bits
   2076 _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
   2077 _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
   2078 _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
   2079 _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
   2080 _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
   2081 _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
   2082 //Reciprocal estimate
   2083 _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
   2084 _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
   2085 _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
   2086 _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
   2087 //Reciprocal square root estimate
   2088 _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
   2089 _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
   2090 _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
   2091 _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
   2092 //Logical operations
   2093 //Bitwise not
   2094 _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
   2095 _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
   2096 _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
   2097 _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
   2098 _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
   2099 _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
   2100 _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
   2101 _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
   2102 _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
   2103 _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
   2104 _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
   2105 _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
   2106 _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
   2107 _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
   2108 //Bitwise and
   2109 _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
   2110 _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
   2111 _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
   2112 _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0
   2113 _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
   2114 _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
   2115 _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
   2116 _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0
   2117 _NEON2SSESTORAGE int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
   2118 _NEON2SSESTORAGE int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
   2119 _NEON2SSESTORAGE int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
   2120 _NEON2SSESTORAGE int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
   2121 _NEON2SSESTORAGE uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
   2122 _NEON2SSESTORAGE uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
   2123 _NEON2SSESTORAGE uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
   2124 _NEON2SSESTORAGE uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
   2125 //Bitwise or
   2126 _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
   2127 _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
   2128 _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
   2129 _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0
   2130 _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
   2131 _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
   2132 _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
   2133 _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0
   2134 _NEON2SSESTORAGE int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
   2135 _NEON2SSESTORAGE int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
   2136 _NEON2SSESTORAGE int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
   2137 _NEON2SSESTORAGE int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
   2138 _NEON2SSESTORAGE uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
   2139 _NEON2SSESTORAGE uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
   2140 _NEON2SSESTORAGE uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
   2141 _NEON2SSESTORAGE uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
   2142 //Bitwise exclusive or (EOR or XOR)
   2143 _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
   2144 _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
   2145 _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
   2146 _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0
   2147 _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
   2148 _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
   2149 _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
   2150 _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0
   2151 _NEON2SSESTORAGE int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
   2152 _NEON2SSESTORAGE int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
   2153 _NEON2SSESTORAGE int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
   2154 _NEON2SSESTORAGE int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
   2155 _NEON2SSESTORAGE uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
   2156 _NEON2SSESTORAGE uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
   2157 _NEON2SSESTORAGE uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
   2158 _NEON2SSESTORAGE uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
   2159 //Bit Clear
   2160 _NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
   2161 _NEON2SSESTORAGE int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
   2162 _NEON2SSESTORAGE int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
   2163 _NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
   2164 _NEON2SSESTORAGE uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
   2165 _NEON2SSESTORAGE uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
   2166 _NEON2SSESTORAGE uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
   2167 _NEON2SSESTORAGE uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
   2168 _NEON2SSESTORAGE int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
   2169 _NEON2SSESTORAGE int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
   2170 _NEON2SSESTORAGE int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
   2171 _NEON2SSESTORAGE int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
   2172 _NEON2SSESTORAGE uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
   2173 _NEON2SSESTORAGE uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
   2174 _NEON2SSESTORAGE uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
   2175 _NEON2SSESTORAGE uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
   2176 //Bitwise OR complement
   2177 _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0
   2178 _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0
   2179 _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0
   2180 _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
   2181 _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0
   2182 _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0
   2183 _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0
   2184 _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
   2185 _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
   2186 _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
   2187 _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
   2188 _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
   2189 _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
   2190 _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
   2191 _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
   2192 _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
   2193 //Bitwise Select
   2194 _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
   2195 _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
   2196 _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
   2197 _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
   2198 _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
   2199 _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
   2200 _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
   2201 _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
   2202 _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
   2203 _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
   2204 _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
   2205 _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
   2206 _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
   2207 _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
   2208 _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
   2209 _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
   2210 _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
   2211 _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
   2212 _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
   2213 _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
   2214 _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
   2215 _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
   2216 //Transposition operations
   2217 //Transpose elements
   2218 _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
   2219 _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
   2220 _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
   2221 _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
   2222 _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
   2223 _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
   2224 _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
   2225 _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
   2226 _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
   2227 _NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
   2228 _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
   2229 _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
   2230 _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
   2231 _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
   2232 _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
   2233 _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
   2234 _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
   2235 _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
   2236 //Interleave elements
   2237 _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
   2238 _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
   2239 _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
   2240 _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
   2241 _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
   2242 _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
   2243 _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
   2244 _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
   2245 _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
   2246 _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
   2247 _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
   2248 _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
   2249 _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
   2250 _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
   2251 _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
   2252 _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
   2253 _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
   2254 _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
   2255 //De-Interleave elements
   2256 _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
   2257 _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
   2258 _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
   2259 _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
   2260 _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
   2261 _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
   2262 _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
   2263 _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
   2264 _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
   2265 _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
   2266 _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
   2267 _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
   2268 _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
   2269 _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
   2270 _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
   2271 _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
   2272 _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
   2273 _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
   2274 
   2275 _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0
   2276 
   2277 _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0
   2278 
   2279 //Sqrt
   2280 _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0
   2281 
   2282 _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0
   2283 
   2284 
   2285 
   2286 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   2287 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics.
   2288 // we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal
   2289 //
   2290 #if  ( defined (__INTEL_COMPILER)  || defined (__GNUC__) && !defined(__llvm__) )
   2291 #   define _MM_ALIGNR_EPI8 _mm_alignr_epi8
   2292 #   define _MM_EXTRACT_EPI16  (int16_t) _mm_extract_epi16
   2293 #   define _MM_INSERT_EPI16 _mm_insert_epi16
   2294 #   ifdef USE_SSE4
   2295 #       define _MM_EXTRACT_EPI8  _mm_extract_epi8
   2296 #       define _MM_EXTRACT_EPI32  _mm_extract_epi32
   2297 #       define _MM_EXTRACT_PS  _mm_extract_ps
   2298 #       define _MM_INSERT_EPI8  _mm_insert_epi8
   2299 #       define _MM_INSERT_EPI32 _mm_insert_epi32
   2300 #       define _MM_INSERT_PS    _mm_insert_ps
   2301 #       ifdef  _NEON2SSE_64BIT
   2302 #           define _MM_INSERT_EPI64 _mm_insert_epi64
   2303 #           define _MM_EXTRACT_EPI64 _mm_extract_epi64
   2304 #       endif
   2305 #   endif //SSE4
   2306 #else
   2307 #   define _NEON2SSE_COMMA ,
   2308 #   define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \
   2309         switch(LANE)         \
   2310         {                \
   2311         case 0:     return NAME(a b, 0); \
   2312         case 1:     return NAME(a b, 1); \
   2313         case 2:     return NAME(a b, 2); \
   2314         case 3:     return NAME(a b, 3); \
   2315         case 4:     return NAME(a b, 4); \
   2316         case 5:     return NAME(a b, 5); \
   2317         case 6:     return NAME(a b, 6); \
   2318         case 7:     return NAME(a b, 7); \
   2319         case 8:     return NAME(a b, 8); \
   2320         case 9:     return NAME(a b, 9); \
   2321         case 10:    return NAME(a b, 10); \
   2322         case 11:    return NAME(a b, 11); \
   2323         case 12:    return NAME(a b, 12); \
   2324         case 13:    return NAME(a b, 13); \
   2325         case 14:    return NAME(a b, 14); \
   2326         case 15:    return NAME(a b, 15); \
   2327         default:    return NAME(a b, 0); \
   2328         }
   2329 
   2330 #   define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \
   2331         switch(LANE)              \
   2332         {                          \
   2333         case 0:  return NAME(vec p,0); \
   2334         case 1:  return NAME(vec p,1); \
   2335         case 2:  return NAME(vec p,2); \
   2336         case 3:  return NAME(vec p,3); \
   2337         case 4:  return NAME(vec p,4); \
   2338         case 5:  return NAME(vec p,5); \
   2339         case 6:  return NAME(vec p,6); \
   2340         case 7:  return NAME(vec p,7); \
   2341         default: return NAME(vec p,0); \
   2342         }
   2343 
   2344 #   define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \
   2345         switch(LANE)              \
   2346         {                          \
   2347         case case0:  return NAME(vec p,case0); \
   2348         case case1:  return NAME(vec p,case1); \
   2349         case case2:  return NAME(vec p,case2); \
   2350         case case3:  return NAME(vec p,case3); \
   2351         default:     return NAME(vec p,case0); \
   2352         }
   2353 
   2354     _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE)
   2355     {
   2356         _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE)
   2357     }
   2358 
   2359     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI16(__m128i vec, int p, const int LANE)
   2360     {
   2361         _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p)
   2362     }
   2363 
   2364     _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE)
   2365     {
   2366         _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,)
   2367     }
   2368 
   2369 #ifdef USE_SSE4
   2370         _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
   2371         {
   2372             _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,)
   2373         }
   2374 
   2375         _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
   2376         {
   2377             _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,)
   2378         }
   2379 
   2380         _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
   2381         {
   2382             _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE)
   2383         }
   2384 
   2385         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
   2386         {
   2387             _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p)
   2388         }
   2389 
   2390         _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
   2391         {
   2392             _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE)
   2393         }
   2394 
   2395 #ifdef  _NEON2SSE_64BIT
   2396             //the special case of functions available only for SSE4 and 64-bit build.
   2397             _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64(__m128i vec, int p, const int LANE)
   2398             {
   2399                 switch(LANE) {
   2400                 case 0:
   2401                     return _mm_insert_epi64(vec,  p, 0);
   2402                 case 1:
   2403                     return _mm_insert_epi64(vec,  p, 1);
   2404                 default:
   2405                     return _mm_insert_epi64(vec,  p, 0);
   2406                 }
   2407             }
   2408 
   2409             _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE)
   2410             {
   2411                 if (LANE ==0) return _mm_extract_epi64(val, 0);
   2412                 else return _mm_extract_epi64(val, 1);
   2413             }
   2414 #endif
   2415 
   2416         _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
   2417         {
   2418             _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p)
   2419         }
   2420 
   2421 #endif //USE_SSE4
   2422 
   2423 #endif     //#ifdef NDEBUG
   2424 
   2425 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   2426 // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices
   2427 // or for some specific commonly used operations implementation missing in SSE
   2428 #ifdef USE_SSE4
   2429 #   define _MM_CVTEPU8_EPI16  _mm_cvtepu8_epi16
   2430 #   define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32
   2431 #   define _MM_CVTEPU32_EPI64  _mm_cvtepu32_epi64
   2432 
   2433 #   define _MM_CVTEPI8_EPI16  _mm_cvtepi8_epi16
   2434 #   define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32
   2435 #   define _MM_CVTEPI32_EPI64  _mm_cvtepi32_epi64
   2436 
   2437 #   define _MM_MAX_EPI8  _mm_max_epi8
   2438 #   define _MM_MAX_EPI32 _mm_max_epi32
   2439 #   define _MM_MAX_EPU16 _mm_max_epu16
   2440 #   define _MM_MAX_EPU32 _mm_max_epu32
   2441 
   2442 #   define _MM_MIN_EPI8  _mm_min_epi8
   2443 #   define _MM_MIN_EPI32 _mm_min_epi32
   2444 #   define _MM_MIN_EPU16 _mm_min_epu16
   2445 #   define _MM_MIN_EPU32 _mm_min_epu32
   2446 
   2447 #   define _MM_BLENDV_EPI8 _mm_blendv_epi8
   2448 #   define _MM_PACKUS_EPI32 _mm_packus_epi32
   2449 #   define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a)
   2450 
   2451 #   define _MM_MULLO_EPI32 _mm_mullo_epi32
   2452 #   define _MM_MUL_EPI32  _mm_mul_epi32
   2453 
   2454 #   define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64
   2455 #else     //no SSE4 !!!!!!
   2456     _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a)
   2457     {
   2458         __m128i zero = _mm_setzero_si128();
   2459         return _mm_unpacklo_epi8(a, zero);
   2460     }
   2461 
   2462     _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a)
   2463     {
   2464         __m128i zero = _mm_setzero_si128();
   2465         return _mm_unpacklo_epi16(a, zero);
   2466     }
   2467 
   2468     _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a)
   2469     {
   2470         __m128i zero = _mm_setzero_si128();
   2471         return _mm_unpacklo_epi32(a, zero);
   2472     }
   2473 
   2474     _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a)
   2475     {
   2476         __m128i zero = _mm_setzero_si128();
   2477         __m128i sign = _mm_cmpgt_epi8(zero, a);
   2478         return _mm_unpacklo_epi8(a, sign);
   2479     }
   2480 
   2481     _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a)
   2482     {
   2483         __m128i zero = _mm_setzero_si128();
   2484         __m128i sign = _mm_cmpgt_epi16(zero, a);
   2485         return _mm_unpacklo_epi16(a, sign);
   2486     }
   2487 
   2488     _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a)
   2489     {
   2490         __m128i zero = _mm_setzero_si128();
   2491         __m128i sign = _mm_cmpgt_epi32(zero, a);
   2492         return _mm_unpacklo_epi32(a, sign);
   2493     }
   2494 
   2495     _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE)
   2496     {
   2497         _NEON2SSE_ALIGN_16 int32_t tmp[4];
   2498         _mm_store_si128((__m128i*)tmp, vec);
   2499         return tmp[LANE];
   2500     }
   2501 
   2502     _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE)
   2503     {
   2504         _NEON2SSE_ALIGN_16 int8_t tmp[16];
   2505         _mm_store_si128((__m128i*)tmp, vec);
   2506         return (int)tmp[LANE];
   2507     }
   2508 
   2509     _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE)
   2510     {
   2511         _NEON2SSE_ALIGN_16 int32_t tmp[4];
   2512         _mm_store_si128((__m128i*)tmp, _M128i(vec));
   2513         return tmp[LANE];
   2514     }
   2515 
   2516     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI32(__m128i vec, int p, const int LANE)
   2517     {
   2518         _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0};
   2519         _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
   2520         __m128i vec_masked, p_masked;
   2521         pvec[LANE] = p;
   2522         mask[LANE] = 0x0;
   2523         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
   2524         p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
   2525         return _mm_or_si128(vec_masked, p_masked);
   2526     }
   2527 
   2528     _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI8(__m128i vec, int p, const int LANE)
   2529     {
   2530         _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0};
   2531         _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff};
   2532         __m128i vec_masked, p_masked;
   2533         pvec[LANE] = (int8_t)p;
   2534         mask[LANE] = 0x0;
   2535         vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
   2536         p_masked = _mm_andnot_si128  (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
   2537         return _mm_or_si128(vec_masked, p_masked);
   2538     }
   2539 
   2540     _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE)
   2541     {
   2542         _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff};
   2543         __m128 tmp, vec_masked, p_masked;
   2544         mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it
   2545         vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p
   2546         p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec
   2547         tmp = _mm_or_ps(vec_masked, p_masked);
   2548         return tmp;
   2549     }
   2550 
   2551     _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b)
   2552     {
   2553         __m128i cmp, resa, resb;
   2554         cmp = _mm_cmpgt_epi8 (a, b);
   2555         resa = _mm_and_si128 (cmp, a);
   2556         resb = _mm_andnot_si128 (cmp,b);
   2557         return _mm_or_si128(resa, resb);
   2558     }
   2559 
   2560     _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b)
   2561     {
   2562         __m128i cmp, resa, resb;
   2563         cmp = _mm_cmpgt_epi32(a, b);
   2564         resa = _mm_and_si128 (cmp, a);
   2565         resb = _mm_andnot_si128 (cmp,b);
   2566         return _mm_or_si128(resa, resb);
   2567     }
   2568 
   2569     _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b)
   2570     {
   2571         __m128i c8000, b_s, a_s, cmp;
   2572         c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
   2573         c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
   2574         b_s = _mm_sub_epi16 (b, c8000);
   2575         a_s = _mm_sub_epi16 (a, c8000);
   2576         cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed
   2577         a_s = _mm_and_si128 (cmp,a);
   2578         b_s = _mm_andnot_si128 (cmp,b);
   2579         return _mm_or_si128(a_s, b_s);
   2580     }
   2581 
   2582     _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b)
   2583     {
   2584         __m128i c80000000, b_s, a_s, cmp;
   2585         c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
   2586         c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
   2587         b_s = _mm_sub_epi32 (b, c80000000);
   2588         a_s = _mm_sub_epi32 (a, c80000000);
   2589         cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed
   2590         a_s = _mm_and_si128 (cmp,a);
   2591         b_s = _mm_andnot_si128 (cmp,b);
   2592         return _mm_or_si128(a_s, b_s);
   2593     }
   2594 
   2595     _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b)
   2596     {
   2597         __m128i cmp, resa, resb;
   2598         cmp = _mm_cmpgt_epi8 (b, a);
   2599         resa = _mm_and_si128 (cmp, a);
   2600         resb = _mm_andnot_si128 (cmp,b);
   2601         return _mm_or_si128(resa, resb);
   2602     }
   2603 
   2604     _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b)
   2605     {
   2606         __m128i cmp, resa, resb;
   2607         cmp = _mm_cmpgt_epi32(b, a);
   2608         resa = _mm_and_si128 (cmp, a);
   2609         resb = _mm_andnot_si128 (cmp,b);
   2610         return _mm_or_si128(resa, resb);
   2611     }
   2612 
   2613     _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b)
   2614     {
   2615         __m128i c8000, b_s, a_s, cmp;
   2616         c8000 = _mm_cmpeq_epi16 (a,a); //0xffff
   2617         c8000 = _mm_slli_epi16 (c8000, 15); //0x8000
   2618         b_s = _mm_sub_epi16 (b, c8000);
   2619         a_s = _mm_sub_epi16 (a, c8000);
   2620         cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed
   2621         a_s = _mm_and_si128 (cmp,a);
   2622         b_s = _mm_andnot_si128 (cmp,b);
   2623         return _mm_or_si128(a_s, b_s);
   2624     }
   2625 
   2626     _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b)
   2627     {
   2628         __m128i c80000000, b_s, a_s, cmp;
   2629         c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff
   2630         c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000
   2631         b_s = _mm_sub_epi32 (b, c80000000);
   2632         a_s = _mm_sub_epi32 (a, c80000000);
   2633         cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed
   2634         a_s = _mm_and_si128 (cmp,a);
   2635         b_s = _mm_andnot_si128 (cmp,b);
   2636         return _mm_or_si128(a_s, b_s);
   2637     }
   2638 
   2639     _NEON2SSE_INLINE __m128i  _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8  !!!!! - please see below
   2640     {
   2641         //it assumes mask is either 0xff or 0  always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters.
   2642         __m128i a_masked, b_masked;
   2643         b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff
   2644         a_masked = _mm_andnot_si128 (mask,a);
   2645         return _mm_or_si128(a_masked, b_masked);
   2646     }
   2647 
   2648     _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b)
   2649     {
   2650         __m128i a16, b16, res, reshi,cmp, zero;
   2651         zero = _mm_setzero_si128();
   2652         a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd);
   2653         b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd);
   2654         res = _mm_unpacklo_epi64(a16, b16); //result without saturation
   2655         reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation
   2656         cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
   2657         res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
   2658         cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
   2659         return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
   2660     }
   2661 
   2662     _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a)
   2663     {
   2664         __m128i a16, res, reshi,cmp, zero;
   2665         zero = _mm_setzero_si128();
   2666         a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd);
   2667         reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation
   2668         cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero
   2669         res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
   2670         cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive
   2671         return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff
   2672     }
   2673 
   2674 
   2675     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL)
   2676     {
   2677         _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4];
   2678         int64_t res64;
   2679         int i;
   2680         _mm_store_si128((__m128i*)atmp, a);
   2681         _mm_store_si128((__m128i*)btmp, b);
   2682         for (i = 0; i<4; i++) {
   2683             res64 = atmp[i] * btmp[i];
   2684             res[i] = (int)(res64 & 0xffffffff);
   2685         }
   2686         return _mm_load_si128((__m128i*)res);
   2687     }
   2688 
   2689     _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b)
   2690     {
   2691         __m128i sign, zero,  mul_us, a_neg, b_neg, mul_us_neg;
   2692         sign = _mm_xor_si128 (a, b);
   2693         sign =  _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive
   2694         sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes
   2695         zero = _mm_setzero_si128();
   2696         a_neg = _mm_abs_epi32 (a); //negate a and b
   2697         b_neg = _mm_abs_epi32 (b); //negate a and b
   2698         mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result
   2699         mul_us_neg = _mm_sub_epi64(zero, mul_us);
   2700         mul_us_neg = _mm_and_si128(sign, mul_us_neg);
   2701         mul_us = _mm_andnot_si128(sign, mul_us);
   2702         return _mm_or_si128 (mul_us, mul_us_neg);
   2703     }
   2704 
   2705     _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b)
   2706     {
   2707         __m128i res;
   2708         res = _mm_cmpeq_epi32 (a, b);
   2709         return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data
   2710     }
   2711 #endif     //SSE4
   2712 
   2713 //the special case of functions working only for 32 bits, no SSE4
   2714 _NEON2SSE_INLINE __m128i  _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE)
   2715 {
   2716     _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0};
   2717     _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff};
   2718     __m128i vec_masked, p_masked;
   2719     pvec[LANE] = p;
   2720     mask[LANE] = 0x0;
   2721     vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p
   2722     p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec
   2723     return _mm_or_si128(vec_masked, p_masked);
   2724 }
   2725 
   2726 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE)
   2727 {
   2728     _NEON2SSE_ALIGN_16 int64_t tmp[2];
   2729     _mm_store_si128((__m128i*)tmp, val);
   2730     return tmp[LANE];
   2731 }
   2732 
   2733 #ifndef _NEON2SSE_64BIT_SSE4
   2734 #   define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32
   2735 #   define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32
   2736 #endif
   2737 
   2738 _NEON2SSESTORAGE int32x4_t  vqd_s32(int32x4_t a); //Doubling saturation for signed ints
   2739 _NEON2SSE_INLINE int32x4_t  vqd_s32(int32x4_t a)
   2740 {
   2741     //Overflow happens only if a and sum have the opposite signs
   2742     __m128i c7fffffff, res, res_sat, res_xor_a;
   2743     c7fffffff = _mm_set1_epi32(0x7fffffff);
   2744     res = _mm_slli_epi32 (a, 1); // res = a*2
   2745     res_sat = _mm_srli_epi32(a, 31);
   2746     res_sat = _mm_add_epi32(res_sat, c7fffffff);
   2747     res_xor_a = _mm_xor_si128(res, a);
   2748     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
   2749     res_sat = _mm_and_si128(res_xor_a, res_sat);
   2750     res = _mm_andnot_si128(res_xor_a, res);
   2751     return _mm_or_si128(res, res_sat);
   2752 }
   2753 
   2754 
   2755 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   2756 //*************************************************************************
   2757 //*************************************************************************
   2758 //*****************  Functions redefinition\implementatin starts here *****
   2759 //*************************************************************************
   2760 //*************************************************************************
   2761 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
   2762 
   2763 /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample:
   2764 #ifdef ARM
   2765 #define vector_addq_s32 _mm_add_epi32
   2766 #else //if we have IA
   2767 #define vector_addq_s32 vadd_s32
   2768 #endif
   2769 
   2770 ********************************************************************************************
   2771 Functions below are organised in the following way:
   2772 
   2773 Each NEON intrinsic function has one of the following options:
   2774 1.  its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement
   2775 2.  x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement
   2776 3.  the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition
   2777 4.  for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance
   2778 the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path
   2779 - please consider such functions removal from your code.
   2780 */
   2781 
   2782 //***********************************************************************
   2783 //************************      Vector add   *****************************
   2784 //***********************************************************************
   2785 _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0
   2786 _NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b)
   2787 {
   2788     int8x8_t res64;
   2789     return64(_mm_add_epi8(_pM128i(a),_pM128i(b)));
   2790 }
   2791 
   2792 
   2793 _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0
   2794 _NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b)
   2795 {
   2796     int16x4_t res64;
   2797     return64(_mm_add_epi16(_pM128i(a),_pM128i(b)));
   2798 }
   2799 
   2800 
   2801 _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0
   2802 _NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b)
   2803 {
   2804     int32x2_t res64;
   2805     return64(_mm_add_epi32(_pM128i(a),_pM128i(b)));
   2806 }
   2807 
   2808 
   2809 _NEON2SSESTORAGE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b); // VADD.I64 d0,d0,d0
   2810 _NEON2SSE_INLINE int64x1_t  vadd_s64(int64x1_t a,  int64x1_t b)
   2811 {
   2812     int64x1_t res64;
   2813     res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0];
   2814     return res64;
   2815 }
   2816 
   2817 
   2818 _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0
   2819 _NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b)
   2820 {
   2821     __m128 res;
   2822     __m64_128 res64;
   2823     res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits
   2824     _M64f(res64, res);
   2825     return res64;
   2826 }
   2827 
   2828 _NEON2SSESTORAGE uint8x8_t  vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0
   2829 #define vadd_u8 vadd_s8
   2830 
   2831 _NEON2SSESTORAGE uint16x4_t  vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0
   2832 #define vadd_u16 vadd_s16
   2833 
   2834 _NEON2SSESTORAGE uint32x2_t  vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0
   2835 #define vadd_u32 vadd_s32
   2836 
   2837 _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b); // VADD.I64 d0,d0,d0
   2838 _NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a,  uint64x1_t b)
   2839 {
   2840     uint64x1_t res64;
   2841     res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0];
   2842     return res64;
   2843 }
   2844 
   2845 
   2846 _NEON2SSESTORAGE int8x16_t   vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0
   2847 #define vaddq_s8 _mm_add_epi8
   2848 
   2849 _NEON2SSESTORAGE int16x8_t   vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0
   2850 #define vaddq_s16 _mm_add_epi16
   2851 
   2852 _NEON2SSESTORAGE int32x4_t   vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0
   2853 #define vaddq_s32 _mm_add_epi32
   2854 
   2855 _NEON2SSESTORAGE int64x2_t   vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0
   2856 #define vaddq_s64 _mm_add_epi64
   2857 
   2858 _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0
   2859 #define vaddq_f32 _mm_add_ps
   2860 
   2861 _NEON2SSESTORAGE uint8x16_t   vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0
   2862 #define vaddq_u8 _mm_add_epi8
   2863 
   2864 _NEON2SSESTORAGE uint16x8_t   vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0
   2865 #define vaddq_u16 _mm_add_epi16
   2866 
   2867 _NEON2SSESTORAGE uint32x4_t   vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0
   2868 #define vaddq_u32 _mm_add_epi32
   2869 
   2870 _NEON2SSESTORAGE uint64x2_t   vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0
   2871 #define vaddq_u64 _mm_add_epi64
   2872 
   2873 //**************************** Vector long add *****************************:
   2874 //***********************************************************************
   2875 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
   2876 _NEON2SSESTORAGE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0
   2877 _NEON2SSE_INLINE int16x8_t  vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0
   2878 {
   2879     __m128i a16, b16;
   2880     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
   2881     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   2882     return _mm_add_epi16 (a16, b16);
   2883 }
   2884 
   2885 _NEON2SSESTORAGE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0
   2886 _NEON2SSE_INLINE int32x4_t  vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0
   2887 {
   2888     __m128i a32, b32;
   2889     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
   2890     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1
   2891     return _mm_add_epi32 (a32, b32);
   2892 }
   2893 
   2894 _NEON2SSESTORAGE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0
   2895 _NEON2SSE_INLINE int64x2_t  vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0
   2896 {
   2897     //may be not optimal
   2898     __m128i a64, b64;
   2899     a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
   2900     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
   2901     return _mm_add_epi64 ( a64, b64);
   2902 }
   2903 
   2904 _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0
   2905 _NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0
   2906 {
   2907     __m128i a16, b16;
   2908     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1
   2909     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
   2910     return _mm_add_epi16 (a16, b16);
   2911 }
   2912 
   2913 _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0
   2914 _NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0
   2915 {
   2916     __m128i a32, b32;
   2917     a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
   2918     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
   2919     return _mm_add_epi32 (a32, b32);
   2920 }
   2921 
   2922 _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0
   2923 _NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0
   2924 {
   2925     //may be not optimal
   2926     __m128i a64, b64;
   2927     a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
   2928     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
   2929     return _mm_add_epi64 (a64, b64);
   2930 }
   2931 
   2932 //***************   Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ******************
   2933 //*************** *********************************************************************
   2934 _NEON2SSESTORAGE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0
   2935 _NEON2SSE_INLINE int16x8_t  vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0
   2936 {
   2937     __m128i b16;
   2938     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   2939     return _mm_add_epi16 (a, b16);
   2940 }
   2941 
   2942 _NEON2SSESTORAGE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0
   2943 _NEON2SSE_INLINE int32x4_t  vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0
   2944 {
   2945     __m128i b32;
   2946     b32 =  _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1,
   2947     return _mm_add_epi32 (a, b32);
   2948 }
   2949 
   2950 _NEON2SSESTORAGE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0
   2951 _NEON2SSE_INLINE int64x2_t  vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0
   2952 {
   2953     __m128i b64;
   2954     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
   2955     return _mm_add_epi64 (a, b64);
   2956 }
   2957 
   2958 _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0
   2959 _NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0
   2960 {
   2961     __m128i b16;
   2962     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1
   2963     return _mm_add_epi16 (a, b16);
   2964 }
   2965 
   2966 _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0
   2967 _NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0
   2968 {
   2969     __m128i b32;
   2970     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1
   2971     return _mm_add_epi32 (a, b32);
   2972 }
   2973 
   2974 _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0
   2975 _NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0
   2976 {
   2977     __m128i b64;
   2978     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
   2979     return _mm_add_epi64 (a, b64);
   2980 }
   2981 
   2982 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 ,  result truncated *******************************
   2983 //*************************************************************************************************************************
   2984 _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b); // VHADD.S8 d0,d0,d0
   2985 _NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a,  int8x8_t b)
   2986 {
   2987     int8x8_t res64;
   2988     return64(vhaddq_s8(_pM128i(a), _pM128i(b)));
   2989 }
   2990 
   2991 
   2992 _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b); // VHADD.S16 d0,d0,d0
   2993 _NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a,  int16x4_t b)
   2994 {
   2995     int16x4_t res64;
   2996     return64( vhaddq_s16(_pM128i(a), _pM128i(b)));
   2997 }
   2998 
   2999 
   3000 _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b); // VHADD.S32 d0,d0,d0
   3001 _NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a,  int32x2_t b)
   3002 {
   3003     int32x2_t res64;
   3004     return64( vhaddq_s32(_pM128i(a), _pM128i(b)));
   3005 }
   3006 
   3007 
   3008 _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b); // VHADD.w d0,d0,d0
   3009 _NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a,  uint8x8_t b)
   3010 {
   3011     uint8x8_t res64;
   3012     return64( vhaddq_u8(_pM128i(a), _pM128i(b)));
   3013 }
   3014 
   3015 
   3016 _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b); // VHADD.s16 d0,d0,d0
   3017 _NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a,  uint16x4_t b)
   3018 {
   3019     uint16x4_t res64;
   3020     return64( vhaddq_u16(_pM128i(a), _pM128i(b)));
   3021 }
   3022 
   3023 
   3024 _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b); // VHADD.U32 d0,d0,d0
   3025 _NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a,  uint32x2_t b)
   3026 {
   3027     uint32x2_t res64;
   3028     return64( vhaddq_u32(_pM128i(a), _pM128i(b)));
   3029 }
   3030 
   3031 
   3032 _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0
   3033 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b)
   3034 {
   3035     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   3036     __m128i tmp1, tmp2;
   3037     tmp1 = _mm_and_si128(a,b);
   3038     tmp2 = _mm_xor_si128(a,b);
   3039     tmp2 = vshrq_n_s8(tmp2,1);
   3040     return _mm_add_epi8(tmp1,tmp2);
   3041 }
   3042 
   3043 _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0
   3044 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b)
   3045 {
   3046     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   3047     __m128i tmp1, tmp2;
   3048     tmp1 = _mm_and_si128(a,b);
   3049     tmp2 = _mm_xor_si128(a,b);
   3050     tmp2 = _mm_srai_epi16(tmp2,1);
   3051     return _mm_add_epi16(tmp1,tmp2);
   3052 }
   3053 
   3054 _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0
   3055 _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0
   3056 {
   3057     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   3058     __m128i tmp1, tmp2;
   3059     tmp1 = _mm_and_si128(a,b);
   3060     tmp2 = _mm_xor_si128(a,b);
   3061     tmp2 = _mm_srai_epi32(tmp2,1);
   3062     return _mm_add_epi32(tmp1,tmp2);
   3063 }
   3064 
   3065 _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0
   3066 _NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0
   3067 {
   3068     __m128i c1, sum, res;
   3069     c1 = _mm_set1_epi8(1);
   3070     sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it
   3071     res = _mm_xor_si128(a, b); //for rounding compensation
   3072     res = _mm_and_si128(res,c1); //for rounding compensation
   3073     return _mm_sub_epi8 (sum, res); //actual rounding compensation
   3074 }
   3075 
   3076 _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0
   3077 _NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0
   3078 {
   3079     __m128i sum, res;
   3080     sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it
   3081     res = _mm_xor_si128(a, b); //for rounding compensation
   3082     res = _mm_slli_epi16 (res,15); //shift left  then back right to
   3083     res = _mm_srli_epi16 (res,15); //get 1 or zero
   3084     return _mm_sub_epi16 (sum, res); //actual rounding compensation
   3085 }
   3086 
   3087 _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0
   3088 _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0
   3089 {
   3090     //need to avoid internal overflow, will use the (x&y)+((x^y)>>1).
   3091     __m128i tmp1, tmp2;
   3092     tmp1 = _mm_and_si128(a,b);
   3093     tmp2 = _mm_xor_si128(a,b);
   3094     tmp2 = _mm_srli_epi32(tmp2,1);
   3095     return _mm_add_epi32(tmp1,tmp2);
   3096 }
   3097 
   3098 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1   ***************************
   3099 //*****************************************************************************************************************************
   3100 _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b); // VRHADD.S8 d0,d0,d0
   3101 _NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a,  int8x8_t b)
   3102 {
   3103     int8x8_t res64;
   3104     return64(vrhaddq_s8(_pM128i(a), _pM128i(b)));
   3105 }
   3106 
   3107 
   3108 _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b); // VRHADD.S16 d0,d0,d0
   3109 _NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a,  int16x4_t b)
   3110 {
   3111     int16x4_t res64;
   3112     return64(vrhaddq_s16(_pM128i(a), _pM128i(b)));
   3113 }
   3114 
   3115 
   3116 _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b); // VRHADD.S32 d0,d0,d0
   3117 _NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a,  int32x2_t b)
   3118 {
   3119     int32x2_t res64;
   3120     return64(vrhaddq_s32(_pM128i(a), _pM128i(b)));
   3121 }
   3122 
   3123 
   3124 _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0
   3125 _NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b)
   3126 {
   3127     uint8x8_t res64;
   3128     return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
   3129 }
   3130 
   3131 
   3132 _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0
   3133 _NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b)
   3134 {
   3135     uint16x4_t res64;
   3136     return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!!
   3137 }
   3138 
   3139 
   3140 _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b); // VRHADD.U32 d0,d0,d0
   3141 _NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a,  uint32x2_t b)
   3142 {
   3143     uint32x2_t res64;
   3144     return64(vrhaddq_u32(_pM128i(a), _pM128i(b)));
   3145 }
   3146 
   3147 
   3148 _NEON2SSESTORAGE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0
   3149 _NEON2SSE_INLINE int8x16_t  vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0
   3150 {
   3151     //no signed average in x86 SIMD, go to unsigned
   3152     __m128i c128, au, bu, sum;
   3153     c128 = _mm_set1_epi8((int8_t)0x80); //-128
   3154     au = _mm_sub_epi8(a, c128); //add 128
   3155     bu = _mm_sub_epi8(b, c128); //add 128
   3156     sum = _mm_avg_epu8(au, bu);
   3157     return _mm_add_epi8 (sum, c128); //sub 128
   3158 }
   3159 
   3160 _NEON2SSESTORAGE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0
   3161 _NEON2SSE_INLINE int16x8_t  vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0
   3162 {
   3163     //no signed average in x86 SIMD, go to unsigned
   3164     __m128i cx8000, au, bu, sum;
   3165     cx8000 = _mm_set1_epi16((int16_t)0x8000); // - 32768
   3166     au = _mm_sub_epi16(a, cx8000); //add 32768
   3167     bu = _mm_sub_epi16(b, cx8000); //add 32768
   3168     sum = _mm_avg_epu16(au, bu);
   3169     return _mm_add_epi16 (sum, cx8000); //sub 32768
   3170 }
   3171 
   3172 _NEON2SSESTORAGE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0
   3173 _NEON2SSE_INLINE int32x4_t  vrhaddq_s32(int32x4_t a, int32x4_t b)
   3174 {
   3175     //need to avoid overflow
   3176     __m128i a2, b2, res, sum;
   3177     a2 = _mm_srai_epi32(a,1); //a2=a/2;
   3178     b2 = _mm_srai_epi32(b,1); // b2=b/2;
   3179     res = _mm_or_si128(a,b); //for rounding
   3180     res = _mm_slli_epi32 (res,31); //shift left  then back right to
   3181     res = _mm_srli_epi32 (res,31); //get 1 or zero
   3182     sum = _mm_add_epi32(a2,b2);
   3183     return _mm_add_epi32(sum,res);
   3184 }
   3185 
   3186 _NEON2SSESTORAGE uint8x16_t   vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0
   3187 #define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded
   3188 
   3189 _NEON2SSESTORAGE uint16x8_t   vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0
   3190 #define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded
   3191 
   3192 
   3193 _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0
   3194 _NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0
   3195 {
   3196     //need to avoid overflow
   3197     __m128i a2, b2, res, sum;
   3198     a2 = _mm_srli_epi32(a,1); //a2=a/2;
   3199     b2 = _mm_srli_epi32(b,1); // b2=b/2;
   3200     res = _mm_or_si128(a,b); //for rounding
   3201     res = _mm_slli_epi32 (res,31); //shift left  then back right to
   3202     res = _mm_srli_epi32 (res,31); //get 1 or zero
   3203     sum = _mm_add_epi32(a2,b2);
   3204     return _mm_add_epi32(sum,res);
   3205 }
   3206 
   3207 //****************** VQADD: Vector saturating add ************************
   3208 //************************************************************************
   3209 _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0
   3210 _NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b)
   3211 {
   3212     int8x8_t res64;
   3213     return64(_mm_adds_epi8(_pM128i(a),_pM128i(b)));
   3214 }
   3215 
   3216 
   3217 _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0
   3218 _NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b)
   3219 {
   3220     int16x4_t res64;
   3221     return64(_mm_adds_epi16(_pM128i(a),_pM128i(b)));
   3222 }
   3223 
   3224 
   3225 _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b); // VQADD.S32 d0,d0,d0
   3226 _NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a,  int32x2_t b)
   3227 {
   3228     int32x2_t res64;
   3229     return64(vqaddq_s32(_pM128i(a), _pM128i(b)));
   3230 }
   3231 
   3232 
   3233 _NEON2SSESTORAGE int64x1_t  vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0
   3234 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3235 {
   3236     int64x1_t res;
   3237     uint64_t a64, b64;
   3238     a64 = a.m64_u64[0];
   3239     b64 = b.m64_u64[0];
   3240     res.m64_u64[0] = a64 + b64;
   3241     a64 = (a64 >> 63) + (~_SIGNBIT64);
   3242     if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) {
   3243         res.m64_u64[0] = a64;
   3244     }
   3245     return res;
   3246 }
   3247 
   3248 _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0
   3249 _NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b)
   3250 {
   3251     uint8x8_t res64;
   3252     return64(_mm_adds_epu8(_pM128i(a),_pM128i(b)));
   3253 }
   3254 
   3255 
   3256 _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0
   3257 _NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b)
   3258 {
   3259     uint16x4_t res64;
   3260     return64(_mm_adds_epu16(_pM128i(a),_pM128i(b)));
   3261 }
   3262 
   3263 
   3264 _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b); // VQADD.U32 d0,d0,d0
   3265 _NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a,  uint32x2_t b)
   3266 {
   3267     uint32x2_t res64;
   3268     return64(vqaddq_u32(_pM128i(a), _pM128i(b)));
   3269 }
   3270 
   3271 
   3272 _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0
   3273 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3274 {
   3275     _NEON2SSE_ALIGN_16 uint64_t a64, b64;
   3276     uint64x1_t res;
   3277     a64 = a.m64_u64[0];
   3278     b64 = b.m64_u64[0];
   3279     res.m64_u64[0] = a64 + b64;
   3280     if (res.m64_u64[0] < a64) {
   3281         res.m64_u64[0] = ~(uint64_t)0;
   3282     }
   3283     return res;
   3284 }
   3285 
   3286 _NEON2SSESTORAGE int8x16_t   vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0
   3287 #define vqaddq_s8 _mm_adds_epi8
   3288 
   3289 _NEON2SSESTORAGE int16x8_t   vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0
   3290 #define vqaddq_s16 _mm_adds_epi16
   3291 
   3292 _NEON2SSESTORAGE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0
   3293 _NEON2SSE_INLINE int32x4_t  vqaddq_s32(int32x4_t a, int32x4_t b)
   3294 {
   3295     //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign
   3296     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_;
   3297     c7fffffff = _mm_set1_epi32(0x7fffffff);
   3298     res = _mm_add_epi32(a, b);
   3299     res_sat = _mm_srli_epi32(a, 31);
   3300     res_sat = _mm_add_epi32(res_sat, c7fffffff);
   3301     res_xor_a = _mm_xor_si128(res, a);
   3302     b_xor_a_ = _mm_xor_si128(b, a);
   3303     res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a);
   3304     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
   3305     res_sat = _mm_and_si128(res_xor_a, res_sat);
   3306     res = _mm_andnot_si128(res_xor_a, res);
   3307     return _mm_or_si128(res, res_sat);
   3308 }
   3309 
   3310 _NEON2SSESTORAGE int64x2_t  vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0
   3311 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3312 {
   3313     _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
   3314     _mm_store_si128((__m128i*)atmp, a);
   3315     _mm_store_si128((__m128i*)btmp, b);
   3316     res[0] = atmp[0] + btmp[0];
   3317     res[1] = atmp[1] + btmp[1];
   3318 
   3319     atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64);
   3320     atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64);
   3321 
   3322     if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) {
   3323         res[0] = atmp[0];
   3324     }
   3325     if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) {
   3326         res[1] = atmp[1];
   3327     }
   3328     return _mm_load_si128((__m128i*)res);
   3329 }
   3330 
   3331 _NEON2SSESTORAGE uint8x16_t   vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0
   3332 #define vqaddq_u8 _mm_adds_epu8
   3333 
   3334 _NEON2SSESTORAGE uint16x8_t   vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0
   3335 #define vqaddq_u16 _mm_adds_epu16
   3336 
   3337 _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0
   3338 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b)
   3339 {
   3340     __m128i c80000000, cmp, subsum, suba, sum;
   3341     c80000000 = _mm_set1_epi32 (0x80000000);
   3342     sum = _mm_add_epi32 (a, b);
   3343     subsum = _mm_sub_epi32 (sum, c80000000);
   3344     suba = _mm_sub_epi32 (a, c80000000);
   3345     cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed
   3346     return _mm_or_si128 (sum, cmp); //saturation
   3347 }
   3348 
   3349 _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0
   3350 #ifdef USE_SSE4
   3351     _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b)
   3352     {
   3353         __m128i c80000000, sum, cmp, suba, subsum;
   3354         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
   3355         sum = _mm_add_epi64 (a, b);
   3356         subsum = _mm_sub_epi64 (sum, c80000000);
   3357         suba = _mm_sub_epi64 (a, c80000000);
   3358         cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!!
   3359         return _mm_or_si128 (sum, cmp); //saturation
   3360     }
   3361 #else
   3362     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3363     {
   3364         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
   3365         _mm_store_si128((__m128i*)atmp, a);
   3366         _mm_store_si128((__m128i*)btmp, b);
   3367         res[0] = atmp[0] + btmp[0];
   3368         res[1] = atmp[1] + btmp[1];
   3369         if (res[0] < atmp[0]) res[0] = ~(uint64_t)0;
   3370         if (res[1] < atmp[1]) res[1] = ~(uint64_t)0;
   3371         return _mm_load_si128((__m128i*)(res));
   3372     }
   3373 #endif
   3374 
   3375 
   3376 //******************* Vector add high half (truncated)  ******************
   3377 //************************************************************************
   3378 _NEON2SSESTORAGE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0
   3379 _NEON2SSE_INLINE int8x8_t   vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0
   3380 {
   3381     int8x8_t res64;
   3382     __m128i sum;
   3383     sum = _mm_add_epi16 (a, b);
   3384     sum = _mm_srai_epi16 (sum, 8);
   3385     sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only
   3386     return64(sum);
   3387 }
   3388 
   3389 _NEON2SSESTORAGE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0
   3390 _NEON2SSE_INLINE int16x4_t  vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0
   3391 {
   3392     int16x4_t res64;
   3393     __m128i sum;
   3394     sum = _mm_add_epi32 (a, b);
   3395     sum = _mm_srai_epi32(sum, 16);
   3396     sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only
   3397     return64(sum);
   3398 }
   3399 
   3400 _NEON2SSESTORAGE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0
   3401 _NEON2SSE_INLINE int32x2_t  vaddhn_s64(int64x2_t a, int64x2_t b)
   3402 {
   3403     int32x2_t res64;
   3404     __m128i sum;
   3405     sum = _mm_add_epi64 (a, b);
   3406     sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (0 << 4) | (2 << 6));
   3407     return64(sum);
   3408 }
   3409 
   3410 _NEON2SSESTORAGE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0
   3411 _NEON2SSE_INLINE uint8x8_t  vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0
   3412 {
   3413     uint8x8_t res64;
   3414     __m128i sum;
   3415     sum = _mm_add_epi16 (a, b);
   3416     sum = _mm_srli_epi16 (sum, 8);
   3417     sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only
   3418     return64(sum);
   3419 }
   3420 
   3421 _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0
   3422 _NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0
   3423 {
   3424     uint16x4_t res64;
   3425      __m128i sum;
   3426     sum = _mm_add_epi32 (a, b);
   3427     sum = _mm_srli_epi32 (sum, 16);
   3428 #ifdef USE_SSE4
   3429     sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only
   3430 #else
   3431     sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits
   3432 #endif
   3433     return64(sum);
   3434 }
   3435 
   3436 _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0
   3437 #define vaddhn_u64 vaddhn_s64
   3438 
   3439 //*********** Vector rounding add high half: vraddhn_<type> ******************.
   3440 //***************************************************************************
   3441 _NEON2SSESTORAGE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0
   3442 _NEON2SSE_INLINE int8x8_t   vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0
   3443 {
   3444     int8x8_t res64;
   3445     __m128i sum, mask1;
   3446     sum = _mm_add_epi16 (a, b);
   3447     mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
   3448     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
   3449     sum = _mm_srai_epi16 (sum, 8); //get high half
   3450     sum = _mm_add_epi16 (sum, mask1); //actual rounding
   3451     sum = _mm_packs_epi16 (sum, sum);
   3452     return64(sum);
   3453 }
   3454 
   3455 _NEON2SSESTORAGE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0
   3456 _NEON2SSE_INLINE int16x4_t  vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0
   3457 {
   3458     //SIMD may be not optimal, serial may be faster
   3459     int16x4_t res64;
   3460     __m128i sum, mask1;
   3461     sum = _mm_add_epi32 (a, b);
   3462     mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
   3463     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
   3464     sum = _mm_srai_epi32 (sum, 16); //get high half
   3465     sum = _mm_add_epi32 (sum, mask1); //actual rounding
   3466     sum = _mm_packs_epi32 (sum, sum);
   3467     return64(sum);
   3468 }
   3469 
   3470 _NEON2SSESTORAGE int32x2_t  vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0
   3471 _NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b)
   3472 {
   3473     //SIMD may be not optimal, serial may be faster
   3474     int32x2_t res64;
   3475     __m128i sum, mask1;
   3476     sum = _mm_add_epi64 (a, b);
   3477     mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to
   3478     mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
   3479     sum = _mm_add_epi64 (sum, mask1); //actual high half rounding
   3480     sum = _mm_shuffle_epi32(sum,  1 | (3 << 2) | (1 << 4) | (3 << 6));
   3481     return64(sum);
   3482 }
   3483 
   3484 _NEON2SSESTORAGE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0
   3485 _NEON2SSE_INLINE uint8x8_t  vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0
   3486 {
   3487     uint8x8_t res64;
   3488     __m128i sum, mask1;
   3489     sum = _mm_add_epi16 (a, b);
   3490     mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to
   3491     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
   3492     sum = _mm_srai_epi16 (sum, 8); //get high half
   3493     sum = _mm_add_epi16 (sum, mask1); //actual rounding
   3494     sum = _mm_packus_epi16 (sum, sum);
   3495     return64(sum);
   3496 }
   3497 
   3498 _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0
   3499 _NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b)
   3500 {
   3501     //SIMD may be not optimal, serial may be faster
   3502     uint16x4_t res64;
   3503     __m128i sum, mask1;
   3504     sum = _mm_add_epi32 (a, b);
   3505     mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to
   3506     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
   3507     sum = _mm_srai_epi32 (sum, 16); //get high half
   3508     sum = _mm_add_epi32 (sum, mask1); //actual rounding
   3509     sum = _MM_PACKUS1_EPI32 (sum);
   3510     return64(sum);
   3511 }
   3512 
   3513 _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0
   3514 #define vraddhn_u64 vraddhn_s64
   3515 
   3516 //**********************************************************************************
   3517 //*********             Multiplication            *************************************
   3518 //**************************************************************************************
   3519 
   3520 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i]
   3521 //As we don't go to wider result functions are equal to "multiply low" in x86
   3522 _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0
   3523 _NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0
   3524 {
   3525     // no 8 bit simd multiply, need to go to 16 bits in SSE
   3526     int8x8_t res64;
   3527     __m128i a128, b128, res;
   3528     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
   3529     b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
   3530     res = _mm_mullo_epi16 (a128, b128);
   3531     res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only
   3532     return64(res);
   3533 }
   3534 
   3535 _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a,  int16x4_t b); // VMUL.I16 d0,d0,d0
   3536 #define vmul_s16 vmul_u16
   3537 
   3538 _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a,  int32x2_t b); // VMUL.I32 d0,d0,d0
   3539 #define vmul_s32 vmul_u32
   3540 
   3541 _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0
   3542 _NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b)
   3543 {
   3544     float32x4_t tmp;
   3545     __m64_128 res64;
   3546     tmp =  _mm_mul_ps(_pM128(a),_pM128(b));
   3547     _M64f(res64, tmp); //use low 64 bits
   3548     return res64;
   3549 }
   3550 
   3551 _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0
   3552 _NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0
   3553 {
   3554     // no 8 bit simd multiply, need to go to 16 bits in SSE
   3555     uint8x8_t res64;
   3556     __m128i mask, a128, b128, res;
   3557     mask = _mm_set1_epi16(0xff);
   3558     a128 = _MM_CVTEPU8_EPI16 (_pM128i(a));
   3559     b128 = _MM_CVTEPU8_EPI16 (_pM128i(b));
   3560     res = _mm_mullo_epi16 (a128, b128);
   3561     res = _mm_and_si128(res, mask); //to avoid saturation
   3562     res = _mm_packus_epi16 (res,res); //use only low 64 bits
   3563     return64(res);
   3564 }
   3565 
   3566 _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0
   3567 _NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b)
   3568 {
   3569     uint16x4_t res64;
   3570     return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b)));
   3571 }
   3572 
   3573 _NEON2SSESTORAGE uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0
   3574 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t   vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   3575 {
   3576     uint32x2_t res;
   3577     res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0];
   3578     res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1];
   3579     return res;
   3580 }
   3581 
   3582 _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0
   3583 _NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b)
   3584 {
   3585     //may be optimized
   3586     poly8x8_t res64;
   3587     __m128i a64, b64, c1, res, tmp, bmasked;
   3588     int i;
   3589     a64 = _pM128i(a);
   3590     b64 = _pM128i(b);
   3591     c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff....
   3592     c1 = vshrq_n_u8(c1,7); //0x1
   3593     bmasked = _mm_and_si128(b64, c1); //0x1
   3594     res = vmulq_u8(a64, bmasked);
   3595     for(i = 1; i<8; i++) {
   3596         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
   3597         bmasked = _mm_and_si128(b64, c1); //0x1
   3598         tmp = vmulq_u8(a64, bmasked);
   3599         res = _mm_xor_si128(res, tmp);
   3600     }
   3601     return64 (res);
   3602 }
   3603 
   3604 _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0
   3605 _NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0
   3606 {
   3607     // no 8 bit simd multiply, need to go to 16 bits
   3608     //solution may be not optimal
   3609     __m128i a16, b16, r16_1, r16_2;
   3610     a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
   3611     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
   3612     r16_1 = _mm_mullo_epi16 (a16, b16);
   3613     //swap hi and low part of a and b to process the remaining data
   3614     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3615     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   3616     a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
   3617     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1  __m128i r16_2
   3618 
   3619     r16_2 = _mm_mullo_epi16 (a16, b16);
   3620     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit
   3621     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit
   3622 
   3623     return _mm_unpacklo_epi64(r16_1,  r16_2);
   3624 }
   3625 
   3626 _NEON2SSESTORAGE int16x8_t   vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0
   3627 #define vmulq_s16 _mm_mullo_epi16
   3628 
   3629 _NEON2SSESTORAGE int32x4_t   vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0
   3630 #define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1
   3631 
   3632 _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0
   3633 #define vmulq_f32 _mm_mul_ps
   3634 
   3635 _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0
   3636 _NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0
   3637 {
   3638     // no 8 bit simd multiply, need to go to 16 bits
   3639     //solution may be not optimal
   3640     __m128i maskff, a16, b16, r16_1, r16_2;
   3641     maskff = _mm_set1_epi16(0xff);
   3642     a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1
   3643     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
   3644     r16_1 = _mm_mullo_epi16 (a16, b16);
   3645     r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation
   3646     //swap hi and low part of a and b to process the remaining data
   3647     a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3648     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   3649     a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1
   3650     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
   3651 
   3652     r16_2 = _mm_mullo_epi16 (a16, b16);
   3653     r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation
   3654     return _mm_packus_epi16 (r16_1,  r16_2);
   3655 }
   3656 
   3657 _NEON2SSESTORAGE uint16x8_t   vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0
   3658 #define vmulq_u16 _mm_mullo_epi16
   3659 
   3660 _NEON2SSESTORAGE uint32x4_t   vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0
   3661 #define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1
   3662 
   3663 _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0
   3664 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b)
   3665 {
   3666     //may be optimized
   3667     __m128i c1, res, tmp, bmasked;
   3668     int i;
   3669     c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
   3670     c1 = vshrq_n_u8(c1,7); //0x1
   3671     bmasked = _mm_and_si128(b, c1); //0x1
   3672     res = vmulq_u8(a, bmasked);
   3673     for(i = 1; i<8; i++) {
   3674         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
   3675         bmasked = _mm_and_si128(b, c1); //0x1
   3676         tmp = vmulq_u8(a, bmasked);
   3677         res = _mm_xor_si128(res, tmp);
   3678     }
   3679     return res;
   3680 }
   3681 
   3682 //************************* Vector long multiply ***********************************
   3683 //****************************************************************************
   3684 _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0
   3685 _NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0
   3686 {
   3687     //no 8 bit simd multiply, need to go to 16 bits
   3688     __m128i a16, b16;
   3689     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
   3690     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
   3691     return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
   3692 }
   3693 
   3694 _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0
   3695 _NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0
   3696 {
   3697 #ifdef USE_SSE4
   3698     __m128i a16, b16;
   3699     a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1
   3700     b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1
   3701     return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
   3702 #else
   3703     __m128i low, hi, a128,b128;
   3704     a128 = _pM128i(a);
   3705     b128 = _pM128i(b);
   3706     low =  _mm_mullo_epi16(a128,b128);
   3707     hi =   _mm_mulhi_epi16(a128,b128);
   3708     return _mm_unpacklo_epi16(low,hi);
   3709 #endif
   3710 }
   3711 
   3712 _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0
   3713 _NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0
   3714 {
   3715     __m128i ab, ba, a128, b128;
   3716     a128 = _pM128i(a);
   3717     b128 = _pM128i(b);
   3718     ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
   3719     ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
   3720     return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   3721 }
   3722 
   3723 _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0
   3724 _NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0
   3725 {
   3726     //no 8 bit simd multiply, need to go to 16 bits
   3727     __m128i a16, b16;
   3728     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
   3729     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
   3730     return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit
   3731 }
   3732 
   3733 _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0
   3734 _NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0
   3735 {
   3736 #ifdef USE_SSE4
   3737     __m128i a16, b16;
   3738     a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1
   3739     b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1
   3740     return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1
   3741 #else
   3742     __m128i a128,b128,low, hi;
   3743     a128 = _pM128i(a);
   3744     b128 = _pM128i(b);
   3745     low =  _mm_mullo_epi16(a128,b128);
   3746     hi =   _mm_mulhi_epu16(a128,b128);
   3747     return _mm_unpacklo_epi16(low,hi);
   3748 #endif
   3749 }
   3750 
   3751 _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0
   3752 _NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0
   3753 {
   3754     ///may be not optimal compared with serial implementation
   3755     __m128i ab, ba, a128, b128;
   3756     a128 = _pM128i(a);
   3757     b128 = _pM128i(b);
   3758     ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1
   3759     ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1
   3760     return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   3761 }
   3762 
   3763 _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0
   3764 _NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b)
   3765 {
   3766     //may be optimized
   3767     __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked;
   3768     int i;
   3769     a128 = _pM128i(a);
   3770     b128 = _pM128i(b);
   3771     c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff....
   3772     c1 = vshrq_n_u8(c1,7); //0x1
   3773     bmasked = _mm_and_si128(b128, c1); //0x1
   3774 
   3775     a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1
   3776     bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
   3777     res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit
   3778     for(i = 1; i<8; i++) {
   3779         c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here
   3780         bmasked = _mm_and_si128(b128, c1); //0x1
   3781         bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1
   3782         tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked);
   3783         res = _mm_xor_si128(res, tmp);
   3784     }
   3785     return res;
   3786 }
   3787 
   3788 //****************Vector saturating doubling long multiply **************************
   3789 //*****************************************************************
   3790 _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0
   3791 _NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b)
   3792 {
   3793     //the serial soulution may be faster due to saturation
   3794     __m128i res;
   3795     res = vmull_s16(a, b);
   3796     return vqd_s32(res);
   3797 }
   3798 
   3799 _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0
   3800 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
   3801 {
   3802     //the serial soulution may be faster due to saturation
   3803     __m128i res;
   3804     res = vmull_s32(a,b);
   3805     return vqaddq_s64(res,res); //slow serial function!!!!
   3806 }
   3807 
   3808 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  ************************
   3809 //******************************************************************************************
   3810 _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0
   3811 _NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0
   3812 {
   3813     // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
   3814     int8x8_t res64;
   3815     __m128i b128, c128, res;
   3816     b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
   3817     c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
   3818     res = _mm_mullo_epi16 (c128, b128);
   3819     res  =  _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd);
   3820     res  = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
   3821     return64(res);
   3822 }
   3823 
   3824 _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0
   3825 _NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
   3826 {
   3827     int16x4_t res64;
   3828     return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
   3829 }
   3830 
   3831 
   3832 _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0
   3833 _NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0
   3834 {
   3835     int32x2_t res64;
   3836     __m128i res;
   3837     res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1
   3838     res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits
   3839     return64(res);
   3840 }
   3841 
   3842 _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0
   3843 _NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c)
   3844 {
   3845     //fma is coming soon, but right now:
   3846     __m128 res;
   3847     __m64_128 res64;
   3848     res = _mm_mul_ps (_pM128(c), _pM128(b));
   3849     res = _mm_add_ps (_pM128(a), res);
   3850     _M64f(res64, res);
   3851     return res64;
   3852 }
   3853 
   3854 _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0
   3855 _NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0
   3856 {
   3857     // no 8 bit x86 simd multiply, need to go to 16 bits,  and use the low 64 bits
   3858     uint8x8_t res64;
   3859     __m128i mask, b128, c128, res;
   3860     mask = _mm_set1_epi16(0xff);
   3861     b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits
   3862     c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits
   3863     res = _mm_mullo_epi16 (c128, b128);
   3864     res = _mm_and_si128(res, mask); //to avoid saturation
   3865     res = _mm_packus_epi16 (res, res);
   3866     res =  _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits
   3867     return64(res);
   3868 }
   3869 
   3870 _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0
   3871 #define vmla_u16 vmla_s16
   3872 
   3873 _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0
   3874 #define vmla_u32 vmla_s32
   3875 
   3876 _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0
   3877 _NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0
   3878 {
   3879     //solution may be not optimal
   3880     // no 8 bit simd multiply, need to go to 16 bits
   3881     __m128i b16, c16, r16_1, a_2,r16_2;
   3882     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
   3883     c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
   3884     r16_1 = _mm_mullo_epi16 (b16, c16);
   3885     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
   3886     r16_1 = _mm_add_epi8 (r16_1, a);
   3887     //swap hi and low part of a, b and c to process the remaining data
   3888     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3889     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   3890     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   3891     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
   3892     c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
   3893 
   3894     r16_2 = _mm_mullo_epi16 (b16, c16);
   3895     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   3896     r16_2 = _mm_add_epi8(r16_2, a_2);
   3897     return _mm_unpacklo_epi64(r16_1,r16_2);
   3898 }
   3899 
   3900 _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0
   3901 _NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0
   3902 {
   3903     __m128i res;
   3904     res = _mm_mullo_epi16 (c, b);
   3905     return _mm_add_epi16 (res, a);
   3906 }
   3907 
   3908 _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0
   3909 _NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0
   3910 {
   3911     __m128i res;
   3912     res = _MM_MULLO_EPI32 (c,  b); //SSE4.1
   3913     return _mm_add_epi32 (res, a);
   3914 }
   3915 
   3916 _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0
   3917 _NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0
   3918 {
   3919     //fma is coming soon, but right now:
   3920     __m128 res;
   3921     res = _mm_mul_ps (c, b);
   3922     return _mm_add_ps (a, res);
   3923 }
   3924 
   3925 _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0
   3926 _NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0
   3927 {
   3928     //solution may be not optimal
   3929     // no 8 bit simd multiply, need to go to 16 bits
   3930     __m128i b16, c16, r16_1, a_2, r16_2;
   3931     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
   3932     c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
   3933     r16_1 = _mm_mullo_epi16 (b16, c16);
   3934     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
   3935     r16_1 = _mm_add_epi8 (r16_1, a);
   3936     //swap hi and low part of a, b and c to process the remaining data
   3937     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   3938     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   3939     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   3940     b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
   3941     c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
   3942 
   3943     r16_2 = _mm_mullo_epi16 (b16, c16);
   3944     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   3945     r16_2 = _mm_add_epi8(r16_2, a_2);
   3946     return _mm_unpacklo_epi64(r16_1,r16_2);
   3947 }
   3948 
   3949 _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0
   3950 #define vmlaq_u16 vmlaq_s16
   3951 
   3952 _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0
   3953 #define vmlaq_u32 vmlaq_s32
   3954 
   3955 //**********************  Vector widening multiply accumulate (long multiply accumulate):
   3956 //                          vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i]  **************
   3957 //********************************************************************************************
   3958 _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0
   3959 _NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0
   3960 {
   3961     int16x8_t res;
   3962     res = vmull_s8(b, c);
   3963     return _mm_add_epi16 (res, a);
   3964 }
   3965 
   3966 _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0
   3967 _NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0
   3968 {
   3969     //may be not optimal compared with serial implementation
   3970     int32x4_t res;
   3971     res = vmull_s16(b,  c);
   3972     return _mm_add_epi32 (res, a);
   3973 }
   3974 
   3975 _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0
   3976 _NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0
   3977 {
   3978     //may be not optimal compared with serial implementation
   3979     int64x2_t res;
   3980     res = vmull_s32( b, c);
   3981     return _mm_add_epi64 (res, a);
   3982 }
   3983 
   3984 _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0
   3985 _NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0
   3986 {
   3987     uint16x8_t res;
   3988     res = vmull_u8(b, c);
   3989     return _mm_add_epi16 (res, a);
   3990 }
   3991 
   3992 _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0
   3993 _NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0
   3994 {
   3995     //may be not optimal compared with serial implementation
   3996     uint32x4_t res;
   3997     res = vmull_u16(b, c);
   3998     return _mm_add_epi32 (res, a);
   3999 }
   4000 
   4001 _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0
   4002 _NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0
   4003 {
   4004     //may be not optimal compared with serial implementation
   4005     int64x2_t res;
   4006     res = vmull_u32( b,c);
   4007     return _mm_add_epi64 (res, a);
   4008 }
   4009 
   4010 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] ***************************************
   4011 //********************************************************************************************
   4012 _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0
   4013 _NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0
   4014 {
   4015     // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
   4016     int8x8_t res64;
   4017     __m128i res;
   4018     res64 = vmul_s8(b,c);
   4019     res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
   4020     return64(res);
   4021 }
   4022 
   4023 _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0
   4024 _NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
   4025 {
   4026     int16x4_t res64;
   4027     return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c)));
   4028 }
   4029 
   4030 
   4031 _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0
   4032 _NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0
   4033 {
   4034     int32x2_t res64;
   4035     __m128i res;
   4036     res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1
   4037     res =  _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only
   4038     return64(res);
   4039 }
   4040 
   4041 _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0
   4042 _NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c)
   4043 {
   4044     __m128 res;
   4045     __m64_128 res64;
   4046     res = _mm_mul_ps (_pM128(c), _pM128(b));
   4047     res = _mm_sub_ps (_pM128(a), res);
   4048     _M64f(res64, res);
   4049     return res64;
   4050 }
   4051 
   4052 _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0
   4053 _NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
   4054 {
   4055     // no 8 bit simd multiply, need to go to 16 bits -  and use the low 64 bits
   4056     uint8x8_t res64;
   4057     __m128i res;
   4058     res64 = vmul_u8(b,c);
   4059     res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64));
   4060     return64(res);
   4061 }
   4062 
   4063 _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0
   4064 #define vmls_u16 vmls_s16
   4065 
   4066 _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0
   4067 #define vmls_u32 vmls_s32
   4068 
   4069 
   4070 _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0
   4071 _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0
   4072 {
   4073     //solution may be not optimal
   4074     // no 8 bit simd multiply, need to go to 16 bits
   4075     __m128i b16, c16, r16_1, a_2, r16_2;
   4076     b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1
   4077     c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1
   4078     r16_1 = _mm_mullo_epi16 (b16, c16);
   4079     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd);
   4080     r16_1 = _mm_sub_epi8 (a, r16_1);
   4081     //swap hi and low part of a, b, c to process the remaining data
   4082     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   4083     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   4084     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   4085     b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1
   4086     c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1
   4087 
   4088     r16_2 = _mm_mullo_epi16 (b16, c16);
   4089     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   4090     r16_2 = _mm_sub_epi8 (a_2, r16_2);
   4091     return _mm_unpacklo_epi64(r16_1,r16_2);
   4092 }
   4093 
   4094 _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0
   4095 _NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0
   4096 {
   4097     __m128i res;
   4098     res = _mm_mullo_epi16 (c, b);
   4099     return _mm_sub_epi16 (a, res);
   4100 }
   4101 
   4102 _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0
   4103 _NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0
   4104 {
   4105     __m128i res;
   4106     res = _MM_MULLO_EPI32 (c, b); //SSE4.1
   4107     return _mm_sub_epi32 (a, res);
   4108 }
   4109 
   4110 _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0
   4111 _NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0
   4112 {
   4113     __m128 res;
   4114     res = _mm_mul_ps (c, b);
   4115     return _mm_sub_ps (a, res);
   4116 }
   4117 
   4118 _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0
   4119 _NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0
   4120 {
   4121     //solution may be not optimal
   4122     // no 8 bit simd multiply, need to go to 16 bits
   4123     __m128i b16, c16, r16_1, a_2, r16_2;
   4124     b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1
   4125     c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1
   4126     r16_1 = _mm_mullo_epi16 (b16, c16);
   4127     r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits
   4128     r16_1 = _mm_sub_epi8 (a, r16_1);
   4129     //swap hi and low part of a, b and c to process the remaining data
   4130     a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   4131     b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32);
   4132     c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32);
   4133     b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1
   4134     c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1
   4135 
   4136     r16_2 = _mm_mullo_epi16 (b16, c16);
   4137     r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd);
   4138     r16_2 = _mm_sub_epi8(a_2, r16_2);
   4139     return _mm_unpacklo_epi64(r16_1,r16_2);
   4140 }
   4141 
   4142 _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0
   4143 #define vmlsq_u16 vmlsq_s16
   4144 
   4145 _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0
   4146 #define vmlsq_u32 vmlsq_s32
   4147 
   4148 //******************** Vector multiply subtract long (widening multiply subtract) ************************************
   4149 //*************************************************************************************************************
   4150 _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0
   4151 _NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0
   4152 {
   4153     int16x8_t res;
   4154     res = vmull_s8(b, c);
   4155     return _mm_sub_epi16 (a, res);
   4156 }
   4157 
   4158 _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0
   4159 _NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0
   4160 {
   4161     //may be not optimal compared with serial implementation
   4162     int32x4_t res;
   4163     res = vmull_s16(b,  c);
   4164     return _mm_sub_epi32 (a, res);
   4165 }
   4166 
   4167 _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0
   4168 _NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0
   4169 {
   4170     //may be not optimal compared with serial implementation
   4171     int64x2_t res;
   4172     res = vmull_s32( b,c);
   4173     return _mm_sub_epi64 (a, res);
   4174 }
   4175 
   4176 _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0
   4177 _NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0
   4178 {
   4179     uint16x8_t res;
   4180     res = vmull_u8(b, c);
   4181     return _mm_sub_epi16 (a, res);
   4182 }
   4183 
   4184 _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0
   4185 _NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0
   4186 {
   4187     //may be not optimal compared with serial implementation
   4188     uint32x4_t res;
   4189     res = vmull_u16(b, c);
   4190     return _mm_sub_epi32 (a, res);
   4191 }
   4192 
   4193 _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0
   4194 _NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0
   4195 {
   4196     //may be not optimal compared with serial implementation
   4197     int64x2_t res;
   4198     res = vmull_u32( b,c);
   4199     return _mm_sub_epi64 (a, res);
   4200 }
   4201 
   4202 //******  Vector saturating doubling multiply high **********************
   4203 //*************************************************************************
   4204 _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b); // VQDMULH.S16 d0,d0,d0
   4205 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a,  int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   4206 {
   4207     int16x4_t res;
   4208     int32_t a32, b32, i;
   4209     for (i = 0; i<4; i++) {
   4210         a32 = (int32_t) a.m64_i16[i];
   4211         b32 = (int32_t) b.m64_i16[i];
   4212         a32 = (a32 * b32) >> 15;
   4213         res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32;
   4214     }
   4215     return res;
   4216 }
   4217 
   4218 _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0
   4219 _NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster
   4220 {
   4221     //may be not optimal compared with a serial solution
   4222     int32x2_t res64;
   4223     __m128i mask;
   4224     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4225     int64x2_t mul;
   4226     mul = vmull_s32(a,b);
   4227     mul = _mm_slli_epi64(mul,1); //double the result
   4228     //at this point start treating 2 64-bit numbers as 4 32-bit
   4229     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
   4230     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
   4231     mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
   4232     return64(mul);
   4233 }
   4234 
   4235 _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0
   4236 _NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0
   4237 {
   4238     __m128i res, res_lo, mask;
   4239     _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
   4240     res = _mm_mulhi_epi16 (a, b);
   4241     res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation
   4242     res_lo = _mm_mullo_epi16 (a, b);
   4243     res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit
   4244     res = _mm_add_epi16(res, res_lo); //combine results
   4245     mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
   4246     return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
   4247 }
   4248 
   4249 _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0
   4250 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   4251 {
   4252     // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
   4253     __m128i ab, ba, mask, mul, mul1;
   4254     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4255     ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
   4256     ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
   4257     mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   4258     mul = _mm_slli_epi64(mul,1); //double the result
   4259     ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
   4260     ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
   4261     mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   4262     mul1 = _mm_slli_epi64(mul1,1); //double the result
   4263     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
   4264     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits
   4265     mul = _mm_unpacklo_epi64(mul, mul1);
   4266     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
   4267     return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
   4268 }
   4269 
   4270 //********* Vector saturating rounding doubling multiply high ****************
   4271 //****************************************************************************
   4272 //If use _mm_mulhrs_xx functions  the result may differ from NEON one a little  due to different rounding rules and order
   4273 _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b); // VQRDMULH.S16 d0,d0,d0
   4274 _NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a,  int16x4_t b)
   4275 {
   4276     int16x4_t res64;
   4277     return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b)));
   4278 }
   4279 
   4280 _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0
   4281 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   4282 {
   4283     //may be not optimal compared with a serial solution
   4284     int32x2_t res64;
   4285     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4286     __m128i res_sat, mask, mask1;
   4287     int64x2_t mul;
   4288     mul = vmull_s32(a,b);
   4289     res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
   4290     mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to
   4291     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
   4292     mul = _mm_add_epi32 (res_sat, mask1); //actual rounding
   4293     //at this point start treating 2 64-bit numbers as 4 32-bit
   4294     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
   4295     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
   4296     mul = _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
   4297     return64(mul);
   4298 }
   4299 
   4300 _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0
   4301 _NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0
   4302 {
   4303     __m128i mask, res;
   4304     _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000};
   4305     res = _mm_mulhrs_epi16 (a, b);
   4306     mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask);
   4307     return _mm_xor_si128 (res,  mask); //res saturated for 0x8000
   4308 }
   4309 
   4310 _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0
   4311 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   4312 {
   4313     // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target
   4314     __m128i ab, ba,  mask, mul, mul1, mask1;
   4315     _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4316     ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1
   4317     ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1
   4318     mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   4319     mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered
   4320     mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to
   4321     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
   4322     mul = _mm_add_epi32 (mul, mask1); //actual rounding
   4323 
   4324     ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3
   4325     ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3
   4326     mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result
   4327     mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered
   4328     mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to
   4329     mask1 = _mm_srli_epi64(mask1,31); //get  31-th bit 1 or zero
   4330     mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding
   4331     //at this point start treating 2 64-bit numbers as 4 32-bit
   4332     mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
   4333     mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit
   4334     mul = _mm_unpacklo_epi64(mul, mul1);
   4335     mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32);
   4336     return _mm_xor_si128 (mul,  mask); //res saturated for 0x80000000
   4337 }
   4338 
   4339 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) *****
   4340 //*************************************************************************************************************************
   4341 _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0
   4342 _NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0
   4343 {
   4344     //not optimal SIMD soulution, serial may be faster
   4345     __m128i res32;
   4346     res32 = vmull_s16(b,  c);
   4347     res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1);
   4348     return vqaddq_s32(res32, a); //saturation
   4349 }
   4350 
   4351 _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0
   4352 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL)
   4353 {
   4354     __m128i res64;
   4355     res64 = vmull_s32(b,c);
   4356     res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1);
   4357     return vqaddq_s64(res64, a); //saturation
   4358 }
   4359 
   4360 //************************************************************************************
   4361 //******************  Vector subtract ***********************************************
   4362 //************************************************************************************
   4363 _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0
   4364 _NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b)
   4365 {
   4366     int8x8_t res64;
   4367     return64(_mm_sub_epi8(_pM128i(a),_pM128i(b)));
   4368 }
   4369 
   4370 
   4371 _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0
   4372 _NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b)
   4373 {
   4374     int16x4_t res64;
   4375     return64(_mm_sub_epi16(_pM128i(a),_pM128i(b)));
   4376 }
   4377 
   4378 
   4379 _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0
   4380 _NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b)
   4381 {
   4382     int32x2_t res64;
   4383     return64(_mm_sub_epi32(_pM128i(a),_pM128i(b)));
   4384 }
   4385 
   4386 
   4387 _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b); // VSUB.I64 d0,d0,d0
   4388 _NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a,  int64x1_t b)
   4389 {
   4390     int64x1_t res64;
   4391     res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0];
   4392     return res64;
   4393 }
   4394 
   4395 
   4396 _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0
   4397 _NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b)
   4398 {
   4399     float32x2_t res;
   4400     res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0];
   4401     res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1];
   4402     return res;
   4403 }
   4404 
   4405 _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0
   4406 #define vsub_u8 vsub_s8
   4407 
   4408 _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0
   4409 #define vsub_u16 vsub_s16
   4410 
   4411 _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0
   4412 #define vsub_u32 vsub_s32
   4413 
   4414 
   4415 _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b); // VSUB.I64 d0,d0,d0
   4416 _NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a,  uint64x1_t b)
   4417 {
   4418     int64x1_t res64;
   4419     res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0];
   4420     return res64;
   4421 }
   4422 
   4423 
   4424 _NEON2SSESTORAGE int8x16_t   vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0
   4425 #define vsubq_s8 _mm_sub_epi8
   4426 
   4427 _NEON2SSESTORAGE int16x8_t   vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0
   4428 #define vsubq_s16 _mm_sub_epi16
   4429 
   4430 _NEON2SSESTORAGE int32x4_t   vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0
   4431 #define vsubq_s32 _mm_sub_epi32
   4432 
   4433 _NEON2SSESTORAGE int64x2_t   vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0
   4434 #define vsubq_s64 _mm_sub_epi64
   4435 
   4436 _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0
   4437 #define vsubq_f32 _mm_sub_ps
   4438 
   4439 _NEON2SSESTORAGE uint8x16_t   vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0
   4440 #define vsubq_u8 _mm_sub_epi8
   4441 
   4442 _NEON2SSESTORAGE uint16x8_t   vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0
   4443 #define vsubq_u16 _mm_sub_epi16
   4444 
   4445 _NEON2SSESTORAGE uint32x4_t   vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0
   4446 #define vsubq_u32 _mm_sub_epi32
   4447 
   4448 _NEON2SSESTORAGE uint64x2_t   vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0
   4449 #define vsubq_u64 _mm_sub_epi64
   4450 
   4451 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ******************
   4452 //***********************************************************************************
   4453 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width.
   4454 _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0
   4455 _NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0
   4456 {
   4457     __m128i a16, b16;
   4458     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
   4459     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   4460     return _mm_sub_epi16 (a16, b16);
   4461 }
   4462 
   4463 _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0
   4464 _NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0
   4465 {
   4466     __m128i a32, b32;
   4467     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
   4468     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
   4469     return _mm_sub_epi32 (a32, b32);
   4470 }
   4471 
   4472 _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0
   4473 _NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0
   4474 {
   4475     //may be not optimal
   4476     __m128i a64, b64;
   4477     a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1
   4478     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1,
   4479     return _mm_sub_epi64 (a64, b64);
   4480 }
   4481 
   4482 _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0
   4483 _NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0
   4484 {
   4485     __m128i a16, b16;
   4486     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1,
   4487     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
   4488     return _mm_sub_epi16 (a16, b16);
   4489 }
   4490 
   4491 _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0
   4492 _NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0
   4493 {
   4494     __m128i a32, b32;
   4495     a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1
   4496     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
   4497     return _mm_sub_epi32 (a32, b32);
   4498 }
   4499 
   4500 _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0
   4501 _NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0
   4502 {
   4503     //may be not optimal
   4504     __m128i a64, b64;
   4505     a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1
   4506     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1,
   4507     return _mm_sub_epi64 (a64, b64);
   4508 }
   4509 
   4510 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] **********************************
   4511 //*****************************************************************************************************
   4512 _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0
   4513 _NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0
   4514 {
   4515     __m128i b16;
   4516     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   4517     return _mm_sub_epi16 (a, b16);
   4518 }
   4519 
   4520 _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0
   4521 _NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0
   4522 {
   4523     __m128i b32;
   4524     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
   4525     return _mm_sub_epi32 (a, b32);
   4526 }
   4527 
   4528 _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0
   4529 _NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0
   4530 {
   4531     __m128i b64;
   4532     b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1
   4533     return _mm_sub_epi64 (a, b64);
   4534 }
   4535 
   4536 _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0
   4537 _NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0
   4538 {
   4539     __m128i b16;
   4540     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
   4541     return _mm_sub_epi16 (a, b16);
   4542 }
   4543 
   4544 _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0
   4545 _NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0
   4546 {
   4547     __m128i b32;
   4548     b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1,
   4549     return _mm_sub_epi32 (a, b32);
   4550 }
   4551 
   4552 _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0
   4553 _NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0
   4554 {
   4555     __m128i b64;
   4556     b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1
   4557     return _mm_sub_epi64 (a, b64);
   4558 }
   4559 
   4560 //************************Vector saturating subtract *********************************
   4561 //*************************************************************************************
   4562 _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0
   4563 _NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b)
   4564 {
   4565     int8x8_t res64;
   4566     return64(_mm_subs_epi8(_pM128i(a),_pM128i(b)));
   4567 }
   4568 
   4569 
   4570 _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0
   4571 _NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b)
   4572 {
   4573     int16x4_t res64;
   4574     return64(_mm_subs_epi16(_pM128i(a),_pM128i(b)));
   4575 }
   4576 
   4577 
   4578 _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b); // VQSUB.S32 d0,d0,d0
   4579 _NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a,  int32x2_t b)
   4580 {
   4581     int32x2_t res64;
   4582     return64(vqsubq_s32(_pM128i(a), _pM128i(b)));
   4583 }
   4584 
   4585 
   4586 _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0
   4587 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
   4588 {
   4589     uint64x1_t res;
   4590     uint64_t a64,b64;
   4591     a64 = a.m64_u64[0];
   4592     b64 = b.m64_u64[0];
   4593     res.m64_u64[0] = a64 - b64;
   4594 
   4595     a64 =  (a64 >> 63) + (~_SIGNBIT64);
   4596     if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) {
   4597         res.m64_u64[0] = a64;
   4598     }
   4599     return res;
   4600 }
   4601 
   4602 _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0
   4603 _NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b)
   4604 {
   4605     uint8x8_t res64;
   4606     return64(_mm_subs_epu8(_pM128i(a),_pM128i(b)));
   4607 }
   4608 
   4609 
   4610 _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0
   4611 _NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b)
   4612 {
   4613     uint16x4_t res64;
   4614     return64(_mm_subs_epu16(_pM128i(a),_pM128i(b)));
   4615 }
   4616 
   4617 
   4618 _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b); // VQSUB.U32 d0,d0,d0
   4619 _NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a,  uint32x2_t b)
   4620 {
   4621     uint32x2_t res64;
   4622     return64(vqsubq_u32(_pM128i(a), _pM128i(b)));
   4623 }
   4624 
   4625 
   4626 _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0
   4627 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   4628 {
   4629     uint64x1_t res;
   4630     uint64_t a64, b64;
   4631     a64 = _Ui64(a);
   4632     b64 = _Ui64(b);
   4633     if (a64 > b64) {
   4634         res.m64_u64[0] = a64 - b64;
   4635     } else {
   4636         res.m64_u64[0] = 0;
   4637     }
   4638     return res;
   4639 }
   4640 
   4641 _NEON2SSESTORAGE int8x16_t   vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0
   4642 #define vqsubq_s8 _mm_subs_epi8
   4643 
   4644 _NEON2SSESTORAGE int16x8_t   vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0
   4645 #define vqsubq_s16 _mm_subs_epi16
   4646 
   4647 _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0
   4648 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b)
   4649 {
   4650     //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a
   4651     __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a;
   4652     c7fffffff = _mm_set1_epi32(0x7fffffff);
   4653     res = _mm_sub_epi32(a, b);
   4654     res_sat = _mm_srli_epi32(a, 31);
   4655     res_sat = _mm_add_epi32(res_sat, c7fffffff);
   4656     res_xor_a = _mm_xor_si128(res, a);
   4657     b_xor_a = _mm_xor_si128(b, a);
   4658     res_xor_a = _mm_and_si128(b_xor_a, res_xor_a);
   4659     res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise
   4660     res_sat = _mm_and_si128(res_xor_a, res_sat);
   4661     res = _mm_andnot_si128(res_xor_a, res);
   4662     return _mm_or_si128(res, res_sat);
   4663 }
   4664 
   4665 _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0
   4666 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution
   4667 {
   4668     _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2];
   4669     _NEON2SSE_ALIGN_16 uint64_t res[2];
   4670     _mm_store_si128((__m128i*)atmp, a);
   4671     _mm_store_si128((__m128i*)btmp, b);
   4672     res[0] = atmp[0] - btmp[0];
   4673     res[1] = atmp[1] - btmp[1];
   4674     if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) {
   4675         res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64;
   4676     }
   4677     if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) {
   4678         res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64;
   4679     }
   4680     return _mm_load_si128((__m128i*)res);
   4681 }
   4682 
   4683 _NEON2SSESTORAGE uint8x16_t   vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0
   4684 #define vqsubq_u8 _mm_subs_epu8
   4685 
   4686 _NEON2SSESTORAGE uint16x8_t   vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0
   4687 #define vqsubq_u16 _mm_subs_epu16
   4688 
   4689 _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0
   4690 _NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0
   4691 {
   4692     __m128i min, mask, sub;
   4693     min = _MM_MIN_EPU32(a, b); //SSE4.1
   4694     mask = _mm_cmpeq_epi32 (min,  b);
   4695     sub = _mm_sub_epi32 (a, b);
   4696     return _mm_and_si128 ( sub, mask);
   4697 }
   4698 
   4699 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0
   4700 #ifdef USE_SSE4
   4701     _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b)
   4702     {
   4703         __m128i c80000000, subb, suba, cmp, sub;
   4704         c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0);
   4705         sub  = _mm_sub_epi64 (a, b);
   4706         suba = _mm_sub_epi64 (a, c80000000);
   4707         subb = _mm_sub_epi64 (b, c80000000);
   4708         cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!!
   4709         return _mm_and_si128 (sub, cmp); //saturation
   4710     }
   4711 #else
   4712     _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   4713     {
   4714         _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2];
   4715         _mm_store_si128((__m128i*)atmp, a);
   4716         _mm_store_si128((__m128i*)btmp, b);
   4717         res[0] = (atmp[0] > btmp[0]) ? atmp[0] -  btmp[0] : 0;
   4718         res[1] = (atmp[1] > btmp[1]) ? atmp[1] -  btmp[1] : 0;
   4719         return _mm_load_si128((__m128i*)(res));
   4720     }
   4721 #endif
   4722 
   4723 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1  ******************************************************
   4724 //****************************************************************
   4725 _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0
   4726 _NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0
   4727 {
   4728     //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit,
   4729     int8x8_t res64;
   4730     __m128i r16;
   4731     int8x8_t r;
   4732     r = vsub_s8 (a, b);
   4733     r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1
   4734     r16 = _mm_srai_epi16 (r16, 1); //SSE2
   4735     r16 =  _mm_packs_epi16 (r16,r16); //use low 64 bits
   4736     return64(r16);
   4737 }
   4738 
   4739 _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b); // VHSUB.S16 d0,d0,d0
   4740 _NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a,  int16x4_t b)
   4741 {
   4742     int16x4_t res64;
   4743     return64(vhsubq_s16(_pM128i(a), _pM128i(b)));
   4744 }
   4745 
   4746 
   4747 
   4748 _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b); // VHSUB.S32 d0,d0,d0
   4749 _NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a,  int32x2_t b)
   4750 {
   4751     int32x2_t res64;
   4752     return64(vhsubq_s32(_pM128i(a), _pM128i(b)));
   4753 }
   4754 
   4755 
   4756 _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b); // VHSUB.U8 d0,d0,d0
   4757 _NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a,  uint8x8_t b)
   4758 {
   4759     uint8x8_t res64;
   4760     return64(vhsubq_u8(_pM128i(a), _pM128i(b)));
   4761 }
   4762 
   4763 _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b); // VHSUB.s16 d0,d0,d0
   4764 _NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a,  uint16x4_t b)
   4765 {
   4766     uint16x4_t res64;
   4767     return64(vhsubq_u16(_pM128i(a), _pM128i(b)));
   4768 }
   4769 
   4770 _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b); // VHSUB.U32 d0,d0,d0
   4771 _NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a,  uint32x2_t b)
   4772 {
   4773     uint32x2_t res64;
   4774     return64(vhsubq_u32(_pM128i(a), _pM128i(b)));
   4775 }
   4776 
   4777 _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0
   4778 _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0
   4779 {
   4780     // //need to deal with the possibility of internal overflow
   4781     __m128i c128, au,bu;
   4782     c128 = _mm_set1_epi8((int8_t)128);
   4783     au = _mm_add_epi8( a, c128);
   4784     bu = _mm_add_epi8( b, c128);
   4785     return vhsubq_u8(au,bu);
   4786 }
   4787 
   4788 _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0
   4789 _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0
   4790 {
   4791     //need to deal with the possibility of internal overflow
   4792     __m128i c8000, au,bu;
   4793     c8000 = _mm_set1_epi16((int16_t)0x8000);
   4794     au = _mm_add_epi16( a, c8000);
   4795     bu = _mm_add_epi16( b, c8000);
   4796     return vhsubq_u16(au,bu);
   4797 }
   4798 
   4799 _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0
   4800 _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0
   4801 {
   4802     //need to deal with the possibility of internal overflow
   4803     __m128i a2, b2,r, b_1;
   4804     a2 = _mm_srai_epi32 (a,1);
   4805     b2 = _mm_srai_epi32 (b,1);
   4806     r = _mm_sub_epi32 (a2, b2);
   4807     b_1 = _mm_andnot_si128(a, b); //!a and b
   4808     b_1 = _mm_slli_epi32 (b_1,31);
   4809     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
   4810     return _mm_sub_epi32(r,b_1);
   4811 }
   4812 
   4813 _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0
   4814 _NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0
   4815 {
   4816     __m128i avg;
   4817     avg = _mm_avg_epu8 (a, b);
   4818     return _mm_sub_epi8(a, avg);
   4819 }
   4820 
   4821 _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0
   4822 _NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0
   4823 {
   4824     __m128i avg;
   4825     avg = _mm_avg_epu16 (a, b);
   4826     return _mm_sub_epi16(a, avg);
   4827 }
   4828 
   4829 _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0
   4830 _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0
   4831 {
   4832     //need to deal with the possibility of internal overflow
   4833     __m128i a2, b2,r, b_1;
   4834     a2 = _mm_srli_epi32 (a,1);
   4835     b2 = _mm_srli_epi32 (b,1);
   4836     r = _mm_sub_epi32 (a2, b2);
   4837     b_1 = _mm_andnot_si128(a, b); //!a and b
   4838     b_1 = _mm_slli_epi32 (b_1,31);
   4839     b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit
   4840     return _mm_sub_epi32(r,b_1);
   4841 }
   4842 
   4843 //******* Vector subtract high half (truncated) ** ************
   4844 //************************************************************
   4845 _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0
   4846 _NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0
   4847 {
   4848     int8x8_t res64;
   4849     __m128i sum, sum8;
   4850     sum = _mm_sub_epi16 (a, b);
   4851     sum8 = _mm_srai_epi16 (sum, 8);
   4852     sum8 = _mm_packs_epi16(sum8,sum8);
   4853     return64(sum8);
   4854 }
   4855 
   4856 _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0
   4857 _NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0
   4858 {
   4859     int16x4_t res64;
   4860     __m128i sum, sum16;
   4861     sum = _mm_sub_epi32 (a, b);
   4862     sum16 = _mm_srai_epi32 (sum, 16);
   4863     sum16 = _mm_packs_epi32(sum16,sum16);
   4864     return64(sum16);
   4865 }
   4866 
   4867 _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0
   4868 _NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b)
   4869 {
   4870     int32x2_t res64;
   4871     __m128i sub;
   4872     sub = _mm_sub_epi64 (a, b);
   4873     sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
   4874     return64(sub);
   4875 }
   4876 
   4877 _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0
   4878 _NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0
   4879 {
   4880     uint8x8_t res64;
   4881     __m128i sum, sum8;
   4882     sum = _mm_sub_epi16 (a, b);
   4883     sum8 = _mm_srli_epi16 (sum, 8);
   4884     sum8 =  _mm_packus_epi16(sum8,sum8);
   4885     return64(sum8);
   4886 }
   4887 
   4888 _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0
   4889 _NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0
   4890 {
   4891     uint16x4_t res64;
   4892      __m128i sum, sum16;
   4893     sum = _mm_sub_epi32 (a, b);
   4894     sum16 = _mm_srli_epi32 (sum, 16);
   4895 #ifdef USE_SSE4
   4896     sum16 =  _MM_PACKUS1_EPI32(sum16);
   4897 #else
   4898     sum16  = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits
   4899 #endif
   4900     return64(sum16);
   4901 }
   4902 
   4903 _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0
   4904 #define vsubhn_u64 vsubhn_s64
   4905 
   4906 //************ Vector rounding subtract high half *********************
   4907 //*********************************************************************
   4908 _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0
   4909 _NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0
   4910 {
   4911     int8x8_t res64;
   4912     __m128i sub, mask1;
   4913     sub = _mm_sub_epi16 (a, b);
   4914     mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
   4915     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
   4916     sub = _mm_srai_epi16 (sub, 8); //get high half
   4917     sub = _mm_add_epi16 (sub, mask1); //actual rounding
   4918     sub =  _mm_packs_epi16 (sub, sub);
   4919     return64(sub);
   4920 }
   4921 
   4922 _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0
   4923 _NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0
   4924 {
   4925     //SIMD may be not optimal, serial may be faster
   4926     int16x4_t res64;
   4927     __m128i sub, mask1;
   4928     sub = _mm_sub_epi32 (a, b);
   4929     mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
   4930     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
   4931     sub = _mm_srai_epi32 (sub, 16); //get high half
   4932     sub = _mm_add_epi32 (sub, mask1); //actual rounding
   4933     sub = _mm_packs_epi32 (sub, sub);
   4934     return64(sub);
   4935 }
   4936 
   4937 _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0
   4938 _NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b)
   4939 {
   4940     //SIMD may be not optimal, serial may be faster
   4941     int32x2_t res64;
   4942     __m128i sub, mask1;
   4943     sub = _mm_sub_epi64 (a, b);
   4944     mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to
   4945     mask1 = _mm_srli_epi64(mask1,32); //get  31-th bit 1 or zero
   4946     sub = _mm_add_epi64 (sub, mask1); //actual high half rounding
   4947     sub = _mm_shuffle_epi32(sub,  1 | (3 << 2) | (0 << 4) | (2 << 6));
   4948     return64(sub);
   4949 }
   4950 
   4951 _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0
   4952 _NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0
   4953 {
   4954     uint8x8_t res64;
   4955     __m128i sub, mask1;
   4956     sub = _mm_sub_epi16 (a, b);
   4957     mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to
   4958     mask1 = _mm_srli_epi16(mask1, 15); //get  7-th bit 1 or zero
   4959     sub = _mm_srai_epi16 (sub, 8); //get high half
   4960     sub = _mm_add_epi16 (sub, mask1); //actual rounding
   4961     sub = _mm_packus_epi16 (sub, sub);
   4962     return64(sub);
   4963 }
   4964 
   4965 _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0
   4966 _NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0
   4967 {
   4968     //SIMD may be not optimal, serial may be faster
   4969     uint16x4_t res64;
   4970     __m128i sub, mask1;
   4971     sub = _mm_sub_epi32 (a, b);
   4972     mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to
   4973     mask1 = _mm_srli_epi32(mask1,31); //get  15-th bit 1 or zero
   4974     sub = _mm_srai_epi32 (sub, 16); //get high half
   4975     sub = _mm_add_epi32 (sub, mask1); //actual rounding
   4976 #ifdef USE_SSE4
   4977     sub =  _MM_PACKUS1_EPI32 (sub);
   4978 #else
   4979     sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits
   4980 #endif
   4981     return64(sub);
   4982 }
   4983 
   4984 _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0
   4985 #define vrsubhn_u64 vrsubhn_s64
   4986 
   4987 //*********** Vector saturating doubling multiply subtract long ********************
   4988 //************************************************************************************
   4989 _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0
   4990 _NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c)
   4991 {
   4992     //not optimal SIMD soulution, serial may be faster
   4993     __m128i res32, mask;
   4994     int32x4_t res;
   4995     _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   4996     res = vmull_s16(b,  c);
   4997     res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered
   4998     mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask);
   4999     res32 = _mm_xor_si128 (res32,  mask); //res32 saturated for 0x80000000
   5000     return vqsubq_s32(a, res32); //saturation
   5001 }
   5002 
   5003 _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0
   5004 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   5005 {
   5006     __m128i res64, mask;
   5007     int64x2_t res;
   5008     _NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000};
   5009     res = vmull_s32(b,  c);
   5010     res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered
   5011     mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask);
   5012     res64 = _mm_xor_si128 (res64,  mask); //res32 saturated for 0x80000000
   5013     return vqsubq_s64(a, res64); //saturation
   5014 }
   5015 
   5016 //******************  COMPARISON ***************************************
   5017 //******************* Vector compare equal *************************************
   5018 //****************************************************************************
   5019 _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0
   5020 _NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b)
   5021 {
   5022     int8x8_t res64;
   5023     return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
   5024 }
   5025 
   5026 
   5027 _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0
   5028 _NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b)
   5029 {
   5030     int16x4_t res64;
   5031     return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
   5032 }
   5033 
   5034 
   5035 _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0
   5036 _NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b)
   5037 {
   5038     int32x2_t res64;
   5039     return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
   5040 }
   5041 
   5042 
   5043 _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0
   5044 _NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b)
   5045 {
   5046     uint32x2_t res64;
   5047     __m128 res;
   5048     res = _mm_cmpeq_ps(_pM128(a), _pM128(b) );
   5049     return64f(res);
   5050 }
   5051 
   5052 _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0
   5053 _NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b)
   5054 {
   5055     uint8x8_t res64;
   5056     return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b)));
   5057 }
   5058 
   5059 
   5060 _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0
   5061 _NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b)
   5062 {
   5063     uint16x4_t res64;
   5064     return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b)));
   5065 }
   5066 
   5067 
   5068 _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0
   5069 _NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b)
   5070 {
   5071     uint32x2_t res64;
   5072     return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b)));
   5073 }
   5074 
   5075 
   5076 _NEON2SSESTORAGE uint8x8_t   vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0
   5077 #define vceq_p8 vceq_u8
   5078 
   5079 
   5080 _NEON2SSESTORAGE uint8x16_t   vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0
   5081 #define vceqq_s8 _mm_cmpeq_epi8
   5082 
   5083 _NEON2SSESTORAGE uint16x8_t   vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0
   5084 #define vceqq_s16 _mm_cmpeq_epi16
   5085 
   5086 _NEON2SSESTORAGE uint32x4_t   vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0
   5087 #define vceqq_s32 _mm_cmpeq_epi32
   5088 
   5089 _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0
   5090 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b)
   5091 {
   5092     __m128 res;
   5093     res = _mm_cmpeq_ps(a,b);
   5094     return _M128i(res);
   5095 }
   5096 
   5097 _NEON2SSESTORAGE uint8x16_t   vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0
   5098 #define vceqq_u8 _mm_cmpeq_epi8
   5099 
   5100 _NEON2SSESTORAGE uint16x8_t   vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0
   5101 #define vceqq_u16 _mm_cmpeq_epi16
   5102 
   5103 _NEON2SSESTORAGE uint32x4_t   vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0
   5104 #define vceqq_u32 _mm_cmpeq_epi32
   5105 
   5106 _NEON2SSESTORAGE uint8x16_t   vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0
   5107 #define vceqq_p8 _mm_cmpeq_epi8
   5108 
   5109 //******************Vector compare greater-than or equal*************************
   5110 //*******************************************************************************
   5111 //in IA SIMD no greater-than-or-equal comparison for integers,
   5112 // there is greater-than available only, so we need the following tricks
   5113 
   5114 _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
   5115 _NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a,  int8x8_t b)
   5116 {
   5117     int8x8_t res64;
   5118     return64(vcgeq_s8(_pM128i(a), _pM128i(b)));
   5119 }
   5120 
   5121 
   5122 _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
   5123 _NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a,  int16x4_t b)
   5124 {
   5125     int16x4_t res64;
   5126     return64(vcgeq_s16(_pM128i(a), _pM128i(b)));
   5127 }
   5128 
   5129 
   5130 _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
   5131 _NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a,  int32x2_t b)
   5132 {
   5133     int32x2_t res64;
   5134     return64(vcgeq_s32(_pM128i(a), _pM128i(b)));
   5135 }
   5136 
   5137 
   5138 _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0
   5139 _NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b)
   5140 {
   5141     uint32x2_t res64;
   5142     __m128 res;
   5143     res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries
   5144     return64f(res);
   5145 }
   5146 
   5147 _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
   5148 _NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a,  uint8x8_t b)
   5149 {
   5150     uint8x8_t res64;
   5151     return64(vcgeq_u8(_pM128i(a), _pM128i(b)));
   5152 }
   5153 
   5154 
   5155 _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
   5156 _NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a,  uint16x4_t b)
   5157 {
   5158     uint16x4_t res64;
   5159     return64(vcgeq_u16(_pM128i(a), _pM128i(b)));
   5160 }
   5161 
   5162 
   5163 _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
   5164 _NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a,  uint32x2_t b)
   5165 {
   5166     //serial solution looks faster
   5167     uint32x2_t res64;
   5168     return64(vcgeq_u32 (_pM128i(a), _pM128i(b)));
   5169 }
   5170 
   5171 
   5172 
   5173 _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
   5174 _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
   5175 {
   5176     __m128i m1, m2;
   5177     m1 = _mm_cmpgt_epi8 ( a, b);
   5178     m2 = _mm_cmpeq_epi8 ( a, b);
   5179     return _mm_or_si128  ( m1, m2);
   5180 }
   5181 
   5182 _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
   5183 _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
   5184 {
   5185     __m128i m1, m2;
   5186     m1 = _mm_cmpgt_epi16 ( a, b);
   5187     m2 = _mm_cmpeq_epi16 ( a, b);
   5188     return _mm_or_si128   ( m1,m2);
   5189 }
   5190 
   5191 _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
   5192 _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
   5193 {
   5194     __m128i m1, m2;
   5195     m1 = _mm_cmpgt_epi32 (a, b);
   5196     m2 = _mm_cmpeq_epi32 (a, b);
   5197     return _mm_or_si128   (m1, m2);
   5198 }
   5199 
   5200 _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
   5201 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b)
   5202 {
   5203     __m128 res;
   5204     res = _mm_cmpge_ps(a,b); //use only 2 first entries
   5205     return *(__m128i*)&res;
   5206 }
   5207 
   5208 _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
   5209 _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
   5210 {
   5211     //no unsigned chars comparison, only signed available,so need the trick
   5212     __m128i cmp;
   5213     cmp = _mm_max_epu8(a, b);
   5214     return _mm_cmpeq_epi8(cmp, a); //a>=b
   5215 }
   5216 
   5217 _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
   5218 _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
   5219 {
   5220     //no unsigned shorts comparison, only signed available,so need the trick
   5221 #ifdef USE_SSE4
   5222     __m128i cmp;
   5223     cmp = _mm_max_epu16(a, b);
   5224     return _mm_cmpeq_epi16(cmp, a); //a>=b
   5225 #else
   5226     __m128i as, mask;
   5227     __m128i zero = _mm_setzero_si128();
   5228     __m128i cffff = _mm_set1_epi16(0xffff);
   5229     as = _mm_subs_epu16(b,a);
   5230     mask = _mm_cmpgt_epi16(as, zero);
   5231     return _mm_xor_si128 ( mask, cffff);
   5232 #endif
   5233 }
   5234 
   5235 _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
   5236 _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
   5237 {
   5238     //no unsigned ints comparison, only signed available,so need the trick
   5239 #ifdef USE_SSE4
   5240     __m128i cmp;
   5241     cmp = _mm_max_epu32(a, b);
   5242     return _mm_cmpeq_epi32(cmp, a); //a>=b
   5243 #else
   5244     //serial solution may be faster
   5245     __m128i c80000000, as, bs, m1, m2;
   5246     c80000000 = _mm_set1_epi32 (0x80000000);
   5247     as = _mm_sub_epi32(a,c80000000);
   5248     bs = _mm_sub_epi32(b,c80000000);
   5249     m1 = _mm_cmpgt_epi32 (as, bs);
   5250     m2 = _mm_cmpeq_epi32 (as, bs);
   5251     return _mm_or_si128 ( m1,  m2);
   5252 #endif
   5253 }
   5254 
   5255 //**********************Vector compare less-than or equal******************************
   5256 //***************************************************************************************
   5257 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks
   5258 
   5259 _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a,  int8x8_t b); // VCGE.S8 d0, d0, d0
   5260 _NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a,  int8x8_t b)
   5261 {
   5262     int8x8_t res64;
   5263     return64(vcleq_s8(_pM128i(a), _pM128i(b)));
   5264 }
   5265 
   5266 
   5267 _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a,  int16x4_t b); // VCGE.S16 d0, d0, d0
   5268 _NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a,  int16x4_t b)
   5269 {
   5270     int16x4_t res64;
   5271     return64(vcleq_s16(_pM128i(a), _pM128i(b)));
   5272 }
   5273 
   5274 
   5275 _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a,  int32x2_t b); // VCGE.S32 d0, d0, d0
   5276 _NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a,  int32x2_t b)
   5277 {
   5278     int32x2_t res64;
   5279     return64(vcleq_s32(_pM128i(a), _pM128i(b)));
   5280 }
   5281 
   5282 
   5283 _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0?
   5284 _NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b)
   5285 {
   5286     uint32x2_t res64;
   5287     __m128 res;
   5288     res = _mm_cmple_ps(_pM128(a),_pM128(b));
   5289     return64f(res);
   5290 }
   5291 
   5292 _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a,  uint8x8_t b); // VCGE.U8 d0, d0, d0
   5293 #define vcle_u8(a,b) vcge_u8(b,a)
   5294 
   5295 
   5296 _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a,  uint16x4_t b); // VCGE.s16 d0, d0, d0
   5297 #define vcle_u16(a,b) vcge_u16(b,a)
   5298 
   5299 
   5300 _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a,  uint32x2_t b); // VCGE.U32 d0, d0, d0
   5301 #define vcle_u32(a,b) vcge_u32(b,a)
   5302 
   5303 _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0
   5304 _NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0
   5305 {
   5306     __m128i c1, res;
   5307     c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff....
   5308     res = _mm_cmpgt_epi8 ( a,  b);
   5309     return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal
   5310 }
   5311 
   5312 _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0
   5313 _NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0
   5314 {
   5315     __m128i c1, res;
   5316     c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff....
   5317     res = _mm_cmpgt_epi16 ( a,  b);
   5318     return _mm_andnot_si128 (res, c1);
   5319 }
   5320 
   5321 _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0
   5322 _NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0
   5323 {
   5324     __m128i c1, res;
   5325     c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff....
   5326     res = _mm_cmpgt_epi32 ( a,  b);
   5327     return _mm_andnot_si128 (res, c1);
   5328 }
   5329 
   5330 _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0
   5331 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b)
   5332 {
   5333     __m128 res;
   5334     res = _mm_cmple_ps(a,b);
   5335     return *(__m128i*)&res;
   5336 }
   5337 
   5338 _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0
   5339 #ifdef USE_SSE4
   5340     _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0
   5341     {
   5342         //no unsigned chars comparison in SSE, only signed available,so need the trick
   5343         __m128i cmp;
   5344         cmp = _mm_min_epu8(a, b);
   5345         return _mm_cmpeq_epi8(cmp, a); //a<=b
   5346     }
   5347 #else
   5348 #   define vcleq_u8(a,b) vcgeq_u8(b,a)
   5349 #endif
   5350 
   5351 
   5352 _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0
   5353 #ifdef USE_SSE4
   5354     _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0
   5355     {
   5356         //no unsigned shorts comparison in SSE, only signed available,so need the trick
   5357         __m128i cmp;
   5358         cmp = _mm_min_epu16(a, b);
   5359         return _mm_cmpeq_epi16(cmp, a); //a<=b
   5360     }
   5361 #else
   5362 #   define vcleq_u16(a,b) vcgeq_u16(b,a)
   5363 #endif
   5364 
   5365 
   5366 _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0
   5367 #ifdef USE_SSE4
   5368     _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0
   5369     {
   5370         //no unsigned chars comparison in SSE, only signed available,so need the trick
   5371         __m128i cmp;
   5372         cmp = _mm_min_epu32(a, b);
   5373         return _mm_cmpeq_epi32(cmp, a); //a<=b
   5374     }
   5375 #else
   5376 //solution may be not optimal compared with the serial one
   5377 #   define vcleq_u32(a,b) vcgeq_u32(b,a)
   5378 #endif
   5379 
   5380 
   5381 //****** Vector compare greater-than ******************************************
   5382 //**************************************************************************
   5383 _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
   5384 _NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b)
   5385 {
   5386     int8x8_t res64;
   5387     return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b)));
   5388 }
   5389 
   5390 
   5391 _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
   5392 _NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b)
   5393 {
   5394     int16x4_t res64;
   5395     return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b)));
   5396 }
   5397 
   5398 
   5399 _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
   5400 _NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b)
   5401 {
   5402     int32x2_t res64;
   5403     return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b)));
   5404 }
   5405 
   5406 
   5407 _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
   5408 _NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b)
   5409 {
   5410     uint32x2_t res64;
   5411     __m128 res;
   5412     res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries
   5413     return64f(res);
   5414 }
   5415 
   5416 _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b); // VCGT.U8 d0, d0, d0
   5417 _NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a,  uint8x8_t b)
   5418 {
   5419     uint8x8_t res64;
   5420     return64(vcgtq_u8(_pM128i(a), _pM128i(b)));
   5421 }
   5422 
   5423 
   5424 _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b); // VCGT.s16 d0, d0, d0
   5425 _NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a,  uint16x4_t b)
   5426 {
   5427     uint16x4_t res64;
   5428     return64(vcgtq_u16(_pM128i(a), _pM128i(b)));
   5429 }
   5430 
   5431 
   5432 _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b); // VCGT.U32 d0, d0, d0
   5433 _NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a,  uint32x2_t b)
   5434 {
   5435     uint32x2_t res64;
   5436     return64(vcgtq_u32(_pM128i(a), _pM128i(b)));
   5437 }
   5438 
   5439 
   5440 _NEON2SSESTORAGE uint8x16_t   vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
   5441 #define vcgtq_s8 _mm_cmpgt_epi8
   5442 
   5443 _NEON2SSESTORAGE uint16x8_t   vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
   5444 #define vcgtq_s16 _mm_cmpgt_epi16
   5445 
   5446 _NEON2SSESTORAGE uint32x4_t   vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
   5447 #define vcgtq_s32 _mm_cmpgt_epi32
   5448 
   5449 _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
   5450 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b)
   5451 {
   5452     __m128 res;
   5453     res = _mm_cmpgt_ps(a,b); //use only 2 first entries
   5454     return *(__m128i*)&res;
   5455 }
   5456 
   5457 _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
   5458 _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0
   5459 {
   5460     //no unsigned chars comparison, only signed available,so need the trick
   5461     __m128i as;
   5462     __m128i zero = _mm_setzero_si128();
   5463     as = _mm_subs_epu8(a, b);
   5464     return _mm_cmpgt_epi8(as, zero);
   5465 }
   5466 
   5467 _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
   5468 _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0
   5469 {
   5470     //no unsigned short comparison, only signed available,so need the trick
   5471     __m128i as;
   5472     __m128i zero = _mm_setzero_si128();
   5473     as = _mm_subs_epu16(a, b);
   5474     return _mm_cmpgt_epi16(as, zero);
   5475 }
   5476 
   5477 _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
   5478 _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0
   5479 {
   5480     //no unsigned int comparison, only signed available,so need the trick
   5481     __m128i c80000000, as, bs;
   5482     c80000000 = _mm_set1_epi32 (0x80000000);
   5483     as = _mm_sub_epi32(a,c80000000);
   5484     bs = _mm_sub_epi32(b,c80000000);
   5485     return _mm_cmpgt_epi32 ( as, bs);
   5486 }
   5487 
   5488 //********************* Vector compare less-than **************************
   5489 //*************************************************************************
   5490 _NEON2SSESTORAGE uint8x8_t   vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0
   5491 #define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!!
   5492 
   5493 
   5494 _NEON2SSESTORAGE uint16x4_t   vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0
   5495 #define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!!
   5496 
   5497 
   5498 _NEON2SSESTORAGE uint32x2_t   vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0
   5499 #define vclt_s32(a,b)  vcgt_s32(b,a) //swap the arguments!!
   5500 
   5501 
   5502 _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0
   5503 #define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!!
   5504 
   5505 _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0
   5506 #define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!!
   5507 
   5508 _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0
   5509 #define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!!
   5510 
   5511 _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0
   5512 #define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!!
   5513 
   5514 _NEON2SSESTORAGE uint8x16_t   vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0
   5515 #define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!!
   5516 
   5517 _NEON2SSESTORAGE uint16x8_t   vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0
   5518 #define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!!
   5519 
   5520 _NEON2SSESTORAGE uint32x4_t   vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0
   5521 #define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!!
   5522 
   5523 _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0
   5524 #define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!!
   5525 
   5526 _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0
   5527 #define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!!
   5528 
   5529 _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0
   5530 #define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!!
   5531 
   5532 _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0
   5533 #define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!!
   5534 
   5535 //*****************Vector compare absolute greater-than or equal ************
   5536 //***************************************************************************
   5537 _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
   5538 _NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b)
   5539 {
   5540     uint32x2_t res64;
   5541     __m128i c7fffffff;
   5542     __m128 a0, b0;
   5543     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5544     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
   5545     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
   5546     a0 = _mm_cmpge_ps ( a0, b0);
   5547     return64f(a0);
   5548 }
   5549 
   5550 _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
   5551 _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
   5552 {
   5553     __m128i c7fffffff;
   5554     __m128 a0, b0;
   5555     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5556     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   5557     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   5558     a0 = _mm_cmpge_ps ( a0, b0);
   5559     return (*(__m128i*)&a0);
   5560 }
   5561 
   5562 //********Vector compare absolute less-than or equal ******************
   5563 //********************************************************************
   5564 _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0
   5565 _NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b)
   5566 {
   5567     uint32x2_t res64;
   5568     __m128i c7fffffff;
   5569     __m128 a0, b0;
   5570     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5571     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
   5572     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
   5573     a0 = _mm_cmple_ps (a0, b0);
   5574     return64f(a0);
   5575 }
   5576 
   5577 _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0
   5578 _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0
   5579 {
   5580     __m128i c7fffffff;
   5581     __m128 a0, b0;
   5582     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5583     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   5584     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   5585     a0 = _mm_cmple_ps (a0, b0);
   5586     return (*(__m128i*)&a0);
   5587 }
   5588 
   5589 //********  Vector compare absolute greater-than    ******************
   5590 //******************************************************************
   5591 _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
   5592 _NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b)
   5593 {
   5594     uint32x2_t res64;
   5595     __m128i c7fffffff;
   5596     __m128 a0, b0;
   5597     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5598     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
   5599     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
   5600     a0 = _mm_cmpgt_ps (a0, b0);
   5601     return64f(a0);
   5602 }
   5603 
   5604 _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
   5605 _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
   5606 {
   5607     __m128i c7fffffff;
   5608     __m128 a0, b0;
   5609     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5610     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   5611     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   5612     a0 = _mm_cmpgt_ps (a0, b0);
   5613     return (*(__m128i*)&a0);
   5614 }
   5615 
   5616 //***************Vector compare absolute less-than  ***********************
   5617 //*************************************************************************
   5618 _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0
   5619 _NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b)
   5620 {
   5621     uint32x2_t res64;
   5622     __m128i c7fffffff;
   5623     __m128 a0, b0;
   5624     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5625     a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff);
   5626     b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff);
   5627     a0 = _mm_cmplt_ps (a0, b0);
   5628     return64f(a0);
   5629 }
   5630 
   5631 _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0
   5632 _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0
   5633 {
   5634     __m128i c7fffffff;
   5635     __m128 a0, b0;
   5636     c7fffffff = _mm_set1_epi32 (0x7fffffff);
   5637     a0 = _mm_and_ps (a, *(__m128*)&c7fffffff);
   5638     b0 = _mm_and_ps (b, *(__m128*)&c7fffffff);
   5639     a0 = _mm_cmplt_ps (a0, b0);
   5640     return (*(__m128i*)&a0);
   5641 }
   5642 
   5643 //*************************Vector test bits************************************
   5644 //*****************************************************************************
   5645 /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them
   5646 with the corresponding element of a second vector. If the result is not zero, the
   5647 corresponding element in the destination vector is set to all ones. Otherwise, it is set to
   5648 all zeros. */
   5649 
   5650 _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b); // VTST.8 d0, d0, d0
   5651 _NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a,  int8x8_t b)
   5652 {
   5653     int8x8_t res64;
   5654     return64(vtstq_s8(_pM128i(a), _pM128i(b)));
   5655 }
   5656 
   5657 
   5658 _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b); // VTST.16 d0, d0, d0
   5659 _NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a,  int16x4_t b)
   5660 {
   5661     int16x4_t res64;
   5662     return64(vtstq_s16(_pM128i(a), _pM128i(b)));
   5663 }
   5664 
   5665 
   5666 _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b); // VTST.32 d0, d0, d0
   5667 _NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a,  int32x2_t b)
   5668 {
   5669     int32x2_t res64;
   5670     return64(vtstq_s32(_pM128i(a), _pM128i(b)));
   5671 }
   5672 
   5673 
   5674 _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a,  uint8x8_t b); // VTST.8 d0, d0, d0
   5675 #define vtst_u8 vtst_s8
   5676 
   5677 _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a,  uint16x4_t b); // VTST.16 d0, d0, d0
   5678 #define vtst_u16 vtst_s16
   5679 
   5680 _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a,  uint32x2_t b); // VTST.32 d0, d0, d0
   5681 #define vtst_u32 vtst_s32
   5682 
   5683 
   5684 _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0
   5685 #define vtst_p8 vtst_u8
   5686 
   5687 _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0
   5688 _NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0
   5689 {
   5690     __m128i zero, one, res;
   5691     zero = _mm_setzero_si128 ();
   5692     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
   5693     res = _mm_and_si128 (a, b);
   5694     res =  _mm_cmpeq_epi8 (res, zero);
   5695     return _mm_xor_si128(res, one); //invert result
   5696 }
   5697 
   5698 _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0
   5699 _NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0
   5700 {
   5701     __m128i zero, one, res;
   5702     zero = _mm_setzero_si128 ();
   5703     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
   5704     res = _mm_and_si128 (a, b);
   5705     res =  _mm_cmpeq_epi16 (res, zero);
   5706     return _mm_xor_si128(res, one); //invert result
   5707 }
   5708 
   5709 _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0
   5710 _NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0
   5711 {
   5712     __m128i zero, one, res;
   5713     zero = _mm_setzero_si128 ();
   5714     one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff
   5715     res = _mm_and_si128 (a, b);
   5716     res =  _mm_cmpeq_epi32 (res, zero);
   5717     return _mm_xor_si128(res, one); //invert result
   5718 }
   5719 
   5720 _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0
   5721 #define vtstq_u8 vtstq_s8
   5722 
   5723 _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0
   5724 #define vtstq_u16 vtstq_s16
   5725 
   5726 _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0
   5727 #define vtstq_u32 vtstq_s32
   5728 
   5729 _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0
   5730 #define vtstq_p8 vtstq_u8
   5731 
   5732 //****************** Absolute difference ********************
   5733 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |*****
   5734 //************************************************************
   5735 _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b); // VABD.S8 d0,d0,d0
   5736 _NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a,  int8x8_t b)
   5737 {
   5738     int8x8_t res64;
   5739     return64(vabdq_s8(_pM128i(a), _pM128i(b)));
   5740 }
   5741 
   5742 _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b); // VABD.S16 d0,d0,d0
   5743 _NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a,  int16x4_t b)
   5744 {
   5745     int16x4_t res64;
   5746     return64(vabdq_s16(_pM128i(a), _pM128i(b)));
   5747 }
   5748 
   5749 _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b); // VABD.S32 d0,d0,d0
   5750 _NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a,  int32x2_t b)
   5751 {//need to deal with an intermediate overflow
   5752     int32x2_t res;
   5753     res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] -  b.m64_i32[0]: b.m64_i32[0] -  a.m64_i32[0];
   5754     res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] -  b.m64_i32[1]: b.m64_i32[1] -  a.m64_i32[1];
   5755     return res;
   5756 }
   5757 
   5758 _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b); // VABD.U8 d0,d0,d0
   5759 _NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a,  uint8x8_t b)
   5760 {
   5761     uint8x8_t res64;
   5762     return64(vabdq_u8(_pM128i(a), _pM128i(b)));
   5763 }
   5764 
   5765 _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b); // VABD.s16 d0,d0,d0
   5766 _NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a,  uint16x4_t b)
   5767 {
   5768     uint16x4_t res64;
   5769     return64(vabdq_u16(_pM128i(a), _pM128i(b)));
   5770 }
   5771 
   5772 _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b); // VABD.U32 d0,d0,d0
   5773 _NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a,  uint32x2_t b)
   5774 {
   5775     uint32x2_t res64;
   5776     return64(vabdq_u32(_pM128i(a), _pM128i(b)));
   5777 }
   5778 
   5779 _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0
   5780 _NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b)
   5781 {
   5782     float32x4_t res;
   5783     __m64_128 res64;
   5784     res = vabdq_f32(_pM128(a), _pM128(b));
   5785     _M64f(res64, res);
   5786     return res64;
   5787 }
   5788 
   5789 _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0
   5790 _NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0
   5791 { //need to deal with an intermediate overflow
   5792    __m128i cmp, difab, difba;
   5793    cmp = vcgtq_s8(a,b);
   5794    difab = _mm_sub_epi8(a,b);
   5795    difba = _mm_sub_epi8(b,a);
   5796    difab = _mm_and_si128(cmp, difab);
   5797    difba = _mm_andnot_si128(cmp, difba);
   5798    return _mm_or_si128(difab, difba);
   5799 }
   5800 
   5801 _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0
   5802 _NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0
   5803 {//need to deal with an intermediate overflow
   5804     __m128i cmp, difab, difba;
   5805     cmp = vcgtq_s16(a,b);
   5806     difab = _mm_sub_epi16(a,b);
   5807     difba = _mm_sub_epi16 (b,a);
   5808     difab = _mm_and_si128(cmp, difab);
   5809     difba = _mm_andnot_si128(cmp, difba);
   5810     return _mm_or_si128(difab, difba);
   5811 }
   5812 
   5813 _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0
   5814 _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0
   5815 {//need to deal with an intermediate overflow
   5816     __m128i cmp, difab, difba;
   5817     cmp = vcgtq_s32(a,b);
   5818     difab = _mm_sub_epi32(a,b);
   5819     difba = _mm_sub_epi32(b,a);
   5820     difab = _mm_and_si128(cmp, difab);
   5821     difba = _mm_andnot_si128(cmp, difba);
   5822     return _mm_or_si128(difab, difba);
   5823 }
   5824 
   5825 _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0
   5826 _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned
   5827 {
   5828     __m128i  difab, difba;
   5829     difab = _mm_subs_epu8(a,b);
   5830     difba = _mm_subs_epu8 (b,a);
   5831     return _mm_or_si128(difab, difba);
   5832 }
   5833 
   5834 _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0
   5835 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b)
   5836 {
   5837     __m128i difab, difba;
   5838     difab = _mm_subs_epu16(a,b);
   5839     difba = _mm_subs_epu16 (b,a);
   5840     return _mm_or_si128(difab, difba);
   5841 }
   5842 
   5843 _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0
   5844 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b)
   5845 {
   5846     __m128i cmp, difab, difba;
   5847     cmp = vcgtq_u32(a,b);
   5848     difab = _mm_sub_epi32(a,b);
   5849     difba = _mm_sub_epi32 (b,a);
   5850     difab = _mm_and_si128(cmp, difab);
   5851     difba = _mm_andnot_si128(cmp, difba);
   5852     return _mm_or_si128(difab, difba);
   5853 }
   5854 
   5855 _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0
   5856 _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0
   5857 {
   5858     __m128i c1;
   5859     __m128 res;
   5860     c1 =  _mm_set1_epi32(0x7fffffff);
   5861     res = _mm_sub_ps (a, b);
   5862     return _mm_and_ps (res, *(__m128*)&c1);
   5863 }
   5864 
   5865 //************  Absolute difference - long **************************
   5866 //********************************************************************
   5867 _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0
   5868 _NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0
   5869 {
   5870     __m128i a16, b16;
   5871     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1,
   5872     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   5873     return vabdq_s16(a16, b16);
   5874 
   5875 }
   5876 
   5877 _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0
   5878 _NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0
   5879 {
   5880     __m128i a32, b32;
   5881     a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1
   5882     b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1,
   5883     return vabdq_s32(a32, b32);
   5884 }
   5885 
   5886 _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0
   5887 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL)
   5888 {
   5889     //no optimal SIMD solution, serial looks faster
   5890     _NEON2SSE_ALIGN_16 int64_t res[2];
   5891     if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0];
   5892     else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0];
   5893     if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1];
   5894     else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1];
   5895     return _mm_load_si128((__m128i*)res);
   5896 }
   5897 
   5898 _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0
   5899 _NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b)
   5900 {
   5901     __m128i res;
   5902     res = vsubl_u8(a,b);
   5903     return _mm_abs_epi16(res);
   5904 }
   5905 
   5906 _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0
   5907 _NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b)
   5908 {
   5909     __m128i res;
   5910     res = vsubl_u16(a,b);
   5911     return _mm_abs_epi32(res);
   5912 }
   5913 
   5914 _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0
   5915 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   5916 {
   5917     _NEON2SSE_ALIGN_16 uint64_t res[2];
   5918     if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0];
   5919     else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0];
   5920     if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1];
   5921     else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1];
   5922     return _mm_load_si128((__m128i*)res);
   5923 }
   5924 
   5925 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | *************
   5926 //*********************************************************************************************
   5927 _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0
   5928 _NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a,  int8x8_t b, int8x8_t c)
   5929 {
   5930     int8x8_t res64;
   5931     return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c)));
   5932 }
   5933 
   5934 _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0
   5935 _NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a,  int16x4_t b, int16x4_t c)
   5936 {
   5937     int16x4_t res64;
   5938     return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c)));
   5939 }
   5940 
   5941 _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0
   5942 _NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a,  int32x2_t b, int32x2_t c)
   5943 {
   5944     int32x2_t res64;
   5945     return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c)));
   5946 }
   5947 
   5948 _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0
   5949 _NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c)
   5950 {
   5951     int8x8_t res64;
   5952     return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c)));
   5953 }
   5954 
   5955 
   5956 _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0
   5957 _NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c)
   5958 {
   5959     int16x4_t res64;
   5960     return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c)));
   5961 }
   5962 
   5963 _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0
   5964 _NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c)
   5965 {
   5966     uint32x2_t res64;
   5967     return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c)));
   5968 }
   5969 
   5970 _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0
   5971 _NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0
   5972 {
   5973     int8x16_t sub;
   5974     sub = vabdq_s8(b, c);
   5975     return vaddq_s8( a, sub);
   5976 }
   5977 
   5978 _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0
   5979 _NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0
   5980 {
   5981     int16x8_t sub;
   5982     sub = vabdq_s16(b, c);
   5983     return vaddq_s16( a, sub);
   5984 }
   5985 
   5986 _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0
   5987 _NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0
   5988 {
   5989     int32x4_t sub;
   5990     sub = vabdq_s32(b, c);
   5991     return vaddq_s32( a, sub);
   5992 }
   5993 
   5994 _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0
   5995 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c)
   5996 {
   5997     uint8x16_t sub;
   5998     sub = vabdq_u8(b, c);
   5999     return vaddq_u8( a, sub);
   6000 }
   6001 
   6002 _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0
   6003 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c)
   6004 {
   6005     uint16x8_t sub;
   6006     sub = vabdq_u16(b, c);
   6007     return vaddq_u16( a, sub);
   6008 }
   6009 
   6010 _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0
   6011 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c)
   6012 {
   6013     uint32x4_t sub;
   6014     sub = vabdq_u32(b, c);
   6015     return vaddq_u32( a, sub);
   6016 }
   6017 
   6018 //************** Absolute difference and accumulate - long ********************************
   6019 //*************************************************************************************
   6020 _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0
   6021 _NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0
   6022 {
   6023     __m128i b16, c16, res;
   6024     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1,
   6025     c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1,
   6026     res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
   6027     return _mm_add_epi16 (a, res);
   6028 }
   6029 
   6030 _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0
   6031 _NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0
   6032 {
   6033     __m128i b32, c32, res;
   6034     b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1
   6035     c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1
   6036     res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
   6037     return _mm_add_epi32 (a, res);
   6038 }
   6039 
   6040 _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0
   6041 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   6042 {
   6043     __m128i res;
   6044     res = vabdl_s32(b,c);
   6045     return _mm_add_epi64(a, res);
   6046 }
   6047 
   6048 _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0
   6049 _NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c)
   6050 {
   6051     __m128i b16, c16, res;
   6052     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1,
   6053     c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1,
   6054     res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) );
   6055     return _mm_add_epi16 (a, res);
   6056 }
   6057 
   6058 _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0
   6059 _NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c)
   6060 {
   6061     __m128i b32, c32, res;
   6062     b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1
   6063     c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1
   6064     res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) );
   6065     return _mm_add_epi32 (a, res);
   6066 }
   6067 
   6068 _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0
   6069 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   6070 {
   6071     __m128i res;
   6072     res = vabdl_u32(b,c);
   6073     return _mm_add_epi64(a, res);
   6074 }
   6075 
   6076 //***********************************************************************************
   6077 //****************  Maximum and minimum operations **********************************
   6078 //***********************************************************************************
   6079 //************* Maximum:  vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i]    *******
   6080 //***********************************************************************************
   6081 _NEON2SSESTORAGE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0
   6082 _NEON2SSE_INLINE int8x8_t   vmax_s8(int8x8_t a, int8x8_t b)
   6083 {
   6084     int8x8_t res64;
   6085     __m128i res;
   6086     res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
   6087     return64(res);
   6088 }
   6089 
   6090 _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0
   6091 _NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b)
   6092 {
   6093     int16x4_t res64;
   6094     return64(_mm_max_epi16(_pM128i(a),_pM128i(b)));
   6095 }
   6096 
   6097 _NEON2SSESTORAGE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0
   6098 _NEON2SSE_INLINE int32x2_t   vmax_s32(int32x2_t a, int32x2_t b)
   6099 {
   6100     int32x2_t res64;
   6101     __m128i res;
   6102     res =  _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
   6103     return64(res);
   6104 }
   6105 
   6106 _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0
   6107 _NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b)
   6108 {
   6109     uint8x8_t res64;
   6110     return64(_mm_max_epu8(_pM128i(a),_pM128i(b)));
   6111 }
   6112 
   6113 
   6114 _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0
   6115 _NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b)
   6116 {
   6117     uint16x4_t res64;
   6118     return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b)));
   6119 }
   6120 
   6121 
   6122 _NEON2SSESTORAGE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0
   6123 _NEON2SSE_INLINE uint32x2_t   vmax_u32(uint32x2_t a, uint32x2_t b)
   6124 {
   6125     uint32x2_t res64;
   6126     __m128i res;
   6127     res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
   6128     return64(res);
   6129 }
   6130 
   6131 _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0
   6132 _NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b)
   6133 {
   6134     //serial solution looks faster than  SIMD one
   6135     float32x2_t res;
   6136     res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
   6137     res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
   6138     return res;
   6139 }
   6140 
   6141 _NEON2SSESTORAGE int8x16_t   vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0
   6142 #define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1
   6143 
   6144 _NEON2SSESTORAGE int16x8_t   vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0
   6145 #define vmaxq_s16 _mm_max_epi16
   6146 
   6147 _NEON2SSESTORAGE int32x4_t   vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0
   6148 #define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1
   6149 
   6150 _NEON2SSESTORAGE uint8x16_t   vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0
   6151 #define vmaxq_u8 _mm_max_epu8
   6152 
   6153 _NEON2SSESTORAGE uint16x8_t   vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0
   6154 #define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1
   6155 
   6156 _NEON2SSESTORAGE uint32x4_t   vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0
   6157 #define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1
   6158 
   6159 
   6160 _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0
   6161 #define vmaxq_f32 _mm_max_ps
   6162 
   6163 
   6164 _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0
   6165 #define vmaxq_f64 _mm_max_pd
   6166 
   6167 
   6168 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ********************************
   6169 //***********************************************************************************************************
   6170 _NEON2SSESTORAGE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0
   6171 _NEON2SSE_INLINE int8x8_t   vmin_s8(int8x8_t a, int8x8_t b)
   6172 {
   6173     int8x8_t res64;
   6174     __m128i res;
   6175     res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
   6176     return64(res);
   6177 }
   6178 
   6179 _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0
   6180 _NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b)
   6181 {
   6182     int16x4_t res64;
   6183     return64(_mm_min_epi16(_pM128i(a),_pM128i(b)));
   6184 }
   6185 
   6186 
   6187 _NEON2SSESTORAGE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0
   6188 _NEON2SSE_INLINE int32x2_t   vmin_s32(int32x2_t a, int32x2_t b)
   6189 {
   6190     int32x2_t res64;
   6191     __m128i res;
   6192     res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits
   6193     return64(res);
   6194 }
   6195 
   6196 _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0
   6197 _NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b)
   6198 {
   6199     uint8x8_t res64;
   6200     return64(_mm_min_epu8(_pM128i(a),_pM128i(b)));
   6201 }
   6202 
   6203 
   6204 _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0
   6205 _NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b)
   6206 {
   6207     uint16x4_t res64;
   6208     return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b)));
   6209 }
   6210 
   6211 
   6212 _NEON2SSESTORAGE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0
   6213 _NEON2SSE_INLINE uint32x2_t   vmin_u32(uint32x2_t a, uint32x2_t b)
   6214 {
   6215     uint32x2_t res64;
   6216     __m128i res;
   6217     res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial
   6218     return64(res);
   6219 }
   6220 
   6221 _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0
   6222 _NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b)
   6223 {
   6224     //serial solution looks faster than  SIMD one
   6225     float32x2_t res;
   6226     res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0];
   6227     res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1];
   6228     return res;
   6229 }
   6230 
   6231 _NEON2SSESTORAGE int8x16_t   vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0
   6232 #define vminq_s8 _MM_MIN_EPI8 //SSE4.1
   6233 
   6234 _NEON2SSESTORAGE int16x8_t   vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0
   6235 #define vminq_s16 _mm_min_epi16
   6236 
   6237 _NEON2SSESTORAGE int32x4_t   vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0
   6238 #define vminq_s32 _MM_MIN_EPI32 //SSE4.1
   6239 
   6240 _NEON2SSESTORAGE uint8x16_t   vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0
   6241 #define vminq_u8 _mm_min_epu8
   6242 
   6243 _NEON2SSESTORAGE uint16x8_t   vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0
   6244 #define vminq_u16 _MM_MIN_EPU16 //SSE4.1
   6245 
   6246 _NEON2SSESTORAGE uint32x4_t   vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0
   6247 #define vminq_u32 _MM_MIN_EPU32 //SSE4.1
   6248 
   6249 _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0
   6250 #define vminq_f32 _mm_min_ps
   6251 
   6252 
   6253 _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0
   6254 #define vminq_f64 _mm_min_pd
   6255 
   6256 
   6257 //*************  Pairwise addition operations. **************************************
   6258 //************************************************************************************
   6259 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector
   6260 _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0
   6261 _NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0
   6262 {
   6263     //no 8 bit hadd in IA32, need to go to 16 bit and then pack
   6264     int8x8_t res64;
   6265     __m128i a16, b16, res;
   6266     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
   6267     b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1
   6268     res = _mm_hadd_epi16 (a16, b16);
   6269     res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits
   6270     return64(res);
   6271 }
   6272 
   6273 _NEON2SSESTORAGE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0
   6274 _NEON2SSE_INLINE int16x4_t   vpadd_s16(int16x4_t a, int16x4_t b)
   6275 {
   6276     int16x4_t res64;
   6277     __m128i hadd128;
   6278     hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b));
   6279     hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
   6280     return64(hadd128);
   6281 }
   6282 
   6283 
   6284 _NEON2SSESTORAGE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0
   6285 _NEON2SSE_INLINE int32x2_t   vpadd_s32(int32x2_t a, int32x2_t b)
   6286 {
   6287     int32x2_t res64;
   6288     __m128i hadd128;
   6289     hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b));
   6290     hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6));
   6291     return64(hadd128);
   6292 }
   6293 
   6294 
   6295 _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0
   6296 _NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0
   6297 {
   6298     //  no 8 bit hadd in IA32, need to go to 16 bit and then pack
   6299     uint8x8_t res64;
   6300 //  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
   6301     __m128i mask8, a16, b16, res;
   6302     mask8 = _mm_set1_epi16(0xff);
   6303     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1
   6304     b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1
   6305     res = _mm_hadd_epi16 (a16, b16);
   6306     res = _mm_and_si128(res, mask8); //to avoid saturation
   6307     res = _mm_packus_epi16 (res,res); //use low 64 bits
   6308     return64(res);
   6309 }
   6310 
   6311 _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0
   6312 _NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0
   6313 {
   6314     // solution may be not optimal, serial execution may be faster
   6315     // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed
   6316     uint16x4_t res64;
   6317     __m128i c32767,  cfffe, as, bs, res;
   6318     c32767 = _mm_set1_epi16 (32767);
   6319     cfffe = _mm_set1_epi16 ((int16_t)0xfffe);
   6320     as = _mm_sub_epi16 (_pM128i(a), c32767);
   6321     bs = _mm_sub_epi16 (_pM128i(b), c32767);
   6322     res = _mm_hadd_epi16 (as, bs);
   6323     res = _mm_add_epi16 (res, cfffe);
   6324     res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
   6325     return64(res);
   6326 }
   6327 
   6328 _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0
   6329 _NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster
   6330 {
   6331     //hadd doesn't work for unsigned values
   6332     uint32x2_t res64;
   6333     __m128i ab, ab_sh, res;
   6334     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1
   6335     ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0
   6336     res = _mm_add_epi32(ab, ab_sh);
   6337     res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6));
   6338     return64(res);
   6339 }
   6340 
   6341 _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0
   6342 _NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b)
   6343 {
   6344     __m128 hadd128;
   6345     __m64_128 res64;
   6346     hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b));
   6347     hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits
   6348     _M64f(res64, hadd128);
   6349     return res64;
   6350 }
   6351 
   6352 
   6353 //**************************  Long pairwise add  **********************************
   6354 //*********************************************************************************
   6355 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width,
   6356 // and places the final results in the destination vector.
   6357 
   6358 _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0
   6359 _NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0
   6360 {
   6361     //no 8 bit hadd in IA32, need to go to 16 bit anyway
   6362     __m128i a16;
   6363     int16x4_t res64;
   6364     a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1
   6365     a16 = _mm_hadd_epi16 (a16,  a16); //use low 64 bits
   6366     return64(a16);
   6367 }
   6368 
   6369 _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0
   6370 _NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0
   6371 {
   6372     // solution may be not optimal, serial execution may be faster
   6373     int32x2_t res64;
   6374     __m128i r32_1;
   6375     r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a));
   6376     r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits
   6377     return64(r32_1);
   6378 }
   6379 
   6380 _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0
   6381 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
   6382 {
   6383     int64x1_t res;
   6384     res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1];
   6385     return res;
   6386 }
   6387 
   6388 _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0
   6389 _NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0
   6390 {
   6391     //  no 8 bit hadd in IA32, need to go to 16 bit
   6392 //  no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work
   6393     uint16x4_t res64;
   6394     __m128i a16;
   6395     a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits
   6396     a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits
   6397     return64(a16);
   6398 }
   6399 
   6400 _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0
   6401 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
   6402 {
   6403     //serial solution looks faster than a SIMD one
   6404     uint32x2_t res;
   6405     res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1];
   6406     res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3];
   6407     return res;
   6408 }
   6409 
   6410 _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0
   6411 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster
   6412 {
   6413     uint64x1_t res;
   6414     res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1];
   6415     return res;
   6416 }
   6417 
   6418 _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0
   6419 _NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0
   6420 {
   6421     //no 8 bit hadd in IA32, need to go to 16 bit
   6422     __m128i r16_1, r16_2;
   6423     r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1
   6424     //swap hi and low part of r to process the remaining data
   6425     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   6426     r16_2 = _MM_CVTEPI8_EPI16 (r16_2);
   6427     return _mm_hadd_epi16 (r16_1, r16_2);
   6428 }
   6429 
   6430 _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0
   6431 _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0
   6432 {
   6433     //no 8 bit hadd in IA32, need to go to 16 bit
   6434     __m128i r32_1, r32_2;
   6435     r32_1 = _MM_CVTEPI16_EPI32(a);
   6436     //swap hi and low part of r to process the remaining data
   6437     r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   6438     r32_2 = _MM_CVTEPI16_EPI32 (r32_2);
   6439     return _mm_hadd_epi32 (r32_1, r32_2);
   6440 }
   6441 
   6442 _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0
   6443 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0
   6444 {
   6445     _NEON2SSE_ALIGN_16 int32_t atmp[4];
   6446     _NEON2SSE_ALIGN_16 int64_t res[2];
   6447     _mm_store_si128((__m128i*)atmp, a);
   6448     res[0] = (int64_t)atmp[0] + (int64_t)atmp[1];
   6449     res[1] = (int64_t)atmp[2] + (int64_t)atmp[3];
   6450     return _mm_load_si128((__m128i*)res);
   6451 }
   6452 
   6453 _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0
   6454 _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0
   6455 {
   6456     //no 8 bit hadd in IA32, need to go to 16 bit
   6457     __m128i r16_1, r16_2;
   6458     r16_1 = _MM_CVTEPU8_EPI16(a);
   6459     //swap hi and low part of r to process the remaining data
   6460     r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   6461     r16_2 = _MM_CVTEPU8_EPI16 (r16_2);
   6462     return _mm_hadd_epi16 (r16_1, r16_2);
   6463 }
   6464 
   6465 _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0
   6466 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a),  _NEON2SSE_REASON_SLOW_SERIAL)
   6467 {
   6468     //serial solution looks faster than a SIMD one
   6469     _NEON2SSE_ALIGN_16 uint16_t atmp[8];
   6470     _NEON2SSE_ALIGN_16 uint32_t res[4];
   6471     _mm_store_si128((__m128i*)atmp, a);
   6472     res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1];
   6473     res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3];
   6474     res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5];
   6475     res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7];
   6476     return _mm_load_si128((__m128i*)res);
   6477 }
   6478 
   6479 _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0
   6480 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6481 {
   6482     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
   6483     _NEON2SSE_ALIGN_16 uint64_t res[2];
   6484     _mm_store_si128((__m128i*)atmp, a);
   6485     res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1];
   6486     res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3];
   6487     return _mm_load_si128((__m128i*)res);
   6488 }
   6489 
   6490 //************************  Long pairwise add and accumulate **************************
   6491 //****************************************************************************************
   6492 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector,
   6493 // and accumulates the  values of the results into the elements of the destination (wide) vector
   6494 _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b); // VPADAL.S8 d0,d0
   6495 _NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a,  int8x8_t b)
   6496 {
   6497     int16x4_t res64;
   6498     return64(vpadalq_s8(_pM128i(a), _pM128i(b)));
   6499 }
   6500 
   6501 _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b); // VPADAL.S16 d0,d0
   6502 _NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a,  int16x4_t b)
   6503 {
   6504     int32x2_t res64;
   6505     return64(vpadalq_s16(_pM128i(a), _pM128i(b)));
   6506 }
   6507 
   6508 
   6509 _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0
   6510 _NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b)
   6511 {
   6512     int64x1_t res;
   6513     res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0];
   6514     return res;
   6515 }
   6516 
   6517 _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b); // VPADAL.U8 d0,d0
   6518 _NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a,  uint8x8_t b)
   6519 {
   6520     uint16x4_t res64;
   6521     return64(vpadalq_u8(_pM128i(a), _pM128i(b)));
   6522 }
   6523 
   6524 
   6525 _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b); // VPADAL.s16 d0,d0
   6526 _NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a,  uint16x4_t b)
   6527 {
   6528     uint32x2_t res64;
   6529     return64(vpadalq_u16(_pM128i(a), _pM128i(b)));
   6530 }
   6531 
   6532 _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0
   6533 _NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b)
   6534 {
   6535     uint64x1_t res;
   6536     res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0];
   6537     return res;
   6538 }
   6539 
   6540 _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0
   6541 _NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0
   6542 {
   6543     int16x8_t pad;
   6544     pad = vpaddlq_s8(b);
   6545     return _mm_add_epi16 (a, pad);
   6546 }
   6547 
   6548 _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0
   6549 _NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0
   6550 {
   6551     int32x4_t pad;
   6552     pad = vpaddlq_s16(b);
   6553     return _mm_add_epi32(a, pad);
   6554 }
   6555 
   6556 _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0
   6557 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b)
   6558 {
   6559     int64x2_t pad;
   6560     pad = vpaddlq_s32(b);
   6561     return _mm_add_epi64 (a, pad);
   6562 }
   6563 
   6564 _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0
   6565 _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0
   6566 {
   6567     uint16x8_t pad;
   6568     pad = vpaddlq_u8(b);
   6569     return _mm_add_epi16 (a, pad);
   6570 }
   6571 
   6572 _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0
   6573 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   6574 {
   6575     uint32x4_t pad;
   6576     pad = vpaddlq_u16(b);
   6577     return _mm_add_epi32(a, pad);
   6578 } //no optimal SIMD solution, serial is faster
   6579 
   6580 _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0
   6581 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   6582 {
   6583     //no optimal SIMD solution, serial is faster
   6584     uint64x2_t pad;
   6585     pad = vpaddlq_u32(b);
   6586     return _mm_add_epi64(a, pad);
   6587 } //no optimal SIMD solution, serial is faster
   6588 
   6589 //**********  Folding maximum   *************************************
   6590 //*******************************************************************
   6591 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors,
   6592 //and copies the larger of each pair into the corresponding element in the destination
   6593 //    no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison
   6594 _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0
   6595 _NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0
   6596 {
   6597     int8x8_t res64;
   6598     __m128i ab, ab1, max;
   6599     _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
   6600     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6601     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
   6602     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
   6603     max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1
   6604     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
   6605     return64(max); //we need 64 bits only
   6606 }
   6607 
   6608 _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0
   6609 _NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0
   6610 {
   6611     //solution may be not optimal compared with the serial one
   6612     int16x4_t res64;
   6613     __m128i ab, ab1, max;
   6614     _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
   6615     ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
   6616     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
   6617     max = _mm_max_epi16 (ab, ab1);
   6618     max =  _mm_shuffle_epi8 (max, *(__m128i*)  mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
   6619     return64(max);
   6620 }
   6621 
   6622 _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0
   6623 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6624 {
   6625     //serial solution looks faster than SIMD one
   6626     int32x2_t res;
   6627     res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
   6628     res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
   6629     return res;
   6630 }
   6631 
   6632 _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0
   6633 _NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0
   6634 {
   6635     uint8x8_t res64;
   6636     __m128i ab, ab1, max;
   6637     _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
   6638     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3,  5,  7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6639     ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab
   6640     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
   6641     max = _mm_max_epu8 (ab, ab1); // SSE4.1
   6642     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data
   6643     return64(max);
   6644 }
   6645 
   6646 _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0
   6647 _NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0
   6648 {
   6649     //solution may be not optimal compared with the serial one
   6650     uint16x4_t res64;
   6651     __m128i ab, ab1, max;
   6652     _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
   6653     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
   6654     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
   6655     max = _MM_MAX_EPU16 (ab, ab1);
   6656     max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
   6657     return64(max);
   6658 }
   6659 
   6660 _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0
   6661 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6662 {
   6663     //serial solution looks faster than SIMD one
   6664     uint32x2_t res;
   6665     res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
   6666     res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
   6667     return res;
   6668 }
   6669 
   6670 _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0
   6671 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   6672 {
   6673     //serial solution looks faster than  SIMD one
   6674     float32x2_t res;
   6675     res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
   6676     res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
   6677     return res;
   6678 }
   6679 
   6680 // ***************** Folding minimum  ****************************
   6681 // **************************************************************
   6682 //vpmin -> takes minimum of adjacent pairs
   6683 _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0
   6684 _NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0
   6685 {
   6686     int8x8_t res64;
   6687     __m128i ab, ab1, min;
   6688     _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
   6689     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6690     ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab
   6691     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding
   6692     min =  _MM_MIN_EPI8 (ab, ab1); // SSE4.1
   6693     min =  _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
   6694     return64(min);
   6695 }
   6696 
   6697 _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0
   6698 _NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0
   6699 {
   6700     //solution may be not optimal compared with the serial one
   6701     int16x4_t res64;
   6702     __m128i ab, ab1, min;
   6703     _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
   6704     ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
   6705     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask
   6706     min = _mm_min_epi16 (ab, ab1);
   6707     min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
   6708     return64(min);
   6709 }
   6710 
   6711 _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0
   6712 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6713 {
   6714     //serial solution looks faster than SIMD one
   6715     int32x2_t res;
   6716     res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0];
   6717     res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0];
   6718     return res;
   6719 }
   6720 
   6721 _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0
   6722 _NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0
   6723 {
   6724     uint8x8_t res64;
   6725     __m128i ab, ab1, min;
   6726     _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5,  4,  7,  6,    9,    8,   11,   10,   13,   12,   15,   14};
   6727     _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   6728     ab = _mm_unpacklo_epi64 (  _pM128i(a),  _pM128i(b)); //ab
   6729     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding
   6730     min = _mm_min_epu8 (ab, ab1); // SSE4.1
   6731     min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data
   6732     return64(min);
   6733 }
   6734 
   6735 _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0
   6736 _NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0
   6737 {
   6738     //solution may be not optimal compared with the serial one
   6739     uint16x4_t res64;
   6740     __m128i ab, ab1, min;
   6741     _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number
   6742     ab = _mm_unpacklo_epi64 ( _pM128i(a),  _pM128i(b)); //ab
   6743     ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask
   6744     min = _MM_MIN_EPU16 (ab, ab1);
   6745     min =    _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used
   6746     return64(min);
   6747 }
   6748 
   6749 _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0
   6750 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6751 {
   6752     //serial solution looks faster than SIMD one
   6753     uint32x2_t res;
   6754     res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0];
   6755     res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0];
   6756     return res;
   6757 }
   6758 
   6759 _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0
   6760 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL)
   6761 {
   6762     //serial solution looks faster than SIMD one
   6763     float32x2_t res;
   6764     res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0];
   6765     res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0];
   6766     return res;
   6767 }
   6768 
   6769 //***************************************************************
   6770 //***********  Reciprocal/Sqrt ************************************
   6771 //***************************************************************
   6772 //****************** Reciprocal estimate *******************************
   6773 //the ARM NEON and x86 SIMD results may be slightly different
   6774 _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0
   6775 _NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits
   6776 {
   6777     float32x4_t res;
   6778     __m64_128 res64;
   6779     res = _mm_rcp_ps(_pM128(a));
   6780     _M64f(res64, res);
   6781     return res64;
   6782 }
   6783 
   6784 _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0
   6785 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6786 {
   6787     //Input is  fixed point number!!! No reciprocal for ints in IA32 available
   6788     uint32x2_t res;
   6789     float resf, r;
   6790     int i, q, s;
   6791     for (i =0; i<2; i++){
   6792         if((a.m64_u32[i] & 0x80000000) == 0) {
   6793             res.m64_u32[i] = 0xffffffff;
   6794         }else{
   6795             resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
   6796             q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
   6797             r = (float)(1.0 / (((float)q + 0.5) / 512.0)); /* reciprocal r */
   6798             s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
   6799             r =  (float)s / 256.0;
   6800             res.m64_u32[i] = r * (uint32_t)(1 << 31);
   6801         }
   6802     }
   6803     return res;
   6804 }
   6805 
   6806 _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0
   6807 #define vrecpeq_f32 _mm_rcp_ps
   6808 
   6809 
   6810 _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0
   6811 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6812 {
   6813     //Input is  fixed point number!!!
   6814     //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double
   6815     _NEON2SSE_ALIGN_16 uint32_t atmp[4];
   6816     _NEON2SSE_ALIGN_16 uint32_t res[4];
   6817     _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000};
   6818     float resf, r;
   6819     int i, q, s;
   6820     __m128i res128, mask, zero;
   6821     _mm_store_si128((__m128i*)atmp, a);
   6822     zero = _mm_setzero_si128();
   6823     for (i =0; i<4; i++){
   6824         resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31)));  //  2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31))
   6825         q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */
   6826         r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */
   6827         s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
   6828         r =  (float)s / 256.0;
   6829         res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
   6830     }
   6831     res128 = _mm_load_si128((__m128i*)res);
   6832     mask = _mm_and_si128(a, *(__m128i*)c80000000);
   6833     mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x7fffffff
   6834     return _mm_or_si128(res128, mask);
   6835 }
   6836 
   6837 //**********Reciprocal square root estimate ****************
   6838 //**********************************************************
   6839 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster
   6840 //but the particular implementation for vrsqrte_u32 may vary for various ARM compilers
   6841 ////the ARM NEON and x86 SIMD results may be slightly different
   6842 _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0
   6843 _NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits
   6844 {
   6845     float32x4_t res;
   6846     __m64_128 res64;
   6847     res = _mm_rsqrt_ps(_pM128(a));
   6848     _M64f(res64, res);
   6849     return res64;
   6850 }
   6851 
   6852 _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0
   6853 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6854 {
   6855     //Input is  fixed point number!!!
   6856     //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
   6857    uint32x2_t res;
   6858    __m128 tmp;
   6859     float r, resf, coeff;
   6860     int i,q0, s;
   6861     for (i =0; i<2; i++){
   6862         if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff
   6863             res.m64_u32[i] = 0xffffffff;
   6864         }else{
   6865             resf =  (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31)));
   6866             coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
   6867             q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
   6868             r = ((float)q0 + 0.5) / coeff;
   6869             tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
   6870             _mm_store_ss(&r, tmp);
   6871             s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
   6872             r = (float)(s / 256.0);
   6873             res.m64_u32[i] = r * (((uint32_t)1) << 31);
   6874         }
   6875     }
   6876     return res;
   6877 }
   6878 
   6879 _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0
   6880 #define vrsqrteq_f32 _mm_rsqrt_ps
   6881 
   6882 _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0
   6883 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   6884 {
   6885     //Input is  fixed point number!!!
   6886     //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double
   6887    _NEON2SSE_ALIGN_16 uint32_t  atmp[4], res[4];
   6888    _NEON2SSE_ALIGN_16 static const uint32_t c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000};
   6889    __m128 tmp;
   6890    __m128i res128, mask, zero;
   6891     float r, resf, coeff;
   6892     int i,q0, s;
   6893     _mm_store_si128((__m128i*)atmp, a);
   6894     zero = _mm_setzero_si128();
   6895     for (i =0; i<4; i++){
   6896         resf =  (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31)));
   6897         coeff = (float)((resf < 0.5)? 512.0 : 256.0); /* range 0.25 <= resf < 0.5  or range 0.5 <= resf < 1.0*/
   6898         q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */
   6899         r = ((float)q0 + 0.5) / coeff;
   6900         tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */
   6901         _mm_store_ss(&r, tmp);
   6902         s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */
   6903         r = (float)s / 256.0;
   6904         res[i] = (uint32_t) (r * (((uint32_t)1) << 31) );
   6905     }
   6906     res128 = _mm_load_si128((__m128i*)res);
   6907     mask = _mm_and_si128(a, *(__m128i*)c_c0000000);
   6908     mask = _mm_cmpeq_epi32(zero, mask);  //0xffffffff if atmp[i] <= 0x3fffffff
   6909     return _mm_or_si128(res128, mask);
   6910 }
   6911 //************ Reciprocal estimate/step and 1/sqrt estimate/step ***************************
   6912 //******************************************************************************************
   6913 //******VRECPS (Vector Reciprocal Step) ***************************************************
   6914 //multiplies the elements of one vector by the corresponding elements of another vector,
   6915 //subtracts each of the results from 2, and places the final results into the elements of the destination vector.
   6916 
   6917 _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0
   6918 _NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b)
   6919 {
   6920     float32x4_t res;
   6921     __m64_128 res64;
   6922     res = vrecpsq_f32(_pM128(a), _pM128(b));
   6923     _M64f(res64, res);
   6924     return res64;
   6925 }
   6926 
   6927 _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0
   6928 _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0
   6929 {
   6930     __m128 f2, mul;
   6931     f2 =  _mm_set1_ps(2.);
   6932     mul = _mm_mul_ps(a,b);
   6933     return _mm_sub_ps(f2,mul);
   6934 }
   6935 
   6936 //*****************VRSQRTS (Vector Reciprocal Square Root Step) *****************************
   6937 //multiplies the elements of one vector by the corresponding elements of another vector,
   6938 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector.
   6939 
   6940 _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0
   6941 _NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b)
   6942 {
   6943     float32x2_t res;
   6944     res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2;
   6945     res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2;
   6946     return res;
   6947 }
   6948 
   6949 _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0
   6950 _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0
   6951 {
   6952     __m128 f3, f05, mul;
   6953     f3 =  _mm_set1_ps(3.);
   6954     f05 =  _mm_set1_ps(0.5);
   6955     mul = _mm_mul_ps(a,b);
   6956     f3 = _mm_sub_ps(f3,mul);
   6957     return _mm_mul_ps (f3, f05);
   6958 }
   6959 //********************************************************************************************
   6960 //***************************** Shifts by signed variable ***********************************
   6961 //********************************************************************************************
   6962 //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) ***********************
   6963 //********************************************************************************************
   6964 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
   6965 //helper macro. It matches ARM implementation for big shifts
   6966 #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
   6967         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
   6968         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   6969         for (i = 0; i<LEN; i++) { \
   6970         if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \
   6971         else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \
   6972         return _mm_load_si128((__m128i*)res);
   6973 
   6974 #define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \
   6975         int ## TYPE ## x ## LEN ## _t res;  int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \
   6976         for (i = 0; i<LEN; i++) { \
   6977         if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \
   6978         else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \
   6979         return res;
   6980 
   6981 _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0
   6982 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6983 {
   6984     SERIAL_SHIFT_64(8, i, 8)
   6985 }
   6986 
   6987 _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0
   6988 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6989 {
   6990     SERIAL_SHIFT_64(16, i, 4)
   6991 }
   6992 
   6993 _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0
   6994 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   6995 {
   6996     SERIAL_SHIFT_64(32, i, 2)
   6997 }
   6998 
   6999 _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0
   7000 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7001 {
   7002     SERIAL_SHIFT_64(64, i, 1)
   7003 }
   7004 
   7005 _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0
   7006 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7007 {
   7008     SERIAL_SHIFT_64(8, u, 8)
   7009 }
   7010 
   7011 _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0
   7012 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7013 {
   7014     SERIAL_SHIFT_64(16, u, 4)
   7015 }
   7016 
   7017 _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0
   7018 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7019 {
   7020     SERIAL_SHIFT_64(32, u, 2)
   7021 }
   7022 
   7023 _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0
   7024 _NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing  for large numbers
   7025 {
   7026     SERIAL_SHIFT_64(64, u, 1)
   7027 }
   7028 
   7029 _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0
   7030 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7031 {
   7032     SERIAL_SHIFT(int8_t, int8_t, 16, 16)
   7033 }
   7034 
   7035 _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0
   7036 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7037 {
   7038     SERIAL_SHIFT(int16_t, int16_t, 8, 8)
   7039 }
   7040 
   7041 _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0
   7042 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7043 {
   7044     SERIAL_SHIFT(int32_t, int32_t, 4, 4)
   7045 }
   7046 
   7047 _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0
   7048 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7049 {
   7050     SERIAL_SHIFT(int64_t, int64_t, 2, 2)
   7051 }
   7052 
   7053 _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0
   7054 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7055 {
   7056     SERIAL_SHIFT(uint8_t, int8_t, 16, 16)
   7057 }
   7058 
   7059 _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0
   7060 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7061 {
   7062     SERIAL_SHIFT(uint16_t, int16_t, 8, 8)
   7063 }
   7064 
   7065 _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0
   7066 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7067 {
   7068     SERIAL_SHIFT(uint32_t, int32_t, 4, 4)
   7069 }
   7070 
   7071 _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0
   7072 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7073 {
   7074     SERIAL_SHIFT(uint64_t, int64_t, 2, 2)
   7075 }
   7076 
   7077 
   7078 //*********** Vector saturating shift left: (negative values shift right) **********************
   7079 //********************************************************************************************
   7080 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
   7081 #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
   7082         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
   7083         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
   7084         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7085         for (i = 0; i<LEN; i++) { \
   7086         if (atmp[i] ==0) res[i] = 0; \
   7087         else{ \
   7088             if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \
   7089             else{ \
   7090                 if (btmp[i]>lanesize_1) { \
   7091                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   7092                 }else{ \
   7093                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
   7094                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
   7095                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   7096                     else res[i] = atmp[i] << btmp[i]; }}}} \
   7097         return _mm_load_si128((__m128i*)res);
   7098 
   7099 #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
   7100         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
   7101         TYPE lanesize = (sizeof(TYPE) << 3); \
   7102         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7103         for (i = 0; i<LEN; i++) { \
   7104         if (atmp[i] ==0) {res[i] = 0; \
   7105         }else{ \
   7106             if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \
   7107             else{ \
   7108                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
   7109                 else{ \
   7110                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
   7111                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
   7112         return _mm_load_si128((__m128i*)res);
   7113 
   7114 #define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \
   7115         int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \
   7116         int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \
   7117         for (i = 0; i<LEN; i++) { \
   7118         if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
   7119         else{ \
   7120             if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
   7121             else{ \
   7122                 if (b.m64_i ## TYPE[i]>lanesize_1) { \
   7123                     res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
   7124                 }else{ \
   7125                     limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
   7126                     if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
   7127                         res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \
   7128                     else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
   7129         return res;
   7130 
   7131 #define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \
   7132         int ## TYPE ## x ## LEN ## _t res;  _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
   7133         int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \
   7134         for (i = 0; i<LEN; i++) { \
   7135         if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
   7136         }else{ \
   7137             if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \
   7138             else{ \
   7139                 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
   7140                 else{ \
   7141                     limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
   7142                     res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \
   7143         return res;
   7144 
   7145 _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0
   7146 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7147 {
   7148     SERIAL_SATURATING_SHIFT_SIGNED_64(8,8)
   7149 }
   7150 
   7151 _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0
   7152 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7153 {
   7154     SERIAL_SATURATING_SHIFT_SIGNED_64(16,4)
   7155 }
   7156 
   7157 _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0
   7158 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7159 {
   7160     SERIAL_SATURATING_SHIFT_SIGNED_64(32,2)
   7161 }
   7162 
   7163 _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0
   7164 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7165 {
   7166     SERIAL_SATURATING_SHIFT_SIGNED_64(64,1)
   7167 }
   7168 
   7169 _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0
   7170 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7171 {
   7172     SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8)
   7173 }
   7174 
   7175 _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0
   7176 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7177 {
   7178     SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4)
   7179 }
   7180 
   7181 _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0
   7182 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7183 {
   7184     SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2)
   7185 }
   7186 
   7187 _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0
   7188 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7189 {
   7190     SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1)
   7191 }
   7192 
   7193 _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0
   7194 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7195 {
   7196     SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16)
   7197 }
   7198 
   7199 _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0
   7200 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7201 {
   7202     SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8)
   7203 }
   7204 
   7205 _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0
   7206 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7207 {
   7208     SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4)
   7209 }
   7210 
   7211 _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0
   7212 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7213 {
   7214     SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2)
   7215 }
   7216 
   7217 _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0
   7218 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7219 {
   7220     SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16)
   7221 }
   7222 
   7223 _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0
   7224 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7225 {
   7226     SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8)
   7227 }
   7228 
   7229 _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0
   7230 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7231 {
   7232     SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4)
   7233 }
   7234 
   7235 _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0
   7236 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7237 {
   7238     SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2)
   7239 }
   7240 
   7241 
   7242 //******** Vector rounding shift left: (negative values shift right) **********
   7243 //****************************************************************************
   7244 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution
   7245 //rounding makes sense for right shifts only.
   7246 #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \
   7247         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \
   7248         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7249         for (i = 0; i<LEN; i++) { \
   7250         if( btmp[i] >= 0) { \
   7251             if(btmp[i] >= lanesize) res[i] = 0; \
   7252             else res[i] = (atmp[i] << btmp[i]); \
   7253         }else{ \
   7254             res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \
   7255                             (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \
   7256                             (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) );    }} \
   7257         return _mm_load_si128((__m128i*)res);
   7258 
   7259 
   7260 #define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \
   7261         int ## TYPE ## x ## LEN ## _t res;  int i;  int lanesize = sizeof(int ## TYPE ## _t) << 3; \
   7262         for (i = 0; i<LEN; i++) { \
   7263         if( b.m64_i ## TYPE[i] >= 0) { \
   7264             if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \
   7265             else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \
   7266         }else{ \
   7267             res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \
   7268                             (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \
   7269                             (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) );    }} \
   7270         return res;
   7271 
   7272 
   7273 _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0
   7274 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7275 {
   7276     SERIAL_ROUNDING_SHIFT_64(8,i,8)
   7277 }
   7278 
   7279 _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0
   7280 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7281 {
   7282     SERIAL_ROUNDING_SHIFT_64(16,i,4)
   7283 }
   7284 
   7285 _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0
   7286 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7287 {
   7288     SERIAL_ROUNDING_SHIFT_64(32,i,2)
   7289 }
   7290 
   7291 _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0
   7292 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7293 {
   7294     SERIAL_ROUNDING_SHIFT_64(64,i,1)
   7295 }
   7296 
   7297 _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0
   7298 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7299 {
   7300     SERIAL_ROUNDING_SHIFT_64(8,u,8)
   7301 }
   7302 
   7303 _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0
   7304 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7305 {
   7306     SERIAL_ROUNDING_SHIFT_64(16,u,4)
   7307 }
   7308 
   7309 _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0
   7310 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7311 {
   7312     SERIAL_ROUNDING_SHIFT_64(32,u,2)
   7313 }
   7314 
   7315 _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0
   7316 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7317 {
   7318     SERIAL_ROUNDING_SHIFT_64(64,u,1)
   7319 }
   7320 
   7321 _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0
   7322 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7323 {
   7324     SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16)
   7325 }
   7326 
   7327 _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0
   7328 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7329 {
   7330     SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8)
   7331 }
   7332 
   7333 _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0
   7334 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7335 {
   7336     SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4)
   7337 }
   7338 
   7339 _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0
   7340 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7341 {
   7342     SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2)
   7343 }
   7344 
   7345 _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0
   7346 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7347 {
   7348     SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16)
   7349 }
   7350 
   7351 _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0
   7352 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7353 {
   7354     SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8)
   7355 }
   7356 
   7357 _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0
   7358 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7359 {
   7360     SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4)
   7361 }
   7362 
   7363 _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0
   7364 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7365 {
   7366     SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2)
   7367 }
   7368 
   7369 
   7370 //********** Vector saturating rounding shift left: (negative values shift right) ****************
   7371 //*************************************************************************************************
   7372 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution
   7373 //Saturation happens for left shifts only while rounding makes sense for right shifts only.
   7374 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \
   7375         _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \
   7376         int lanesize_1 = (sizeof(TYPE) << 3) - 1; \
   7377         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7378         for (i = 0; i<LEN; i++) { \
   7379         if (atmp[i] ==0) res[i] = 0; \
   7380         else{ \
   7381             if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
   7382             else{ \
   7383                 if (btmp[i]>lanesize_1) { \
   7384                     res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   7385                 }else{ \
   7386                     limit = (TYPE)1 << (lanesize_1 - btmp[i]); \
   7387                     if((atmp[i] >= limit)||(atmp[i] <= -limit)) \
   7388                         res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \
   7389                     else res[i] = atmp[i] << btmp[i]; }}}} \
   7390         return _mm_load_si128((__m128i*)res);
   7391 
   7392 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \
   7393         _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \
   7394         int lanesize = (sizeof(TYPE) << 3); \
   7395         _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \
   7396         for (i = 0; i<LEN; i++) { \
   7397         if (atmp[i] ==0) {res[i] = 0; \
   7398         }else{ \
   7399             if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \
   7400             else{ \
   7401                 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \
   7402                 else{ \
   7403                     limit = (TYPE) 1 << (lanesize - btmp[i]); \
   7404                     res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \
   7405         return _mm_load_si128((__m128i*)res);
   7406 
   7407 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \
   7408         __m64_128 res; int ## TYPE ## _t limit; int i; \
   7409         int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \
   7410         for (i = 0; i<LEN; i++) { \
   7411         if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \
   7412         else{ \
   7413             if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
   7414             else{ \
   7415                 if (b.m64_i ## TYPE[i]>lanesize_1) { \
   7416                     res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
   7417                 }else{ \
   7418                     limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \
   7419                     if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \
   7420                         res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \
   7421                     else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
   7422         return res;
   7423 
   7424 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \
   7425         __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \
   7426         int lanesize = (sizeof(int ## TYPE ## _t) << 3); \
   7427         for (i = 0; i<LEN; i++) { \
   7428         if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \
   7429         }else{ \
   7430             if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \
   7431             else{ \
   7432                 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \
   7433                 else{ \
   7434                     limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \
   7435                     res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \
   7436         return res;
   7437 
   7438 _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0
   7439 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7440 {
   7441     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8)
   7442 }
   7443 
   7444 _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0
   7445 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7446 {
   7447     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4)
   7448 }
   7449 
   7450 _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0
   7451 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7452 {
   7453     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2)
   7454 }
   7455 
   7456 _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0
   7457 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7458 {
   7459     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1)
   7460 }
   7461 
   7462 _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0
   7463 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7464 {
   7465     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8)
   7466 }
   7467 
   7468 _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0
   7469 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7470 {
   7471     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4)
   7472 }
   7473 
   7474 _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0
   7475 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7476 {
   7477     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2)
   7478 }
   7479 
   7480 _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0
   7481 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7482 {
   7483     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1)
   7484 }
   7485 
   7486 _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0
   7487 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7488 {
   7489     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16)
   7490 }
   7491 
   7492 _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0
   7493 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7494 {
   7495     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8)
   7496 }
   7497 
   7498 _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0
   7499 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7500 {
   7501     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4)
   7502 }
   7503 
   7504 _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0
   7505 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7506 {
   7507     SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2)
   7508 }
   7509 
   7510 _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0
   7511 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7512 {
   7513     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16)
   7514 }
   7515 
   7516 _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0
   7517 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7518 {
   7519     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8)
   7520 }
   7521 
   7522 _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0
   7523 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7524 {
   7525     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4)
   7526 }
   7527 
   7528 _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0
   7529 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b),  _NEON2SSE_REASON_SLOW_SERIAL)
   7530 {
   7531     SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2)
   7532 }
   7533 
   7534 // *********************************************************************************
   7535 // *****************************  Shifts by a constant *****************************
   7536 // *********************************************************************************
   7537 //**************** Vector shift right by constant*************************************
   7538 //************************************************************************************
   7539 _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8
   7540 _NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8
   7541 {
   7542     //no 8 bit shift available, go to 16 bit
   7543     int8x8_t res64;
   7544     __m128i r;
   7545     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   7546     r = _mm_srai_epi16 (r, b); //SSE2
   7547     r = _mm_packs_epi16 (r,r); //we need 64 bits only
   7548     return64(r);
   7549 }
   7550 
   7551 _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VSHR.S16 d0,d0,#16
   7552 _NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
   7553 {
   7554     int16x4_t res64;
   7555     return64(_mm_srai_epi16(_pM128i(a), b));
   7556 }
   7557 
   7558 
   7559 _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VSHR.S32 d0,d0,#32
   7560 _NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
   7561 {
   7562     int32x2_t res64;
   7563     return64(_mm_srai_epi32(_pM128i(a), b));
   7564 }
   7565 
   7566 _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64
   7567 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   7568 {
   7569     //no arithmetic shift for 64bit values, serial solution used
   7570     int64x1_t res;
   7571     if(b>=64) res.m64_i64[0] = 0;
   7572     else res.m64_i64[0] = (*(int64_t*)&a) >> b;
   7573     return res;
   7574 }
   7575 
   7576 _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8
   7577 _NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8
   7578 {
   7579     //no 8 bit shift available, go to 16 bit
   7580     uint8x8_t res64;
   7581     __m128i r;
   7582     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
   7583     r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one
   7584     r = _mm_packus_epi16 (r,r); //we need 64 bits only
   7585     return64(r);
   7586 }
   7587 
   7588 _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VSHR.s16 d0,d0,#16
   7589 _NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
   7590 {
   7591     uint16x4_t res64;
   7592     return64(_mm_srli_epi16(_pM128i(a), b));
   7593 }
   7594 
   7595 
   7596 _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VSHR.U32 d0,d0,#32
   7597 _NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
   7598 {
   7599     uint32x2_t res64;
   7600     return64(_mm_srli_epi32(_pM128i(a), b));
   7601 }
   7602 
   7603 
   7604 _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b); // VSHR.U64 d0,d0,#64
   7605 _NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a,  __constrange(1,64) int b)
   7606 {
   7607     uint64x1_t res64;
   7608     return64(_mm_srli_epi64(_pM128i(a), b));
   7609 }
   7610 
   7611 
   7612 _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8
   7613 _NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8
   7614 {
   7615     //no 8 bit shift available, go to 16 bit trick
   7616     __m128i zero, mask0, a_sign, r, a_sign_mask;
   7617     _NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0,  0x00f8, 0x00fc, 0x00fe, 0x00ff};
   7618     zero = _mm_setzero_si128();
   7619     mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
   7620     a_sign =  _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0
   7621     r = _mm_srai_epi16 (a, b);
   7622     a_sign_mask =  _mm_and_si128 (mask0, a_sign);
   7623     r =  _mm_andnot_si128 (mask0, r);
   7624     return _mm_or_si128 (r, a_sign_mask);
   7625 }
   7626 
   7627 _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16
   7628 #define vshrq_n_s16 _mm_srai_epi16
   7629 
   7630 _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32
   7631 #define vshrq_n_s32 _mm_srai_epi32
   7632 
   7633 _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64
   7634 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
   7635 {
   7636     //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD
   7637     __m128i c1, signmask,a0,  res64;
   7638     _NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000};
   7639     c1 =  _mm_cmpeq_epi32(a,a); //0xffffffffffffffff
   7640     signmask  =  _mm_slli_epi64 (c1, (64 - b));
   7641     a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit
   7642     a0 = _MM_CMPEQ_EPI64 (a, a0);
   7643     signmask = _mm_and_si128(a0, signmask);
   7644     res64 = _mm_srli_epi64 (a, b);
   7645     return _mm_or_si128(res64, signmask);
   7646 }
   7647 
   7648 _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8
   7649 _NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8
   7650 {
   7651     //no 8 bit shift available, need the special trick
   7652     __m128i mask0, r;
   7653     _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f,  0xff07, 0xff03, 0xff01, 0xff00};
   7654     mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
   7655     r = _mm_srli_epi16 ( a, b);
   7656     return _mm_and_si128 (r,  mask0);
   7657 }
   7658 
   7659 _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16
   7660 #define vshrq_n_u16 _mm_srli_epi16
   7661 
   7662 _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32
   7663 #define vshrq_n_u32 _mm_srli_epi32
   7664 
   7665 _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64
   7666 #define vshrq_n_u64 _mm_srli_epi64
   7667 
   7668 //*************************** Vector shift left by constant *************************
   7669 //*********************************************************************************
   7670 _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
   7671 _NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0
   7672 {
   7673     //no 8 bit shift available, go to 16 bit
   7674     int8x8_t res64;
   7675     __m128i r;
   7676     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   7677     r = _mm_slli_epi16 (r, b); //SSE2
   7678     r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only
   7679     return64(r);
   7680 }
   7681 
   7682 _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
   7683 _NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a,  __constrange(0,15) int b)
   7684 {
   7685     int16x4_t res64;
   7686     return64(_mm_slli_epi16(_pM128i(a), b));
   7687 }
   7688 
   7689 
   7690 _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
   7691 _NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
   7692 {
   7693     int32x2_t res64;
   7694     return64(_mm_slli_epi32(_pM128i(a), b));
   7695 }
   7696 
   7697 
   7698 _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
   7699 _NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a,  __constrange(0,63) int b)
   7700 {
   7701     int64x1_t res64;
   7702     return64(_mm_slli_epi64(_pM128i(a), b));
   7703 }
   7704 
   7705 
   7706 _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0
   7707 _NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b)
   7708 {
   7709     //no 8 bit shift available, go to 16 bit
   7710     uint8x8_t res64;
   7711     __m128i mask8;
   7712     __m128i r;
   7713     mask8 = _mm_set1_epi16(0xff);
   7714     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
   7715     r = _mm_slli_epi16 (r, b); //SSE2
   7716     r = _mm_and_si128(r, mask8); //to avoid saturation
   7717     r = _mm_packus_epi16 (r,r); //we need 64 bits only
   7718     return64(r);
   7719 }
   7720 
   7721 _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a,  __constrange(0,15) int b); // VSHL.I16 d0,d0,#0
   7722 #define vshl_n_u16 vshl_n_s16
   7723 
   7724 
   7725 _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VSHL.I32 d0,d0,#0
   7726 #define vshl_n_u32 vshl_n_s32
   7727 
   7728 _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0
   7729 #define vshl_n_u64 vshl_n_s64
   7730 
   7731 _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
   7732 #define vshlq_n_s8 vshlq_n_u8
   7733 
   7734 _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
   7735 #define vshlq_n_s16 _mm_slli_epi16
   7736 
   7737 _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
   7738 #define vshlq_n_s32 _mm_slli_epi32
   7739 
   7740 _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
   7741 #define vshlq_n_s64 _mm_slli_epi64
   7742 
   7743 _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0
   7744 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b)
   7745 {
   7746     //no 8 bit shift available, need the special trick
   7747     __m128i mask0, r;
   7748     _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff,  0xe0ff, 0xc0ff, 0x80ff, 0xff};
   7749     mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled"  by 16 bit shift
   7750     r = _mm_slli_epi16 ( a, b);
   7751     return _mm_and_si128 (r,  mask0);
   7752 }
   7753 
   7754 _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0
   7755 #define vshlq_n_u16 vshlq_n_s16
   7756 
   7757 _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0
   7758 #define vshlq_n_u32 vshlq_n_s32
   7759 
   7760 _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0
   7761 #define vshlq_n_u64 vshlq_n_s64
   7762 
   7763 //************* Vector rounding shift right by constant ******************
   7764 //*************************************************************************
   7765 //No corresponding  x86 intrinsics exist, need to do some tricks
   7766 _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8
   7767 _NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8
   7768 {
   7769     //no 8 bit shift available, go to 16 bit
   7770     int8x8_t res64;
   7771     __m128i r, maskb;
   7772     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   7773     maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
   7774     maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
   7775     r = _mm_srai_epi16 (r, b);
   7776     r = _mm_add_epi16 (r, maskb); //actual rounding
   7777     r = _mm_packs_epi16 (r,r); ////we need 64 bits only
   7778     return64(r);
   7779 }
   7780 
   7781 _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16
   7782 _NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a,  __constrange(1,16) int b)
   7783 {
   7784     int16x4_t res64;
   7785     return64(vrshrq_n_s16(_pM128i(a), b));
   7786 }
   7787 
   7788 
   7789 _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32
   7790 _NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a,  __constrange(1,32) int b)
   7791 {
   7792     int32x2_t res64;
   7793     return64(vrshrq_n_s32(_pM128i(a), b));
   7794 }
   7795 
   7796 
   7797 _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64
   7798 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   7799 {
   7800     //serial solution is faster
   7801     int64x1_t res;
   7802     int64_t a_i64 = *( int64_t*)&a;
   7803     if(b==64) {
   7804         res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63;
   7805     } else {
   7806         int64_t maskb = a_i64 & (( int64_t)1 << (b - 1));
   7807         res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1));
   7808     }
   7809     return res;
   7810 }
   7811 
   7812 _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8
   7813 _NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8
   7814 {
   7815     //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one
   7816     uint8x8_t res64;
   7817     __m128i r, maskb;
   7818     r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
   7819     maskb =  _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit
   7820     maskb = _mm_srli_epi16 (maskb, 15); //1 or 0
   7821     r = _mm_srli_epi16 (r, b);
   7822     r = _mm_add_epi16 (r, maskb); //actual rounding
   7823     r =  _mm_packus_epi16 (r,r); ////we need 64 bits only
   7824     return64(r);
   7825 }
   7826 
   7827 _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16
   7828 _NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a,  __constrange(1,16) int b)
   7829 {
   7830     uint16x4_t res64;
   7831     return64(vrshrq_n_u16(_pM128i(a), b));
   7832 }
   7833 
   7834 
   7835 _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32
   7836 _NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a,  __constrange(1,32) int b)
   7837 {
   7838     uint32x2_t res64;
   7839     return64(vrshrq_n_u32(_pM128i(a), b));
   7840 }
   7841 
   7842 
   7843 _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64
   7844 _NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b)
   7845 {
   7846     uint64x1_t res64;
   7847     return64(vrshrq_n_u64(_pM128i(a), b));
   7848 }
   7849 
   7850 _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8
   7851 _NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8
   7852 {
   7853     //no 8 bit shift available, go to 16 bit trick
   7854     __m128i r, mask1, maskb;
   7855     _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
   7856     r = vshrq_n_s8 (a, b);
   7857     mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
   7858     maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
   7859     maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
   7860     return _mm_add_epi8(r, maskb); //actual rounding
   7861 }
   7862 
   7863 _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16
   7864 _NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
   7865 {
   7866     __m128i maskb, r;
   7867     maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
   7868     maskb = _mm_srli_epi16(maskb, 15); //1 or 0
   7869     r = _mm_srai_epi16 (a, b);
   7870     return _mm_add_epi16 (r, maskb); //actual rounding
   7871 }
   7872 
   7873 _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32
   7874 _NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
   7875 {
   7876     __m128i maskb,  r;
   7877     maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
   7878     maskb = _mm_srli_epi32 (maskb,31); //1 or 0
   7879     r = _mm_srai_epi32(a, b);
   7880     return _mm_add_epi32 (r, maskb); //actual rounding
   7881 }
   7882 
   7883 _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64
   7884 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b)
   7885 {
   7886     //solution may be not optimal compared with a serial one
   7887     __m128i maskb;
   7888     int64x2_t r;
   7889     maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
   7890     maskb = _mm_srli_epi64 (maskb,63); //1 or 0
   7891     r = vshrq_n_s64(a, b);
   7892     return _mm_add_epi64 (r, maskb); //actual rounding
   7893 }
   7894 
   7895 _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8
   7896 _NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8
   7897 {
   7898     //no 8 bit shift available, go to 16 bit trick
   7899     __m128i r, mask1, maskb;
   7900     _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1
   7901     r = vshrq_n_u8 (a, b);
   7902     mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding
   7903     maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding
   7904     maskb =  _mm_srli_epi16 (maskb, b - 1); // to add 1
   7905     return _mm_add_epi8(r, maskb); //actual rounding
   7906 }
   7907 
   7908 _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16
   7909 _NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16
   7910 {
   7911     __m128i maskb, r;
   7912     maskb =  _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit
   7913     maskb = _mm_srli_epi16(maskb, 15); //1 or 0
   7914     r = _mm_srli_epi16 (a, b);
   7915     return _mm_add_epi16 (r, maskb); //actual rounding
   7916 }
   7917 
   7918 _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32
   7919 _NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32
   7920 {
   7921     __m128i maskb,  r;
   7922     maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit
   7923     maskb = _mm_srli_epi32 (maskb,31); //1 or 0
   7924     r = _mm_srli_epi32(a, b);
   7925     return _mm_add_epi32 (r, maskb); //actual rounding
   7926 }
   7927 
   7928 _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64
   7929 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b)
   7930 {
   7931     //solution may be not optimal compared with a serial one
   7932     __m128i maskb,  r;
   7933     maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit
   7934     maskb = _mm_srli_epi64 (maskb,63); //1 or 0
   7935     r = _mm_srli_epi64(a, b);
   7936     return _mm_add_epi64 (r, maskb); //actual rounding
   7937 }
   7938 
   7939 //************* Vector shift right by constant and accumulate *********
   7940 //*********************************************************************
   7941 _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8
   7942 _NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8
   7943 {
   7944     int8x8_t shift;
   7945     shift = vshr_n_s8(b, c);
   7946     return vadd_s8( a, shift);
   7947 }
   7948 
   7949 _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16
   7950 _NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16
   7951 {
   7952     int16x4_t shift;
   7953     shift = vshr_n_s16( b, c);
   7954     return vadd_s16(a, shift);
   7955 }
   7956 
   7957 _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32
   7958 _NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32
   7959 {
   7960     //may be not optimal compared with the serial execution
   7961     int32x2_t shift;
   7962     shift = vshr_n_s32(b, c);
   7963     return vadd_s32( a, shift);
   7964 }
   7965 
   7966 _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64
   7967 _NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
   7968 {
   7969     //may be not optimal compared with a serial solution
   7970     int64x1_t shift;
   7971     shift = vshr_n_s64(b, c);
   7972     return vadd_s64( a, shift);
   7973 }
   7974 
   7975 _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8
   7976 _NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8
   7977 {
   7978     uint8x8_t shift;
   7979     shift = vshr_n_u8(b, c);
   7980     return vadd_u8(a, shift);
   7981 }
   7982 
   7983 _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16
   7984 _NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16
   7985 {
   7986     uint16x4_t shift;
   7987     shift = vshr_n_u16(b, c);
   7988     return vadd_u16(a,shift);
   7989 }
   7990 
   7991 _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32
   7992 _NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32
   7993 {
   7994     //may be not optimal compared with the serial execution
   7995     uint32x2_t shift;
   7996     shift = vshr_n_u32(b, c);
   7997     return vadd_u32( a, shift);
   7998 }
   7999 
   8000 _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64
   8001 _NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64
   8002 {
   8003     //may be not optimal compared with the serial execution
   8004     uint64x1_t shift;
   8005     shift = vshr_n_u64(b, c);
   8006     return vadd_u64(a, shift);
   8007 }
   8008 
   8009 _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8
   8010 _NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8
   8011 {
   8012     int8x16_t shift;
   8013     shift = vshrq_n_s8(b, c);
   8014     return vaddq_s8(a, shift);
   8015 }
   8016 
   8017 _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16
   8018 _NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16
   8019 {
   8020     int16x8_t shift;
   8021     shift = vshrq_n_s16(b, c);
   8022     return vaddq_s16(a, shift);
   8023 }
   8024 
   8025 _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32
   8026 _NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32
   8027 {
   8028     int32x4_t shift;
   8029     shift = vshrq_n_s32(b, c);
   8030     return vaddq_s32(a, shift);
   8031 }
   8032 
   8033 _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64
   8034 _NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64
   8035 {
   8036     int64x2_t shift;
   8037     shift = vshrq_n_s64(b, c);
   8038     return vaddq_s64( a, shift);
   8039 }
   8040 
   8041 _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8
   8042 _NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8
   8043 {
   8044     uint8x16_t shift;
   8045     shift = vshrq_n_u8(b, c);
   8046     return vaddq_u8(a, shift);
   8047 }
   8048 
   8049 _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16
   8050 _NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16
   8051 {
   8052     uint16x8_t shift;
   8053     shift = vshrq_n_u16(b, c);
   8054     return vaddq_u16(a,  shift);
   8055 }
   8056 
   8057 _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32
   8058 _NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32
   8059 {
   8060     uint32x4_t shift;
   8061     shift = vshrq_n_u32(b, c);
   8062     return vaddq_u32(a, shift);
   8063 }
   8064 
   8065 _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64
   8066 _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64
   8067 {
   8068     uint64x2_t shift;
   8069     shift = vshrq_n_u64(b, c);
   8070     return vaddq_u64(a, shift);
   8071 }
   8072 
   8073 //************* Vector rounding shift right by constant and accumulate ****************************
   8074 //************************************************************************************************
   8075 _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8
   8076 _NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8
   8077 {
   8078     int8x8_t shift;
   8079     shift = vrshr_n_s8(b, c);
   8080     return vadd_s8( a, shift);
   8081 }
   8082 
   8083 _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16
   8084 _NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16
   8085 {
   8086     int16x4_t shift;
   8087     shift = vrshr_n_s16( b, c);
   8088     return vadd_s16(a, shift);
   8089 }
   8090 
   8091 _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32
   8092 _NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32
   8093 {
   8094     //may be not optimal compared with the serial execution
   8095     int32x2_t shift;
   8096     shift = vrshr_n_s32(b, c);
   8097     return vadd_s32( a, shift);
   8098 }
   8099 
   8100 _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64
   8101 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
   8102 {
   8103     int64x1_t shift;
   8104     shift = vrshr_n_s64(b, c);
   8105     return vadd_s64( a, shift);
   8106 }
   8107 
   8108 _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8
   8109 _NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8
   8110 {
   8111     uint8x8_t shift;
   8112     shift = vrshr_n_u8(b, c);
   8113     return vadd_u8(a, shift);
   8114 }
   8115 
   8116 _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16
   8117 _NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16
   8118 {
   8119     uint16x4_t shift;
   8120     shift = vrshr_n_u16(b, c);
   8121     return vadd_u16(a,shift);
   8122 }
   8123 
   8124 _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32
   8125 _NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32
   8126 {
   8127     //may be not optimal compared with the serial execution
   8128     uint32x2_t shift;
   8129     shift = vrshr_n_u32(b, c);
   8130     return vadd_u32( a, shift);
   8131 }
   8132 
   8133 _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64
   8134 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution
   8135 {
   8136     //may be not optimal compared with the serial execution
   8137     uint64x1_t shift;
   8138     shift = vrshr_n_u64(b, c);
   8139     return vadd_u64( a, shift);
   8140 }
   8141 
   8142 _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8
   8143 _NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8
   8144 {
   8145     int8x16_t shift;
   8146     shift = vrshrq_n_s8(b, c);
   8147     return vaddq_s8(a, shift);
   8148 }
   8149 
   8150 _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16
   8151 _NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16
   8152 {
   8153     int16x8_t shift;
   8154     shift = vrshrq_n_s16(b, c);
   8155     return vaddq_s16(a, shift);
   8156 }
   8157 
   8158 _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32
   8159 _NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32
   8160 {
   8161     int32x4_t shift;
   8162     shift = vrshrq_n_s32(b, c);
   8163     return vaddq_s32(a, shift);
   8164 }
   8165 
   8166 _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64
   8167 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
   8168 {
   8169     int64x2_t shift;
   8170     shift = vrshrq_n_s64(b, c);
   8171     return vaddq_s64(a, shift);
   8172 }
   8173 
   8174 _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8
   8175 _NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8
   8176 {
   8177     uint8x16_t shift;
   8178     shift = vrshrq_n_u8(b, c);
   8179     return vaddq_u8(a, shift);
   8180 }
   8181 
   8182 _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16
   8183 _NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16
   8184 {
   8185     uint16x8_t shift;
   8186     shift = vrshrq_n_u16(b, c);
   8187     return vaddq_u16(a,  shift);
   8188 }
   8189 
   8190 _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32
   8191 _NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32
   8192 {
   8193     uint32x4_t shift;
   8194     shift = vrshrq_n_u32(b, c);
   8195     return vaddq_u32(a, shift);
   8196 }
   8197 
   8198 _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64
   8199 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c)
   8200 {
   8201     uint64x2_t shift;
   8202     shift = vrshrq_n_u64(b, c);
   8203     return vaddq_u64(a, shift);
   8204 }
   8205 
   8206 //**********************Vector saturating shift left by constant *****************************
   8207 //********************************************************************************************
   8208 //we don't check const ranges  assuming they are met
   8209 _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0
   8210 _NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0
   8211 {
   8212     //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
   8213     int8x8_t res64;
   8214     __m128i a128, r128;
   8215     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   8216     r128 = _mm_slli_epi16 (a128, b);
   8217     r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only
   8218     return64(r128);
   8219 }
   8220 
   8221 _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0
   8222 _NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0
   8223 {
   8224     // go to 32 bit to get the auto saturation (in packs function)
   8225     int16x4_t res64;
   8226     __m128i a128, r128;
   8227     a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
   8228     r128 = _mm_slli_epi32 (a128, b); //shift_res
   8229     r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only
   8230     return64(r128);
   8231 }
   8232 
   8233 _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0
   8234 _NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a,  __constrange(0,31) int b)
   8235 {
   8236     //serial execution may be faster
   8237     int32x2_t res64;
   8238     return64(vqshlq_n_s32 (_pM128i(a), b));
   8239 }
   8240 
   8241 
   8242 _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0
   8243 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   8244 {
   8245     // no effective SIMD solution here
   8246     int64x1_t res;
   8247     int64_t bmask;
   8248     int64_t a_i64 = *( int64_t*)&a;
   8249     bmask = ( int64_t)1 << (63 - b); //positive
   8250     if (a_i64 >= bmask) {
   8251         res.m64_i64[0] = ~(_SIGNBIT64);
   8252     } else {
   8253         res.m64_i64[0]  = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b;
   8254     }
   8255     return res;
   8256 }
   8257 
   8258 
   8259 _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0
   8260 _NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0
   8261 {
   8262     //no 8 bit shift available in IA32 SIMD, go to 16 bit
   8263     uint8x8_t res64;
   8264     __m128i a128, r128;
   8265     a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1
   8266     r128 = _mm_slli_epi16 (a128, b); //shift_res
   8267     r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
   8268     return64(r128);
   8269 }
   8270 
   8271 _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0
   8272 _NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0
   8273 {
   8274     // go to 32 bit to get the auto saturation (in packus function)
   8275     uint16x4_t res64;
   8276     __m128i a128, r128;
   8277     a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1
   8278     r128 = _mm_slli_epi32 (a128, b); //shift_res
   8279     r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16
   8280     return64(r128);
   8281 }
   8282 
   8283 _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0
   8284 _NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a,  __constrange(0,31) int b)
   8285 {
   8286     uint32x2_t res64;
   8287     return64(vqshlq_n_u32(_pM128i(a), b));
   8288 }
   8289 
   8290 _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0
   8291 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   8292 {
   8293     // no effective SIMD solution here
   8294     uint64x1_t res;
   8295     uint64_t bmask;
   8296     uint64_t a_i64 = *(uint64_t*)&a;
   8297     bmask = ( uint64_t)1 << (64 - b);
   8298     res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a
   8299     return res;
   8300 }
   8301 
   8302 _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0
   8303 _NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0
   8304 {
   8305     // go to 16 bit to get the auto saturation (in packs function)
   8306     __m128i a128, r128_1, r128_2;
   8307     a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
   8308     r128_1 = _mm_slli_epi16 (a128, b);
   8309     //swap hi and low part of a128 to process the remaining data
   8310     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8311     a128 = _MM_CVTEPI8_EPI16 (a128);
   8312     r128_2 = _mm_slli_epi16 (a128, b);
   8313     return _mm_packs_epi16 (r128_1, r128_2); //saturated s8
   8314 }
   8315 
   8316 _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0
   8317 _NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0
   8318 {
   8319     // manual saturation solution looks LESS optimal than 32 bits conversion one
   8320     // go to 32 bit to get the auto saturation (in packs function)
   8321     __m128i a128, r128_1, r128_2;
   8322     a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
   8323     r128_1 = _mm_slli_epi32 (a128, b); //shift_res
   8324     //swap hi and low part of a128 to process the remaining data
   8325     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8326     a128 = _MM_CVTEPI16_EPI32 (a128);
   8327     r128_2 = _mm_slli_epi32 (a128, b);
   8328     return _mm_packs_epi32 (r128_1, r128_2); //saturated s16
   8329 }
   8330 
   8331 _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0
   8332 _NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0
   8333 {
   8334     // no 64 bit saturation option available, special tricks necessary
   8335     __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask;
   8336     c1 = _mm_cmpeq_epi32(a,a); //0xff..ff
   8337     maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones
   8338     saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0  otherwise
   8339     c7ffffff_mask  = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not
   8340     shift_res = _mm_slli_epi32 (a, b);
   8341     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
   8342     //result with positive numbers saturated
   8343     shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask);
   8344     //treat negative numbers
   8345     maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones  and (32-b+1)  zeros
   8346     saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0  otherwise
   8347     c7ffffff_mask  = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not
   8348     shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res);
   8349     return _mm_or_si128 (c7ffffff_mask, shift_res_mask);
   8350 }
   8351 
   8352 _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0
   8353 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   8354 {
   8355     // no effective SIMD solution here
   8356     _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2];
   8357     int64_t bmask;
   8358     int i;
   8359     bmask = ( int64_t)1 << (63 - b); //positive
   8360     _mm_store_si128((__m128i*)atmp, a);
   8361     for (i = 0; i<2; i++) {
   8362         if (atmp[i] >= bmask) {
   8363             res[i] = ~(_SIGNBIT64);
   8364         } else {
   8365             res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b;
   8366         }
   8367     }
   8368     return _mm_load_si128((__m128i*)res);
   8369 }
   8370 
   8371 _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0
   8372 _NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0
   8373 {
   8374     // go to 16 bit to get the auto saturation (in packs function)
   8375     __m128i a128, r128_1, r128_2;
   8376     a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1
   8377     r128_1 = _mm_slli_epi16 (a128, b);
   8378     //swap hi and low part of a128 to process the remaining data
   8379     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8380     a128 = _MM_CVTEPU8_EPI16 (a128);
   8381     r128_2 = _mm_slli_epi16 (a128, b);
   8382     return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
   8383 }
   8384 
   8385 _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0
   8386 _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0
   8387 {
   8388     // manual saturation solution looks more optimal than 32 bits conversion one
   8389     __m128i cb, c8000, a_signed, saturation_mask,  shift_res;
   8390     cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 );
   8391     c8000 = _mm_set1_epi16 ((int16_t)0x8000);
   8392 //no unsigned shorts comparison in SSE, only signed available, so need the trick
   8393     a_signed = _mm_sub_epi16(a, c8000); //go to signed
   8394     saturation_mask = _mm_cmpgt_epi16 (a_signed, cb);
   8395     shift_res = _mm_slli_epi16 (a, b);
   8396     return _mm_or_si128 (shift_res, saturation_mask);
   8397 }
   8398 
   8399 _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0
   8400 _NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0
   8401 {
   8402     // manual saturation solution, no 64 bit saturation option, the serial version may be faster
   8403     __m128i cb, c80000000, a_signed, saturation_mask,  shift_res;
   8404     cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 );
   8405     c80000000 = _mm_set1_epi32 (0x80000000);
   8406 //no unsigned ints comparison in SSE, only signed available, so need the trick
   8407     a_signed = _mm_sub_epi32(a, c80000000); //go to signed
   8408     saturation_mask = _mm_cmpgt_epi32 (a_signed, cb);
   8409     shift_res = _mm_slli_epi32 (a, b);
   8410     return _mm_or_si128 (shift_res, saturation_mask);
   8411 }
   8412 
   8413 _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0
   8414 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL)
   8415 {
   8416     // no effective SIMD solution here
   8417     _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2];
   8418     uint64_t bmask;
   8419     int i;
   8420     bmask = ( uint64_t)1 << (64 - b);
   8421     _mm_store_si128((__m128i*)atmp, a);
   8422     for (i = 0; i<2; i++) {
   8423         res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a
   8424     }
   8425     return _mm_load_si128((__m128i*)res);
   8426 }
   8427 
   8428 //**************Vector signed->unsigned saturating shift left by constant *************
   8429 //*************************************************************************************
   8430 _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0
   8431 _NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0
   8432 {
   8433     //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function)
   8434     uint8x8_t res64;
   8435     __m128i a128, r128;
   8436     a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   8437     r128 = _mm_slli_epi16 (a128, b);
   8438     r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only
   8439     return64(r128);
   8440 }
   8441 
   8442 _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0
   8443 _NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0
   8444 {
   8445     uint16x4_t res64;
   8446     __m128i a128, r128;
   8447     a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1
   8448     r128 = _mm_slli_epi32 (a128, b); //shift_res
   8449     r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only
   8450     return64(r128);
   8451 }
   8452 
   8453 _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0
   8454 _NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a,  __constrange(0,31) int b)
   8455 {
   8456     int32x2_t res64;
   8457     return64( vqshluq_n_s32(_pM128i(a), b));
   8458 }
   8459 
   8460 _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0
   8461 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster
   8462 {
   8463     uint64x1_t res;
   8464     uint64_t limit;
   8465     if (a.m64_i64[0]<=0) {
   8466         res.m64_u64[0] = 0;
   8467     } else {
   8468         limit = (uint64_t) 1 << (64 - b);
   8469         res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b;
   8470     }
   8471     return res;
   8472 }
   8473 
   8474 _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0
   8475 _NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0
   8476 {
   8477     __m128i a128, r128_1, r128_2;
   8478     a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1
   8479     r128_1 = _mm_slli_epi16 (a128, b);
   8480     //swap hi and low part of a128 to process the remaining data
   8481     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8482     a128 = _MM_CVTEPI8_EPI16 (a128);
   8483     r128_2 = _mm_slli_epi16 (a128, b);
   8484     return _mm_packus_epi16 (r128_1, r128_2); //saturated u8
   8485 }
   8486 
   8487 _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0
   8488 _NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0
   8489 {
   8490     // manual saturation solution looks LESS optimal than 32 bits conversion one
   8491     __m128i a128, r128_1, r128_2;
   8492     a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1
   8493     r128_1 = _mm_slli_epi32 (a128, b); //shift_res
   8494     //swap hi and low part of a128 to process the remaining data
   8495     a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32);
   8496     a128 = _MM_CVTEPI16_EPI32 (a128);
   8497     r128_2 = _mm_slli_epi32 (a128, b);
   8498     return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16
   8499 }
   8500 
   8501 _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0
   8502 _NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0
   8503 {
   8504     //solution may be  not optimal compared with the serial one
   8505     __m128i zero, maskA, maskGT0, a0,  a_masked, a_shift;
   8506     zero = _mm_setzero_si128();
   8507     maskA = _mm_cmpeq_epi32(a, a);
   8508     maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros
   8509     //saturate negative numbers to zero
   8510     maskGT0   = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers)
   8511     a0 = _mm_and_si128 (a,  maskGT0); //negative are zeros now
   8512     //saturate positive to 0xffffffff
   8513     a_masked = _mm_and_si128 (a0, maskA);
   8514     a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise
   8515     a_shift = _mm_slli_epi32 (a0, b);
   8516     return _mm_or_si128 (a_shift, a_masked); //actual saturation
   8517 }
   8518 
   8519 _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0
   8520 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b),  _NEON2SSE_REASON_SLOW_SERIAL)
   8521 {
   8522     // no effective SIMD solution here, serial execution looks faster
   8523     _NEON2SSE_ALIGN_16 int64_t atmp[2];
   8524     _NEON2SSE_ALIGN_16 uint64_t res[2];
   8525     uint64_t limit;
   8526     int i;
   8527     _mm_store_si128((__m128i*)atmp, a);
   8528     for (i = 0; i<2; i++) {
   8529         if (atmp[i]<=0) {
   8530             res[i] = 0;
   8531         } else {
   8532             limit = (uint64_t) 1 << (64 - b);
   8533             res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b;
   8534         }
   8535     }
   8536     return _mm_load_si128((__m128i*)res);
   8537 }
   8538 
   8539 //************** Vector narrowing  shift right by constant **************
   8540 //**********************************************************************
   8541 _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
   8542 _NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
   8543 {
   8544     int8x8_t res64;
   8545     __m128i r16;
   8546     r16  = vshrq_n_s16(a,b);
   8547     r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
   8548     return64(r16);
   8549 }
   8550 
   8551 _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
   8552 _NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
   8553 {
   8554     int16x4_t res64;
   8555     __m128i r32;
   8556     r32  = vshrq_n_s32(a,b);
   8557     r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
   8558     return64(r32);
   8559 }
   8560 
   8561 _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
   8562 _NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
   8563 {
   8564     int32x2_t res64;
   8565     __m128i r64;
   8566     r64  = vshrq_n_s64(a,b);
   8567     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8568     return64(r64);
   8569 }
   8570 
   8571 _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8
   8572 _NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8
   8573 {
   8574     uint8x8_t res64;
   8575     __m128i mask, r16;
   8576     mask = _mm_set1_epi16(0xff);
   8577     r16  = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
   8578     r16 = _mm_and_si128(r16, mask); //to avoid saturation
   8579     r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only
   8580     return64(r16);
   8581 }
   8582 
   8583 _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16
   8584 _NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16
   8585 {
   8586     uint16x4_t res64;
   8587     __m128i mask, r32;
   8588     mask = _mm_set1_epi32(0xffff);
   8589     r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16)
   8590     r32 = _mm_and_si128(r32, mask); //to avoid saturation
   8591     r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
   8592     return64(r32);
   8593 }
   8594 
   8595 _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32
   8596 _NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
   8597 {
   8598     uint32x2_t res64;
   8599     __m128i r64;
   8600     r64  = vshrq_n_u64(a,b);
   8601     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8602     return64(r64);
   8603 }
   8604 
   8605 //************** Vector signed->unsigned narrowing saturating shift right by constant ********
   8606 //*********************************************************************************************
   8607 _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8
   8608 _NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8
   8609 {
   8610     uint8x8_t res64;
   8611     __m128i r16;
   8612     r16  = vshrq_n_s16(a,b);
   8613     r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
   8614     return64(r16);
   8615 }
   8616 
   8617 _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16
   8618 _NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16
   8619 {
   8620     uint16x4_t res64;
   8621     __m128i r32;
   8622     r32  = vshrq_n_s32(a,b);
   8623     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow(signed to unsigned), use low 64 bits only
   8624     return64(r32);
   8625 }
   8626 
   8627 _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32
   8628 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
   8629 {
   8630     _NEON2SSE_ALIGN_16 int64_t atmp[2];
   8631     uint32x2_t res;
   8632     int64_t res64;
   8633     _mm_store_si128((__m128i*)atmp, a);
   8634     if (atmp[0] < 0) {
   8635         res.m64_u32[0] = 0;
   8636     } else {
   8637         res64 = (atmp[0] >> b);
   8638         res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64;
   8639     }
   8640     if (atmp[1] < 0) {
   8641         res.m64_u32[1] = 0;
   8642     } else {
   8643         res64 = (atmp[1] >> b);
   8644         res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64;
   8645     }
   8646     return res;
   8647 }
   8648 
   8649 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant *****
   8650 _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8
   8651 _NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8
   8652 {
   8653     //solution may be not optimal compared with the serial one
   8654     __m128i r16;
   8655     uint8x8_t res64;
   8656     r16 = vrshrq_n_s16(a,b);
   8657     r16 =  _mm_packus_epi16 (r16,r16); //saturate and  narrow (signed to unsigned), use low 64 bits only
   8658     return64(r16);
   8659 }
   8660 
   8661 _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16
   8662 _NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16
   8663 {
   8664     //solution may be not optimal compared with the serial one
   8665     __m128i r32;
   8666     uint16x4_t res64;
   8667     r32 = vrshrq_n_s32(a,b);
   8668     r32 =  _MM_PACKUS1_EPI32 (r32); //saturate and  narrow (signed to unsigned), use low 64 bits only
   8669     return64(r32);
   8670 }
   8671 
   8672 _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32
   8673 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster
   8674 {
   8675     _NEON2SSE_ALIGN_16 int64_t atmp[2];
   8676     uint32x2_t res;
   8677     int64_t res64;
   8678     _mm_store_si128((__m128i*)atmp, a);
   8679     if (atmp[0] < 0) {
   8680         res.m64_u32[0] = 0;
   8681     } else {
   8682         res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
   8683         res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
   8684     }
   8685     if (atmp[1] < 0) {
   8686         res.m64_u32[1] = 0;
   8687     } else {
   8688         res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1)  );
   8689         res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64);
   8690     }
   8691     return res;
   8692 }
   8693 
   8694 //***** Vector narrowing saturating shift right by constant ******
   8695 //*****************************************************************
   8696 _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8
   8697 _NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8
   8698 {
   8699     int8x8_t res64;
   8700     __m128i r16;
   8701     r16  = vshrq_n_s16(a,b);
   8702     r16  = _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8703     return64(r16);
   8704 }
   8705 
   8706 _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16
   8707 _NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16
   8708 {
   8709     int16x4_t res64;
   8710     __m128i r32;
   8711     r32  = vshrq_n_s32(a,b);
   8712     r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
   8713     return64(r32);
   8714 }
   8715 
   8716 _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32
   8717 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   8718 {
   8719     //no optimal SIMD solution found
   8720     _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2];
   8721     int32x2_t res;
   8722     _mm_store_si128((__m128i*)atmp, a);
   8723     res64[0] = (atmp[0] >> b);
   8724     res64[1] = (atmp[1] >> b);
   8725     if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
   8726     if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
   8727     if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
   8728     if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
   8729     res.m64_i32[0] = (int32_t)res64[0];
   8730     res.m64_i32[1] = (int32_t)res64[1];
   8731     return res;
   8732 }
   8733 
   8734 _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8
   8735 _NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8
   8736 {
   8737     uint8x8_t res64;
   8738     __m128i r16;
   8739     r16  = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
   8740     r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8741     return64(r16);
   8742 }
   8743 
   8744 _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16
   8745 _NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16
   8746 {
   8747     uint16x4_t res64;
   8748     __m128i r32;
   8749     r32  = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
   8750     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
   8751     return64(r32);
   8752 }
   8753 
   8754 _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32
   8755 _NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
   8756 {
   8757     //serial solution may be faster
   8758     uint32x2_t res64;
   8759     __m128i r64, res_hi, zero;
   8760     zero = _mm_setzero_si128();
   8761     r64  = vshrq_n_u64(a,b);
   8762     res_hi = _mm_srli_epi64(r64,  32);
   8763     res_hi = _mm_cmpgt_epi32(res_hi, zero);
   8764     r64 = _mm_or_si128(r64, res_hi);
   8765     r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8766     return64(r64);
   8767 }
   8768 
   8769 
   8770 //********* Vector rounding narrowing shift right by constant *************************
   8771 //****************************************************************************************
   8772 _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
   8773 _NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
   8774 {
   8775     int8x8_t res64;
   8776     __m128i r16;
   8777      r16  = vrshrq_n_s16(a,b);
   8778     r16  = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
   8779     return64(r16);
   8780 }
   8781 
   8782 _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
   8783 _NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
   8784 {
   8785     int16x4_t res64;
   8786     __m128i r32;
   8787     r32  = vrshrq_n_s32(a,b);
   8788     r32  =  _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems
   8789     return64(r32);
   8790 }
   8791 
   8792 _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
   8793 _NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b)
   8794 {
   8795     int32x2_t res64;
   8796     __m128i r64;
   8797     r64  = vrshrq_n_s64(a,b);
   8798     r64  = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8799     return64(r64);
   8800 }
   8801 
   8802 _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8
   8803 _NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8
   8804 {
   8805     uint8x8_t res64;
   8806     __m128i mask, r16;
   8807     mask = _mm_set1_epi16(0xff);
   8808     r16  = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
   8809     r16 = _mm_and_si128(r16, mask); //to avoid saturation
   8810     r16 = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8811     return64(r16);
   8812 }
   8813 
   8814 _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16
   8815 _NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16
   8816 {
   8817     uint16x4_t res64;
   8818     __m128i mask, r32;
   8819     mask = _mm_set1_epi32(0xffff);
   8820     r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
   8821     r32 = _mm_and_si128(r32, mask); //to avoid saturation
   8822     r32 = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
   8823     return64(r32);
   8824 }
   8825 
   8826 _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32
   8827 _NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster
   8828 {
   8829     uint32x2_t res64;
   8830     __m128i r64;
   8831     r64  = vrshrq_n_u64(a,b);
   8832     r64  =  _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8833     return64(r64);
   8834 }
   8835 
   8836 //************* Vector rounding narrowing saturating shift right by constant ************
   8837 //****************************************************************************************
   8838 _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8
   8839 _NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8
   8840 {
   8841     int8x8_t res64;
   8842     __m128i r16;
   8843     r16  = vrshrq_n_s16(a,b);
   8844     r16  =  _mm_packs_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8845     return64(r16);
   8846 }
   8847 
   8848 _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16
   8849 _NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16
   8850 {
   8851     int16x4_t res64;
   8852     __m128i r32;
   8853     r32  = vrshrq_n_s32(a,b);
   8854     r32  = _mm_packs_epi32 (r32,r32); //saturate and  narrow, use low 64 bits only
   8855     return64(r32);
   8856 }
   8857 
   8858 _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32
   8859 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   8860 {
   8861     //no optimal SIMD solution found
   8862     _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2];
   8863     int32x2_t res;
   8864     _mm_store_si128((__m128i*)atmp, a);
   8865     maskb[0] = atmp[0] & (( int64_t)1 << (b - 1));
   8866     res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result
   8867     maskb[1] = atmp[1] & (( int64_t)1 << (b - 1));
   8868     res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result
   8869     if(res64[0]>SINT_MAX) res64[0] = SINT_MAX;
   8870     if(res64[0]<SINT_MIN) res64[0] = SINT_MIN;
   8871     if(res64[1]>SINT_MAX) res64[1] = SINT_MAX;
   8872     if(res64[1]<SINT_MIN) res64[1] = SINT_MIN;
   8873     res.m64_i32[0] = (int32_t)res64[0];
   8874     res.m64_i32[1] = (int32_t)res64[1];
   8875     return res;
   8876 }
   8877 
   8878 _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8
   8879 _NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8
   8880 {
   8881     uint8x8_t res64;
   8882     __m128i r16;
   8883     r16  = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8)
   8884     r16  = _mm_packus_epi16 (r16,r16); //saturate and  narrow, use low 64 bits only
   8885     return64(r16);
   8886 }
   8887 
   8888 _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16
   8889 _NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16
   8890 {
   8891     uint16x4_t res64;
   8892     __m128i r32;
   8893     r32  = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8)
   8894     r32  = _MM_PACKUS1_EPI32 (r32); //saturate and  narrow, use low 64 bits only
   8895     return64(r32);
   8896 }
   8897 
   8898 _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32
   8899 _NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b)
   8900 {
   8901     //serial solution may be faster
   8902     uint32x2_t res64;
   8903     __m128i r64, res_hi, zero;
   8904     zero = _mm_setzero_si128();
   8905     r64  = vrshrq_n_u64(a,b);
   8906     res_hi = _mm_srli_epi64(r64,  32);
   8907     res_hi = _mm_cmpgt_epi32(res_hi, zero);
   8908     r64 = _mm_or_si128(r64, res_hi);
   8909     r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   8910     return64(r64);
   8911 }
   8912 
   8913 //************** Vector widening shift left by constant ****************
   8914 //************************************************************************
   8915 _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0
   8916 _NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0
   8917 {
   8918     __m128i r;
   8919     r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1
   8920     return _mm_slli_epi16 (r, b);
   8921 }
   8922 
   8923 _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0
   8924 _NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0
   8925 {
   8926     __m128i r;
   8927     r =  _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1,
   8928     return _mm_slli_epi32 (r, b);
   8929 }
   8930 
   8931 _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0
   8932 _NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0
   8933 {
   8934     __m128i r;
   8935     r =  _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1,
   8936     return _mm_slli_epi64 (r, b);
   8937 }
   8938 
   8939 _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0
   8940 _NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0
   8941 {
   8942     //no uint8 to uint16 conversion available, manual conversion used
   8943     __m128i zero,  r;
   8944     zero = _mm_setzero_si128 ();
   8945     r = _mm_unpacklo_epi8(_pM128i(a), zero);
   8946     return _mm_slli_epi16 (r, b);
   8947 }
   8948 
   8949 _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0
   8950 _NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0
   8951 {
   8952     //no uint16 to uint32 conversion available, manual conversion used
   8953     __m128i zero,  r;
   8954     zero = _mm_setzero_si128 ();
   8955     r = _mm_unpacklo_epi16(_pM128i(a), zero);
   8956     return _mm_slli_epi32 (r, b);
   8957 }
   8958 
   8959 _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0
   8960 _NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0
   8961 {
   8962     //no uint32 to uint64 conversion available, manual conversion used
   8963     __m128i zero,  r;
   8964     zero = _mm_setzero_si128 ();
   8965     r = _mm_unpacklo_epi32(_pM128i(a), zero);
   8966     return _mm_slli_epi64 (r, b);
   8967 }
   8968 
   8969 //************************************************************************************
   8970 //**************************** Shifts with insert ************************************
   8971 //************************************************************************************
   8972 //takes each element in a vector,  shifts them by an immediate value,
   8973 //and inserts the results in the destination vector. Bits shifted out of the each element are lost.
   8974 
   8975 //**************** Vector shift right and insert ************************************
   8976 //Actually the "c" left bits from "a" are the only bits remained from "a"  after the shift.
   8977 //All other bits are taken from b shifted.
   8978 _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   8979 _NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a,  int8x8_t b, __constrange(1,8) int c)
   8980 {
   8981     int8x8_t res64;
   8982     return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c));
   8983 }
   8984 
   8985 
   8986 _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   8987 _NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a,  int16x4_t b, __constrange(1,16) int c)
   8988 {
   8989     int16x4_t res64;
   8990     return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c));
   8991 }
   8992 
   8993 
   8994 _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
   8995 _NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a,  int32x2_t b, __constrange(1,32) int c)
   8996 {
   8997     int32x2_t res64;
   8998     return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c));
   8999 }
   9000 
   9001 
   9002 _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
   9003 _NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c)
   9004 {
   9005     int64x1_t res;
   9006     if (c ==64)
   9007         res = a;
   9008     else{
   9009         res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros
   9010     }
   9011     return res;
   9012 }
   9013 
   9014 _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   9015 #define vsri_n_u8 vsri_n_s8
   9016 
   9017 _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   9018 #define vsri_n_u16 vsri_n_s16
   9019 
   9020 _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32
   9021 #define vsri_n_u32 vsri_n_s32
   9022 
   9023 
   9024 _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64
   9025 #define vsri_n_u64 vsri_n_s64
   9026 
   9027 _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8
   9028 #define vsri_n_p8 vsri_n_u8
   9029 
   9030 _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16
   9031 #define vsri_n_p16 vsri_n_u16
   9032 
   9033 _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   9034 _NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8
   9035 {
   9036     __m128i maskA, a_masked;
   9037     uint8x16_t b_shift;
   9038     _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used
   9039     maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros
   9040     a_masked = _mm_and_si128 (a, maskA);
   9041     b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift
   9042     return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a)
   9043 }
   9044 
   9045 _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   9046 _NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16
   9047 {
   9048     //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
   9049     uint16x8_t b_shift;
   9050     uint16x8_t a_c;
   9051     b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift
   9052     a_c = vshrq_n_u16( a, (16 - c));
   9053     a_c  = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a
   9054     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
   9055 }
   9056 
   9057 _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
   9058 _NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32
   9059 {
   9060     //to cut "c" left bits from a we do shift right and then  shift back left providing c right zeros in a
   9061     uint32x4_t b_shift;
   9062     uint32x4_t a_c;
   9063     b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift
   9064     a_c = vshrq_n_u32( a, (32 - c));
   9065     a_c  = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a
   9066     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
   9067 }
   9068 
   9069 _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
   9070 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c)
   9071 {
   9072     //serial solution may be faster
   9073     uint64x2_t b_shift;
   9074     uint64x2_t a_c;
   9075     b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift
   9076     a_c = _mm_srli_epi64(a, (64 - c));
   9077     a_c  = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a
   9078     return _mm_or_si128 (a_c, b_shift); //combine (insert b into a)
   9079 }
   9080 
   9081 _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   9082 #define vsriq_n_u8 vsriq_n_s8
   9083 
   9084 _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   9085 #define vsriq_n_u16 vsriq_n_s16
   9086 
   9087 _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32
   9088 #define vsriq_n_u32 vsriq_n_s32
   9089 
   9090 _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64
   9091 #define vsriq_n_u64 vsriq_n_s64
   9092 
   9093 _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8
   9094 #define vsriq_n_p8 vsriq_n_u8
   9095 
   9096 _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16
   9097 #define vsriq_n_p16 vsriq_n_u16
   9098 
   9099 //***** Vector shift left and insert *********************************************
   9100 //*********************************************************************************
   9101 //Actually the "c" right bits from "a" are the only bits remained from "a"  after the shift.
   9102 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted".
   9103 _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   9104 _NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a,  int8x8_t b, __constrange(0,7) int c)
   9105 {
   9106     int8x8_t res64;
   9107     return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c));
   9108 }
   9109 
   9110 
   9111 _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   9112 _NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a,  int16x4_t b, __constrange(0,15) int c)
   9113 {
   9114     int16x4_t res64;
   9115     return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c));
   9116 }
   9117 
   9118 
   9119 _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
   9120 _NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a,  int32x2_t b, __constrange(0,31) int c)
   9121 {
   9122     int32x2_t res64;
   9123     return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c));
   9124 }
   9125 
   9126 _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
   9127 _NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c)
   9128 {
   9129     int64x1_t res;
   9130     res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros
   9131     return res;
   9132 }
   9133 
   9134 
   9135 _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   9136 #define vsli_n_u8 vsli_n_s8
   9137 
   9138 _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   9139 #define vsli_n_u16 vsli_n_s16
   9140 
   9141 _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0
   9142 #define vsli_n_u32 vsli_n_s32
   9143 
   9144 _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0
   9145 #define vsli_n_u64 vsli_n_s64
   9146 
   9147 _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0
   9148 #define vsli_n_p8 vsli_n_u8
   9149 
   9150 _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0
   9151 #define vsli_n_p16 vsli_n_u16
   9152 
   9153 _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   9154 _NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0
   9155 {
   9156     __m128i maskA, a_masked;
   9157     int8x16_t b_shift;
   9158     _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask
   9159     maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones
   9160     b_shift = vshlq_n_s8( b, c);
   9161     a_masked = _mm_and_si128 (a, maskA);
   9162     return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a)
   9163 }
   9164 
   9165 _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   9166 _NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0
   9167 {
   9168     //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a
   9169     int16x8_t b_shift;
   9170     int16x8_t a_c;
   9171     b_shift = vshlq_n_s16( b, c);
   9172     a_c = vshlq_n_s16( a, (16 - c));
   9173     a_c  = _mm_srli_epi16(a_c, (16 - c));
   9174     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
   9175 }
   9176 
   9177 _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
   9178 _NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0
   9179 {
   9180     //solution may be  not optimal compared with the serial one
   9181     //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a
   9182     int32x4_t b_shift;
   9183     int32x4_t a_c;
   9184     b_shift = vshlq_n_s32( b, c);
   9185     a_c = vshlq_n_s32( a, (32 - c));
   9186     a_c  = _mm_srli_epi32(a_c, (32 - c));
   9187     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
   9188 }
   9189 
   9190 _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
   9191 _NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0
   9192 {
   9193     //solution may be  not optimal compared with the serial one
   9194     //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a
   9195     int64x2_t b_shift;
   9196     int64x2_t a_c;
   9197     b_shift = vshlq_n_s64( b, c);
   9198     a_c = vshlq_n_s64( a, (64 - c));
   9199     a_c  = _mm_srli_epi64(a_c, (64 - c));
   9200     return _mm_or_si128 (b_shift, a_c); //combine (insert b into a)
   9201 }
   9202 
   9203 _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   9204 #define vsliq_n_u8 vsliq_n_s8
   9205 
   9206 _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   9207 #define vsliq_n_u16 vsliq_n_s16
   9208 
   9209 _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0
   9210 #define vsliq_n_u32 vsliq_n_s32
   9211 
   9212 _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0
   9213 #define vsliq_n_u64 vsliq_n_s64
   9214 
   9215 _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0
   9216 #define vsliq_n_p8 vsliq_n_u8
   9217 
   9218 _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0
   9219 #define vsliq_n_p16 vsliq_n_u16
   9220 
   9221 // ***********************************************************************************************
   9222 // ****************** Loads and stores of a single vector ***************************************
   9223 // ***********************************************************************************************
   9224 //Performs loads and stores of a single vector of some type.
   9225 //*******************************  Loads ********************************************************
   9226 // ***********************************************************************************************
   9227 //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);.
   9228 //also for SSE3  supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous.
   9229 // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access
   9230 //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead;
   9231 #define LOAD_SI128(ptr) \
   9232         ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr))
   9233 
   9234 _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   9235 #define vld1q_u8 LOAD_SI128
   9236 
   9237 _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   9238 #define vld1q_u16 LOAD_SI128
   9239 
   9240 _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   9241 #define vld1q_u32 LOAD_SI128
   9242 
   9243 _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   9244 #define vld1q_u64 LOAD_SI128
   9245 
   9246 _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   9247 #define vld1q_s8 LOAD_SI128
   9248 
   9249 _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   9250 #define vld1q_s16 LOAD_SI128
   9251 
   9252 _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   9253 #define vld1q_s32 LOAD_SI128
   9254 
   9255 _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   9256 #define vld1q_s64 LOAD_SI128
   9257 
   9258 _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0]
   9259 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers
   9260 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0]
   9261 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
   9262 __m128 f2;
   9263 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]);
   9264 }*/
   9265 
   9266 _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0]
   9267 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr)
   9268 {
   9269     if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned
   9270         return _mm_load_ps(ptr);
   9271     else
   9272         return _mm_loadu_ps(ptr);
   9273 }
   9274 
   9275 _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0]
   9276 #define vld1q_p8  LOAD_SI128
   9277 
   9278 _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0]
   9279 #define vld1q_p16 LOAD_SI128
   9280 
   9281 _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0]
   9282 #define vld1_u8(ptr)  *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr))
   9283 
   9284 _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0]
   9285 #define vld1_u16 vld1_u8
   9286 
   9287 _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0]
   9288 #define vld1_u32 vld1_u8
   9289 
   9290 
   9291 _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   9292 #define vld1_u64 vld1_u8
   9293 
   9294 _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0]
   9295 #define vld1_s8 vld1_u8
   9296 
   9297 _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0]
   9298 #define vld1_s16 vld1_u16
   9299 
   9300 _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0]
   9301 #define vld1_s32 vld1_u32
   9302 
   9303 _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   9304 #define vld1_s64 vld1_u64
   9305 
   9306 _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0]
   9307 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]);
   9308 
   9309 _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0]
   9310 _NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr)
   9311 {
   9312     float32x2_t res;
   9313     res.m64_f32[0] = *(ptr);
   9314     res.m64_f32[1] = *(ptr + 1);
   9315     return res;
   9316 }
   9317 
   9318 _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0]
   9319 #define vld1_p8 vld1_u8
   9320 
   9321 _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0]
   9322 #define vld1_p16 vld1_u16
   9323 
   9324 
   9325 _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   9326 _NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr)
   9327 {
   9328     if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned
   9329         return _mm_load_pd(ptr);
   9330     else
   9331         return _mm_loadu_pd(ptr);
   9332 }
   9333 
   9334 
   9335 //***********************************************************************************************************
   9336 //******* Lane load functions - insert the data at  vector's given position (lane) *************************
   9337 //***********************************************************************************************************
   9338 _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
   9339 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
   9340 
   9341 _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1)    uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   9342 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
   9343 
   9344 _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   9345 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
   9346 
   9347 _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
   9348 #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p;
   9349 
   9350 
   9351 _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
   9352 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
   9353 
   9354 _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   9355 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
   9356 
   9357 _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   9358 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane)
   9359 
   9360 _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   9361 //current IA SIMD doesn't support float16
   9362 
   9363 _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0]
   9364 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane)
   9365 {
   9366     //we need to deal with  ptr  16bit NOT aligned case
   9367     __m128 p;
   9368     p = _mm_set1_ps(*(ptr));
   9369     return _MM_INSERT_PS(vec,  p, _INSERTPS_NDX(0, lane));
   9370 }
   9371 
   9372 _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0]
   9373 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane)
   9374 
   9375 _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0]
   9376 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane)
   9377 
   9378 _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0]
   9379 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane)
   9380 
   9381 _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
   9382 _NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane)
   9383 {
   9384     uint8x8_t res;
   9385     res = vec;
   9386     res.m64_u8[lane] = *(ptr);
   9387     return res;
   9388 }
   9389 
   9390 _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
   9391 _NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane)
   9392 {
   9393     uint16x4_t res;
   9394     res = vec;
   9395     res.m64_u16[lane] = *(ptr);
   9396     return res;
   9397 }
   9398 
   9399 _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
   9400 _NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane)
   9401 {
   9402     uint32x2_t res;
   9403     res = vec;
   9404     res.m64_u32[lane] = *(ptr);
   9405     return res;
   9406 }
   9407 
   9408 _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
   9409 _NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane)
   9410 {
   9411     uint64x1_t res;
   9412     res.m64_u64[0] = *(ptr);
   9413     return res;
   9414 }
   9415 
   9416 
   9417 _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
   9418 #define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane)
   9419 
   9420 _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
   9421 #define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane)
   9422 
   9423 _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
   9424 #define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane)
   9425 
   9426 _NEON2SSESTORAGE float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
   9427 //current IA SIMD doesn't support float16
   9428 
   9429 _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0]
   9430 _NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane)
   9431 {
   9432     float32x2_t res;
   9433     res = vec;
   9434     res.m64_f32[lane] = *(ptr);
   9435     return res;
   9436 }
   9437 
   9438 _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0]
   9439 #define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane)
   9440 
   9441 _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0]
   9442 #define vld1_lane_p8 vld1_lane_u8
   9443 
   9444 _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0]
   9445 #define vld1_lane_p16 vld1_lane_s16
   9446 
   9447 // ****************** Load single value ( set all lanes of vector with same value from memory)**********************
   9448 // ******************************************************************************************************************
   9449 _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9450 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr))
   9451 
   9452 _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9453 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr))
   9454 
   9455 _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9456 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr))
   9457 
   9458 _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   9459 _NEON2SSE_INLINE uint64x2_t   vld1q_dup_u64(__transfersize(1) uint64_t const * ptr)
   9460 {
   9461     _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)};
   9462     return LOAD_SI128(val);
   9463 }
   9464 
   9465 _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9466 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr))
   9467 
   9468 _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9469 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr))
   9470 
   9471 _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9472 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr))
   9473 
   9474 _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   9475 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr)
   9476 
   9477 _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
   9478 //current IA SIMD doesn't support float16, need to go to 32 bits
   9479 
   9480 _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9481 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr))
   9482 
   9483 _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9484 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr))
   9485 
   9486 _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9487 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr))
   9488 
   9489 _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9490 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
   9491 {
   9492     uint8x8_t res;
   9493     int i;
   9494     for(i = 0; i<8; i++) {
   9495         res.m64_u8[i] =  *(ptr);
   9496     }
   9497     return res;
   9498 }
   9499 
   9500 _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9501 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
   9502 {
   9503     uint16x4_t res;
   9504     int i;
   9505     for(i = 0; i<4; i++) {
   9506         res.m64_u16[i] =  *(ptr);
   9507     }
   9508     return res;
   9509 }
   9510 
   9511 _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9512 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL)
   9513 {
   9514     uint32x2_t res;
   9515     res.m64_u32[0] = *(ptr);
   9516     res.m64_u32[1] = *(ptr);
   9517     return res;
   9518 }
   9519 
   9520 _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0]
   9521 _NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr)
   9522 {
   9523     uint64x1_t res;
   9524     res.m64_u64[0] = *(ptr);
   9525     return res;
   9526 }
   9527 
   9528 _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9529 #define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr)
   9530 
   9531 
   9532 _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9533 #define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr)
   9534 
   9535 
   9536 _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9537 #define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr)
   9538 
   9539 
   9540 _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0]
   9541 #define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr)
   9542 
   9543 _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0]
   9544 //current IA SIMD doesn't support float16
   9545 
   9546 _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0]
   9547 _NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr)
   9548 {
   9549     float32x2_t res;
   9550     res.m64_f32[0] = *(ptr);
   9551     res.m64_f32[1] = res.m64_f32[0];
   9552     return res; // use last 64bits only
   9553 }
   9554 
   9555 _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0]
   9556 #define vld1_dup_p8 vld1_dup_u8
   9557 
   9558 
   9559 _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0]
   9560 #define vld1_dup_p16 vld1_dup_u16
   9561 
   9562 
   9563 //*************************************************************************************
   9564 //********************************* Store **********************************************
   9565 //*************************************************************************************
   9566 // If ptr is 16bit aligned and you  need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val);
   9567 //here we assume the case of  NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro
   9568 #define STORE_SI128(ptr, val) \
   9569         (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val);
   9570 
   9571 _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0]
   9572 #define vst1q_u8 STORE_SI128
   9573 
   9574 _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0]
   9575 #define vst1q_u16 STORE_SI128
   9576 
   9577 _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0]
   9578 #define vst1q_u32 STORE_SI128
   9579 
   9580 _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0]
   9581 #define vst1q_u64 STORE_SI128
   9582 
   9583 _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0]
   9584 #define vst1q_s8 STORE_SI128
   9585 
   9586 _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0]
   9587 #define vst1q_s16 STORE_SI128
   9588 
   9589 _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0]
   9590 #define vst1q_s32 STORE_SI128
   9591 
   9592 _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0]
   9593 #define vst1q_s64 STORE_SI128
   9594 
   9595 _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0]
   9596 // IA32 SIMD doesn't work with 16bit floats currently
   9597 
   9598 _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0]
   9599 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val)
   9600 {
   9601     if( ((uintptr_t)(ptr) & 15)  == 0 ) //16 bits aligned
   9602         _mm_store_ps (ptr, val);
   9603     else
   9604         _mm_storeu_ps (ptr, val);
   9605 }
   9606 
   9607 _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0]
   9608 #define vst1q_p8  vst1q_u8
   9609 
   9610 _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0]
   9611 #define vst1q_p16 vst1q_u16
   9612 
   9613 _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0]
   9614 _NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val)
   9615 {
   9616     int i;
   9617     for (i = 0; i<8; i++) {
   9618         *(ptr + i) = ((uint8_t*)&val)[i];
   9619     }
   9620     //_mm_storel_epi64((__m128i*)ptr, val);
   9621     return;
   9622 }
   9623 
   9624 _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0]
   9625 _NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val)
   9626 {
   9627     int i;
   9628     for (i = 0; i<4; i++) {
   9629         *(ptr + i) = ((uint16_t*)&val)[i];
   9630     }
   9631     //_mm_storel_epi64((__m128i*)ptr, val);
   9632     return;
   9633 }
   9634 
   9635 _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0]
   9636 _NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val)
   9637 {
   9638     int i;
   9639     for (i = 0; i<2; i++) {
   9640         *(ptr + i) = ((uint32_t*)&val)[i];
   9641     }
   9642     //_mm_storel_epi64((__m128i*)ptr, val);
   9643     return;
   9644 }
   9645 
   9646 _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0]
   9647 _NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val)
   9648 {
   9649     *(ptr) = *((uint64_t*)&val);
   9650     //_mm_storel_epi64((__m128i*)ptr, val);
   9651     return;
   9652 }
   9653 
   9654 _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0]
   9655 #define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val)
   9656 
   9657 _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0]
   9658 #define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val)
   9659 
   9660 _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0]
   9661 #define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val)
   9662 
   9663 _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0]
   9664 #define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val)
   9665 
   9666 _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0]
   9667 //current IA SIMD doesn't support float16
   9668 
   9669 _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0]
   9670 _NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val)
   9671 {
   9672     *(ptr) =   val.m64_f32[0];
   9673     *(ptr + 1) = val.m64_f32[1];
   9674     return;
   9675 }
   9676 
   9677 _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0]
   9678 #define vst1_p8 vst1_u8
   9679 
   9680 _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0]
   9681 #define vst1_p16 vst1_u16
   9682 
   9683 //***********Store a lane of a vector into memory (extract given lane) *********************
   9684 //******************************************************************************************
   9685 _NEON2SSESTORAGE void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
   9686 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane)
   9687 
   9688 _NEON2SSESTORAGE void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
   9689 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane)
   9690 
   9691 _NEON2SSESTORAGE void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
   9692 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane)
   9693 
   9694 _NEON2SSESTORAGE void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
   9695 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane)
   9696 
   9697 _NEON2SSESTORAGE void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
   9698 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane)
   9699 
   9700 _NEON2SSESTORAGE void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
   9701 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane)
   9702 
   9703 _NEON2SSESTORAGE void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
   9704 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane)
   9705 
   9706 _NEON2SSESTORAGE void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0]
   9707 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane)
   9708 
   9709 _NEON2SSESTORAGE void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
   9710 //current IA SIMD doesn't support float16
   9711 
   9712 _NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0]
   9713 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane)
   9714 {
   9715     int32_t ilane;
   9716     ilane = _MM_EXTRACT_PS(val,lane);
   9717     *(ptr) =  *((float*)&ilane);
   9718 }
   9719 
   9720 _NEON2SSESTORAGE void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0]
   9721 #define vst1q_lane_p8   vst1q_lane_u8
   9722 
   9723 _NEON2SSESTORAGE void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0]
   9724 #define vst1q_lane_p16   vst1q_lane_s16
   9725 
   9726 _NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
   9727 _NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane)
   9728 {
   9729     *(ptr) = val.m64_u8[lane];
   9730 }
   9731 
   9732 _NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
   9733 _NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane)
   9734 {
   9735     *(ptr) = val.m64_u16[lane];
   9736 }
   9737 
   9738 _NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
   9739 _NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane)
   9740 {
   9741     *(ptr) = val.m64_u32[lane];
   9742 }
   9743 
   9744 _NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
   9745 _NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane)
   9746 {
   9747     *(ptr) = val.m64_u64[0];
   9748 }
   9749 
   9750 _NEON2SSESTORAGE void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
   9751 #define  vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane)
   9752 
   9753 _NEON2SSESTORAGE void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
   9754 #define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane)
   9755 
   9756 _NEON2SSESTORAGE void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
   9757 #define vst1_lane_s32(ptr, val, lane)  vst1_lane_u32((uint32_t*)ptr, val, lane)
   9758 
   9759 
   9760 _NEON2SSESTORAGE void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0]
   9761 #define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane)
   9762 
   9763 
   9764 _NEON2SSESTORAGE void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
   9765 //current IA SIMD doesn't support float16
   9766 
   9767 _NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0]
   9768 _NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane)
   9769 {
   9770     *(ptr) = val.m64_f32[lane];
   9771 }
   9772 
   9773 _NEON2SSESTORAGE void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0]
   9774 #define vst1_lane_p8 vst1_lane_u8
   9775 
   9776 _NEON2SSESTORAGE void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0]
   9777 #define vst1_lane_p16 vst1_lane_s16
   9778 
   9779 //***********************************************************************************************
   9780 //**************** Loads and stores of an N-element structure **********************************
   9781 //***********************************************************************************************
   9782 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning
   9783 //We assume ptr is NOT aligned in general case, for more details see  "Loads and stores of a single vector functions"
   9784 //****************** 2 elements load  *********************************************
   9785 _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   9786 _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0]
   9787 {
   9788     uint8x16x2_t v;
   9789     v.val[0] = vld1q_u8(ptr);
   9790     v.val[1] = vld1q_u8((ptr + 16));
   9791     v = vuzpq_s8(v.val[0], v.val[1]);
   9792     return v;
   9793 }
   9794 
   9795 _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   9796 _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0]
   9797 {
   9798     uint16x8x2_t v;
   9799     v.val[0] = vld1q_u16( ptr);
   9800     v.val[1] = vld1q_u16( (ptr + 8));
   9801     v = vuzpq_s16(v.val[0], v.val[1]);
   9802     return v;
   9803 }
   9804 
   9805 _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   9806 _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
   9807 {
   9808     uint32x4x2_t v;
   9809     v.val[0] = vld1q_u32 ( ptr);
   9810     v.val[1] = vld1q_u32 ( (ptr + 4));
   9811     v = vuzpq_s32(v.val[0], v.val[1]);
   9812     return v;
   9813 }
   9814 
   9815 _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr);
   9816 #define  vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr)
   9817 
   9818 _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   9819 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr)
   9820 
   9821 _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   9822 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr)
   9823 
   9824 
   9825 _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0]
   9826 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   9827 
   9828 _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0]
   9829 _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0]
   9830 {
   9831     float32x4x2_t v;
   9832     v.val[0] =  vld1q_f32 (ptr);
   9833     v.val[1] =  vld1q_f32 ((ptr + 4));
   9834     v = vuzpq_f32(v.val[0], v.val[1]);
   9835     return v;
   9836 }
   9837 
   9838 _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0]
   9839 #define  vld2q_p8 vld2q_u8
   9840 
   9841 _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0]
   9842 #define vld2q_p16 vld2q_u16
   9843 
   9844 _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   9845 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr)
   9846 {
   9847     uint8x8x2_t v;
   9848     __m128i ld128;
   9849     ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit
   9850     ld128 =  _mm_shuffle_epi8(ld128, *(__m128i*)mask8_16_even_odd);
   9851     vst1q_u8((v.val), ld128); //  v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32);
   9852     return v;
   9853 }
   9854 
   9855 _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   9856 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr)
   9857 {
   9858     _NEON2SSE_ALIGN_16 uint16x4x2_t v;
   9859     __m128i ld128;
   9860     ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit
   9861     ld128 = _mm_shuffle_epi8(ld128, *(__m128i*) mask8_32_even_odd);
   9862     vst1q_u16((v.val), ld128);
   9863     return v;
   9864 }
   9865 
   9866 _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   9867 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr)
   9868 {
   9869     _NEON2SSE_ALIGN_16 uint32x2x2_t v;
   9870     __m128i ld128;
   9871     ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit
   9872     ld128 = _mm_shuffle_epi32(ld128,  0 | (2 << 2) | (1 << 4) | (3 << 6));
   9873     vst1q_u32((v.val), ld128);
   9874     return v;
   9875 }
   9876 
   9877 _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   9878 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr)
   9879 {
   9880     uint64x1x2_t v;
   9881     v.val[0].m64_u64[0] = *(ptr);
   9882     v.val[1].m64_u64[0] = *(ptr + 1);
   9883     return v;
   9884 }
   9885 
   9886 _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   9887 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr)
   9888 
   9889 _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   9890 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr)
   9891 
   9892 _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   9893 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr)
   9894 
   9895 _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   9896 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr)
   9897 
   9898 _NEON2SSESTORAGE float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0]
   9899 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example
   9900 
   9901 _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0]
   9902 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr)
   9903 {
   9904     float32x2x2_t v;
   9905     v.val[0].m64_f32[0] = *(ptr);
   9906     v.val[0].m64_f32[1] = *(ptr + 2);
   9907     v.val[1].m64_f32[0] = *(ptr + 1);
   9908     v.val[1].m64_f32[1] = *(ptr + 3);
   9909     return v;
   9910 }
   9911 
   9912 _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0]
   9913 #define vld2_p8 vld2_u8
   9914 
   9915 _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0]
   9916 #define vld2_p16 vld2_u16
   9917 
   9918 //******************** Triplets ***************************************
   9919 //*********************************************************************
   9920 _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   9921 _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0]
   9922 {
   9923     //a0,a1,a2,a3,...a7,a8,...a15,  b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 ->
   9924     //a:0,3,6,9,12,15,b:2,5,8,11,14,  c:1,4,7,10,13
   9925     //a:1,4,7,10,13,  b:0,3,6,9,12,15,c:2,5,8,11,14,
   9926     //a:2,5,8,11,14,  b:1,4,7,10,13,  c:0,3,6,9,12,15
   9927     uint8x16x3_t v;
   9928     __m128i tmp0, tmp1,tmp2, tmp3;
   9929     _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14};
   9930     _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13};
   9931     _NEON2SSE_ALIGN_16 static const int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15};
   9932 
   9933     v.val[0] =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15
   9934     v.val[1] =  vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15
   9935     v.val[2] =  vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15
   9936 
   9937     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11
   9938     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13
   9939     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15
   9940 
   9941     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15
   9942     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x
   9943     tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14,
   9944     tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0
   9945     v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13,
   9946     v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13,
   9947 
   9948     tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13,
   9949     tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0
   9950     v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0
   9951     v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13,
   9952     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13,
   9953     v.val[1] =  _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15,
   9954     v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0
   9955     tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0
   9956     tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14,
   9957     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14,
   9958 
   9959     tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0,
   9960     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15,
   9961     v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0
   9962     v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0
   9963     v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15,
   9964     tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0,
   9965     v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15,
   9966     return v;
   9967 }
   9968 
   9969 _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   9970 _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0]
   9971 {
   9972     //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
   9973     uint16x8x3_t v;
   9974     __m128i tmp0, tmp1,tmp2, tmp3;
   9975     _NEON2SSE_ALIGN_16 static const int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
   9976     _NEON2SSE_ALIGN_16 static const int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13};
   9977     _NEON2SSE_ALIGN_16 static const int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15};
   9978 
   9979     v.val[0] =  vld1q_u16 (ptr); //a0,a1,a2,a3,...a7,
   9980     v.val[1] =  vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7
   9981     v.val[2] =  vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7
   9982 
   9983     tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5,
   9984     tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6
   9985     tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7
   9986 
   9987     tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6,
   9988     tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x
   9989     tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7
   9990     tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0
   9991     v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5,
   9992     v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5
   9993 
   9994     tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7
   9995     tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0
   9996     v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0
   9997     v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6,
   9998     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6,
   9999     v.val[1] =  _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5,
   10000     v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0,
   10001     tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0
   10002     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6,
   10003     v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6,
   10004 
   10005     tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0
   10006     tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7,
   10007     v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0
   10008     v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0
   10009     v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7,
   10010     tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0
   10011     v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7,
   10012     return v;
   10013 }
   10014 
   10015 _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   10016 _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
   10017 {
   10018     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
   10019     uint32x4x3_t v;
   10020     __m128i tmp0, tmp1,tmp2, tmp3;
   10021     v.val[0] =  vld1q_u32 (ptr); //a0,a1,a2,a3,
   10022     v.val[1] =  vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3
   10023     v.val[2] =  vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3,
   10024 
   10025     tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2
   10026     tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1
   10027     tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3
   10028 
   10029     tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2
   10030     v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1
   10031     tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1
   10032     v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0,
   10033     v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2
   10034     v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3
   10035     return v;
   10036 }
   10037 
   10038 _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   10039 #define  vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr))
   10040 
   10041 _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   10042 #define  vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr))
   10043 
   10044 _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   10045 #define  vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr))
   10046 
   10047 _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   10048 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10049 
   10050 _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0]
   10051 _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0]
   10052 {
   10053     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
   10054     float32x4x3_t v;
   10055     __m128 tmp0, tmp1,tmp2, tmp3;
   10056     v.val[0] =  vld1q_f32 (ptr); //a0,a1,a2,a3,
   10057     v.val[1] =  vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3
   10058     v.val[2] =  vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3,
   10059 
   10060     tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2
   10061     tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1
   10062     tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3
   10063     tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2
   10064 
   10065     v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1
   10066     tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1
   10067     v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0,
   10068     v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2
   10069     v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3
   10070     return v;
   10071 }
   10072 
   10073 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0]
   10074 #define vld3q_p8 vld3q_u8
   10075 
   10076 _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0]
   10077 #define vld3q_p16 vld3q_u16
   10078 
   10079 _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   10080 _NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0]
   10081 {
   10082     //a0, a1,a2,a3,...a7,  b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7
   10083     uint8x8x3_t v;
   10084     __m128i val0, val1, val2, tmp0, tmp1;
   10085     _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14};
   10086     _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0};
   10087     val0 =  vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7
   10088     val2 =  _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7
   10089 
   10090     tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6,
   10091     tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x
   10092     val0 = _mm_slli_si128(tmp0,10);
   10093     val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0
   10094     val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x
   10095     val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x
   10096     _M64(v.val[0], val0);
   10097     val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5,
   10098     val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0,
   10099     val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0
   10100     val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0
   10101     val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x
   10102     _M64(v.val[1], val1);
   10103 
   10104     tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0,
   10105     val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0
   10106     val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7,
   10107     val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x
   10108     _M64(v.val[2], val2);
   10109     return v;
   10110 }
   10111 
   10112 _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   10113 _NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0]
   10114 {
   10115     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1,  a1,b0,b3,c2, a2,b1,c0,c3,
   10116     uint16x4x3_t v;
   10117     __m128i val0, val1, val2, tmp0, tmp1;
   10118     _NEON2SSE_ALIGN_16 static const int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11};
   10119     val0 =  vld1q_u16 (ptr); //a0,a1,a2,a3,  b0,b1,b2,b3
   10120     val2 =  _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x
   10121 
   10122     tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1
   10123     tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01     : c1, c2, c0, c3,
   10124     val0 = _mm_slli_si128(tmp0,10);
   10125     val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0,
   10126     val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1
   10127     val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0
   10128     val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x
   10129     _M64(v.val[0], val0);
   10130 
   10131     val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3
   10132     val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0,
   10133     val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0,
   10134     val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0
   10135     val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x
   10136     _M64(v.val[1], val1);
   10137 
   10138     tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0
   10139     tmp1 = _mm_srli_si128(tmp1,4);
   10140     tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3,
   10141     val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3,
   10142     _M64(v.val[2], val2);
   10143     return v;
   10144 }
   10145 
   10146 _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   10147 _NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0]
   10148 {
   10149     //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
   10150     uint32x2x3_t v;
   10151     __m128i val0, val1, val2;
   10152     val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
   10153     val2 =   _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x
   10154 
   10155     val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0
   10156     _M64(v.val[0], val0);
   10157     val2 =  _mm_slli_si128(val2, 8); //x, x,c0,c1,
   10158     val1 =  _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1
   10159     _M64(v.val[1], val1);
   10160     val2 =  _mm_srli_si128(val1, 8); //b0, c1, x, x,
   10161     _M64(v.val[2], val2);
   10162     return v;
   10163 }
   10164 _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   10165 _NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
   10166 {
   10167     uint64x1x3_t v;
   10168     v.val[0].m64_u64[0] = *(ptr);
   10169     v.val[1].m64_u64[0] = *(ptr + 1);
   10170     v.val[2].m64_u64[0] = *(ptr + 2);
   10171     return v;
   10172 }
   10173 
   10174 _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   10175 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr)
   10176 
   10177 _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   10178 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr)
   10179 
   10180 _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   10181 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr)
   10182 
   10183 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   10184 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr)
   10185 
   10186 _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   10187 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10188 
   10189 _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0]
   10190 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr)
   10191 {
   10192     //a0,a1,  b0,b1, c0,c1,  -> a0,b1, a1,c0, b0,c1
   10193     float32x2x3_t v;
   10194     v.val[0].m64_f32[0] = *(ptr);
   10195     v.val[0].m64_f32[1] = *(ptr + 3);
   10196 
   10197     v.val[1].m64_f32[0] = *(ptr + 1);
   10198     v.val[1].m64_f32[1] = *(ptr + 4);
   10199 
   10200     v.val[2].m64_f32[0] = *(ptr + 2);
   10201     v.val[2].m64_f32[1] = *(ptr + 5);
   10202     return v;
   10203 }
   10204 
   10205 _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0]
   10206 #define vld3_p8 vld3_u8
   10207 
   10208 _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0]
   10209 #define vld3_p16 vld3_u16
   10210 
   10211 //***************  Quadruples load ********************************
   10212 //*****************************************************************
   10213 _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   10214 _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0]
   10215 {
   10216     uint8x16x4_t v;
   10217     __m128i tmp3, tmp2, tmp1, tmp0;
   10218 
   10219     v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15
   10220     v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15
   10221     v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15
   10222     v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15
   10223 
   10224     tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7
   10225     tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7
   10226     tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15
   10227     tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15
   10228 
   10229     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8,  a1,a9, b1,b9, ....a3,a11, b3,b11
   10230     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15
   10231     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11
   10232     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15
   10233 
   10234     tmp0 =  _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8,  d0,d8, a1,a9, b1,b9, c1,c9, d1,d9
   10235     tmp1 =  _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11
   10236     tmp2 =  _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13,
   10237     tmp3 =  _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15
   10238 
   10239     v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12
   10240     v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13
   10241     v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14
   10242     v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15
   10243     return v;
   10244 }
   10245 
   10246 _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   10247 _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0]
   10248 {
   10249     uint16x8x4_t v;
   10250     __m128i tmp3, tmp2, tmp1, tmp0;
   10251     tmp0  =  vld1q_u16 (ptr); //a0,a1,a2,...a7
   10252     tmp1  =  vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7
   10253     tmp2  =  vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7
   10254     tmp3  =  vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7
   10255     v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3,
   10256     v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3,
   10257     v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7
   10258     v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7
   10259     tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5
   10260     tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7
   10261     tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5
   10262     tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7
   10263     v.val[0] =  _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4,
   10264     v.val[1] =  _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5
   10265     v.val[2] =  _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6,
   10266     v.val[3] =  _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7
   10267     return v;
   10268 }
   10269 
   10270 _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   10271 _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
   10272 {
   10273     uint32x4x4_t v;
   10274     __m128i tmp3, tmp2, tmp1, tmp0;
   10275     v.val[0] =  vld1q_u32 (ptr);
   10276     v.val[1] =  vld1q_u32 ((ptr + 4));
   10277     v.val[2] =  vld1q_u32 ((ptr + 8));
   10278     v.val[3] =  vld1q_u32 ((ptr + 12));
   10279     tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]);
   10280     tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]);
   10281     tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]);
   10282     tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]);
   10283     v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1);
   10284     v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1);
   10285     v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3);
   10286     v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3);
   10287     return v;
   10288 }
   10289 
   10290 _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   10291 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr)
   10292 
   10293 _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   10294 #define  vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr)
   10295 
   10296 _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   10297 #define  vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr)
   10298 
   10299 _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   10300 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10301 
   10302 _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0]
   10303 _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0]
   10304 {
   10305     float32x4x4_t v;
   10306     __m128 tmp3, tmp2, tmp1, tmp0;
   10307 
   10308     v.val[0] =  vld1q_f32 ((float*) ptr);
   10309     v.val[1] =  vld1q_f32 ((float*) (ptr + 4));
   10310     v.val[2] =  vld1q_f32 ((float*) (ptr + 8));
   10311     v.val[3] =  vld1q_f32 ((float*) (ptr + 12));
   10312     tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]);
   10313     tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]);
   10314     tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]);
   10315     tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]);
   10316     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
   10317     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
   10318     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
   10319     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
   10320     return v;
   10321 }
   10322 
   10323 _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0]
   10324 #define vld4q_p8 vld4q_u8
   10325 
   10326 _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0]
   10327 #define vld4q_p16 vld4q_s16
   10328 
   10329 _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   10330 _NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0]
   10331 {
   10332     uint8x8x4_t v;
   10333     __m128i sh0, sh1;
   10334     __m128i val0,  val2;
   10335     _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15};
   10336 
   10337     val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1]
   10338     val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3]
   10339 
   10340     sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8);
   10341     sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8);
   10342     val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29
   10343     vst1q_u8(&v.val[0], val0 );
   10344     val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31
   10345     vst1q_u8(&v.val[2], val2 );
   10346     return v;
   10347 }
   10348 
   10349 _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   10350 _NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0]
   10351 {
   10352     uint16x4x4_t v;
   10353     __m128i sh0, sh1;
   10354     __m128i val0, val2;
   10355     _NEON2SSE_ALIGN_16 static const int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7
   10356     val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1]
   10357     val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3]
   10358     sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16);
   10359     sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16);
   10360     val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13
   10361     vst1q_u16(&v.val[0], val0 );
   10362     val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15
   10363     vst1q_u16(&v.val[2], val2 );
   10364     return v;
   10365 }
   10366 
   10367 _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   10368 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr)
   10369 {
   10370     //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
   10371     uint32x2x4_t v;
   10372     __m128i val0, val01, val2;
   10373     val0 =  vld1q_u32 (ptr); //a0,a1,  b0,b1,
   10374     val2 =  vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1
   10375     val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1,
   10376     val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1
   10377     vst1q_u32(&v.val[0], val01);
   10378     vst1q_u32(&v.val[2], val2 );
   10379     return v;
   10380 }
   10381 
   10382 _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   10383 _NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
   10384 {
   10385     uint64x1x4_t v;
   10386     v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1]
   10387     v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1]
   10388     v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3]
   10389     v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3]
   10390     return v;
   10391 }
   10392 
   10393 _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   10394 #define  vld4_s8(ptr) vld4_u8((uint8_t*)ptr)
   10395 
   10396 _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   10397 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr)
   10398 
   10399 _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   10400 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr)
   10401 
   10402 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   10403 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr)
   10404 
   10405 _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   10406 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10407 
   10408 _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0]
   10409 _NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0]
   10410 {
   10411     //a0,a1,  b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1
   10412     float32x2x4_t res;
   10413     res.val[0].m64_f32[0] = *(ptr);
   10414     res.val[0].m64_f32[1] = *(ptr + 4);
   10415     res.val[1].m64_f32[0] = *(ptr + 1);
   10416     res.val[1].m64_f32[1] = *(ptr + 5);
   10417     res.val[2].m64_f32[0] = *(ptr + 2);
   10418     res.val[2].m64_f32[1] = *(ptr + 6);
   10419     res.val[3].m64_f32[0] = *(ptr + 3);
   10420     res.val[3].m64_f32[1] = *(ptr + 7);
   10421     return res;
   10422 }
   10423 
   10424 _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0]
   10425 #define vld4_p8 vld4_u8
   10426 
   10427 _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0]
   10428 #define vld4_p16 vld4_u16
   10429 
   10430 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes *******************
   10431 //*******************************************************************************************************************
   10432 _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   10433 _NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0]
   10434 {
   10435     uint8x8x2_t v;
   10436     __m128i val0, val1;
   10437     val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x
   10438     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x,
   10439     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x
   10440     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
   10441     vst1q_u8(v.val, val0);
   10442     return v;
   10443 }
   10444 
   10445 _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   10446 _NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0]
   10447 {
   10448     uint16x4x2_t v;
   10449     __m128i val0, val1;
   10450     val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x
   10451     val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0)
   10452     _M64(v.val[0], val0);
   10453     val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1)
   10454     _M64(v.val[1], val1);
   10455     return v;
   10456 }
   10457 
   10458 _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   10459 _NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
   10460 {
   10461     uint32x2x2_t v;
   10462     __m128i val0;
   10463     val0 = LOAD_SI128(ptr); //0,1,x,x
   10464     val0 = _mm_shuffle_epi32(val0,   0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1
   10465     vst1q_u32(v.val, val0);
   10466     return v;
   10467 }
   10468 
   10469 _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   10470 #define vld2_dup_u64 vld2_u64
   10471 
   10472 _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   10473 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr)
   10474 
   10475 _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   10476 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr)
   10477 
   10478 _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   10479 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr)
   10480 
   10481 _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0]
   10482 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr)
   10483 
   10484 _NEON2SSESTORAGE float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   10485 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10486 
   10487 _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0]
   10488 _NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0]
   10489 {
   10490     float32x2x2_t v;
   10491     v.val[0].m64_f32[0] = *(ptr); //0,0
   10492     v.val[0].m64_f32[1] = *(ptr); //0,0
   10493     v.val[1].m64_f32[0] = *(ptr + 1); //1,1
   10494     v.val[1].m64_f32[1] = *(ptr + 1); //1,1
   10495     return v;
   10496 }
   10497 
   10498 _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0]
   10499 #define vld2_dup_p8 vld2_dup_u8
   10500 
   10501 _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0]
   10502 #define vld2_dup_p16 vld2_dup_s16
   10503 
   10504 //************* Duplicate (or propagate)triplets: *******************
   10505 //********************************************************************
   10506 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes
   10507 _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   10508 _NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0]
   10509 {
   10510     uint8x8x3_t v;
   10511     __m128i val0, val1, val2;
   10512     val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x
   10513     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x,
   10514     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x,
   10515     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
   10516     val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x,
   10517     vst1q_u8(v.val, val0);
   10518     _M64(v.val[2], val2);
   10519     return v;
   10520 }
   10521 
   10522 _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10523 _NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10524 {
   10525     uint16x4x3_t v;
   10526     __m128i val0, val1, val2;
   10527     val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x
   10528     val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0)
   10529     val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1)
   10530     val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2)
   10531     _M64(v.val[0], val0);
   10532     _M64(v.val[1], val1);
   10533     _M64(v.val[2], val2);
   10534     return v;
   10535 }
   10536 
   10537 _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10538 _NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10539 {
   10540     uint32x2x3_t v;
   10541     __m128i val0, val1, val2;
   10542     val2 = LOAD_SI128(ptr); //0,1,2,x
   10543     val0 = _mm_shuffle_epi32(val2,   0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2
   10544     val1 = _mm_shuffle_epi32(val2,   1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2
   10545     val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0
   10546     _M64(v.val[0], val0);
   10547     _M64(v.val[1], val1);
   10548     _M64(v.val[2], val2);
   10549     return v;
   10550 }
   10551 
   10552 _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   10553 _NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0]
   10554 {
   10555     uint64x1x3_t v;
   10556     v.val[0].m64_u64[0] = *(ptr);
   10557     v.val[1].m64_u64[0] = *(ptr + 1);
   10558     v.val[2].m64_u64[0] = *(ptr + 2);
   10559     return v;
   10560 }
   10561 
   10562 _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   10563 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr)
   10564 
   10565 _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10566 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr)
   10567 
   10568 _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10569 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr)
   10570 
   10571 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0]
   10572 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr)
   10573 
   10574 
   10575 _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10576 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10577 
   10578 _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10579 _NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0]
   10580 {
   10581     float32x2x3_t v;
   10582     int i;
   10583     for (i = 0; i<3; i++) {
   10584         v.val[i].m64_f32[0] = *(ptr + i);
   10585         v.val[i].m64_f32[1] = *(ptr + i);
   10586     }
   10587     return v;
   10588 }
   10589 
   10590 _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0]
   10591 #define vld3_dup_p8 vld3_dup_u8
   10592 
   10593 _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0]
   10594 #define vld3_dup_p16 vld3_dup_s16
   10595 
   10596 
   10597 //************* Duplicate (or propagate) quadruples: *******************
   10598 //***********************************************************************
   10599 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes  and  ptr[3] to all val[3] lanes
   10600 _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   10601 _NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   10602 {
   10603     uint8x8x4_t v;
   10604     __m128i val0, val1, val2;
   10605     val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x
   10606     val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x,
   10607     val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3
   10608     val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,
   10609     val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3
   10610     vst1q_u8(&v.val[0], val0);
   10611     vst1q_u8(&v.val[2], val2);
   10612     return v;
   10613 }
   10614 
   10615 _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10616 _NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10617 {
   10618     uint16x4x4_t v;
   10619     __m128i val0, val1, val2, val3;
   10620     val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x
   10621     val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0)
   10622     val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1)
   10623     val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2)
   10624     val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3)
   10625     _M64(v.val[0], val0);
   10626     _M64(v.val[1], val1);
   10627     _M64(v.val[2], val2);
   10628     _M64(v.val[3], val3);
   10629     return v;
   10630 }
   10631 
   10632 _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10633 _NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10634 {
   10635     uint32x2x4_t v;
   10636     __m128i val0, val1, val2, val3;
   10637     val3 = LOAD_SI128(ptr); //0,1,2,3
   10638     val0 = _mm_shuffle_epi32(val3,   0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3
   10639     val1 = _mm_shuffle_epi32(val3,   1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3
   10640     val2 = _mm_shuffle_epi32(val3,   2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3
   10641     val3 = _mm_shuffle_epi32(val3,   3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2
   10642     _M64(v.val[0], val0);
   10643     _M64(v.val[1], val1);
   10644     _M64(v.val[2], val2);
   10645     _M64(v.val[3], val3);
   10646     return v;
   10647 }
   10648 
   10649 _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   10650 _NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0]
   10651 {
   10652     uint64x1x4_t v;
   10653     v.val[0].m64_u64[0] = *(ptr);
   10654     v.val[1].m64_u64[0] = *(ptr + 1);
   10655     v.val[2].m64_u64[0] = *(ptr + 2);
   10656     v.val[3].m64_u64[0] = *(ptr + 3);
   10657     return v;
   10658 }
   10659 
   10660 _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   10661 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr)
   10662 
   10663 _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10664 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr)
   10665 
   10666 _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10667 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr)
   10668 
   10669 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0]
   10670 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr)
   10671 
   10672 _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10673 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   10674 
   10675 _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10676 _NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0]
   10677 {
   10678     float32x2x4_t v;
   10679     int i;
   10680     for (i = 0; i<4; i++) {
   10681         v.val[i].m64_f32[0] = *(ptr + i);
   10682         v.val[i].m64_f32[1] = *(ptr + i);
   10683     }
   10684     return v;
   10685 }
   10686 
   10687 _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const  * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0]
   10688 #define vld4_dup_p8 vld4_dup_u8
   10689 
   10690 _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0]
   10691 #define vld4_dup_p16 vld4_dup_u16
   10692 
   10693 
   10694 //**********************************************************************************
   10695 //*******************Lane loads for  an N-element structures ***********************
   10696 //**********************************************************************************
   10697 //********************** Lane pairs  ************************************************
   10698 //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon
   10699 //we assume  src is 16 bit aligned
   10700 
   10701 //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error
   10702 //to fix it the all functions below work with  xxxxxx_2t pointers and the corresponding original functions are redefined
   10703 
   10704 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   10705 _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0]
   10706 {
   10707     uint16x8x2_t v;
   10708     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
   10709     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
   10710     return v;
   10711 }
   10712 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane)
   10713 
   10714 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
   10715 _NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
   10716 {
   10717     uint32x4x2_t v;
   10718     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
   10719     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
   10720     return v;
   10721 }
   10722 #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane)
   10723 
   10724 //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   10725 _NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane)
   10726 {
   10727     int16x8x2_t v;
   10728     v.val[0] = vld1q_lane_s16 (ptr, src->val[0],  lane);
   10729     v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1],  lane);
   10730     return v;
   10731 }
   10732 #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane)
   10733 
   10734 //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
   10735 _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane)
   10736 {
   10737     int32x4x2_t v;
   10738     v.val[0] = _MM_INSERT_EPI32 (src->val[0],  ptr[0], lane);
   10739     v.val[1] = _MM_INSERT_EPI32 (src->val[1],  ptr[1], lane);
   10740     return v;
   10741 }
   10742 #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane)
   10743 
   10744 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   10745 //current IA SIMD doesn't support float16
   10746 
   10747 //float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0]
   10748 _NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0]
   10749 {
   10750     float32x4x2_t v;
   10751     v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane);
   10752     v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane);
   10753     return v;
   10754 }
   10755 #define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane)
   10756 
   10757 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0]
   10758 #define vld2q_lane_p16 vld2q_lane_u16
   10759 
   10760 _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
   10761 _NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0]
   10762 {
   10763     uint8x8x2_t v;
   10764     v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
   10765     v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
   10766     return v;
   10767 }
   10768 
   10769 _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
   10770 _NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane)
   10771 {
   10772     uint16x4x2_t v;
   10773     v.val[0]  =  vld1_lane_u16(ptr, src.val[0], lane);
   10774     v.val[1]  = vld1_lane_u16((ptr + 1), src.val[1], lane);
   10775     return v;
   10776 }
   10777 
   10778 _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
   10779 _NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane)
   10780 {
   10781     uint32x2x2_t v;
   10782     v.val[0]  =  vld1_lane_u32(ptr, src.val[0], lane);
   10783     v.val[1]  = vld1_lane_u32((ptr + 1), src.val[1], lane);
   10784     return v;
   10785 }
   10786 
   10787 _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
   10788 #define vld2_lane_s8(ptr, src, lane)  vld2_lane_u8(( uint8_t*) ptr, src, lane)
   10789 
   10790 _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
   10791 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane)
   10792 
   10793 _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0]
   10794 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane)
   10795 
   10796 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   10797 //current IA SIMD doesn't support float16
   10798 
   10799 _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0]
   10800 _NEON2SSE_INLINE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t  src,__constrange(0,1) int lane)
   10801 {
   10802     float32x2x2_t v;
   10803     v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
   10804     v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
   10805     return v;
   10806 }
   10807 
   10808 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0]
   10809 _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0]
   10810 #define vld2_lane_p8 vld2_lane_u8
   10811 
   10812 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0]
   10813 _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0]
   10814 #define vld2_lane_p16 vld2_lane_u16
   10815 
   10816 //*********** Lane triplets **********************
   10817 //*************************************************
   10818 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon
   10819 //we assume src is 16 bit aligned
   10820 
   10821 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10822 _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10823 {
   10824     uint16x8x3_t v;
   10825     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
   10826     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
   10827     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
   10828     return v;
   10829 }
   10830 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane)
   10831 
   10832 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10833 _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10834 {
   10835     uint32x4x3_t v;
   10836     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
   10837     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
   10838     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
   10839     return v;
   10840 }
   10841 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane)
   10842 
   10843 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10844 _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10845 {
   10846     int16x8x3_t v;
   10847     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
   10848     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
   10849     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
   10850     return v;
   10851 }
   10852 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane)
   10853 
   10854 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10855 _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10856 {
   10857     int32x4x3_t v;
   10858     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
   10859     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
   10860     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
   10861     return v;
   10862 }
   10863 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane)
   10864 
   10865 _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10866 //current IA SIMD doesn't support float16
   10867 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane)
   10868 
   10869 
   10870 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10871 _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0]
   10872 {
   10873     float32x4x3_t v;
   10874     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
   10875     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
   10876     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
   10877     return v;
   10878 }
   10879 #define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane)
   10880 
   10881 _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0]
   10882 #define vld3q_lane_p16 vld3q_lane_u16
   10883 
   10884 _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   10885 _NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   10886 {
   10887     uint8x8x3_t v;
   10888     v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
   10889     v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
   10890     v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
   10891     return v;
   10892 }
   10893 
   10894 _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t   const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10895 _NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10896 {
   10897     uint16x4x3_t v;
   10898     v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
   10899     v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
   10900     v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
   10901     return v;
   10902 }
   10903 
   10904 _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10905 _NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10906 {
   10907     //need to merge into 128 bit anyway
   10908     uint32x2x3_t v;
   10909     v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);;
   10910     v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);;
   10911     v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);;
   10912     return v;
   10913 }
   10914 
   10915 _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t  src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   10916 #define vld3_lane_s8(ptr, src, lane)  vld3_lane_u8(( uint8_t*) ptr, src, lane)
   10917 
   10918 _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t  src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10919 #define vld3_lane_s16(ptr, src, lane)  vld3_lane_u16(( uint16_t*) ptr, src, lane)
   10920 
   10921 _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t  src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10922 #define vld3_lane_s32(ptr, src, lane)  vld3_lane_u32(( uint32_t*) ptr, src, lane)
   10923 
   10924 _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10925 //current IA SIMD doesn't support float16
   10926 
   10927 _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10928 _NEON2SSE_INLINE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0]
   10929 {
   10930     float32x2x3_t v;
   10931     v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
   10932     v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
   10933     v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
   10934     return v;
   10935 }
   10936 
   10937 _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0]
   10938 #define vld3_lane_p8 vld3_lane_u8
   10939 
   10940 _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0]
   10941 #define vld3_lane_p16 vld3_lane_u16
   10942 
   10943 //******************* Lane Quadruples  load ***************************
   10944 //*********************************************************************
   10945 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon
   10946 //we assume src is 16 bit aligned
   10947 
   10948 //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10949 _NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane)
   10950 {
   10951     uint16x8x4_t v;
   10952     v.val[0] = _MM_INSERT_EPI16 ( src->val[0],  ptr[0], lane);
   10953     v.val[1] = _MM_INSERT_EPI16 ( src->val[1],  ptr[1], lane);
   10954     v.val[2] = _MM_INSERT_EPI16 ( src->val[2],  ptr[2], lane);
   10955     v.val[3] = _MM_INSERT_EPI16 ( src->val[3],  ptr[3], lane);
   10956     return v;
   10957 }
   10958 #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane)
   10959 
   10960 //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10961 _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane)
   10962 {
   10963     uint32x4x4_t v;
   10964     v.val[0] = _MM_INSERT_EPI32 ( src->val[0],  ptr[0], lane);
   10965     v.val[1] = _MM_INSERT_EPI32 ( src->val[1],  ptr[1], lane);
   10966     v.val[2] = _MM_INSERT_EPI32 ( src->val[2],  ptr[2], lane);
   10967     v.val[3] = _MM_INSERT_EPI32 ( src->val[3],  ptr[3], lane);
   10968     return v;
   10969 }
   10970 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane)
   10971 
   10972 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10973 _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10974 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane)
   10975 
   10976 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10977 _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10978 #define vld4q_lane_s32(ptr, src, lane)  vld4q_lane_u32(( uint32_t*) ptr, src, lane)
   10979 
   10980 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10981 _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10982 //current IA SIMD doesn't support float16
   10983 
   10984 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10985 _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane)
   10986 {
   10987     float32x4x4_t v;
   10988     v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane);
   10989     v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane);
   10990     v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane);
   10991     v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane);
   10992     return v;
   10993 }
   10994 #define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane)
   10995 
   10996 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10997 _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   10998 #define vld4q_lane_p16 vld4q_lane_u16
   10999 
   11000 _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11001 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)
   11002 {
   11003     uint8x8x4_t v;
   11004     v.val[0] = vld1_lane_u8(ptr, src.val[0], lane);
   11005     v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane);
   11006     v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane);
   11007     v.val[3] = vld1_lane_u8((ptr + 3), src.val[3], lane);
   11008     return v;
   11009 }
   11010 
   11011 _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11012 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane)
   11013 {
   11014     uint16x4x4_t v;
   11015     v.val[0] = vld1_lane_u16(ptr, src.val[0], lane);
   11016     v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane);
   11017     v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane);
   11018     v.val[3] = vld1_lane_u16((ptr + 3), src.val[3], lane);
   11019     return v;
   11020 }
   11021 
   11022 _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11023 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane)
   11024 {
   11025     uint32x2x4_t v;
   11026     v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);
   11027     v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);
   11028     v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);
   11029     v.val[3] = vld1_lane_u32((ptr + 3), src.val[3], lane);
   11030     return v;
   11031 }
   11032 
   11033 _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11034 #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane)
   11035 
   11036 _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11037 #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane)
   11038 
   11039 _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11040 #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane)
   11041 
   11042 //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11043 _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane);
   11044 //current IA SIMD doesn't support float16
   11045 
   11046 _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11047 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)
   11048 {
   11049     //serial solution may be faster
   11050     float32x2x4_t v;
   11051     v.val[0] = vld1_lane_f32(ptr, src.val[0], lane);
   11052     v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane);
   11053     v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane);
   11054     v.val[3] = vld1_lane_f32((ptr + 3), src.val[3], lane);
   11055     return v;
   11056 }
   11057 
   11058 _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11059 #define vld4_lane_p8 vld4_lane_u8
   11060 
   11061 _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11062 #define vld4_lane_p16 vld4_lane_u16
   11063 
   11064 //******************* Store duplets *********************************************
   11065 //********************************************************************************
   11066 //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0]
   11067 _NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val)
   11068 {
   11069     uint8x16x2_t v;
   11070     v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]);
   11071     v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]);
   11072     vst1q_u8 (ptr, v.val[0]);
   11073     vst1q_u8 ((ptr + 16),  v.val[1]);
   11074 }
   11075 #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val)
   11076 
   11077 //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0]
   11078 _NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val)
   11079 {
   11080     uint16x8x2_t v;
   11081     v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]);
   11082     v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]);
   11083     vst1q_u16 (ptr, v.val[0]);
   11084     vst1q_u16 ((ptr + 8),  v.val[1]);
   11085 }
   11086 #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val)
   11087 
   11088 //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0]
   11089 _NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val)
   11090 {
   11091     uint32x4x2_t v;
   11092     v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]);
   11093     v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]);
   11094     vst1q_u32 (ptr, v.val[0]);
   11095     vst1q_u32 ((ptr + 4),  v.val[1]);
   11096 }
   11097 #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val)
   11098 
   11099 //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0]
   11100 _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val);
   11101 #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val)
   11102 
   11103 //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0]
   11104 _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val);
   11105 #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val)
   11106 
   11107 //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0]
   11108 _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val);
   11109 #define vst2q_s32(ptr, val)  vst2q_u32((uint32_t*)(ptr), val)
   11110 
   11111 //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0]
   11112 _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val);
   11113 // IA32 SIMD doesn't work with 16bit floats currently
   11114 
   11115 //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0]
   11116 _NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val)
   11117 {
   11118     float32x4x2_t v;
   11119     v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]);
   11120     v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]);
   11121     vst1q_f32 (ptr, v.val[0]);
   11122     vst1q_f32 ((ptr + 4),  v.val[1]);
   11123 }
   11124 #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val)
   11125 
   11126 //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0]
   11127 _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val);
   11128 #define vst2q_p8 vst2q_u8
   11129 
   11130 //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0]
   11131 _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val);
   11132 #define vst2q_p16 vst2q_u16
   11133 
   11134 _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0]
   11135 _NEON2SSE_INLINE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val)
   11136 {
   11137     __m128i v0;
   11138     v0 = _mm_unpacklo_epi8(_pM128i(val.val[0]), _pM128i(val.val[1]));
   11139     vst1q_u8 (ptr, v0);
   11140 }
   11141 
   11142 _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0]
   11143 _NEON2SSE_INLINE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val)
   11144 {
   11145     __m128i v0;
   11146     v0 = _mm_unpacklo_epi16(_pM128i(val.val[0]), _pM128i(val.val[1]));
   11147     vst1q_u16 (ptr, v0);
   11148 }
   11149 
   11150 _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0]
   11151 _NEON2SSE_INLINE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val)
   11152 {
   11153     __m128i v0;
   11154     v0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1]));
   11155     vst1q_u32 (ptr, v0);
   11156 }
   11157 
   11158 _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0]
   11159 _NEON2SSE_INLINE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val)
   11160 {
   11161     *(ptr) = val.val[0].m64_u64[0];
   11162     *(ptr + 1) = val.val[1].m64_u64[0];
   11163 }
   11164 
   11165 _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0]
   11166 #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val)
   11167 
   11168 _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0]
   11169 #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val)
   11170 
   11171 _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0]
   11172 #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val)
   11173 
   11174 _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val);
   11175 #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val)
   11176 
   11177 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0]
   11178 //current IA SIMD doesn't support float16
   11179 
   11180 _NEON2SSESTORAGE void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0]
   11181 _NEON2SSE_INLINE void vst2_f32(__transfersize(4) float32_t* ptr, float32x2x2_t val)
   11182 {
   11183     *(ptr) =   val.val[0].m64_f32[0];
   11184     *(ptr + 1) = val.val[1].m64_f32[0];
   11185     *(ptr + 2) = val.val[0].m64_f32[1];
   11186     *(ptr + 3) = val.val[1].m64_f32[1];
   11187 }
   11188 
   11189 _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t  val); // VST2.8 {d0, d1}, [r0]
   11190 #define vst2_p8 vst2_u8
   11191 
   11192 _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t  val); // VST2.16 {d0, d1}, [r0]
   11193 #define vst2_p16 vst2_u16
   11194 
   11195 //******************** Triplets store  *****************************************
   11196 //******************************************************************************
   11197 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0]
   11198 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val)
   11199 {
   11200     uint8x16x3_t v;
   11201     __m128i v0,v1,v2, cff, bldmask;
   11202     _NEON2SSE_ALIGN_16 static const uint8_t mask0[16]   = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10};
   11203     _NEON2SSE_ALIGN_16 static const uint8_t mask1[16]   = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10};
   11204     _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] =    {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff};
   11205     _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff};
   11206     _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff};
   11207     _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15};
   11208 
   11209     v0 =  _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22
   11210     v2 =  _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25,  27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46
   11211     v1 =  _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25,  27,28, 30,31, 33,34
   11212     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
   11213     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
   11214     cff = _mm_cmpeq_epi8(v0, v0); //all ff
   11215     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff);
   11216     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
   11217     vst1q_u8(ptr,   v.val[0]);
   11218     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
   11219     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
   11220     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff);
   11221     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
   11222     vst1q_u8((ptr + 16),  v.val[1]);
   11223     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
   11224     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
   11225     bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff);
   11226     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
   11227     vst1q_u8((ptr + 32),  v.val[2]);
   11228 }
   11229 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val)
   11230 
   11231 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0]
   11232 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val)
   11233 {
   11234     uint16x8x3_t v;
   11235     __m128i v0,v1,v2, cff, bldmask;
   11236     _NEON2SSE_ALIGN_16 static const uint8_t mask0[16]   = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11};
   11237     _NEON2SSE_ALIGN_16 static const uint8_t mask1[16]   = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9};
   11238     _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] =    {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff};
   11239     _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff};
   11240     _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff};
   11241     _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15};
   11242 
   11243     v0 =  _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10
   11244     v2 =  _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22,
   11245     v1 =  _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19
   11246     v.val[0] =  _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding
   11247     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding
   11248     cff = _mm_cmpeq_epi16(v0, v0); //all ff
   11249     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff);
   11250     v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask);
   11251     vst1q_u16(ptr,      v.val[0]);
   11252     v.val[0] =  _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding
   11253     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding
   11254     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff);
   11255     v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask);
   11256     vst1q_u16((ptr + 8),  v.val[1]);
   11257     v.val[0] =  _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding
   11258     v.val[2] =  _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding
   11259     bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff);
   11260     v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask );
   11261     vst1q_u16((ptr + 16), v.val[2]);
   11262 }
   11263 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val)
   11264 
   11265 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
   11266 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val)
   11267 {
   11268     //a0,a1,a2,a3,  b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3
   11269     uint32x4x3_t v;
   11270     __m128i tmp0, tmp1,tmp2;
   11271     tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1
   11272     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3
   11273     tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1
   11274     v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2,
   11275     v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3
   11276     v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3
   11277     tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1
   11278     v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1,
   11279 
   11280     vst1q_u32(ptr,      v.val[0]);
   11281     vst1q_u32((ptr + 4),  v.val[1]);
   11282     vst1q_u32((ptr + 8),  v.val[2]);
   11283 }
   11284 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val)
   11285 
   11286 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val);
   11287 _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val);
   11288 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val)
   11289 
   11290 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val);
   11291 _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val);
   11292 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val)
   11293 
   11294 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val);
   11295 _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val);
   11296 #define vst3q_s32(ptr, val)  vst3q_u32((uint32_t*)(ptr), val)
   11297 
   11298 //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
   11299 _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val);
   11300 // IA32 SIMD doesn't work with 16bit floats currently
   11301 
   11302 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0]
   11303 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val)
   11304 {
   11305     float32x4x3_t v;
   11306     __m128 tmp0, tmp1,tmp2;
   11307     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1
   11308     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3
   11309     tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1
   11310     v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2,
   11311     v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3
   11312     v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3
   11313     tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1
   11314     v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1,
   11315 
   11316     vst1q_f32( ptr,    v.val[0]);
   11317     vst1q_f32( (ptr + 4),  v.val[1]);
   11318     vst1q_f32( (ptr + 8),  v.val[2]);
   11319 }
   11320 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val)
   11321 
   11322 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0]
   11323 _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val);
   11324 #define vst3q_p8 vst3q_u8
   11325 
   11326 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0]
   11327 _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val);
   11328 #define vst3q_p16 vst3q_u16
   11329 
   11330 _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
   11331 _NEON2SSE_INLINE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)
   11332 {
   11333     __m128i tmp, sh0, sh1, val0, val2;
   11334     _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5};
   11335     _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0};
   11336     _NEON2SSE_ALIGN_16 static const uint8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0};
   11337     _NEON2SSE_ALIGN_16 static const uint8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0};
   11338     tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]) );
   11339     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15)
   11340     val2 = _pM128i(val.val[2]);
   11341     sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask0);
   11342     val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel);
   11343     vst1q_u8(ptr,   val0); //store as 128 bit structure
   11344     sh0 =  _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15)
   11345     sh1 =  _mm_shuffle_epi8(val2, *(__m128i*)mask1);
   11346     val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel);
   11347     _M64((*(__m64_128*)(ptr + 16)),  val2); //need it to fit into *ptr memory
   11348 }
   11349 
   11350 _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
   11351 _NEON2SSE_INLINE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)
   11352 {
   11353     __m128i tmp, val0, val1, val2;
   11354     _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13};
   11355     _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23,   0,0,0,0,0,0,0,0};
   11356     _NEON2SSE_ALIGN_16 static const uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0]  otherwise from v.val[1]
   11357     _NEON2SSE_ALIGN_16 static const uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1]  otherwise from v.val[0]
   11358     tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]));
   11359     val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0);
   11360     val2 = _pM128i(val.val[2]);
   11361     val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0);
   11362     val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f);
   11363     vst1q_u16(ptr,    val0); //store as 128 bit structure
   11364     val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1);
   11365     val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1);
   11366     val1 = _MM_BLENDV_EPI8(val0, val1,  *(__m128i*)mask1f); //change the operands order
   11367     _M64((*(__m64_128*)(ptr + 8)),  val1); //need it to fit into *ptr memory
   11368 }
   11369 
   11370 _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
   11371 _NEON2SSE_INLINE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)
   11372 {
   11373     //val.val[0]:0,3,val.val[1]:1,4; val.val[2]:2,5,x,x;
   11374     __m128i val0, val1;
   11375     val0 = _mm_unpacklo_epi64(_pM128i(val.val[1]), _pM128i(val.val[2])); //val[0]: 1,4,2,5
   11376     val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5
   11377     val1 = _mm_srli_si128(val0, 8); //4,5, x,x
   11378     _M64((*(__m64_128*)(ptr + 4)),  val1);
   11379     val0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), val0); //0,1,3,2
   11380     val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3
   11381     vst1q_u32(ptr, val0); //store as 128 bit structure
   11382 }
   11383 
   11384 _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val);// VST1.64 {d0, d1, d2}, [r0]
   11385 _NEON2SSE_INLINE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)
   11386 {
   11387     *(ptr) = val.val[0].m64_u64[0];
   11388     *(ptr + 1) = val.val[1].m64_u64[0];
   11389     *(ptr + 2) = val.val[2].m64_u64[0];
   11390 }
   11391 
   11392 _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val);  // VST3.8 {d0, d1, d2}, [r0]
   11393 #define vst3_s8(ptr, val) vst3_u8((uint8_t*)ptr, val)
   11394 
   11395 _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val);  // VST3.16 {d0, d1, d2}, [r0]
   11396 #define vst3_s16(ptr, val) vst3_u16((uint16_t*)ptr, val)
   11397 
   11398 _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0]
   11399 #define vst3_s32(ptr, val) vst3_u32((uint32_t*)ptr, val)
   11400 
   11401 _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0]
   11402 #define vst3_s64(ptr, val) vst3_u64((uint64_t*)ptr, val)
   11403 
   11404 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
   11405 _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0]
   11406 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   11407 
   11408 _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0]
   11409 _NEON2SSE_INLINE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)
   11410 {
   11411     //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x;   -> 0,2, 4,1, 3,5
   11412     *(ptr) =   val.val[0].m64_f32[0];
   11413     *(ptr + 1) = val.val[1].m64_f32[0];
   11414     *(ptr + 2) = val.val[2].m64_f32[0];
   11415     *(ptr + 3) = val.val[0].m64_f32[1];
   11416     *(ptr + 4) = val.val[1].m64_f32[1];
   11417     *(ptr + 5) = val.val[2].m64_f32[1];
   11418 }
   11419 
   11420 _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0]
   11421 #define vst3_p8 vst3_u8
   11422 
   11423 _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0]
   11424 #define vst3_p16 vst3_u16
   11425 
   11426 //***************  Quadruples store ********************************
   11427 //*********************************************************************
   11428 //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0]
   11429 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val)
   11430 {
   11431     __m128i tmp1, tmp2, res;
   11432     tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //  0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29
   11433     tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); //  2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31
   11434     res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15
   11435     vst1q_u8(ptr,  res);
   11436     res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31
   11437     vst1q_u8((ptr + 16), res);
   11438     tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //
   11439     tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); //
   11440     res = _mm_unpacklo_epi16(tmp1, tmp2); //
   11441     vst1q_u8((ptr + 32), res);
   11442     res = _mm_unpackhi_epi16(tmp1, tmp2); //
   11443     vst1q_u8((ptr + 48), res);
   11444 }
   11445 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val)
   11446 
   11447 //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0]
   11448 _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val)
   11449 {
   11450     uint16x8x4_t v;
   11451     __m128i tmp1, tmp2;
   11452     tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
   11453     tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
   11454     v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2);
   11455     v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2);
   11456     tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
   11457     tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
   11458     v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2);
   11459     v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2);
   11460     vst1q_u16(ptr,     v.val[0]);
   11461     vst1q_u16((ptr + 8), v.val[1]);
   11462     vst1q_u16((ptr + 16),v.val[2]);
   11463     vst1q_u16((ptr + 24), v.val[3]);
   11464 }
   11465 #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val)
   11466 
   11467 //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
   11468 _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val)
   11469 {
   11470     uint16x8x4_t v;
   11471     __m128i tmp1, tmp2;
   11472     tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
   11473     tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
   11474     v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2);
   11475     v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2);
   11476     tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13
   11477     tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15
   11478     v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2);
   11479     v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2);
   11480     vst1q_u32(ptr,      v.val[0]);
   11481     vst1q_u32((ptr + 4),  v.val[1]);
   11482     vst1q_u32((ptr + 8),  v.val[2]);
   11483     vst1q_u32((ptr + 12), v.val[3]);
   11484 }
   11485 #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val)
   11486 
   11487 //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val);
   11488 _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val);
   11489 #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val)
   11490 
   11491 //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val);
   11492 _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val);
   11493 #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val)
   11494 
   11495 //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val);
   11496 _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val);
   11497 #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val)
   11498 
   11499 //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
   11500 _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val);
   11501 // IA32 SIMD doesn't work with 16bit floats currently
   11502 
   11503 //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0]
   11504 _NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val)
   11505 {
   11506     __m128 tmp3, tmp2, tmp1, tmp0;
   11507     float32x4x4_t v;
   11508     tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]);
   11509     tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]);
   11510     tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]);
   11511     tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]);
   11512     v.val[0] = _mm_movelh_ps(tmp0, tmp2);
   11513     v.val[1] = _mm_movehl_ps(tmp2, tmp0);
   11514     v.val[2] = _mm_movelh_ps(tmp1, tmp3);
   11515     v.val[3] = _mm_movehl_ps(tmp3, tmp1);
   11516     vst1q_f32(ptr,   v.val[0]);
   11517     vst1q_f32((ptr + 4), v.val[1]);
   11518     vst1q_f32((ptr + 8), v.val[2]);
   11519     vst1q_f32((ptr + 12), v.val[3]);
   11520 }
   11521 #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val)
   11522 
   11523 //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0]
   11524 _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val);
   11525 #define vst4q_p8 vst4q_u8
   11526 
   11527 //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0]
   11528 _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val);
   11529 #define vst4q_p16 vst4q_s16
   11530 
   11531 _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
   11532 _NEON2SSE_INLINE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)
   11533 {
   11534     __m128i sh0, sh1, val0, val2;
   11535     sh0 = _mm_unpacklo_epi8(_pM128i(val.val[0]),_pM128i(val.val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7,
   11536     sh1 = _mm_unpacklo_epi8(_pM128i(val.val[2]),_pM128i(val.val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7
   11537     val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3,
   11538     val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7
   11539     vst1q_u8(ptr,    val0);
   11540     vst1q_u8((ptr + 16),  val2);
   11541 }
   11542 
   11543 _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
   11544 _NEON2SSE_INLINE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)
   11545 {
   11546     __m128i sh0, sh1, val0, val2;
   11547     sh0 = _mm_unpacklo_epi16(_pM128i(val.val[0]),_pM128i(val.val[1])); //a0,a1,b0,b1,c0,c1,d0,d1,
   11548     sh1 = _mm_unpacklo_epi16(_pM128i(val.val[2]),_pM128i(val.val[3])); //a2,a3,b2,b3,c2,c3,d2,d3
   11549     val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3
   11550     val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3
   11551     vst1q_u16(ptr,      val0); //store as 128 bit structure
   11552     vst1q_u16((ptr + 8),  val2);
   11553 }
   11554 
   11555 _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
   11556 _NEON2SSE_INLINE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)
   11557 {
   11558     //0,4,   1,5,  2,6,  3,7
   11559     __m128i sh0, sh1, val0, val1;
   11560     sh0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1])); //0,1,4,5
   11561     sh1 = _mm_unpacklo_epi32(_pM128i(val.val[2]), _pM128i(val.val[3])); //2,3,6,7
   11562     val0 = _mm_unpacklo_epi64(sh0,sh1); //
   11563     val1 = _mm_unpackhi_epi64(sh0,sh1); //
   11564     vst1q_u32(ptr,     val0); //store as 128 bit structure
   11565     vst1q_u32((ptr + 4),  val1);
   11566 }
   11567 
   11568 _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val);// VST1.64 {d0, d1, d2, d3}, [r0]
   11569 _NEON2SSE_INLINE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)
   11570 {
   11571     *(ptr) =  val.val[0].m64_u64[0];
   11572     *(ptr + 1) =  val.val[1].m64_u64[0];
   11573     *(ptr + 2) =  val.val[2].m64_u64[0];
   11574     *(ptr + 3) =  val.val[3].m64_u64[0];
   11575 }
   11576 
   11577 //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val)  //VST4.8 {d0, d1, d2, d3}, [r0]
   11578 #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val)
   11579 
   11580 //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val)  // VST4.16 {d0, d1, d2, d3}, [r0]
   11581 #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val)
   11582 
   11583 //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0]
   11584 #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val)
   11585 
   11586 //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0]
   11587 _NEON2SSESTORAGE void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val);
   11588 #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val)
   11589 
   11590 //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
   11591 _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val);
   11592 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example
   11593 
   11594 _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0]
   11595 _NEON2SSE_INLINE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)
   11596 {
   11597     //0,4,   1,5,  2,6,  3,7 -> 0,1, 2,3, 4,5, 6,7
   11598     *(ptr) =   val.val[0].m64_f32[0];
   11599     *(ptr + 1) = val.val[1].m64_f32[0];
   11600     *(ptr + 2) = val.val[2].m64_f32[0];
   11601     *(ptr + 3) = val.val[3].m64_f32[0];
   11602     *(ptr + 4) = val.val[0].m64_f32[1];
   11603     *(ptr + 5) = val.val[1].m64_f32[1];
   11604     *(ptr + 6) = val.val[2].m64_f32[1];
   11605     *(ptr + 7) = val.val[3].m64_f32[1];
   11606 }
   11607 
   11608 _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0]
   11609 #define vst4_p8 vst4_u8
   11610 
   11611 _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0]
   11612 #define vst4_p16 vst4_u16
   11613 
   11614 //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors  *********************
   11615 //********************************************************************************************************************
   11616 //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0]
   11617 _NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane)
   11618 {
   11619     vst1q_lane_s16(ptr, val->val[0], lane);
   11620     vst1q_lane_s16((ptr + 1), val->val[1], lane);
   11621 }
   11622 #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane)
   11623 
   11624 //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
   11625 _NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane)
   11626 {
   11627     vst1q_lane_u32(ptr, val->val[0], lane);
   11628     vst1q_lane_u32((ptr + 1), val->val[1], lane);
   11629 }
   11630 #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane)
   11631 
   11632 //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
   11633 _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane);
   11634 #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane)
   11635 
   11636 //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0]
   11637 _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane);
   11638 #define vst2q_lane_s32(ptr, val, lane)  vst2q_lane_u32((uint32_t*)ptr, val, lane)
   11639 
   11640 //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
   11641 _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane);
   11642 //current IA SIMD doesn't support float16
   11643 
   11644 //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0]
   11645 _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane)
   11646 {
   11647     vst1q_lane_f32(ptr, val->val[0], lane);
   11648     vst1q_lane_f32((ptr + 1), val->val[1], lane);
   11649 }
   11650 #define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane)
   11651 
   11652 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0]
   11653 _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane);
   11654 #define vst2q_lane_p16 vst2q_lane_s16
   11655 
   11656 _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
   11657 _NEON2SSE_INLINE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0]
   11658 {
   11659     *(ptr) = val.val[0].m64_u8[lane];
   11660     *(ptr + 1) = val.val[1].m64_u8[lane];
   11661 }
   11662 
   11663 _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
   11664 _NEON2SSE_INLINE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane)
   11665 {
   11666     *(ptr) = val.val[0].m64_u16[lane];
   11667     *(ptr + 1) = val.val[1].m64_u16[lane];
   11668 }
   11669 
   11670 _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
   11671 _NEON2SSE_INLINE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane)
   11672 {
   11673     *(ptr) = val.val[0].m64_u32[lane];
   11674     *(ptr + 1) = val.val[1].m64_u32[lane];
   11675 }
   11676 
   11677 _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
   11678 #define vst2_lane_s8(ptr, val, lane)  vst2_lane_u8((uint8_t*)ptr, val, lane)
   11679 
   11680 _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
   11681 #define vst2_lane_s16(ptr, val, lane)  vst2_lane_u16((uint16_t*)ptr, val, lane)
   11682 
   11683 _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0]
   11684 #define vst2_lane_s32(ptr, val, lane)  vst2_lane_u32((uint32_t*)ptr, val, lane)
   11685 
   11686 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0]
   11687 //current IA SIMD doesn't support float16
   11688 
   11689 _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0]
   11690 _NEON2SSE_INLINE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane)
   11691 {
   11692     *(ptr) = val.val[0].m64_f32[lane];
   11693     *(ptr + 1) = val.val[1].m64_f32[lane];
   11694 }
   11695 
   11696 _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0]
   11697 #define vst2_lane_p8 vst2_lane_u8
   11698 
   11699 _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0]
   11700 #define vst2_lane_p16 vst2_lane_u16
   11701 
   11702 //************************* Triple lanes  stores *******************************************************
   11703 //*******************************************************************************************************
   11704 //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   11705 _NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane)
   11706 {
   11707     vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane);
   11708     vst1q_lane_u16((ptr + 2), val->val[2], lane);
   11709 }
   11710 #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane)
   11711 
   11712 //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   11713 _NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane)
   11714 {
   11715     vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane);
   11716     vst1q_lane_u32((ptr + 2), val->val[2], lane);
   11717 }
   11718 #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane)
   11719 
   11720 //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   11721 _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane);
   11722 #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane)
   11723 
   11724 //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   11725 _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane);
   11726 #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane)
   11727 
   11728 //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   11729 _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane);
   11730 //current IA SIMD doesn't support float16
   11731 
   11732 //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0]
   11733 _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane)
   11734 {
   11735     vst1q_lane_f32(ptr,   val->val[0], lane);
   11736     vst1q_lane_f32((ptr + 1),   val->val[1], lane);
   11737     vst1q_lane_f32((ptr + 2), val->val[2], lane);
   11738 }
   11739 #define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane)
   11740 
   11741 //void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0]
   11742 _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane);
   11743 #define vst3q_lane_p16 vst3q_lane_s16
   11744 
   11745 _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
   11746 _NEON2SSE_INLINE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane)
   11747 {
   11748     *(ptr) =     val.val[0].m64_u8[lane];
   11749     *(ptr + 1) = val.val[1].m64_u8[lane];
   11750     *(ptr + 2) = val.val[2].m64_u8[lane];
   11751 }
   11752 
   11753 _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   11754 _NEON2SSE_INLINE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane)
   11755 {
   11756     *(ptr) =     val.val[0].m64_u16[lane];
   11757     *(ptr + 1) = val.val[1].m64_u16[lane];
   11758     *(ptr + 2) = val.val[2].m64_u16[lane];
   11759 }
   11760 
   11761 _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
   11762 _NEON2SSE_INLINE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane)
   11763 {
   11764     *(ptr) =     val.val[0].m64_u32[lane];
   11765     *(ptr + 1) = val.val[1].m64_u32[lane];
   11766     *(ptr + 2) = val.val[2].m64_u32[lane];
   11767 }
   11768 
   11769 _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
   11770 #define  vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane)
   11771 
   11772 _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   11773 #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane)
   11774 
   11775 _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
   11776 #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane)
   11777 
   11778 //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   11779 _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane);
   11780 //current IA SIMD doesn't support float16
   11781 
   11782 _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0]
   11783 _NEON2SSE_INLINE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)
   11784 {
   11785     *(ptr) = val.val[0].m64_f32[lane];
   11786     *(ptr + 1) = val.val[1].m64_f32[lane];
   11787     *(ptr + 2) = val.val[2].m64_f32[lane];
   11788 }
   11789 
   11790 _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0]
   11791 #define vst3_lane_p8 vst3_lane_u8
   11792 
   11793 _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0]
   11794 #define vst3_lane_p16 vst3_lane_u16
   11795 
   11796 //******************************** Quadruple lanes stores ***********************************************
   11797 //*******************************************************************************************************
   11798 //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11799 _NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane)
   11800 {
   11801     vst2q_lane_u16_ptr(ptr,    (uint16x8x2_t*)val4->val, lane);
   11802     vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane);
   11803 }
   11804 #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane)
   11805 
   11806 //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11807 _NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane)
   11808 {
   11809     vst2q_lane_u32_ptr(ptr,     (uint32x4x2_t*)val4->val, lane);
   11810     vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane);
   11811 }
   11812 #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane)
   11813 
   11814 //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11815 _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane);
   11816 #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane)
   11817 
   11818 //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11819 _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane);
   11820 #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane)
   11821 
   11822 //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11823 _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane);
   11824 //current IA SIMD doesn't support float16
   11825 
   11826 //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11827 _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane)
   11828 {
   11829     vst1q_lane_f32(ptr,   val->val[0], lane);
   11830     vst1q_lane_f32((ptr + 1), val->val[1], lane);
   11831     vst1q_lane_f32((ptr + 2), val->val[2], lane);
   11832     vst1q_lane_f32((ptr + 3), val->val[3], lane);
   11833 }
   11834 #define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane)
   11835 
   11836 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0]
   11837 _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane);
   11838 #define vst4q_lane_p16 vst4q_lane_u16
   11839 
   11840 _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11841 _NEON2SSE_INLINE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)
   11842 {
   11843     *(ptr) =     val.val[0].m64_u8[lane];
   11844     *(ptr + 1) = val.val[1].m64_u8[lane];
   11845     *(ptr + 2) = val.val[2].m64_u8[lane];
   11846     *(ptr + 3) = val.val[3].m64_u8[lane];
   11847 }
   11848 
   11849 _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11850 _NEON2SSE_INLINE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)
   11851 {
   11852     *(ptr) =     val.val[0].m64_u16[lane];
   11853     *(ptr + 1) = val.val[1].m64_u16[lane];
   11854     *(ptr + 2) = val.val[2].m64_u16[lane];
   11855     *(ptr + 3) = val.val[3].m64_u16[lane];
   11856 }
   11857 
   11858 _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11859 _NEON2SSE_INLINE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)
   11860 {
   11861     *(ptr) =     val.val[0].m64_u32[lane];
   11862     *(ptr + 1) = val.val[1].m64_u32[lane];
   11863     *(ptr + 2) = val.val[2].m64_u32[lane];
   11864     *(ptr + 3) = val.val[3].m64_u32[lane];
   11865 }
   11866 
   11867 _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11868 #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane)
   11869 
   11870 _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11871 #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane)
   11872 
   11873 _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11874 #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane)
   11875 
   11876 //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11877 _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane);
   11878 //current IA SIMD doesn't support float16
   11879 
   11880 _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t  val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11881 _NEON2SSE_INLINE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)
   11882 {
   11883     *(ptr) = val.val[0].m64_f32[lane];
   11884     *(ptr + 1) = val.val[1].m64_f32[lane];
   11885     *(ptr + 2) = val.val[2].m64_f32[lane];
   11886     *(ptr + 3) = val.val[3].m64_f32[lane];
   11887 }
   11888 
   11889 _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11890 #define vst4_lane_p8 vst4_lane_u8
   11891 
   11892 _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0]
   11893 #define vst4_lane_p16 vst4_lane_u16
   11894 
   11895 //**************************************************************************************************
   11896 //************************ Extract lanes from a vector ********************************************
   11897 //**************************************************************************************************
   11898 //These intrinsics extract a single lane (element) from a vector.
   11899 _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
   11900 #define vget_lane_u8(vec, lane) vec.m64_u8[lane]
   11901 
   11902 _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
   11903 #define vget_lane_u16(vec, lane) vec.m64_u16[lane]
   11904 
   11905 
   11906 _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   11907 #define vget_lane_u32(vec, lane) vec.m64_u32[lane]
   11908 
   11909 _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0]
   11910 #define vget_lane_s8(vec, lane) vec.m64_i8[lane]
   11911 
   11912 _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0]
   11913 #define vget_lane_s16(vec, lane) vec.m64_i16[lane]
   11914 
   11915 _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   11916 #define vget_lane_s32(vec, lane) vec.m64_i32[lane]
   11917 
   11918 _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0]
   11919 #define vget_lane_p8 vget_lane_u8
   11920 
   11921 _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0]
   11922 #define vget_lane_p16 vget_lane_u16
   11923 
   11924 _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0]
   11925 #define vget_lane_f32(vec, lane) vec.m64_f32[lane]
   11926 
   11927 _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
   11928 #define vgetq_lane_u8 (uint8_t) _MM_EXTRACT_EPI8
   11929 
   11930 _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
   11931 #define  vgetq_lane_u16 (uint16_t) _MM_EXTRACT_EPI16
   11932 
   11933 _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   11934 #define vgetq_lane_u32 (uint32_t) _MM_EXTRACT_EPI32
   11935 
   11936 _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0]
   11937 #define vgetq_lane_s8 _MM_EXTRACT_EPI8
   11938 
   11939 _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0]
   11940 #define vgetq_lane_s16 _MM_EXTRACT_EPI16
   11941 
   11942 _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   11943 #define vgetq_lane_s32 _MM_EXTRACT_EPI32
   11944 
   11945 _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0]
   11946 #define vgetq_lane_p8 vgetq_lane_u8
   11947 
   11948 _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0]
   11949 #define vgetq_lane_p16 vgetq_lane_u16
   11950 
   11951 _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0]
   11952 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane)
   11953 {
   11954     int32_t ilane;
   11955     ilane = _MM_EXTRACT_PS(vec,lane);
   11956     return *(float*)&ilane;
   11957 }
   11958 
   11959 _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
   11960 #define vget_lane_s64(vec, lane) vec.m64_i64[0]
   11961 
   11962 _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0
   11963 #define vget_lane_u64(vec, lane) vec.m64_u64[0]
   11964 
   11965 
   11966 _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
   11967 #define vgetq_lane_s64 _MM_EXTRACT_EPI64
   11968 
   11969 _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0
   11970 #define vgetq_lane_u64 (uint64_t) _MM_EXTRACT_EPI64
   11971 
   11972 // ***************** Set lanes within a vector ********************************************
   11973 // **************************************************************************************
   11974 //These intrinsics set a single lane (element) within a vector.
   11975 //same functions as vld1_lane_xx ones, but take the value to be set directly.
   11976 
   11977 _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   11978 _NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane)
   11979 {
   11980     uint8_t val;
   11981     val = value;
   11982     return vld1_lane_u8(&val, vec,  lane);
   11983 }
   11984 
   11985 _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   11986 _NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane)
   11987 {
   11988     uint16_t val;
   11989     val = value;
   11990     return vld1_lane_u16(&val, vec,  lane);
   11991 }
   11992 
   11993 _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   11994 _NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane)
   11995 {
   11996     uint32_t val;
   11997     val = value;
   11998     return vld1_lane_u32(&val, vec,  lane);
   11999 }
   12000 
   12001 _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   12002 _NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane)
   12003 {
   12004     int8_t val;
   12005     val = value;
   12006     return vld1_lane_s8(&val, vec,  lane);
   12007 }
   12008 
   12009 _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   12010 _NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane)
   12011 {
   12012     int16_t val;
   12013     val = value;
   12014     return vld1_lane_s16(&val, vec,  lane);
   12015 }
   12016 
   12017 _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   12018 _NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane)
   12019 {
   12020     int32_t val;
   12021     val = value;
   12022     return vld1_lane_s32(&val, vec,  lane);
   12023 }
   12024 
   12025 _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0
   12026 #define vset_lane_p8  vset_lane_u8
   12027 
   12028 _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0
   12029 #define vset_lane_p16  vset_lane_u16
   12030 
   12031 _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0
   12032 _NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane)
   12033 {
   12034     float32_t val;
   12035     val = value;
   12036     return vld1_lane_f32(&val, vec,  lane);
   12037 }
   12038 
   12039 _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   12040 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane)
   12041 {
   12042     uint8_t val;
   12043     val = value;
   12044     return vld1q_lane_u8(&val, vec,  lane);
   12045 }
   12046 
   12047 _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   12048 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane)
   12049 {
   12050     uint16_t val;
   12051     val = value;
   12052     return vld1q_lane_u16(&val, vec,  lane);
   12053 }
   12054 
   12055 _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   12056 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane)
   12057 {
   12058     uint32_t val;
   12059     val = value;
   12060     return vld1q_lane_u32(&val, vec,  lane);
   12061 }
   12062 
   12063 _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   12064 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane)
   12065 {
   12066     int8_t val;
   12067     val = value;
   12068     return vld1q_lane_s8(&val, vec,  lane);
   12069 }
   12070 
   12071 _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   12072 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane)
   12073 {
   12074     int16_t val;
   12075     val = value;
   12076     return vld1q_lane_s16(&val, vec,  lane);
   12077 }
   12078 
   12079 _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   12080 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane)
   12081 {
   12082     int32_t val;
   12083     val = value;
   12084     return vld1q_lane_s32(&val, vec,  lane);
   12085 }
   12086 
   12087 _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0
   12088 #define vsetq_lane_p8 vsetq_lane_u8
   12089 
   12090 _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0
   12091 #define vsetq_lane_p16 vsetq_lane_u16
   12092 
   12093 _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0
   12094 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane)
   12095 {
   12096     float32_t val;
   12097     val = value;
   12098     return vld1q_lane_f32(&val, vec,  lane);
   12099 }
   12100 
   12101 _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
   12102 _NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane)
   12103 {
   12104     int64_t val;
   12105     val = value;
   12106     return vld1_lane_s64(&val, vec,  lane);
   12107 }
   12108 
   12109 _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0
   12110 _NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane)
   12111 {
   12112     uint64_t val;
   12113     val = value;
   12114     return vld1_lane_u64(&val, vec,  lane);
   12115 }
   12116 
   12117 _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
   12118 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane)
   12119 {
   12120     uint64_t val;
   12121     val = value;
   12122     return vld1q_lane_s64(&val, vec,  lane);
   12123 }
   12124 
   12125 _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0
   12126 #define vsetq_lane_u64 vsetq_lane_s64
   12127 
   12128 // *******************************************************************************
   12129 // **************** Initialize a vector from bit pattern ***************************
   12130 // *******************************************************************************
   12131 //These intrinsics create a vector from a literal bit pattern.
   12132 _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0
   12133 _NEON2SSE_INLINE int8x8_t vcreate_s8(uint64_t a)
   12134 {
   12135     return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
   12136 }
   12137 
   12138 _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0
   12139 #define vcreate_s16  vcreate_s8
   12140 
   12141 _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0
   12142 #define vcreate_s32  vcreate_s8
   12143 
   12144 _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0
   12145 //no IA32 SIMD avalilable
   12146 
   12147 _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0
   12148 _NEON2SSE_INLINE float32x2_t vcreate_f32(uint64_t a)
   12149 {
   12150     return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage
   12151 }
   12152 
   12153 _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0
   12154 #define vcreate_u8 vcreate_s8
   12155 
   12156 _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0
   12157 #define vcreate_u16 vcreate_s16
   12158 
   12159 _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0
   12160 #define vcreate_u32 vcreate_s32
   12161 
   12162 _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0
   12163 #define vcreate_u64  vcreate_s8
   12164 
   12165 
   12166 _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0
   12167 #define vcreate_p8 vcreate_u8
   12168 
   12169 _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0
   12170 #define vcreate_p16 vcreate_u16
   12171 
   12172 _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0
   12173 #define vcreate_s64 vcreate_u64
   12174 
   12175 //********************* Set all lanes to same value ********************************
   12176 //*********************************************************************************
   12177 //These intrinsics set all lanes to the same value.
   12178 _NEON2SSESTORAGE uint8x8_t   vdup_n_u8(uint8_t value); // VDUP.8 d0,r0
   12179 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t  vdup_n_u8(uint8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12180 {
   12181     uint8x8_t res;
   12182     int i;
   12183     for (i = 0; i<8; i++) {
   12184         res.m64_u8[i] = value;
   12185     }
   12186     return res;
   12187 }
   12188 
   12189 _NEON2SSESTORAGE uint16x4_t   vdup_n_u16(uint16_t value); // VDUP.16 d0,r0
   12190 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t  vdup_n_u16(uint16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12191 {
   12192     uint16x4_t res;
   12193     int i;
   12194     for (i = 0; i<4; i++) {
   12195         res.m64_u16[i] = value;
   12196     }
   12197     return res;
   12198 }
   12199 
   12200 _NEON2SSESTORAGE uint32x2_t   vdup_n_u32(uint32_t value); // VDUP.32 d0,r0
   12201 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t  vdup_n_u32(uint32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12202 {
   12203     uint32x2_t res;
   12204     res.m64_u32[0] = value;
   12205     res.m64_u32[1] = value;
   12206     return res;
   12207 }
   12208 
   12209 _NEON2SSESTORAGE int8x8_t   vdup_n_s8(int8_t value); // VDUP.8 d0,r0
   12210 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t  vdup_n_s8(int8_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12211 {
   12212     int8x8_t res;
   12213     int i;
   12214     for (i = 0; i<8; i++) {
   12215         res.m64_i8[i] = value;
   12216     }
   12217     return res;
   12218 }
   12219 
   12220 _NEON2SSESTORAGE int16x4_t   vdup_n_s16(int16_t value); // VDUP.16 d0,r0
   12221 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t  vdup_n_s16(int16_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12222 {
   12223     int16x4_t res;
   12224     int i;
   12225     for (i = 0; i<4; i++) {
   12226         res.m64_i16[i] = value;
   12227     }
   12228     return res;
   12229 }
   12230 
   12231 _NEON2SSESTORAGE int32x2_t   vdup_n_s32(int32_t value); // VDUP.32 d0,r0
   12232 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t  vdup_n_s32(int32_t value),  _NEON2SSE_REASON_SLOW_SERIAL)
   12233 {
   12234     int32x2_t res;
   12235     res.m64_i32[0] = value;
   12236     res.m64_i32[1] = value;
   12237     return res;
   12238 }
   12239 
   12240 _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0
   12241 #define vdup_n_p8 vdup_n_u8
   12242 
   12243 _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0
   12244 #define vdup_n_p16 vdup_n_s16
   12245 
   12246 _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0
   12247 _NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value)
   12248 {
   12249     float32x2_t res;
   12250     res.m64_f32[0] = value;
   12251     res.m64_f32[1] = value;
   12252     return res;
   12253 }
   12254 
   12255 _NEON2SSESTORAGE uint8x16_t   vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0
   12256 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value))
   12257 
   12258 _NEON2SSESTORAGE uint16x8_t   vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0
   12259 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value))
   12260 
   12261 _NEON2SSESTORAGE uint32x4_t   vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0
   12262 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value))
   12263 
   12264 _NEON2SSESTORAGE int8x16_t   vdupq_n_s8(int8_t value); // VDUP.8 q0,r0
   12265 #define vdupq_n_s8 _mm_set1_epi8
   12266 
   12267 _NEON2SSESTORAGE int16x8_t   vdupq_n_s16(int16_t value); // VDUP.16 q0,r0
   12268 #define vdupq_n_s16 _mm_set1_epi16
   12269 
   12270 _NEON2SSESTORAGE int32x4_t   vdupq_n_s32(int32_t value); // VDUP.32 q0,r0
   12271 #define vdupq_n_s32 _mm_set1_epi32
   12272 
   12273 _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0
   12274 #define  vdupq_n_p8 vdupq_n_u8
   12275 
   12276 _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0
   12277 #define  vdupq_n_p16 vdupq_n_u16
   12278 
   12279 _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0
   12280 #define vdupq_n_f32 _mm_set1_ps
   12281 
   12282 _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0
   12283 _NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value)
   12284 {
   12285     int64x1_t res;
   12286     res.m64_i64[0] = value;
   12287     return res;
   12288 }
   12289 
   12290 _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0
   12291 _NEON2SSE_INLINE uint64x1_t  vdup_n_u64(uint64_t value)
   12292 {
   12293     uint64x1_t res;
   12294     res.m64_u64[0] = value;
   12295     return res;
   12296 }
   12297 
   12298 _NEON2SSESTORAGE int64x2_t   vdupq_n_s64(int64_t value); // VMOV d0,r0,r0
   12299 _NEON2SSE_INLINE int64x2_t   vdupq_n_s64(int64_t value)
   12300 {
   12301     _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate
   12302     return LOAD_SI128(value2);
   12303 }
   12304 
   12305 _NEON2SSESTORAGE uint64x2_t   vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0
   12306 _NEON2SSE_INLINE uint64x2_t   vdupq_n_u64(uint64_t value)
   12307 {
   12308     _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate
   12309     return LOAD_SI128(val);
   12310 }
   12311 
   12312 //****  Set all lanes to same value  ************************
   12313 //Same functions as above - just aliaces.********************
   12314 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction **********
   12315 _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0
   12316 #define vmov_n_u8 vdup_n_s8
   12317 
   12318 _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0
   12319 #define vmov_n_u16 vdup_n_s16
   12320 
   12321 _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0
   12322 #define vmov_n_u32 vdup_n_u32
   12323 
   12324 _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0
   12325 #define vmov_n_s8 vdup_n_s8
   12326 
   12327 _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0
   12328 #define vmov_n_s16 vdup_n_s16
   12329 
   12330 _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0
   12331 #define vmov_n_s32 vdup_n_s32
   12332 
   12333 _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0
   12334 #define vmov_n_p8 vdup_n_u8
   12335 
   12336 _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0
   12337 #define vmov_n_p16 vdup_n_s16
   12338 
   12339 _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0
   12340 #define vmov_n_f32 vdup_n_f32
   12341 
   12342 _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0
   12343 #define vmovq_n_u8 vdupq_n_u8
   12344 
   12345 _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0
   12346 #define vmovq_n_u16 vdupq_n_s16
   12347 
   12348 _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0
   12349 #define vmovq_n_u32 vdupq_n_u32
   12350 
   12351 _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0
   12352 #define vmovq_n_s8 vdupq_n_s8
   12353 
   12354 _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0
   12355 #define vmovq_n_s16 vdupq_n_s16
   12356 
   12357 _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0
   12358 #define vmovq_n_s32 vdupq_n_s32
   12359 
   12360 _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0
   12361 #define vmovq_n_p8 vdupq_n_u8
   12362 
   12363 _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0
   12364 #define vmovq_n_p16 vdupq_n_s16
   12365 
   12366 _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0
   12367 #define vmovq_n_f32 vdupq_n_f32
   12368 
   12369 _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0
   12370 #define vmov_n_s64 vdup_n_s64
   12371 
   12372 _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0
   12373 #define vmov_n_u64 vdup_n_u64
   12374 
   12375 _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0
   12376 #define vmovq_n_s64 vdupq_n_s64
   12377 
   12378 _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0
   12379 #define vmovq_n_u64 vdupq_n_u64
   12380 
   12381 //**************Set all lanes to the value of one lane of a vector *************
   12382 //****************************************************************************
   12383 //here shuffle is better solution than lane extraction followed by set1 function
   12384 _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   12385 _NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane)
   12386 {
   12387     uint8x8_t res;
   12388     uint8_t valane;
   12389     int i = 0;
   12390     valane = vec.m64_u8[lane];
   12391     for (i = 0; i<8; i++) {
   12392         res.m64_u8[i] = valane;
   12393     }
   12394     return res;
   12395 }
   12396 
   12397 _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   12398 _NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane)
   12399 {
   12400     uint16x4_t res;
   12401     uint16_t valane;
   12402     valane = vec.m64_u16[lane];
   12403     res.m64_u16[0] = valane;
   12404     res.m64_u16[1] = valane;
   12405     res.m64_u16[2] = valane;
   12406     res.m64_u16[3] = valane;
   12407     return res;
   12408 }
   12409 
   12410 _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   12411 _NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
   12412 {
   12413     uint32x2_t res;
   12414     res.m64_u32[0] = vec.m64_u32[lane];
   12415     res.m64_u32[1] = res.m64_u32[0];
   12416     return res;
   12417 }
   12418 
   12419 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec,  __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   12420 #define vdup_lane_s8 vdup_lane_u8
   12421 
   12422 _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec,  __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   12423 #define vdup_lane_s16 vdup_lane_u16
   12424 
   12425 _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec,  __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   12426 #define vdup_lane_s32 vdup_lane_u32
   12427 
   12428 _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0]
   12429 #define vdup_lane_p8 vdup_lane_u8
   12430 
   12431 _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0]
   12432 #define vdup_lane_p16 vdup_lane_s16
   12433 
   12434 _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0]
   12435 _NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane)
   12436 {
   12437     float32x2_t res;
   12438     res.m64_f32[0] = vec.m64_f32[lane];
   12439     res.m64_f32[1] = res.m64_f32[0];
   12440     return res;
   12441 }
   12442 
   12443 _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   12444 _NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0]
   12445 {
   12446     const int8_t lane8 = (int8_t) lane;
   12447     _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8};
   12448     return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8);
   12449 }
   12450 
   12451 _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   12452 _NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0]
   12453 {
   12454     //we could use 8bit shuffle for 16 bit as well
   12455     const int8_t lane16 = ((int8_t) lane) << 1;
   12456     const int8_t lane16_1 = lane16 + 1;
   12457     _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1,
   12458                                                 lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1};
   12459     return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16);
   12460 }
   12461 
   12462 _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   12463 _NEON2SSE_INLINE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane)
   12464 {
   12465     //need to use function not macro to make it gcc friendly and meet the immediate const requirement for _mm_shuffle_epi32
   12466     if (lane == 1)
   12467         return _mm_shuffle_epi32 (_pM128i(vec), (1 | (1 << 2) | (1 << 4) | (1 << 6)) );
   12468     else
   12469         return _mm_shuffle_epi32 (_pM128i(vec), 0);
   12470 }
   12471 
   12472 _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   12473 #define vdupq_lane_s8 vdupq_lane_u8
   12474 
   12475 _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   12476 #define vdupq_lane_s16 vdupq_lane_u16
   12477 
   12478 _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   12479 #define vdupq_lane_s32 vdupq_lane_u32
   12480 
   12481 _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0]
   12482 #define vdupq_lane_p8 vdupq_lane_u8
   12483 
   12484 _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0]
   12485 #define vdupq_lane_p16 vdupq_lane_s16
   12486 
   12487 _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0]
   12488 #define  vdupq_lane_f32(vec, lane)  _mm_load1_ps((vec.m64_f32 + lane))
   12489 
   12490 _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
   12491 #define vdup_lane_s64(vec,lane) vec
   12492 
   12493 _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0
   12494 #define vdup_lane_u64(vec,lane) vec
   12495 
   12496 _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
   12497 _NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane)
   12498 {
   12499     __m128i vec128;
   12500     vec128 = _pM128i(vec);
   12501     return _mm_unpacklo_epi64(vec128,vec128);
   12502 }
   12503 
   12504 _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0
   12505 #define vdupq_lane_u64 vdupq_lane_s64
   12506 
   12507 // ********************************************************************
   12508 // ********************  Combining vectors *****************************
   12509 // ********************************************************************
   12510 //These intrinsics join two 64 bit vectors into a single 128bit vector.
   12511 _NEON2SSESTORAGE int8x16_t   vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0
   12512 _NEON2SSE_INLINE int8x16_t  vcombine_s8(int8x8_t low, int8x8_t high)
   12513 {
   12514    return _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) );
   12515 }
   12516 
   12517 _NEON2SSESTORAGE int16x8_t   vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0
   12518 #define vcombine_s16 vcombine_s8
   12519 
   12520 _NEON2SSESTORAGE int32x4_t   vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0
   12521 #define vcombine_s32 vcombine_s8
   12522 
   12523 _NEON2SSESTORAGE int64x2_t   vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0
   12524 #define vcombine_s64 vcombine_s8
   12525 
   12526 _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0
   12527 //current IA SIMD doesn't support float16
   12528 
   12529 _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0
   12530 _NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high)
   12531 {
   12532     __m128i res;
   12533     res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) );
   12534     return _M128(res);
   12535 }
   12536 
   12537 _NEON2SSESTORAGE uint8x16_t   vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0
   12538 #define vcombine_u8 vcombine_s8
   12539 
   12540 _NEON2SSESTORAGE uint16x8_t   vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0
   12541 #define vcombine_u16 vcombine_s16
   12542 
   12543 _NEON2SSESTORAGE uint32x4_t   vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0
   12544 #define vcombine_u32 vcombine_s32
   12545 
   12546 _NEON2SSESTORAGE uint64x2_t   vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0
   12547 #define vcombine_u64 vcombine_s64
   12548 
   12549 _NEON2SSESTORAGE poly8x16_t   vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0
   12550 #define vcombine_p8 vcombine_u8
   12551 
   12552 _NEON2SSESTORAGE poly16x8_t   vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0
   12553 #define vcombine_p16 vcombine_u16
   12554 
   12555 //**********************************************************************
   12556 //************************* Splitting vectors **************************
   12557 //**********************************************************************
   12558 //**************** Get high part ******************************************
   12559 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors
   12560 _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0
   12561 _NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a)
   12562 {
   12563     int8x8_t res64;
   12564     __m128i res;
   12565     res = _mm_unpackhi_epi64(a,a); //SSE2
   12566     return64(res);
   12567 }
   12568 
   12569 _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0
   12570 _NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a)
   12571 {
   12572     int16x4_t res64;
   12573     __m128i res;
   12574     res =  _mm_unpackhi_epi64(a,a); //SSE2
   12575     return64(res);
   12576 }
   12577 
   12578 _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0
   12579 _NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a)
   12580 {
   12581     int32x2_t res64;
   12582     __m128i res;
   12583     res =  _mm_unpackhi_epi64(a,a); //SSE2
   12584     return64(res);
   12585 }
   12586 
   12587 _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0
   12588 _NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a)
   12589 {
   12590     int64x1_t res64;
   12591     __m128i res;
   12592     res =  _mm_unpackhi_epi64(a,a); //SSE2
   12593     return64(res);
   12594 }
   12595 
   12596 _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0
   12597 // IA32 SIMD doesn't work with 16bit floats currently
   12598 
   12599 _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0
   12600 _NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a)
   12601 {
   12602     __m128i res;
   12603     __m64_128 res64;
   12604     res = _mm_unpackhi_epi64(_M128i(a),_M128i(a));
   12605     return64(res);
   12606 }
   12607 
   12608 _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0
   12609 #define vget_high_u8 vget_high_s8
   12610 
   12611 _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0
   12612 #define vget_high_u16 vget_high_s16
   12613 
   12614 _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0
   12615 #define vget_high_u32 vget_high_s32
   12616 
   12617 _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0
   12618 #define vget_high_u64 vget_high_s64
   12619 
   12620 _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0
   12621 #define vget_high_p8 vget_high_u8
   12622 
   12623 _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0
   12624 #define vget_high_p16 vget_high_u16
   12625 
   12626 //********************** Get low part **********************
   12627 //**********************************************************
   12628 _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0
   12629 _NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0
   12630 {
   12631     int16x4_t res64;
   12632     return64(a);
   12633 }
   12634 
   12635 _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0
   12636 _NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0
   12637 {
   12638     int16x4_t res64;
   12639     return64(a);
   12640 }
   12641 
   12642 _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0
   12643 _NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0
   12644 {
   12645     int32x2_t res64;
   12646     return64(a);
   12647 }
   12648 
   12649 _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0
   12650 _NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0
   12651 {
   12652     int64x1_t res64;
   12653     return64 (a);
   12654 }
   12655 
   12656 _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0
   12657 // IA32 SIMD doesn't work with 16bit floats currently
   12658 
   12659 _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0
   12660 _NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a)
   12661 {
   12662     float32x2_t res64;
   12663     _M64f(res64, a);
   12664     return res64;
   12665 }
   12666 
   12667 _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0
   12668 #define vget_low_u8 vget_low_s8
   12669 
   12670 _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0
   12671 #define vget_low_u16 vget_low_s16
   12672 
   12673 _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0
   12674 #define vget_low_u32 vget_low_s32
   12675 
   12676 _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0
   12677 #define vget_low_u64 vget_low_s64
   12678 
   12679 _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0
   12680 #define vget_low_p8 vget_low_u8
   12681 
   12682 _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0
   12683 #define vget_low_p16 vget_low_s16
   12684 
   12685 //**************************************************************************
   12686 //************************ Converting vectors **********************************
   12687 //**************************************************************************
   12688 //************* Convert from float ***************************************
   12689 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly
   12690 _NEON2SSESTORAGE int32x2_t   vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0
   12691 _NEON2SSE_INLINE int32x2_t   vcvt_s32_f32(float32x2_t a)
   12692 {
   12693     int32x2_t res64;
   12694     __m128i res;
   12695     res =  _mm_cvtps_epi32(_pM128(a)); //use low 64 bits of result only
   12696     return64(res);
   12697 }
   12698 
   12699 _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0
   12700 _NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a)
   12701 {
   12702     uint32x2_t res64;
   12703     __m128i res;
   12704     res = vcvtq_u32_f32(_pM128(a));
   12705     return64(res);
   12706 }
   12707 
   12708 _NEON2SSESTORAGE int32x4_t  vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0
   12709 _NEON2SSE_INLINE int32x4_t  vcvtq_s32_f32(float32x4_t a)
   12710 {
   12711     __m128 dif;
   12712     __m128i res;
   12713     //_mm_cvttps_epi32 incorrectly treats the case a > =2.14748364e+009, therefore the special processing is necessary
   12714     _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
   12715     dif = _mm_cmpge_ps(a, *(__m128*)fmax);
   12716     res = _mm_cvttps_epi32(a);
   12717     return _mm_xor_si128(res, _M128i(dif));
   12718 }
   12719 
   12720 _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0
   12721 _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0
   12722 {
   12723     //No single instruction SSE solution  but we could implement it as following:
   12724     __m128i res1, res2, zero, mask;
   12725     __m128  max, min, dif;
   12726     _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f };
   12727     _NEON2SSE_ALIGN_16 static const float32_t fmax_unsigned[] = { 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f };
   12728     zero = _mm_setzero_si128();
   12729     mask = _mm_cmpgt_epi32(_M128i(a), zero);
   12730     min = _mm_and_ps(_M128(mask), a);
   12731     max = _mm_min_ps(min, *(__m128*)fmax_unsigned); //clamped in 0 - 4.29496729+009
   12732 
   12733     dif = _mm_sub_ps(max, *(__m128*)fmax);
   12734     mask = _mm_cmpgt_epi32(_M128i(dif),zero);
   12735     dif = _mm_and_ps(_M128(mask), dif);
   12736 
   12737     res1 = _mm_cvttps_epi32(dif);
   12738     res2 = vcvtq_s32_f32(max);
   12739     return _mm_add_epi32(res1, res2);
   12740 }
   12741 
   12742 // ***** Convert to the fixed point  with   the number of fraction bits specified by b ***********
   12743 //*************************************************************************************************
   12744 _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32
   12745 _NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b)
   12746 {
   12747     int32x2_t res64;
   12748     return64(vcvtq_n_s32_f32(_pM128(a),b));
   12749 }
   12750 
   12751 _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32
   12752 _NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b)
   12753 {
   12754     uint32x2_t res;
   12755     float convconst;
   12756     convconst = (float)((uint32_t)1 << b);
   12757     res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst);
   12758     res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst);
   12759     return res;
   12760 }
   12761 
   12762 _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32
   12763 _NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b)
   12764 {
   12765     float convconst;
   12766     _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   12767     __m128 cconst128;
   12768     __m128i mask, res;
   12769     convconst = (float)(1 << b);
   12770     cconst128 = vdupq_n_f32(convconst);
   12771     res =  _mm_cvttps_epi32(_mm_mul_ps(a,cconst128));
   12772     mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask);
   12773     return _mm_xor_si128 (res,  mask); //res saturated for 0x80000000
   12774 }
   12775 
   12776 _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32
   12777 _NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b)
   12778 {
   12779     float convconst;
   12780     __m128 cconst128;
   12781     convconst = (float)(1 << b);
   12782     cconst128 = vdupq_n_f32(convconst);
   12783     return vcvtq_u32_f32(_mm_mul_ps(a,cconst128));
   12784 }
   12785 
   12786 
   12787 _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0
   12788 _NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a)
   12789 {
   12790   return _mm_cvtps_epi32(a);
   12791 }
   12792 
   12793 //***************** Convert to float *************************
   12794 //*************************************************************
   12795 _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0
   12796 _NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits
   12797 {
   12798     float32x2_t res;
   12799     res.m64_f32[0] = (float) a.m64_i32[0];
   12800     res.m64_f32[1] = (float) a.m64_i32[1];
   12801     return res;
   12802 }
   12803 
   12804 _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0
   12805 _NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a)
   12806 {
   12807     float32x2_t res;
   12808     res.m64_f32[0] = (float) a.m64_u32[0];
   12809     res.m64_f32[1] = (float) a.m64_u32[1];
   12810     return res;
   12811 }
   12812 
   12813 _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0
   12814 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a)
   12815 
   12816 _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0
   12817 _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0
   12818 {
   12819     //solution may be not optimal
   12820     __m128 two16, fHi, fLo;
   12821     __m128i hi, lo;
   12822     two16 = _mm_set1_ps((float)0x10000); //2^16
   12823     // Avoid double rounding by doing two exact conversions
   12824     // of high and low 16-bit segments
   12825     hi = _mm_srli_epi32(a, 16);
   12826     lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16);
   12827     fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16);
   12828     fLo = _mm_cvtepi32_ps(lo);
   12829     // do single rounding according to current rounding mode
   12830     return _mm_add_ps(fHi, fLo);
   12831 }
   12832 
   12833 // ***** Convert to the float from fixed point  with   the number of fraction bits specified by b ***********
   12834 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32
   12835 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b)
   12836 {
   12837     float32x2_t res;
   12838     float convconst;
   12839     convconst = (float)(1. / ((uint32_t)1 << b));
   12840     res.m64_f32[0] =  a.m64_i32[0] * convconst;
   12841     res.m64_f32[1] = a.m64_i32[1] * convconst;
   12842     return res;
   12843 }
   12844 
   12845 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32
   12846 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32
   12847 {
   12848     float32x2_t res;
   12849     float convconst;
   12850     convconst = (float)(1. / ((uint32_t)1 << b));
   12851     res.m64_f32[0] =  a.m64_u32[0] * convconst;
   12852     res.m64_f32[1] = a.m64_u32[1] * convconst;
   12853     return res;
   12854 }
   12855 
   12856 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32
   12857 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b)
   12858 {
   12859     float convconst;
   12860     __m128 cconst128, af;
   12861     convconst = (float)(1. / ((uint32_t)1 << b));
   12862     af = _mm_cvtepi32_ps(a);
   12863     cconst128 = vdupq_n_f32(convconst);
   12864     return _mm_mul_ps(af,cconst128);
   12865 }
   12866 
   12867 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32
   12868 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b)
   12869 {
   12870     float convconst;
   12871     __m128 cconst128, af;
   12872     convconst = (float)(1. / (1 << b));
   12873     af = vcvtq_f32_u32(a);
   12874     cconst128 = vdupq_n_f32(convconst);
   12875     return _mm_mul_ps(af,cconst128);
   12876 }
   12877 
   12878 //**************Convert between floats ***********************
   12879 //************************************************************
   12880 _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0
   12881 //Intel SIMD doesn't support 16bits floats curently
   12882 
   12883 _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0
   12884 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits
   12885 
   12886 //************Vector narrow integer conversion (truncation) ******************
   12887 //****************************************************************************
   12888 _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0
   12889 _NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0
   12890 {
   12891     int8x8_t res64;
   12892     __m128i res;
   12893     res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only
   12894     return64(res);
   12895 }
   12896 
   12897 _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0
   12898 _NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0
   12899 {
   12900     int16x4_t res64;
   12901     __m128i res;
   12902     res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only
   12903     return64(res);
   12904 }
   12905 
   12906 _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0
   12907 _NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a)
   12908 {
   12909     //may be not effective compared with a serial implementation
   12910     int32x2_t res64;
   12911     __m128i res;
   12912     res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0)
   12913     return64(res);
   12914 }
   12915 
   12916 _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0
   12917 #define vmovn_u16 vmovn_s16
   12918 
   12919 _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0
   12920 #define vmovn_u32 vmovn_s32
   12921 
   12922 _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0
   12923 #define vmovn_u64 vmovn_s64
   12924 
   12925 //**************** Vector long move   ***********************
   12926 //***********************************************************
   12927 _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0
   12928 _NEON2SSE_INLINE int16x8_t vmovl_s8(int8x8_t a)
   12929 {
   12930     return _MM_CVTEPI8_EPI16(_pM128i(a)); //SSE4.1
   12931 }
   12932 
   12933 _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0
   12934 _NEON2SSE_INLINE int32x4_t vmovl_s16(int16x4_t a)
   12935 {
   12936     return _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1
   12937 }
   12938 
   12939 _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0
   12940 _NEON2SSE_INLINE int64x2_t  vmovl_s32(int32x2_t a)
   12941 {
   12942     return _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1
   12943 }
   12944 
   12945 _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0
   12946 _NEON2SSE_INLINE uint16x8_t vmovl_u8(uint8x8_t a)
   12947 {
   12948     return _MM_CVTEPU8_EPI16(_pM128i(a)); //SSE4.1
   12949 }
   12950 
   12951 _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0
   12952 _NEON2SSE_INLINE uint32x4_t  vmovl_u16(uint16x4_t a)
   12953 {
   12954     return _MM_CVTEPU16_EPI32(_pM128i(a)); //SSE4.1
   12955 }
   12956 
   12957 _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0
   12958 _NEON2SSE_INLINE uint64x2_t  vmovl_u32(uint32x2_t a)
   12959 {
   12960     return _MM_CVTEPU32_EPI64(_pM128i(a)); //SSE4.1
   12961 }
   12962 
   12963 //*************Vector saturating narrow integer*****************
   12964 //**************************************************************
   12965 _NEON2SSESTORAGE int8x8_t   vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0
   12966 _NEON2SSE_INLINE int8x8_t   vqmovn_s16(int16x8_t a)
   12967 {
   12968     int8x8_t res64;
   12969     __m128i res;
   12970     res = _mm_packs_epi16(a, a);
   12971     return64(res);
   12972 }
   12973 
   12974 _NEON2SSESTORAGE int16x4_t   vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0
   12975 _NEON2SSE_INLINE int16x4_t   vqmovn_s32(int32x4_t a)
   12976 {
   12977     int16x4_t res64;
   12978     __m128i res;
   12979     res = _mm_packs_epi32(a, a);
   12980     return64(res);
   12981 }
   12982 
   12983 _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0
   12984 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution
   12985 {
   12986     int32x2_t res;
   12987     _NEON2SSE_ALIGN_16 int64_t atmp[2];
   12988     _mm_store_si128((__m128i*)atmp, a);
   12989     if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX;
   12990     if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN;
   12991     if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX;
   12992     if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN;
   12993     res.m64_i32[0] = (int32_t)atmp[0];
   12994     res.m64_i32[1] = (int32_t)atmp[1];
   12995     return res;
   12996 }
   12997 
   12998 _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0
   12999 _NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0
   13000 {
   13001     //no uint16 to uint8 conversion in SSE, need truncate to max signed first. Also trying to avoid _mm_shuffle_epi8 because of its big latency for old Atom CPUs
   13002     uint8x8_t res64;
   13003     __m128i c7fff, a_trunc, mask_trunc;
   13004     c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero
   13005     a_trunc =  _mm_and_si128(a,  c7fff); // a truncated to max signed
   13006     mask_trunc =  _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
   13007     mask_trunc =  _mm_and_si128(mask_trunc,  c7fff);  //zero or c7fff if the 15-th bit had been set initially
   13008     a_trunc = _mm_or_si128(a_trunc,  mask_trunc);
   13009     a_trunc =  _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only
   13010     return64(a_trunc);
   13011 }
   13012 
   13013 _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0
   13014 _NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0
   13015 {
   13016      #ifdef USE_SSE4
   13017         //no uint32 to uint16 conversion in SSE, need truncate to max signed first
   13018         uint16x4_t res64;
   13019         __m128i c7fffffff, a_trunc, mask_trunc;
   13020         c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero
   13021         a_trunc =  _mm_and_si128(a,  c7fffffff); // a truncated to max signed
   13022         mask_trunc =  _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially.
   13023         mask_trunc =  _mm_and_si128(mask_trunc,  c7fffffff);  //zero or c7fff if the 15-th bit had been set initially
   13024         a_trunc = _mm_or_si128(a_trunc,  mask_trunc);
   13025         a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only
   13026         return64(a_trunc);
   13027     #else
   13028         uint16x4_t res64;
   13029        __m128i res_hi, mask;
   13030         mask = _mm_setzero_si128();
   13031         res_hi = _mm_srli_epi32(a, 16);
   13032         res_hi = _mm_cmpeq_epi16(res_hi, mask);
   13033         mask = _mm_cmpeq_epi16(mask,mask); //all fff
   13034         mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >16 bits numbers
   13035         res_hi = _mm_or_si128(a, mask); //saturated res
   13036         res_hi = _mm_shuffle_epi8 (res_hi, *(__m128i*) mask8_32_even_odd); //go to 16 bits
   13037         return64(res_hi);
   13038     #endif
   13039 }
   13040 
   13041 _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0
   13042 _NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a)
   13043 {
   13044     //serial solution may be faster
   13045     uint32x2_t res64;
   13046     __m128i res_hi, mask;
   13047     mask = _mm_setzero_si128();
   13048     res_hi = _mm_srli_epi64(a, 32);
   13049     res_hi = _mm_cmpeq_epi32(res_hi, mask);
   13050     mask = _mm_cmpeq_epi32(mask,mask); //all fff
   13051     mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers
   13052     res_hi = _mm_or_si128(a, mask);
   13053     res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   13054     return64(res_hi);
   13055 }
   13056 //************* Vector saturating narrow integer signed->unsigned **************
   13057 //*****************************************************************************
   13058 _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0
   13059 _NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a)
   13060 {
   13061     uint8x8_t res64;
   13062     __m128i res;
   13063     res = _mm_packus_epi16(a, a); //use low 64bits only
   13064     return64(res);
   13065 }
   13066 
   13067 _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0
   13068 _NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a)
   13069 {
   13070     uint16x4_t res64;
   13071     __m128i res;
   13072     res = _MM_PACKUS1_EPI32(a); //use low 64bits only
   13073     return64(res);
   13074 }
   13075 
   13076 _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0
   13077 _NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a)
   13078 {
   13079     uint32x2_t res64;
   13080     __m128i res_hi,res_lo, zero, cmp;
   13081     zero = _mm_setzero_si128();
   13082     res_hi = _mm_srli_epi64(a,  32);
   13083     cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero
   13084     res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0  and the result is 0
   13085     cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive
   13086     res_lo =  _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff
   13087     res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits
   13088     return64(res_lo);
   13089 }
   13090 
   13091 // ********************************************************
   13092 // **************** Table look up **************************
   13093 // ********************************************************
   13094 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values
   13095 //in a table and generate a new vector. Indexes out of range return 0.
   13096 //for Intel SIMD we need to set the MSB to 1 for zero return
   13097 _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
   13098 _NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b)
   13099 {
   13100     uint8x8_t res64;
   13101     __m128i c7, maskgt, bmask, b128;
   13102     c7 = _mm_set1_epi8 (7);
   13103     b128 = _pM128i(b);
   13104     maskgt = _mm_cmpgt_epi8(b128,c7);
   13105     bmask = _mm_or_si128(b128,maskgt);
   13106     bmask = _mm_shuffle_epi8(_pM128i(a),bmask);
   13107     return64(bmask);
   13108 }
   13109 
   13110 _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a,  int8x8_t b); // VTBL.8 d0, {d0}, d0
   13111 #define vtbl1_s8 vtbl1_u8
   13112 
   13113 _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0
   13114 #define vtbl1_p8 vtbl1_u8
   13115 
   13116 _NEON2SSESTORAGE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   13117 _NEON2SSE_INLINE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b)
   13118 {
   13119     uint8x8_t res64;
   13120     __m128i c15, a01, maskgt15, bmask, b128;
   13121     c15 = _mm_set1_epi8 (15);
   13122     b128 = _pM128i(b);
   13123     maskgt15 = _mm_cmpgt_epi8(b128,c15);
   13124     bmask = _mm_or_si128(b128, maskgt15);
   13125     a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]), _pM128i(a.val[1]));
   13126     a01 =  _mm_shuffle_epi8(a01, bmask);
   13127     return64(a01);
   13128 }
   13129 
   13130 //int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   13131 #define vtbl2_s8 vtbl2_u8
   13132 
   13133 //poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0
   13134 #define vtbl2_p8 vtbl2_u8
   13135 
   13136 _NEON2SSESTORAGE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   13137 _NEON2SSE_INLINE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b)
   13138 {
   13139     //solution may be not optimal
   13140     uint8x8_t res64;
   13141     __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128;
   13142     c15 = _mm_set1_epi8 (15);
   13143     c23 = _mm_set1_epi8 (23);
   13144     b128 = _pM128i(b);
   13145     maskgt23 = _mm_cmpgt_epi8(b128,c23);
   13146     bmask = _mm_or_si128(b128, maskgt23);
   13147     maskgt15 = _mm_cmpgt_epi8(b128,c15);
   13148     a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
   13149     sh0 =  _mm_shuffle_epi8(a01, bmask);
   13150     sh1 =  _mm_shuffle_epi8(_pM128i(a.val[2]), bmask); //for bi>15 bi is wrapped (bi-=15)
   13151     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1
   13152     return64(sh0);
   13153 }
   13154 
   13155 _NEON2SSESTORAGE int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   13156 #define vtbl3_s8 vtbl3_u8
   13157 
   13158 _NEON2SSESTORAGE poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0
   13159 #define vtbl3_p8 vtbl3_u8
   13160 
   13161 _NEON2SSESTORAGE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   13162 _NEON2SSE_INLINE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b)
   13163 {
   13164     //solution may be not optimal
   13165     uint8x8_t res64;
   13166     __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128;
   13167     c15 = _mm_set1_epi8 (15);
   13168     c31 = _mm_set1_epi8 (31);
   13169     b128 = _pM128i(b);
   13170     maskgt31 = _mm_cmpgt_epi8(b128,c31);
   13171     bmask = _mm_or_si128(b128, maskgt31);
   13172     maskgt15 = _mm_cmpgt_epi8(b128,c15);
   13173     a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1]));
   13174     a23 = _mm_unpacklo_epi64(_pM128i(a.val[2]),_pM128i(a.val[3]));
   13175     sh0 =  _mm_shuffle_epi8(a01, bmask);
   13176     sh1 =  _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15)
   13177     sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1
   13178     return64(sh0);
   13179 }
   13180 
   13181 _NEON2SSESTORAGE int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   13182 #define vtbl4_s8 vtbl4_u8
   13183 
   13184 _NEON2SSESTORAGE poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0
   13185 #define vtbl4_p8 vtbl4_u8
   13186 
   13187 //****************** Extended table look up intrinsics ***************************
   13188 //**********************************************************************************
   13189 //VTBX (Vector Table Extension) works in the same way as VTBL do,
   13190 // except that indexes out of range leave the destination element unchanged.
   13191 
   13192 _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
   13193 _NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c)
   13194 {
   13195     uint8x8_t res64;
   13196     __m128i c7, maskgt, sh, c128;
   13197     c7 = _mm_set1_epi8 (7);
   13198     c128 = _pM128i(c);
   13199     maskgt = _mm_cmpgt_epi8(c128,c7);
   13200     c7 = _mm_and_si128(maskgt,_pM128i(a));
   13201     sh = _mm_shuffle_epi8(_pM128i(b),c128);
   13202     sh = _mm_andnot_si128(maskgt,sh);
   13203     sh =  _mm_or_si128(sh,c7);
   13204     return64(sh);
   13205 }
   13206 
   13207 _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a,  int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0
   13208 #define vtbx1_s8 vtbx1_u8
   13209 
   13210 _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0
   13211 #define vtbx1_p8 vtbx1_u8
   13212 
   13213 _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   13214 _NEON2SSE_INLINE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c)
   13215 {
   13216     uint8x8_t res64;
   13217     __m128i c15, b01, maskgt15, sh, c128;
   13218     c15 = _mm_set1_epi8 (15);
   13219     c128 = _pM128i(c);
   13220     maskgt15 = _mm_cmpgt_epi8(c128, c15);
   13221     c15 = _mm_and_si128(maskgt15, _pM128i(a));
   13222     b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]), _pM128i(b.val[1]));
   13223     sh =  _mm_shuffle_epi8(b01, c128);
   13224     sh = _mm_andnot_si128(maskgt15, sh);
   13225     sh =  _mm_or_si128(sh,c15);
   13226     return64(sh);
   13227 }
   13228 
   13229 //int8x8_t vtbx2_s8(int8x8_t a,  int8x8x2_t b, int8x8_t c);  // VTBX.8 d0, {d0, d1}, d0
   13230 #define vtbx2_s8 vtbx2_u8
   13231 
   13232 //poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0
   13233 #define vtbx2_p8 vtbx2_u8
   13234 
   13235 _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   13236 _NEON2SSE_INLINE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c)
   13237 {
   13238     //solution may be not optimal
   13239     uint8x8_t res64;
   13240     __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128;
   13241     c15 = _mm_set1_epi8 (15);
   13242     c23 = _mm_set1_epi8 (23);
   13243     c128 = _pM128i(c);
   13244     maskgt15 = _mm_cmpgt_epi8(c128,c15);
   13245     maskgt23 = _mm_cmpgt_epi8(c128,c23);
   13246     c23 = _mm_and_si128(maskgt23, _pM128i(a));
   13247     b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
   13248     sh0 =  _mm_shuffle_epi8(b01, c128);
   13249     sh1 =  _mm_shuffle_epi8(_pM128i(b.val[2]), c128); //for bi>15 bi is wrapped (bi-=15)
   13250     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
   13251     sh0 = _mm_andnot_si128(maskgt23,sh0);
   13252     sh0 = _mm_or_si128(sh0,c23);
   13253     return64(sh0);
   13254 }
   13255 
   13256 _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   13257 #define vtbx3_s8 vtbx3_u8
   13258 
   13259 _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0
   13260 #define vtbx3_p8 vtbx3_u8
   13261 
   13262 _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   13263 _NEON2SSE_INLINE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c)
   13264 {
   13265     //solution may be not optimal
   13266     uint8x8_t res64;
   13267     __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128;
   13268     c15 = _mm_set1_epi8 (15);
   13269     c31 = _mm_set1_epi8 (31);
   13270     c128 = _pM128i(c);
   13271     maskgt15 = _mm_cmpgt_epi8(c128,c15);
   13272     maskgt31 = _mm_cmpgt_epi8(c128,c31);
   13273     c31 = _mm_and_si128(maskgt31, _pM128i(a));
   13274 
   13275     b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1]));
   13276     b23 = _mm_unpacklo_epi64(_pM128i(b.val[2]),_pM128i(b.val[3]));
   13277     sh0 =  _mm_shuffle_epi8(b01, c128);
   13278     sh1 =  _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15)
   13279     sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15);
   13280     sh0 = _mm_andnot_si128(maskgt31,sh0);
   13281     sh0 =  _mm_or_si128(sh0,c31);
   13282     return64(sh0);
   13283 }
   13284 
   13285 _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   13286 #define vtbx4_s8 vtbx4_u8
   13287 
   13288 _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0
   13289 #define vtbx4_p8 vtbx4_u8
   13290 
   13291 //*************************************************************************************************
   13292 // *************************** Operations with a scalar value *********************************
   13293 //*************************************************************************************************
   13294 
   13295 //******* Vector multiply accumulate by scalar *************************************************
   13296 //**********************************************************************************************
   13297 _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
   13298 _NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0]
   13299 {
   13300     int16_t c;
   13301     int16x4_t scalar;
   13302     c = vget_lane_s16(v, l);
   13303     scalar = vdup_n_s16(c);
   13304     return vmla_s16(a, b, scalar);
   13305 }
   13306 
   13307 _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
   13308 _NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0]
   13309 {
   13310     int32_t c;
   13311     int32x2_t scalar;
   13312     c = vget_lane_s32(v, l);
   13313     scalar = vdup_n_s32(c);
   13314     return vmla_s32(a, b, scalar);
   13315 }
   13316 
   13317 _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0]
   13318 #define vmla_lane_u16 vmla_lane_s16
   13319 
   13320 
   13321 _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0]
   13322 #define vmla_lane_u32 vmla_lane_s32
   13323 
   13324 _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0]
   13325 _NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
   13326 {
   13327     float32_t vlane;
   13328     float32x2_t c;
   13329     vlane = vget_lane_f32(v, l);
   13330     c = vdup_n_f32(vlane);
   13331     return vmla_f32(a,b,c);
   13332 }
   13333 
   13334 _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
   13335 _NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
   13336 {
   13337     int16_t vlane;
   13338     int16x8_t c;
   13339     vlane = vget_lane_s16(v, l);
   13340     c = vdupq_n_s16(vlane);
   13341     return vmlaq_s16(a,b,c);
   13342 }
   13343 
   13344 _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
   13345 _NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
   13346 {
   13347     int32_t vlane;
   13348     int32x4_t c;
   13349     vlane = vget_lane_s32(v, l);
   13350     c = vdupq_n_s32(vlane);
   13351     return vmlaq_s32(a,b,c);
   13352 }
   13353 
   13354 _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
   13355 #define vmlaq_lane_u16 vmlaq_lane_s16
   13356 
   13357 _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
   13358 #define vmlaq_lane_u32 vmlaq_lane_s32
   13359 
   13360 _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
   13361 _NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
   13362 {
   13363     float32_t vlane;
   13364     float32x4_t c;
   13365     vlane = vget_lane_f32(v, l);
   13366     c = vdupq_n_f32(vlane);
   13367     return vmlaq_f32(a,b,c);
   13368 }
   13369 
   13370 //***************** Vector widening multiply accumulate by scalar **********************
   13371 //***************************************************************************************
   13372 _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
   13373 _NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
   13374 {
   13375     int16_t vlane;
   13376     int16x4_t c;
   13377     vlane = vget_lane_s16(v, l);
   13378     c = vdup_n_s16(vlane);
   13379     return vmlal_s16(a, b, c);
   13380 }
   13381 
   13382 _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
   13383 _NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
   13384 {
   13385     int32_t vlane;
   13386     int32x2_t c;
   13387     vlane = vget_lane_s32(v, l);
   13388     c = vdup_n_s32(vlane);
   13389     return vmlal_s32(a, b, c);
   13390 }
   13391 
   13392 _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
   13393 _NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
   13394 {
   13395     uint16_t vlane;
   13396     uint16x4_t c;
   13397     vlane = vget_lane_u16(v, l);
   13398     c = vdup_n_u16(vlane);
   13399     return vmlal_u16(a, b, c);
   13400 }
   13401 
   13402 _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
   13403 _NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
   13404 {
   13405     uint32_t vlane;
   13406     uint32x2_t c;
   13407     vlane = vget_lane_u32(v, l);
   13408     c = vdup_n_u32(vlane);
   13409     return vmlal_u32(a, b, c);
   13410 }
   13411 
   13412 // ******** Vector widening saturating doubling multiply accumulate by scalar *******************************
   13413 // ************************************************************************************************
   13414 _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0]
   13415 _NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
   13416 {
   13417     int16_t vlane;
   13418     int16x4_t c;
   13419     vlane = vget_lane_s16(v, l);
   13420     c = vdup_n_s16(vlane);
   13421     return vqdmlal_s16(a, b, c);
   13422 }
   13423 
   13424 _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0]
   13425 _NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l)
   13426 {
   13427     int32_t vlane;
   13428     uint32x2_t c;
   13429     vlane = vget_lane_s32(v, l);
   13430     c = vdup_n_s32(vlane);
   13431     return vqdmlal_s32(a, b, c);
   13432 }
   13433 
   13434 // ****** Vector multiply subtract by scalar *****************
   13435 // *************************************************************
   13436 _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
   13437 _NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
   13438 {
   13439     int16_t vlane;
   13440     int16x4_t c;
   13441     vlane = vget_lane_s16(v, l);
   13442     c = vdup_n_s16(vlane);
   13443     return vmls_s16(a, b, c);
   13444 }
   13445 
   13446 _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
   13447 _NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
   13448 {
   13449     int32_t vlane;
   13450     int32x2_t c;
   13451     vlane = vget_lane_s32(v, l);
   13452     c = vdup_n_s32(vlane);
   13453     return vmls_s32(a, b, c);
   13454 }
   13455 
   13456 _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0]
   13457 _NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0]
   13458 {
   13459     uint16_t vlane;
   13460     uint16x4_t c;
   13461     vlane = vget_lane_s16(v, l);
   13462     c = vdup_n_s16(vlane);
   13463     return vmls_s16(a, b, c);
   13464 }
   13465 
   13466 _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0]
   13467 _NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0]
   13468 {
   13469     uint32_t vlane;
   13470     uint32x2_t c;
   13471     vlane = vget_lane_u32(v, l);
   13472     c = vdup_n_u32(vlane);
   13473     return vmls_u32(a, b, c);
   13474 }
   13475 
   13476 _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0]
   13477 _NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l)
   13478 {
   13479     float32_t vlane;
   13480     float32x2_t c;
   13481     vlane = (float) vget_lane_f32(v, l);
   13482     c = vdup_n_f32(vlane);
   13483     return vmls_f32(a,b,c);
   13484 }
   13485 
   13486 _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0]
   13487 _NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0]
   13488 {
   13489     int16_t vlane;
   13490     int16x8_t c;
   13491     vlane = vget_lane_s16(v, l);
   13492     c = vdupq_n_s16(vlane);
   13493     return vmlsq_s16(a, b,c);
   13494 }
   13495 
   13496 _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0]
   13497 _NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0]
   13498 {
   13499     int32_t vlane;
   13500     int32x4_t c;
   13501     vlane = vget_lane_s32(v, l);
   13502     c = vdupq_n_s32(vlane);
   13503     return vmlsq_s32(a,b,c);
   13504 }
   13505 
   13506 _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0]
   13507 _NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0]
   13508 {
   13509     uint16_t vlane;
   13510     uint16x8_t c;
   13511     vlane = vget_lane_u16(v, l);
   13512     c = vdupq_n_u16(vlane);
   13513     return vmlsq_u16(a,b,c);
   13514 }
   13515 
   13516 _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0]
   13517 _NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0]
   13518 {
   13519     uint32_t vlane;
   13520     uint32x4_t c;
   13521     vlane = vget_lane_u32(v, l);
   13522     c = vdupq_n_u32(vlane);
   13523     return vmlsq_u32(a,b,c);
   13524 }
   13525 
   13526 _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0]
   13527 _NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0]
   13528 {
   13529     float32_t vlane;
   13530     float32x4_t c;
   13531     vlane = (float) vget_lane_f32(v, l);
   13532     c = vdupq_n_f32(vlane);
   13533     return vmlsq_f32(a,b,c);
   13534 }
   13535 
   13536 // **** Vector widening multiply subtract by scalar ****
   13537 // ****************************************************
   13538 _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0]
   13539 _NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0]
   13540 {
   13541     int16_t vlane;
   13542     int16x4_t c;
   13543     vlane = vget_lane_s16(v, l);
   13544     c = vdup_n_s16(vlane);
   13545     return vmlsl_s16(a, b, c);
   13546 }
   13547 
   13548 _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0]
   13549 _NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0]
   13550 {
   13551     int32_t vlane;
   13552     int32x2_t c;
   13553     vlane = vget_lane_s32(v, l);
   13554     c = vdup_n_s32(vlane);
   13555     return vmlsl_s32(a, b, c);
   13556 }
   13557 
   13558 _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0]
   13559 _NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0]
   13560 {
   13561     uint16_t vlane;
   13562     uint16x4_t c;
   13563     vlane = vget_lane_s16(v, l);
   13564     c = vdup_n_s16(vlane);
   13565     return vmlsl_s16(a, b, c);
   13566 }
   13567 
   13568 _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0]
   13569 _NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0]
   13570 {
   13571     uint32_t vlane;
   13572     uint32x2_t c;
   13573     vlane = vget_lane_u32(v, l);
   13574     c = vdup_n_u32(vlane);
   13575     return vmlsl_u32(a, b, c);
   13576 }
   13577 
   13578 //********* Vector widening saturating doubling multiply subtract by scalar **************************
   13579 //******************************************************************************************************
   13580 _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0]
   13581 _NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l)
   13582 {
   13583     int16_t vlane;
   13584     int16x4_t c;
   13585     vlane = vget_lane_s16(v, l);
   13586     c = vdup_n_s16(vlane);
   13587     return vqdmlsl_s16(a, b, c);
   13588 }
   13589 
   13590 _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0]
   13591 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL)
   13592 {
   13593     int32_t vlane;
   13594     int32x2_t c;
   13595     vlane = vget_lane_s32(v, l);
   13596     c = vdup_n_s32(vlane);
   13597     return vqdmlsl_s32(a, b, c);
   13598 }
   13599 //********** Vector multiply with scalar *****************************
   13600 _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0]
   13601 _NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0]
   13602 {
   13603     int16x4_t b16x4;
   13604     b16x4 = vdup_n_s16(b);
   13605     return vmul_s16(a, b16x4);
   13606 }
   13607 
   13608 _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0]
   13609 _NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0]
   13610 {
   13611     //serial solution looks faster
   13612     int32x2_t b32x2;
   13613     b32x2 = vdup_n_s32(b);
   13614     return vmul_s32(a, b32x2);
   13615 }
   13616 
   13617 _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0]
   13618 _NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0]
   13619 {
   13620     float32x2_t b32x2;
   13621     b32x2 = vdup_n_f32(b);
   13622     return vmul_f32(a, b32x2);
   13623 }
   13624 
   13625 _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0]
   13626 _NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0]
   13627 {
   13628     uint16x4_t b16x4;
   13629     b16x4 = vdup_n_s16(b);
   13630     return vmul_s16(a, b16x4);
   13631 }
   13632 
   13633 _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0]
   13634 _NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0]
   13635 {
   13636     //serial solution looks faster
   13637     uint32x2_t b32x2;
   13638     b32x2 = vdup_n_u32(b);
   13639     return vmul_u32(a, b32x2);
   13640 }
   13641 
   13642 _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0]
   13643 _NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0]
   13644 {
   13645     int16x8_t b16x8;
   13646     b16x8 = vdupq_n_s16(b);
   13647     return vmulq_s16(a, b16x8);
   13648 }
   13649 
   13650 _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0]
   13651 _NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0]
   13652 {
   13653     int32x4_t b32x4;
   13654     b32x4 = vdupq_n_s32(b);
   13655     return vmulq_s32(a, b32x4);
   13656 }
   13657 
   13658 _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0]
   13659 _NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0]
   13660 {
   13661     float32x4_t b32x4;
   13662     b32x4 = vdupq_n_f32(b);
   13663     return vmulq_f32(a, b32x4);
   13664 }
   13665 
   13666 _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0]
   13667 _NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0]
   13668 {
   13669     uint16x8_t b16x8;
   13670     b16x8 = vdupq_n_s16(b);
   13671     return vmulq_s16(a, b16x8);
   13672 }
   13673 
   13674 _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0]
   13675 _NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0]
   13676 {
   13677     uint32x4_t b32x4;
   13678     b32x4 = vdupq_n_u32(b);
   13679     return vmulq_u32(a, b32x4);
   13680 }
   13681 
   13682 //********** Vector multiply lane *****************************
   13683 _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c);
   13684 _NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c)
   13685 {
   13686     int16x4_t b16x4;
   13687     int16_t vlane;
   13688     vlane = vget_lane_s16(b, c);
   13689     b16x4 = vdup_n_s16(vlane);
   13690     return vmul_s16(a, b16x4);
   13691 }
   13692 
   13693 _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c);
   13694 _NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c)
   13695 {
   13696     int32x2_t b32x2;
   13697     int32_t vlane;
   13698     vlane = vget_lane_s32(b, c);
   13699     b32x2 = vdup_n_s32(vlane);
   13700     return vmul_s32(a, b32x2);
   13701 }
   13702 
   13703 _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c);
   13704 _NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c)
   13705 {
   13706     float32x2_t b32x2;
   13707     float32_t vlane;
   13708     vlane = vget_lane_f32(b, c);
   13709     b32x2 = vdup_n_f32(vlane);
   13710     return vmul_f32(a, b32x2);
   13711 }
   13712 
   13713 _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c);
   13714 #define vmul_lane_u16 vmul_lane_s16
   13715 
   13716 _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c);
   13717 #define vmul_lane_u32 vmul_lane_s32
   13718 
   13719 _NEON2SSESTORAGE int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c);
   13720 _NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c)
   13721 {
   13722     int16x8_t b16x8;
   13723     int16_t vlane;
   13724     vlane = vget_lane_s16(b, c);
   13725     b16x8 = vdupq_n_s16(vlane);
   13726     return vmulq_s16(a, b16x8);
   13727 }
   13728 
   13729 _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c);
   13730 _NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c)
   13731 {
   13732     int32x4_t b32x4;
   13733     int32_t vlane;
   13734     vlane = vget_lane_s32(b, c);
   13735     b32x4 = vdupq_n_s32(vlane);
   13736     return vmulq_s32(a, b32x4);
   13737 }
   13738 
   13739 _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c);
   13740 _NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c)
   13741 {
   13742     float32x4_t b32x4;
   13743     float32_t vlane;
   13744     vlane = vget_lane_f32(b, c);
   13745     b32x4 = vdupq_n_f32(vlane);
   13746     return vmulq_f32(a, b32x4);
   13747 }
   13748 
   13749 _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c);
   13750 #define vmulq_lane_u16 vmulq_lane_s16
   13751 
   13752 _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c);
   13753 #define vmulq_lane_u32 vmulq_lane_s32
   13754 
   13755 //**** Vector long multiply with scalar ************
   13756 _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0]
   13757 _NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0]
   13758 {
   13759     int16x4_t b16x4;
   13760     b16x4 = vdup_n_s16(val2);
   13761     return vmull_s16(vec1, b16x4);
   13762 }
   13763 
   13764 _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0]
   13765 _NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0]
   13766 {
   13767     int32x2_t b32x2;
   13768     b32x2 = vdup_n_s32(val2);
   13769     return vmull_s32(vec1, b32x2);
   13770 }
   13771 
   13772 _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0]
   13773 _NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0]
   13774 {
   13775     uint16x4_t b16x4;
   13776     b16x4 = vdup_n_s16(val2);
   13777     return vmull_s16(vec1, b16x4);
   13778 }
   13779 
   13780 _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0]
   13781 _NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0]
   13782 {
   13783     uint32x2_t b32x2;
   13784     b32x2 = vdup_n_u32(val2);
   13785     return vmull_u32(vec1, b32x2);
   13786 }
   13787 
   13788 //**** Vector long multiply by scalar ****
   13789 _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0]
   13790 _NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0]
   13791 {
   13792     int16_t vlane;
   13793     int16x4_t b;
   13794     vlane = vget_lane_s16(val2, val3);
   13795     b = vdup_n_s16(vlane);
   13796     return vmull_s16(vec1, b);
   13797 }
   13798 
   13799 _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0]
   13800 _NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0]
   13801 {
   13802     int32_t vlane;
   13803     int32x2_t b;
   13804     vlane = vget_lane_s32(val2, val3);
   13805     b = vdup_n_s32(vlane);
   13806     return vmull_s32(vec1, b);
   13807 }
   13808 
   13809 _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0]
   13810 _NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0]
   13811 {
   13812     uint16_t vlane;
   13813     uint16x4_t b;
   13814     vlane = vget_lane_s16(val2, val3);
   13815     b = vdup_n_s16(vlane);
   13816     return vmull_s16(vec1, b);
   13817 }
   13818 
   13819 _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0]
   13820 _NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0]
   13821 {
   13822     uint32_t vlane;
   13823     uint32x2_t b;
   13824     vlane = vget_lane_u32(val2, val3);
   13825     b = vdup_n_u32(vlane);
   13826     return vmull_u32(vec1, b);
   13827 }
   13828 
   13829 //********* Vector saturating doubling long multiply with scalar  *******************
   13830 _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0]
   13831 _NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2)
   13832 {
   13833     //the serial soulution may be faster due to saturation
   13834     int16x4_t b;
   13835     b = vdup_n_s16(val2);
   13836     return vqdmull_s16(vec1, b);
   13837 }
   13838 
   13839 _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0]
   13840 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL)
   13841 {
   13842     int32x2_t b;
   13843     b = vdup_n_s32(val2);
   13844     return vqdmull_s32(vec1,b); //slow serial function!!!!
   13845 }
   13846 
   13847 //************* Vector saturating doubling long multiply by scalar ***********************************************
   13848 _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0]
   13849 _NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3)
   13850 {
   13851     int16_t c;
   13852     int16x4_t scalar;
   13853     c = vget_lane_s16(val2, val3);
   13854     scalar = vdup_n_s16(c);
   13855     return vqdmull_s16(vec1, scalar);
   13856 }
   13857 
   13858 
   13859 _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULL.S32 q0,d0,d0[0]
   13860 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL)
   13861 {
   13862     int32_t c;
   13863     int32x2_t scalar;
   13864     c = vget_lane_s32(val2, val3);
   13865     scalar = vdup_n_s32(c);
   13866     return vqdmull_s32(vec1,scalar); //slow serial function!!!!
   13867 }
   13868 
   13869 // *****Vector saturating doubling multiply high with scalar *****
   13870 _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2); //  VQDMULH.S16 d0,d0,d0[0]
   13871 _NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1,  int16_t val2)
   13872 {
   13873     int16x4_t res64;
   13874     return64(vqdmulhq_n_s16(_pM128i(vec1), val2));
   13875 }
   13876 
   13877 _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2); //  VQDMULH.S32 d0,d0,d0[0]
   13878 _NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1,  int32_t val2)
   13879 {
   13880     int32x2_t res64;
   13881     return64(vqdmulhq_n_s32(_pM128i(vec1), val2));
   13882 }
   13883 
   13884 _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); //  VQDMULH.S16 q0,q0,d0[0]
   13885 _NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) //  VQDMULH.S16 q0,q0,d0[0]
   13886 {
   13887     //solution may be not optimal
   13888     int16x8_t scalar;
   13889     scalar = vdupq_n_s16(val2);
   13890     return vqdmulhq_s16(vec1, scalar);
   13891 }
   13892 
   13893 _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); //  VQDMULH.S32 q0,q0,d0[0]
   13894 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13895 {
   13896     int32x4_t scalar;
   13897     scalar = vdupq_n_s32(val2);
   13898     return vqdmulhq_s32(vec1, scalar);
   13899 }
   13900 
   13901 //***** Vector saturating doubling multiply high by scalar ****************
   13902 _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 d0,d0,d0[0]
   13903 _NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 d0,d0,d0[0]
   13904 {
   13905     //solution may be not optimal
   13906     int16_t vlane;
   13907     int16x4_t scalar;
   13908     vlane = vget_lane_s16(val2, val3);
   13909     scalar = vdup_n_s16(vlane);
   13910     return vqdmulh_s16(vec1, scalar);
   13911 }
   13912 
   13913 _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 d0,d0,d0[0]
   13914 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13915 {
   13916     int32_t vlane;
   13917     int32x2_t scalar;
   13918     vlane = vget_lane_s32(val2, val3);
   13919     scalar = vdup_n_s32(vlane);
   13920     return vqdmulh_s32(vec1, scalar);
   13921 }
   13922 
   13923 _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); //  VQDMULH.S16 q0,q0,d0[0]
   13924 _NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) //  VQDMULH.S16 q0,q0,d0[0]
   13925 {
   13926     //solution may be not optimal
   13927     int16_t vlane;
   13928     int16x8_t scalar;
   13929     vlane = vget_lane_s16(val2, val3);
   13930     scalar = vdupq_n_s16(vlane );
   13931     return vqdmulhq_s16(vec1, scalar);
   13932 }
   13933 
   13934 _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); //  VQDMULH.S32 q0,q0,d0[0]
   13935 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13936 {
   13937     //solution may be not optimal
   13938     int32_t vlane;
   13939     int32x4_t scalar;
   13940     vlane = vgetq_lane_s32(_pM128i(val2), val3);
   13941     scalar = vdupq_n_s32(vlane );
   13942     return vqdmulhq_s32(vec1, scalar);
   13943 }
   13944 
   13945 //******** Vector saturating rounding doubling multiply high with scalar ***
   13946 _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0]
   13947 _NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0]
   13948 {
   13949     //solution may be not optimal
   13950     int16x4_t scalar;
   13951     scalar = vdup_n_s16(val2);
   13952     return vqrdmulh_s16(vec1, scalar);
   13953 }
   13954 
   13955 _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0]
   13956 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13957 {
   13958     int32x2_t scalar;
   13959     scalar = vdup_n_s32(val2);
   13960     return vqrdmulh_s32(vec1, scalar);
   13961 }
   13962 
   13963 _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0]
   13964 _NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0]
   13965 {
   13966     //solution may be not optimal
   13967     int16x8_t scalar;
   13968     scalar = vdupq_n_s16(val2);
   13969     return vqrdmulhq_s16(vec1, scalar);
   13970 }
   13971 
   13972 _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0]
   13973 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13974 {
   13975     int32x4_t scalar;
   13976     scalar = vdupq_n_s32(val2);
   13977     return vqrdmulhq_s32(vec1, scalar);
   13978 }
   13979 
   13980 //********* Vector rounding saturating doubling multiply high by scalar  ****
   13981 _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0]
   13982 _NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0]
   13983 {
   13984     //solution may be not optimal
   13985     int16_t vlane;
   13986     int16x4_t scalar;
   13987     vlane = vget_lane_s16(val2, val3);
   13988     scalar = vdup_n_s16(vlane);
   13989     return vqrdmulh_s16(vec1, scalar);
   13990 }
   13991 
   13992 _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0]
   13993 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   13994 {
   13995     int32_t vlane;
   13996     int32x2_t scalar;
   13997     vlane = vget_lane_s32(val2, val3);
   13998     scalar = vdup_n_s32(vlane);
   13999     return vqrdmulh_s32(vec1, scalar);
   14000 }
   14001 
   14002 _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0]
   14003 _NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0]
   14004 {
   14005     //solution may be not optimal
   14006     int16_t vlane;
   14007     int16x8_t scalar;
   14008     vlane = vget_lane_s16(val2, val3);
   14009     scalar = vdupq_n_s16(vlane);
   14010     return vqrdmulhq_s16(vec1, scalar);
   14011 }
   14012 
   14013 _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0]
   14014 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE)
   14015 {
   14016     //solution may be not optimal
   14017     int32_t vlane;
   14018     int32x4_t scalar;
   14019     vlane = vgetq_lane_s32(_pM128i(val2), val3);
   14020     scalar = vdupq_n_s32(vlane );
   14021     return vqrdmulhq_s32(vec1, scalar);
   14022 }
   14023 
   14024 //**************Vector multiply accumulate with scalar *******************
   14025 _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0]
   14026 _NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0]
   14027 {
   14028     int16x4_t scalar;
   14029     scalar = vdup_n_s16(c);
   14030     return vmla_s16(a, b, scalar);
   14031 }
   14032 
   14033 _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0]
   14034 _NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0]
   14035 {
   14036     int32x2_t scalar;
   14037     scalar = vdup_n_s32(c);
   14038     return vmla_s32(a, b, scalar);
   14039 }
   14040 
   14041 _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a,  uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0]
   14042 #define vmla_n_u16 vmla_n_s16
   14043 
   14044 
   14045 _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a,  uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0]
   14046 #define vmla_n_u32 vmla_n_s32
   14047 
   14048 
   14049 _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0]
   14050 _NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0]
   14051 {
   14052     float32x2_t scalar;
   14053     scalar = vdup_n_f32(c);
   14054     return vmla_f32(a, b, scalar);
   14055 }
   14056 
   14057 _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0]
   14058 _NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0]
   14059 {
   14060     int16x8_t scalar;
   14061     scalar = vdupq_n_s16(c);
   14062     return vmlaq_s16(a,b,scalar);
   14063 }
   14064 
   14065 _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0]
   14066 _NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0]
   14067 {
   14068     int32x4_t scalar;
   14069     scalar = vdupq_n_s32(c);
   14070     return vmlaq_s32(a,b,scalar);
   14071 }
   14072 
   14073 _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0]
   14074 #define vmlaq_n_u16 vmlaq_n_s16
   14075 
   14076 _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0]
   14077 #define vmlaq_n_u32 vmlaq_n_s32
   14078 
   14079 _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0]
   14080 _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0]
   14081 {
   14082     float32x4_t scalar;
   14083     scalar = vdupq_n_f32(c);
   14084     return vmlaq_f32(a,b,scalar);
   14085 }
   14086 
   14087 //************Vector widening multiply accumulate with scalar****************************
   14088 _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0]
   14089 _NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0]
   14090 {
   14091     int16x4_t vc;
   14092     vc = vdup_n_s16(c);
   14093     return vmlal_s16(a, b, vc);
   14094 }
   14095 
   14096 _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0]
   14097 _NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0]
   14098 {
   14099     int32x2_t vc;
   14100     vc = vdup_n_s32(c);
   14101     return vmlal_s32(a, b, vc);
   14102 }
   14103 
   14104 _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0]
   14105 _NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0]
   14106 {
   14107     uint16x4_t vc;
   14108     vc = vdup_n_u16(c);
   14109     return vmlal_u16(a, b, vc);
   14110 }
   14111 
   14112 _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0]
   14113 _NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0]
   14114 {
   14115     uint32x2_t vc;
   14116     vc = vdup_n_u32(c);
   14117     return vmlal_u32(a, b, vc);
   14118 }
   14119 
   14120 //************ Vector widening saturating doubling multiply accumulate with scalar **************
   14121 _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0]
   14122 _NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c)
   14123 {
   14124     //not optimal SIMD soulution, serial may be faster
   14125     int16x4_t vc;
   14126     vc = vdup_n_s16(c);
   14127     return vqdmlal_s16(a, b, vc);
   14128 }
   14129 
   14130 _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0]
   14131 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   14132 {
   14133     int32x2_t vc;
   14134     vc = vdup_n_s32(c);
   14135     return vqdmlal_s32(a, b, vc);
   14136 }
   14137 
   14138 //******** Vector multiply subtract with scalar **************
   14139 _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0]
   14140 _NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0]
   14141 {
   14142     int16x4_t vc;
   14143     vc = vdup_n_s16(c);
   14144     return vmls_s16(a, b, vc);
   14145 }
   14146 
   14147 _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0]
   14148 _NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0]
   14149 {
   14150     int32x2_t vc;
   14151     vc = vdup_n_s32(c);
   14152     return vmls_s32(a, b, vc);
   14153 }
   14154 
   14155 _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0]
   14156 _NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0]
   14157 {
   14158     uint16x4_t vc;
   14159     vc = vdup_n_s16(c);
   14160     return vmls_s16(a, b, vc);
   14161 }
   14162 
   14163 _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0]
   14164 _NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0]
   14165 {
   14166     uint32x2_t vc;
   14167     vc = vdup_n_u32(c);
   14168     return vmls_u32(a, b, vc);
   14169 }
   14170 
   14171 _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0]
   14172 _NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c)
   14173 {
   14174     float32x2_t res;
   14175     res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c;
   14176     res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c;
   14177     return res;
   14178 }
   14179 
   14180 _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0]
   14181 _NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0]
   14182 {
   14183     int16x8_t vc;
   14184     vc = vdupq_n_s16(c);
   14185     return vmlsq_s16(a, b,vc);
   14186 }
   14187 
   14188 _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0]
   14189 _NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0]
   14190 {
   14191     int32x4_t vc;
   14192     vc = vdupq_n_s32(c);
   14193     return vmlsq_s32(a,b,vc);
   14194 }
   14195 
   14196 _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0]
   14197 _NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0]
   14198 {
   14199     uint16x8_t vc;
   14200     vc = vdupq_n_u16(c);
   14201     return vmlsq_u16(a,b,vc);
   14202 }
   14203 
   14204 _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0]
   14205 _NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0]
   14206 {
   14207     uint32x4_t vc;
   14208     vc = vdupq_n_u32(c);
   14209     return vmlsq_u32(a,b,vc);
   14210 }
   14211 
   14212 _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0]
   14213 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c)
   14214 {
   14215     float32x4_t vc;
   14216     vc = vdupq_n_f32(c);
   14217     return vmlsq_f32(a,b,vc);
   14218 }
   14219 
   14220 //**** Vector widening multiply subtract with scalar ******
   14221 _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0]
   14222 _NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0]
   14223 {
   14224     int16x4_t vc;
   14225     vc = vdup_n_s16(c);
   14226     return vmlsl_s16(a, b, vc);
   14227 }
   14228 
   14229 _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0]
   14230 _NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0]
   14231 {
   14232     int32x2_t vc;
   14233     vc = vdup_n_s32(c);
   14234     return vmlsl_s32(a, b, vc);
   14235 }
   14236 
   14237 _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0]
   14238 _NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0]
   14239 {
   14240     uint16x4_t vc;
   14241     vc = vdup_n_u16(c);
   14242     return vmlsl_u16(a, b, vc);
   14243 }
   14244 
   14245 _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0]
   14246 _NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0]
   14247 {
   14248     uint32x2_t vc;
   14249     vc = vdup_n_u32(c);
   14250     return vmlsl_u32(a, b, vc);
   14251 }
   14252 
   14253 //***** Vector widening saturating doubling multiply subtract with scalar *********
   14254 //**********************************************************************************
   14255 _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0]
   14256 _NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c)
   14257 {
   14258     int16x4_t vc;
   14259     vc = vdup_n_s16(c);
   14260     return vqdmlsl_s16(a, b, vc);
   14261 }
   14262 
   14263 _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0]
   14264 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL)
   14265 {
   14266     int32x2_t vc;
   14267     vc = vdup_n_s32(c);
   14268     return vqdmlsl_s32(a, b, vc);
   14269 }
   14270 
   14271 //*******************  Vector extract ***********************************************
   14272 //*************************************************************************************
   14273 //VEXT (Vector Extract) extracts  elements from the bottom end of the second operand
   14274 //vector and the top end of the first, concatenates them, and places the result in the destination vector
   14275 //c elements from the bottom end of the second operand and (8-c) from the top end of the first
   14276 _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   14277 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL)
   14278 {
   14279     int8x8_t res;
   14280     int i;
   14281     for (i = 0; i<8 - c; i++) {
   14282         res.m64_i8[i] = a.m64_i8[i + c];
   14283     }
   14284     for(i = 0; i<c; i++) {
   14285         res.m64_i8[8 - c + i] = b.m64_i8[i];
   14286     }
   14287     return res;
   14288 }
   14289 
   14290 _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a,  uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   14291 #define vext_u8 vext_s8
   14292 //same result tested
   14293 
   14294 _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0
   14295 #define vext_p8 vext_u8
   14296 
   14297 _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   14298 _NEON2SSE_INLINE int16x4_t  _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL)
   14299 {
   14300     int16x4_t res;
   14301     int i;
   14302     for (i = 0; i<4 - c; i++) {
   14303         res.m64_i16[i] = a.m64_i16[i + c];
   14304     }
   14305     for(i = 0; i<c; i++) {
   14306         res.m64_i16[4 - c + i] = b.m64_i16[i];
   14307     }
   14308     return res;
   14309 }
   14310 
   14311 _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a,  uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   14312 #define vext_u16 vext_s16
   14313 
   14314 _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0
   14315 #define vext_p16 vext_s16
   14316 
   14317 _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   14318 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
   14319 {
   14320     int32x2_t res;
   14321     if (c==0) {
   14322         res.m64_i32[0] = a.m64_i32[0];
   14323         res.m64_i32[1] = a.m64_i32[1];
   14324     } else {
   14325         res.m64_i32[0] = a.m64_i32[1];
   14326         res.m64_i32[1] = b.m64_i32[0];
   14327     }
   14328     return res;
   14329 }
   14330 
   14331 _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   14332 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL)
   14333 {
   14334     float32x2_t res;
   14335     if (c==0) {
   14336         res.m64_f32[0] = a.m64_f32[0];
   14337         res.m64_f32[1] = a.m64_f32[1];
   14338     } else {
   14339         res.m64_f32[0] = a.m64_f32[1];
   14340         res.m64_f32[1] = b.m64_f32[0];
   14341     }
   14342     return res;
   14343 }
   14344 
   14345 _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a,  uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0
   14346 #define vext_u32 vext_s32
   14347 
   14348 
   14349 _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
   14350 #define vext_s64(a,b,c) a
   14351 
   14352 _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0
   14353 #define vext_u64(a,b,c) a
   14354 
   14355 _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   14356 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
   14357 
   14358 _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   14359 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c)
   14360 
   14361 _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0
   14362 #define vextq_p8 vextq_s8
   14363 
   14364 _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   14365 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
   14366 
   14367 _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   14368 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2)
   14369 
   14370 _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0
   14371 #define vextq_p16 vextq_s16
   14372 
   14373 _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
   14374 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
   14375 
   14376 _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0
   14377 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4)
   14378 
   14379 _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0
   14380 #define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) )
   14381 
   14382 _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
   14383 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
   14384 
   14385 _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0
   14386 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8)
   14387 
   14388 //************ Reverse vector elements (swap endianness)*****************
   14389 //*************************************************************************
   14390 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide.
   14391 _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0
   14392 _NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec)
   14393 {
   14394     int8x8_t res64;
   14395     __m128i res;
   14396     res = vrev64q_s8(_pM128i(vec));
   14397     return64(res);
   14398 }
   14399 
   14400 _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0
   14401 _NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec)
   14402 {
   14403     int16x4_t res64;
   14404     __m128i res;
   14405     res = vrev64q_s16(_pM128i(vec));
   14406     return64(res);
   14407 }
   14408 
   14409 _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0
   14410 _NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec)
   14411 {
   14412     int32x2_t res;
   14413     res.m64_i32[0] = vec.m64_i32[1];
   14414     res.m64_i32[1] = vec.m64_i32[0];
   14415     return res;
   14416 }
   14417 
   14418 _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0
   14419 #define vrev64_u8 vrev64_s8
   14420 
   14421 _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0
   14422 #define vrev64_u16 vrev64_s16
   14423 
   14424 _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0
   14425 #define vrev64_u32 vrev64_s32
   14426 
   14427 _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0
   14428 #define vrev64_p8 vrev64_u8
   14429 
   14430 _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0
   14431 #define vrev64_p16 vrev64_u16
   14432 
   14433 _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0
   14434 _NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec)
   14435 {
   14436     float32x2_t res;
   14437     res.m64_f32[0] = vec.m64_f32[1];
   14438     res.m64_f32[1] = vec.m64_f32[0];
   14439     return res;
   14440 }
   14441 
   14442 _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0
   14443 _NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0
   14444 {
   14445     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8};
   14446     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
   14447 }
   14448 
   14449 _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0
   14450 _NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0
   14451 {
   14452     //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask
   14453     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9};
   14454     return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16);
   14455 }
   14456 
   14457 _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0
   14458 _NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0
   14459 {
   14460     return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) );
   14461 }
   14462 
   14463 _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0
   14464 #define vrev64q_u8 vrev64q_s8
   14465 
   14466 _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0
   14467 #define vrev64q_u16 vrev64q_s16
   14468 
   14469 _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0
   14470 #define vrev64q_u32 vrev64q_s32
   14471 
   14472 _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0
   14473 #define vrev64q_p8 vrev64q_u8
   14474 
   14475 _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0
   14476 #define vrev64q_p16 vrev64q_u16
   14477 
   14478 _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0
   14479 #define vrev64q_f32(vec) _mm_shuffle_ps (vec,  vec, _MM_SHUFFLE(2,3, 0,1))
   14480 
   14481 //********************  32 bit shuffles **********************
   14482 //************************************************************
   14483 _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0
   14484 _NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec)
   14485 {
   14486     int8x8_t res64;
   14487     __m128i res;
   14488     res = vrev32q_s8(_pM128i(vec));
   14489     return64(res);
   14490 }
   14491 
   14492 _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0
   14493 _NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec)
   14494 {
   14495     int16x4_t res64;
   14496     __m128i res;
   14497     res = vrev32q_s16(_pM128i(vec));
   14498     return64(res);
   14499 }
   14500 
   14501 _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0
   14502 #define vrev32_u8 vrev32_s8
   14503 
   14504 _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0
   14505 #define vrev32_u16 vrev32_s16
   14506 
   14507 _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0
   14508 #define vrev32_p8 vrev32_u8
   14509 
   14510 _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0
   14511 #define vrev32_p16 vrev32_u16
   14512 
   14513 _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0
   14514 _NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0
   14515 {
   14516     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12};
   14517     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
   14518 }
   14519 
   14520 _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0
   14521 _NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0
   14522 {
   14523     _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13};
   14524     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev_e8);
   14525 }
   14526 
   14527 _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0
   14528 #define vrev32q_u8 vrev32q_s8
   14529 
   14530 _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0
   14531 #define vrev32q_u16 vrev32q_s16
   14532 
   14533 _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0
   14534 #define vrev32q_p8 vrev32q_u8
   14535 
   14536 _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0
   14537 #define vrev32q_p16 vrev32q_u16
   14538 
   14539 //*************  16 bit shuffles **********************
   14540 //******************************************************
   14541 _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0
   14542 _NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec)
   14543 {
   14544     int8x8_t res64;
   14545     __m128i res;
   14546     res = vrev16q_s8(_pM128i(vec));
   14547     return64(res);
   14548 }
   14549 
   14550 _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0
   14551 #define vrev16_u8 vrev16_s8
   14552 
   14553 _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0
   14554 #define vrev16_p8 vrev16_u8
   14555 
   14556 _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0
   14557 _NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0
   14558 {
   14559     _NEON2SSE_ALIGN_16 static const int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14};
   14560     return _mm_shuffle_epi8 (vec, *(__m128i*)  mask_rev8);
   14561 }
   14562 
   14563 _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0
   14564 #define vrev16q_u8 vrev16q_s8
   14565 
   14566 _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0
   14567 #define vrev16q_p8 vrev16q_u8
   14568 
   14569 //*********************************************************************
   14570 //**************** Other single operand arithmetic *******************
   14571 //*********************************************************************
   14572 
   14573 //*********** Absolute: Vd[i] = |Va[i]| **********************************
   14574 //************************************************************************
   14575 _NEON2SSESTORAGE int8x8_t   vabs_s8(int8x8_t a); // VABS.S8 d0,d0
   14576 _NEON2SSE_INLINE int8x8_t   vabs_s8(int8x8_t a)
   14577 {
   14578     int8x8_t res64;
   14579     __m128i res;
   14580     res = _mm_abs_epi8(_pM128i(a));
   14581     return64(res);
   14582 }
   14583 
   14584 
   14585 _NEON2SSESTORAGE int16x4_t   vabs_s16(int16x4_t a); // VABS.S16 d0,d0
   14586 _NEON2SSE_INLINE int16x4_t   vabs_s16(int16x4_t a)
   14587 {
   14588     int16x4_t res64;
   14589     __m128i res;
   14590     res = _mm_abs_epi16(_pM128i(a));
   14591     return64(res);
   14592 }
   14593 
   14594 _NEON2SSESTORAGE int32x2_t   vabs_s32(int32x2_t a); // VABS.S32 d0,d0
   14595 _NEON2SSE_INLINE int32x2_t   vabs_s32(int32x2_t a)
   14596 {
   14597     int32x2_t res64;
   14598     __m128i res;
   14599     res = _mm_abs_epi32(_pM128i(a));
   14600     return64(res);
   14601 }
   14602 
   14603 _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0
   14604 _NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0
   14605 {
   14606     float32x4_t res;
   14607     __m64_128 res64;
   14608     _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
   14609     res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only
   14610     _M64f(res64, res);
   14611     return res64;
   14612 }
   14613 
   14614 _NEON2SSESTORAGE int8x16_t   vabsq_s8(int8x16_t a); // VABS.S8 q0,q0
   14615 #define vabsq_s8 _mm_abs_epi8
   14616 
   14617 _NEON2SSESTORAGE int16x8_t   vabsq_s16(int16x8_t a); // VABS.S16 q0,q0
   14618 #define vabsq_s16 _mm_abs_epi16
   14619 
   14620 _NEON2SSESTORAGE int32x4_t   vabsq_s32(int32x4_t a); // VABS.S32 q0,q0
   14621 #define vabsq_s32 _mm_abs_epi32
   14622 
   14623 _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0
   14624 _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0
   14625 {
   14626     _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
   14627     return _mm_and_ps (a, *(__m128*)c7fffffff);
   14628 }
   14629 
   14630 #ifdef _NEON2SSE_64BIT
   14631 _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0
   14632 _NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0
   14633 {
   14634     __m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, 0xf5), 31);
   14635     return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign);
   14636 }
   14637 
   14638 _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0
   14639 _NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0
   14640 {
   14641     _NEON2SSE_ALIGN_16 static const int64_t mask[2] = {0x7fffffffffffffffLL, 0x7fffffffffffffffLL};
   14642     return _mm_and_pd (a, *(__m128d*)mask);
   14643 }
   14644 #endif
   14645 
   14646 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) *********************
   14647 //**********************************************************************
   14648 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place
   14649 _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0
   14650 _NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a)
   14651 {
   14652     int8x8_t res64;
   14653     __m128i res;
   14654     res = vqabsq_s8(_pM128i(a));
   14655     return64(res);
   14656 }
   14657 
   14658 _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0
   14659 _NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a)
   14660 {
   14661     int16x4_t res64;
   14662     __m128i res;
   14663     res = vqabsq_s16(_pM128i(a));
   14664     return64(res);
   14665 }
   14666 
   14667 _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0
   14668 _NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a)
   14669 {
   14670     int32x2_t res64;
   14671     __m128i res;
   14672     res = vqabsq_s32(_pM128i(a));
   14673     return64(res);
   14674 }
   14675 
   14676 _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0
   14677 _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0
   14678 {
   14679     __m128i c_128, abs, abs_cmp;
   14680     c_128 = _mm_set1_epi8 ((int8_t)0x80); //-128
   14681     abs = _mm_abs_epi8 (a);
   14682     abs_cmp = _mm_cmpeq_epi8 (abs, c_128);
   14683     return _mm_xor_si128 (abs,  abs_cmp);
   14684 }
   14685 
   14686 _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0
   14687 _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0
   14688 {
   14689     __m128i c_32768, abs, abs_cmp;
   14690     c_32768 = _mm_set1_epi16 ((int16_t)0x8000); //-32768
   14691     abs = _mm_abs_epi16 (a);
   14692     abs_cmp = _mm_cmpeq_epi16 (abs, c_32768);
   14693     return _mm_xor_si128 (abs,  abs_cmp);
   14694 }
   14695 
   14696 _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0
   14697 _NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0
   14698 {
   14699     __m128i c80000000, abs, abs_cmp;
   14700     c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
   14701     abs = _mm_abs_epi32 (a);
   14702     abs_cmp = _mm_cmpeq_epi32 (abs, c80000000);
   14703     return _mm_xor_si128 (abs,  abs_cmp);
   14704 }
   14705 
   14706 //*************** Negate: Vd[i] = - Va[i] *************************************
   14707 //*****************************************************************************
   14708 //several Negate implementations possible for SIMD.
   14709 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance:
   14710 _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0
   14711 _NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a)
   14712 {
   14713     int8x8_t res64;
   14714     __m128i res;
   14715     res = vnegq_s8(_pM128i(a));
   14716     return64(res);
   14717 }
   14718 
   14719 _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0
   14720 _NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a)
   14721 {
   14722     int16x4_t res64;
   14723     __m128i res;
   14724     res = vnegq_s16(_pM128i(a));
   14725     return64(res);
   14726 }
   14727 
   14728 _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0
   14729 _NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a)
   14730 {
   14731     int32x2_t res64;
   14732     __m128i res;
   14733     res = vnegq_s32(_pM128i(a));
   14734     return64(res);
   14735 }
   14736 
   14737 _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0
   14738 _NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0
   14739 {
   14740     float32x4_t res;
   14741     __m64_128 res64;
   14742     _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   14743     res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits
   14744     _M64f(res64, res);
   14745     return res64;
   14746 }
   14747 
   14748 _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0
   14749 _NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0
   14750 {
   14751     __m128i zero;
   14752     zero = _mm_setzero_si128 ();
   14753     return _mm_sub_epi8 (zero, a);
   14754 } //or _mm_sign_epi8 (a, negative numbers vector)
   14755 
   14756 _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0
   14757 _NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0
   14758 {
   14759     __m128i zero;
   14760     zero = _mm_setzero_si128 ();
   14761     return _mm_sub_epi16 (zero, a);
   14762 } //or _mm_sign_epi16 (a, negative numbers vector)
   14763 
   14764 _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0
   14765 _NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0
   14766 {
   14767     __m128i zero;
   14768     zero = _mm_setzero_si128 ();
   14769     return _mm_sub_epi32 (zero, a);
   14770 } //or _mm_sign_epi32 (a, negative numbers vector)
   14771 
   14772 _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0
   14773 _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0
   14774 {
   14775     _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
   14776     return _mm_xor_ps (a, *(__m128*) c80000000);
   14777 }
   14778 
   14779 //************** Saturating Negate: sat(Vd[i] = - Va[i]) **************************
   14780 //***************************************************************************************
   14781 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive
   14782 _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0
   14783 _NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a)
   14784 {
   14785     int8x8_t res64;
   14786     __m128i res;
   14787     res = vqnegq_s8(_pM128i(a));
   14788     return64(res);
   14789 }
   14790 
   14791 _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0
   14792 _NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a)
   14793 {
   14794     int16x4_t res64;
   14795     __m128i res;
   14796     res = vqnegq_s16(_pM128i(a));
   14797     return64(res);
   14798 }
   14799 
   14800 _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0
   14801 _NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a)
   14802 {
   14803     int32x2_t res64;
   14804     __m128i res;
   14805     res = vqnegq_s32(_pM128i(a));
   14806     return64(res);
   14807 }
   14808 
   14809 _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0
   14810 _NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0
   14811 {
   14812     __m128i zero;
   14813     zero = _mm_setzero_si128 ();
   14814     return _mm_subs_epi8 (zero, a); //saturating substraction
   14815 }
   14816 
   14817 _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0
   14818 _NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0
   14819 {
   14820     __m128i zero;
   14821     zero = _mm_setzero_si128 ();
   14822     return _mm_subs_epi16 (zero, a); //saturating substraction
   14823 }
   14824 
   14825 _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0
   14826 _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0
   14827 {
   14828     //solution may be not optimal compared with a serial
   14829     __m128i c80000000, zero, sub, cmp;
   14830     c80000000 = _mm_set1_epi32 (0x80000000); //most negative value
   14831     zero = _mm_setzero_si128 ();
   14832     sub =  _mm_sub_epi32 (zero, a); //substraction
   14833     cmp = _mm_cmpeq_epi32 (a, c80000000);
   14834     return _mm_xor_si128 (sub,  cmp);
   14835 }
   14836 
   14837 //****************** Count leading zeros ********************************
   14838 //**************************************************************************
   14839 //no corresponding vector intrinsics in IA32, need to implement it.  While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
   14840 _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0
   14841 _NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a)
   14842 {
   14843     int8x8_t res64;
   14844     __m128i res;
   14845     res = vclzq_s8(_pM128i(a));
   14846     return64(res);
   14847 }
   14848 
   14849 _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0
   14850 _NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a)
   14851 {
   14852     int16x4_t res64;
   14853     __m128i res;
   14854     res = vclzq_s16(_pM128i(a));
   14855     return64(res);
   14856 }
   14857 
   14858 _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0
   14859 _NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a)
   14860 {
   14861     int32x2_t res64;
   14862     __m128i res;
   14863     res = vclzq_s32(_pM128i(a));
   14864     return64(res);
   14865 }
   14866 
   14867 
   14868 _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0
   14869 #define vclz_u8 vclz_s8
   14870 
   14871 _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0
   14872 #define vclz_u16 vclz_s16
   14873 
   14874 _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0
   14875 #define vclz_u32 vclz_s32
   14876 
   14877 _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0
   14878 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a)
   14879 {
   14880     _NEON2SSE_ALIGN_16 static const int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2,
   14881                                                             /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1,
   14882                                                             /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0,
   14883                                                             /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0                          };
   14884     __m128i maskLOW, c4, lowclz, mask, hiclz;
   14885     maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically
   14886     c4 = _mm_set1_epi8(4);
   14887     lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway
   14888     mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
   14889     mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
   14890     hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway
   14891     mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros
   14892     lowclz = _mm_and_si128(lowclz,mask);
   14893     return _mm_add_epi8(lowclz, hiclz);
   14894 }
   14895 
   14896 _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0
   14897 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a)
   14898 {
   14899     __m128i c7, res8x16, res8x16_swap;
   14900     _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14};
   14901     _NEON2SSE_ALIGN_16 static const uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff};
   14902     c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7
   14903     res8x16 = vclzq_s8(a);
   14904     res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap
   14905     res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz
   14906     res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz
   14907     c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros
   14908     res8x16 = _mm_and_si128(res8x16, c7); //lowclz
   14909     return _mm_add_epi16(res8x16_swap, res8x16);
   14910 }
   14911 
   14912 _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0
   14913 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a)
   14914 {
   14915     __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res;
   14916     c55555555 = _mm_set1_epi32(0x55555555);
   14917     c33333333 = _mm_set1_epi32(0x33333333);
   14918     c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f);
   14919     c3f = _mm_set1_epi32(0x3f);
   14920     c32 = _mm_set1_epi32(32);
   14921     tmp = _mm_srli_epi32(a, 1);
   14922     res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1);
   14923     tmp = _mm_srli_epi32(res, 2);
   14924     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2);
   14925     tmp = _mm_srli_epi32(res, 4);
   14926     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4);
   14927     tmp = _mm_srli_epi32(res, 8);
   14928     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8);
   14929     tmp = _mm_srli_epi32(res, 16);
   14930     res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16);
   14931 
   14932     tmp = _mm_srli_epi32(res, 1);
   14933     tmp = _mm_and_si128(tmp, c55555555);
   14934     res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555);
   14935 
   14936     tmp = _mm_srli_epi32(res, 2);
   14937     tmp = _mm_and_si128(tmp, c33333333);
   14938     tmp1 = _mm_and_si128(res, c33333333);
   14939     res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333));
   14940 
   14941     tmp = _mm_srli_epi32(res, 4);
   14942     tmp = _mm_add_epi32(tmp, res);
   14943     res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f);
   14944 
   14945     tmp = _mm_srli_epi32(res, 8);
   14946     res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8);
   14947 
   14948     tmp = _mm_srli_epi32(res, 16);
   14949     res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16);
   14950 
   14951     res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f;
   14952 
   14953     return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i];
   14954 }
   14955 
   14956 _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0
   14957 #define vclzq_u8 vclzq_s8
   14958 
   14959 _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0
   14960 #define vclzq_u16 vclzq_s16
   14961 
   14962 _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0
   14963 #define vclzq_u32 vclzq_s32
   14964 
   14965 //************** Count leading sign bits **************************
   14966 //********************************************************************
   14967 //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following
   14968 // the topmost bit, that are the same as the topmost bit, in each element in a vector
   14969 //No corresponding vector intrinsics in IA32, need to implement it.
   14970 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits
   14971 _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0
   14972 _NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a)
   14973 {
   14974     int8x8_t res64;
   14975     __m128i res;
   14976     res = vclsq_s8(_pM128i(a));
   14977     return64(res);
   14978 }
   14979 
   14980 _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0
   14981 _NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a)
   14982 {
   14983     int16x4_t res64;
   14984     __m128i res;
   14985     res = vclsq_s16(_pM128i(a));
   14986     return64(res);
   14987 }
   14988 
   14989 _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0
   14990 _NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a)
   14991 {
   14992     int32x2_t res64;
   14993     __m128i res;
   14994     res = vclsq_s32(_pM128i(a));
   14995     return64(res);
   14996 }
   14997 
   14998 _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0
   14999 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a)
   15000 {
   15001     __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb;
   15002     cff = _mm_cmpeq_epi8 (a,a); //0xff
   15003     c80 = _mm_set1_epi8((int8_t)0x80);
   15004     c1 = _mm_set1_epi8(1);
   15005     a_mask = _mm_and_si128(a, c80);
   15006     a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive
   15007     a_neg = _mm_xor_si128(a, cff);
   15008     a_neg = _mm_and_si128(a_mask, a_neg);
   15009     a_pos = _mm_andnot_si128(a_mask, a);
   15010     a_comb = _mm_or_si128(a_pos, a_neg);
   15011     a_comb = vclzq_s8(a_comb);
   15012     return _mm_sub_epi8(a_comb, c1);
   15013 }
   15014 
   15015 _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0
   15016 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a)
   15017 {
   15018     __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb;
   15019     cffff = _mm_cmpeq_epi16(a,a);
   15020     c8000 =  _mm_slli_epi16(cffff, 15); //0x8000
   15021     c1 = _mm_srli_epi16(cffff,15); //0x1
   15022     a_mask = _mm_and_si128(a, c8000);
   15023     a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive
   15024     a_neg = _mm_xor_si128(a, cffff);
   15025     a_neg = _mm_and_si128(a_mask, a_neg);
   15026     a_pos = _mm_andnot_si128(a_mask, a);
   15027     a_comb = _mm_or_si128(a_pos, a_neg);
   15028     a_comb = vclzq_s16(a_comb);
   15029     return _mm_sub_epi16(a_comb, c1);
   15030 }
   15031 
   15032 _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0
   15033 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a)
   15034 {
   15035     __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb;
   15036     cffffffff = _mm_cmpeq_epi32(a,a);
   15037     c80000000 =  _mm_slli_epi32(cffffffff, 31); //0x80000000
   15038     c1 = _mm_srli_epi32(cffffffff,31); //0x1
   15039     a_mask = _mm_and_si128(a, c80000000);
   15040     a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive
   15041     a_neg = _mm_xor_si128(a, cffffffff);
   15042     a_neg = _mm_and_si128(a_mask, a_neg);
   15043     a_pos = _mm_andnot_si128(a_mask, a);
   15044     a_comb = _mm_or_si128(a_pos, a_neg);
   15045     a_comb = vclzq_s32(a_comb);
   15046     return _mm_sub_epi32(a_comb, c1);
   15047 }
   15048 
   15049 //************************* Count number of set bits   ********************************
   15050 //*************************************************************************************
   15051 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2  _mm_popcnt__u32 (unsigned int v) for each element
   15052 //another option is to do the following algorithm:
   15053 
   15054 _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0
   15055 _NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a)
   15056 {
   15057     uint8x8_t res64;
   15058     __m128i res;
   15059     res = vcntq_u8(_pM128i(a));
   15060     return64(res);
   15061 }
   15062 
   15063 _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0
   15064 #define vcnt_s8 vcnt_u8
   15065 
   15066 _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0
   15067 #define vcnt_p8 vcnt_u8
   15068 
   15069 _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0
   15070 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a)
   15071 {
   15072     _NEON2SSE_ALIGN_16 static const int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2,
   15073                                                                  /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3,
   15074                                                                  /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3,
   15075                                                                  /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4};
   15076     __m128i maskLOW, mask, lowpopcnt, hipopcnt;
   15077     maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set
   15078     mask = _mm_and_si128(a, maskLOW);
   15079     lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway
   15080     mask =  _mm_srli_epi16(a, 4); //get high 4 bits as low bits
   15081     mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set
   15082     hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway
   15083     return _mm_add_epi8(lowpopcnt, hipopcnt);
   15084 }
   15085 
   15086 _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0
   15087 #define vcntq_s8 vcntq_u8
   15088 
   15089 _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0
   15090 #define vcntq_p8 vcntq_u8
   15091 
   15092 //**************************************************************************************
   15093 //*********************** Logical operations ****************************************
   15094 //**************************************************************************************
   15095 //************************** Bitwise not ***********************************
   15096 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance
   15097 _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0
   15098 _NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a)
   15099 {
   15100     int8x8_t res64;
   15101     __m128i res;
   15102     res = vmvnq_s8(_pM128i(a));
   15103     return64(res);
   15104 }
   15105 
   15106 _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0
   15107 _NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a)
   15108 {
   15109     int16x4_t res64;
   15110     __m128i res;
   15111     res = vmvnq_s16(_pM128i(a));
   15112     return64(res);
   15113 }
   15114 
   15115 _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0
   15116 _NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a)
   15117 {
   15118     int32x2_t res64;
   15119     __m128i res;
   15120     res = vmvnq_s32(_pM128i(a));
   15121     return64(res);
   15122 }
   15123 
   15124 _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0
   15125 #define vmvn_u8 vmvn_s8
   15126 
   15127 _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0
   15128 #define vmvn_u16 vmvn_s16
   15129 
   15130 _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0
   15131 #define vmvn_u32 vmvn_s32
   15132 
   15133 _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0
   15134 #define vmvn_p8 vmvn_u8
   15135 
   15136 _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0
   15137 _NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0
   15138 {
   15139     __m128i c1;
   15140     c1 = _mm_cmpeq_epi8 (a,a); //0xff
   15141     return _mm_andnot_si128 (a, c1);
   15142 }
   15143 
   15144 _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0
   15145 _NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0
   15146 {
   15147     __m128i c1;
   15148     c1 = _mm_cmpeq_epi16 (a,a); //0xffff
   15149     return _mm_andnot_si128 (a, c1);
   15150 }
   15151 
   15152 _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0
   15153 _NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0
   15154 {
   15155     __m128i c1;
   15156     c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff
   15157     return _mm_andnot_si128 (a, c1);
   15158 }
   15159 
   15160 _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0
   15161 #define vmvnq_u8 vmvnq_s8
   15162 
   15163 _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0
   15164 #define vmvnq_u16 vmvnq_s16
   15165 
   15166 _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0
   15167 #define vmvnq_u32 vmvnq_s32
   15168 
   15169 _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0
   15170 #define vmvnq_p8 vmvnq_u8
   15171 
   15172 //****************** Bitwise and ***********************
   15173 //******************************************************
   15174 _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0
   15175 _NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b)
   15176 {
   15177     int8x8_t res64;
   15178     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
   15179 }
   15180 
   15181 _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0
   15182 _NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b)
   15183 {
   15184     int16x4_t res64;
   15185     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
   15186 }
   15187 
   15188 _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0
   15189 _NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b)
   15190 {
   15191     int32x2_t res64;
   15192     return64(_mm_and_si128(_pM128i(a),_pM128i(b)));
   15193 }
   15194 
   15195 
   15196 _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a,  int64x1_t b); // VAND d0,d0,d0
   15197 _NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a,  int64x1_t b)
   15198 {
   15199     int64x1_t res;
   15200     res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0];
   15201     return res;
   15202 }
   15203 
   15204 _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0
   15205 #define vand_u8 vand_s8
   15206 
   15207 _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0
   15208 #define vand_u16 vand_s16
   15209 
   15210 _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0
   15211 #define vand_u32 vand_s32
   15212 
   15213 _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a,  uint64x1_t b); // VAND d0,d0,d0
   15214 #define vand_u64 vand_s64
   15215 
   15216 
   15217 _NEON2SSESTORAGE int8x16_t   vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0
   15218 #define vandq_s8 _mm_and_si128
   15219 
   15220 _NEON2SSESTORAGE int16x8_t   vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0
   15221 #define vandq_s16 _mm_and_si128
   15222 
   15223 _NEON2SSESTORAGE int32x4_t   vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0
   15224 #define vandq_s32 _mm_and_si128
   15225 
   15226 _NEON2SSESTORAGE int64x2_t   vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0
   15227 #define vandq_s64 _mm_and_si128
   15228 
   15229 _NEON2SSESTORAGE uint8x16_t   vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0
   15230 #define vandq_u8 _mm_and_si128
   15231 
   15232 _NEON2SSESTORAGE uint16x8_t   vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0
   15233 #define vandq_u16 _mm_and_si128
   15234 
   15235 _NEON2SSESTORAGE uint32x4_t   vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0
   15236 #define vandq_u32 _mm_and_si128
   15237 
   15238 _NEON2SSESTORAGE uint64x2_t   vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0
   15239 #define vandq_u64 _mm_and_si128
   15240 
   15241 //******************** Bitwise or *********************************
   15242 //******************************************************************
   15243 _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0
   15244 _NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b)
   15245 {
   15246     int8x8_t res64;
   15247     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
   15248 }
   15249 
   15250 
   15251 _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0
   15252 _NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b)
   15253 {
   15254     int16x4_t res64;
   15255     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
   15256 }
   15257 
   15258 
   15259 _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0
   15260 _NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b)
   15261 {
   15262     int32x2_t res64;
   15263     return64(_mm_or_si128(_pM128i(a),_pM128i(b)));
   15264 }
   15265 
   15266 
   15267 _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b); // VORR d0,d0,d0
   15268 _NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a,  int64x1_t b)
   15269 {
   15270     int64x1_t res;
   15271     res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0];
   15272     return res;
   15273 }
   15274 
   15275 _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0
   15276 #define vorr_u8 vorr_s8
   15277 
   15278 _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0
   15279 #define vorr_u16 vorr_s16
   15280 
   15281 _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0
   15282 #define vorr_u32 vorr_s32
   15283 
   15284 _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a,  uint64x1_t b); // VORR d0,d0,d0
   15285 #define vorr_u64 vorr_s64
   15286 
   15287 _NEON2SSESTORAGE int8x16_t   vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0
   15288 #define vorrq_s8 _mm_or_si128
   15289 
   15290 _NEON2SSESTORAGE int16x8_t   vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0
   15291 #define vorrq_s16 _mm_or_si128
   15292 
   15293 _NEON2SSESTORAGE int32x4_t   vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0
   15294 #define vorrq_s32 _mm_or_si128
   15295 
   15296 _NEON2SSESTORAGE int64x2_t   vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0
   15297 #define vorrq_s64 _mm_or_si128
   15298 
   15299 _NEON2SSESTORAGE uint8x16_t   vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0
   15300 #define vorrq_u8 _mm_or_si128
   15301 
   15302 _NEON2SSESTORAGE uint16x8_t   vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0
   15303 #define vorrq_u16 _mm_or_si128
   15304 
   15305 _NEON2SSESTORAGE uint32x4_t   vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0
   15306 #define vorrq_u32 _mm_or_si128
   15307 
   15308 _NEON2SSESTORAGE uint64x2_t   vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0
   15309 #define vorrq_u64 _mm_or_si128
   15310 
   15311 //************* Bitwise exclusive or (EOR or XOR) ******************
   15312 //*******************************************************************
   15313 _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0
   15314 _NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b)
   15315 {
   15316     int8x8_t res64;
   15317     return64(_mm_xor_si128(_pM128i(a),_pM128i(b)));
   15318 }
   15319 
   15320 _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0
   15321 #define veor_s16 veor_s8
   15322 
   15323 _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0
   15324 #define veor_s32 veor_s8
   15325 
   15326 _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a,  int64x1_t b); // VEOR d0,d0,d0
   15327 _NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a,  int64x1_t b)
   15328 {
   15329     int64x1_t res;
   15330     res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0];
   15331     return res;
   15332 }
   15333 
   15334 _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0
   15335 #define veor_u8 veor_s8
   15336 
   15337 _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0
   15338 #define veor_u16 veor_s16
   15339 
   15340 _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0
   15341 #define veor_u32 veor_s32
   15342 
   15343 _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a,  uint64x1_t b); // VEOR d0,d0,d0
   15344 #define veor_u64 veor_s64
   15345 
   15346 _NEON2SSESTORAGE int8x16_t   veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0
   15347 #define veorq_s8 _mm_xor_si128
   15348 
   15349 _NEON2SSESTORAGE int16x8_t   veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0
   15350 #define veorq_s16 _mm_xor_si128
   15351 
   15352 _NEON2SSESTORAGE int32x4_t   veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0
   15353 #define veorq_s32 _mm_xor_si128
   15354 
   15355 _NEON2SSESTORAGE int64x2_t   veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0
   15356 #define veorq_s64 _mm_xor_si128
   15357 
   15358 _NEON2SSESTORAGE uint8x16_t   veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0
   15359 #define veorq_u8 _mm_xor_si128
   15360 
   15361 _NEON2SSESTORAGE uint16x8_t   veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0
   15362 #define veorq_u16 _mm_xor_si128
   15363 
   15364 _NEON2SSESTORAGE uint32x4_t   veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0
   15365 #define veorq_u32 _mm_xor_si128
   15366 
   15367 _NEON2SSESTORAGE uint64x2_t   veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0
   15368 #define veorq_u64 _mm_xor_si128
   15369 
   15370 //********************** Bit Clear **********************************
   15371 //*******************************************************************
   15372 //Logical AND complement (AND negation or AND NOT)
   15373 _NEON2SSESTORAGE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0
   15374 _NEON2SSE_INLINE int8x8_t   vbic_s8(int8x8_t a, int8x8_t b)
   15375 {
   15376     int8x8_t res64;
   15377     return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap"
   15378 }
   15379 
   15380 _NEON2SSESTORAGE int16x4_t   vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0
   15381 #define vbic_s16 vbic_s8
   15382 
   15383 _NEON2SSESTORAGE int32x2_t   vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0
   15384 #define vbic_s32 vbic_s8
   15385 
   15386 _NEON2SSESTORAGE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0
   15387 _NEON2SSE_INLINE int64x1_t   vbic_s64(int64x1_t a, int64x1_t b)
   15388 {
   15389     int64x1_t res;
   15390     res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]);
   15391     return res;
   15392 }
   15393 
   15394 _NEON2SSESTORAGE uint8x8_t   vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0
   15395 #define vbic_u8 vbic_s8
   15396 
   15397 _NEON2SSESTORAGE uint16x4_t   vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0
   15398 #define vbic_u16 vbic_s16
   15399 
   15400 _NEON2SSESTORAGE uint32x2_t   vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0
   15401 #define vbic_u32 vbic_s32
   15402 
   15403 _NEON2SSESTORAGE uint64x1_t   vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0
   15404 #define vbic_u64 vbic_s64
   15405 
   15406 _NEON2SSESTORAGE int8x16_t   vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0
   15407 #define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15408 
   15409 _NEON2SSESTORAGE int16x8_t   vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0
   15410 #define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15411 
   15412 _NEON2SSESTORAGE int32x4_t   vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0
   15413 #define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15414 
   15415 _NEON2SSESTORAGE int64x2_t   vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0
   15416 #define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15417 
   15418 _NEON2SSESTORAGE uint8x16_t   vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0
   15419 #define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15420 
   15421 _NEON2SSESTORAGE uint16x8_t   vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0
   15422 #define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15423 
   15424 _NEON2SSESTORAGE uint32x4_t   vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0
   15425 #define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15426 
   15427 _NEON2SSESTORAGE uint64x2_t   vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0
   15428 #define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap"
   15429 
   15430 //**************** Bitwise OR complement ********************************
   15431 //**************************************** ********************************
   15432 //no exact IA 32 match, need to implement it as following
   15433 _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b); // VORN d0,d0,d0
   15434 _NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a,  int8x8_t b)
   15435 {
   15436     int8x8_t res64;
   15437     return64(vornq_s8(_pM128i(a), _pM128i(b)));
   15438 }
   15439 
   15440 
   15441 _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b); // VORN d0,d0,d0
   15442 _NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a,  int16x4_t b)
   15443 {
   15444     int16x4_t res64;
   15445     return64(vornq_s16(_pM128i(a), _pM128i(b)));
   15446 }
   15447 
   15448 
   15449 _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b); // VORN d0,d0,d0
   15450 _NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a,  int32x2_t b)
   15451 {
   15452     int32x2_t res64;
   15453     return64(vornq_s32(_pM128i(a), _pM128i(b)));
   15454 }
   15455 
   15456 
   15457 _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0
   15458 _NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b)
   15459 {
   15460     int64x1_t res;
   15461     res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]);
   15462     return res;
   15463 }
   15464 
   15465 _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a,  uint8x8_t b); // VORN d0,d0,d0
   15466 #define vorn_u8 vorn_s8
   15467 
   15468 
   15469 _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a,  uint16x4_t b); // VORN d0,d0,d0
   15470 #define vorn_u16 vorn_s16
   15471 
   15472 _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a,  uint32x2_t b); // VORN d0,d0,d0
   15473 #define vorn_u32 vorn_s32
   15474 
   15475 _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0
   15476 #define vorn_u64 vorn_s64
   15477 
   15478 
   15479 _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0
   15480 _NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0
   15481 {
   15482     __m128i b1;
   15483     b1 = vmvnq_s8( b); //bitwise not for b
   15484     return _mm_or_si128 (a, b1);
   15485 }
   15486 
   15487 _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0
   15488 _NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0
   15489 {
   15490     __m128i b1;
   15491     b1 = vmvnq_s16( b); //bitwise not for b
   15492     return _mm_or_si128 (a, b1);
   15493 }
   15494 
   15495 _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0
   15496 _NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0
   15497 {
   15498     __m128i b1;
   15499     b1 = vmvnq_s32( b); //bitwise not for b
   15500     return _mm_or_si128 (a, b1);
   15501 }
   15502 
   15503 _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0
   15504 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b)
   15505 {
   15506     __m128i c1, b1;
   15507     c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff
   15508     b1 = _mm_andnot_si128 (b, c1);
   15509     return _mm_or_si128 (a, b1);
   15510 }
   15511 
   15512 _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0
   15513 _NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0
   15514 {
   15515     __m128i b1;
   15516     b1 = vmvnq_u8( b); //bitwise not for b
   15517     return _mm_or_si128 (a, b1);
   15518 }
   15519 
   15520 _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0
   15521 _NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0
   15522 {
   15523     __m128i b1;
   15524     b1 = vmvnq_s16( b); //bitwise not for b
   15525     return _mm_or_si128 (a, b1);
   15526 }
   15527 
   15528 _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0
   15529 _NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0
   15530 {
   15531     __m128i b1;
   15532     b1 = vmvnq_u32( b); //bitwise not for b
   15533     return _mm_or_si128 (a, b1);
   15534 }
   15535 _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0
   15536 #define vornq_u64 vornq_s64
   15537 
   15538 //********************* Bitwise Select *****************************
   15539 //******************************************************************
   15540 //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????)
   15541 
   15542 //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the
   15543 //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0.
   15544 
   15545 //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination
   15546 //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged
   15547 
   15548 //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination
   15549 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged.
   15550 
   15551 //VBSL only is implemented for SIMD
   15552 _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0
   15553 _NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c)
   15554 {
   15555     int8x8_t res64;
   15556     __m128i res;
   15557     res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c));
   15558     return64(res);
   15559 }
   15560 
   15561 _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0
   15562 #define vbsl_s16 vbsl_s8
   15563 
   15564 _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0
   15565 #define vbsl_s32 vbsl_s8
   15566 
   15567 _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0
   15568 _NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c)
   15569 {
   15570     int64x1_t res;
   15571     res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]);
   15572     return res;
   15573 }
   15574 
   15575 _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a,  uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0
   15576 #define vbsl_u8 vbsl_s8
   15577 
   15578 _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a,  uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0
   15579 #define vbsl_u16 vbsl_s8
   15580 
   15581 _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a,  uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0
   15582 #define vbsl_u32 vbsl_s8
   15583 
   15584 _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0
   15585 #define vbsl_u64 vbsl_s64
   15586 
   15587 _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0
   15588 _NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c)
   15589 {
   15590     __m128 sel1, sel2;
   15591     __m64_128 res64;
   15592     sel1 = _mm_and_ps   (_pM128(a), _pM128(b));
   15593     sel2 = _mm_andnot_ps (_pM128(a), _pM128(c));
   15594     sel1 = _mm_or_ps (sel1, sel2);
   15595     _M64f(res64, sel1);
   15596     return res64;
   15597 }
   15598 
   15599 _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0
   15600 #define  vbsl_p8 vbsl_s8
   15601 
   15602 _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0
   15603 #define  vbsl_p16 vbsl_s8
   15604 
   15605 _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0
   15606 _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0
   15607 {
   15608     __m128i sel1, sel2;
   15609     sel1 = _mm_and_si128   (a, b);
   15610     sel2 = _mm_andnot_si128 (a, c);
   15611     return _mm_or_si128 (sel1, sel2);
   15612 }
   15613 
   15614 _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0
   15615 #define vbslq_s16 vbslq_s8
   15616 
   15617 _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0
   15618 #define vbslq_s32 vbslq_s8
   15619 
   15620 _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0
   15621 #define vbslq_s64 vbslq_s8
   15622 
   15623 _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0
   15624 #define vbslq_u8 vbslq_s8
   15625 
   15626 _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0
   15627 #define vbslq_u16 vbslq_s8
   15628 
   15629 _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0
   15630 #define vbslq_u32 vbslq_s8
   15631 
   15632 _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0
   15633 #define vbslq_u64 vbslq_s8
   15634 
   15635 _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0
   15636 _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0
   15637 {
   15638     __m128 sel1, sel2;
   15639     sel1 = _mm_and_ps   (*(__m128*)&a, b);
   15640     sel2 = _mm_andnot_ps (*(__m128*)&a, c);
   15641     return _mm_or_ps (sel1, sel2);
   15642 }
   15643 
   15644 _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0
   15645 #define vbslq_p8 vbslq_u8
   15646 
   15647 _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0
   15648 #define vbslq_p16 vbslq_s8
   15649 
   15650 //************************************************************************************
   15651 //**************** Transposition operations ****************************************
   15652 //************************************************************************************
   15653 //*****************  Vector Transpose ************************************************
   15654 //************************************************************************************
   15655 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices.
   15656 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....)
   15657 _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0
   15658 _NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0
   15659 {
   15660     int8x8x2_t val;
   15661     __m128i tmp, val0;
   15662     tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
   15663     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask8_32_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7)
   15664     vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6),
   15665     return val;
   15666 }
   15667 
   15668 _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0
   15669 _NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0
   15670 {
   15671     int16x4x2_t val;
   15672     __m128i tmp, val0;
   15673     _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15};
   15674     tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
   15675     val0 =  _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3
   15676     vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2),
   15677     return val;
   15678 }
   15679 
   15680 _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0
   15681 _NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b)
   15682 {
   15683     int32x2x2_t val;
   15684     __m128i val0;
   15685     val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1
   15686     vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0,
   15687     return val;
   15688 }
   15689 
   15690 _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0
   15691 #define vtrn_u8 vtrn_s8
   15692 
   15693 _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0
   15694 #define vtrn_u16 vtrn_s16
   15695 
   15696 _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0
   15697 #define vtrn_u32 vtrn_s32
   15698 
   15699 _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0
   15700 _NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b)
   15701 {
   15702     float32x2x2_t val;
   15703     val.val[0].m64_f32[0] = a.m64_f32[0];
   15704     val.val[0].m64_f32[1] = b.m64_f32[0];
   15705     val.val[1].m64_f32[0] = a.m64_f32[1];
   15706     val.val[1].m64_f32[1] = b.m64_f32[1];
   15707     return val; //a0,b0,a1,b1
   15708 }
   15709 
   15710 _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0
   15711 #define  vtrn_p8 vtrn_u8
   15712 
   15713 _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0
   15714 #define  vtrn_p16 vtrn_s16
   15715 
   15716 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0
   15717 _NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0
   15718 {
   15719     int8x16x2_t r8x16;
   15720     __m128i a_sh, b_sh;
   15721     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
   15722     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
   15723 
   15724     r8x16.val[0] =  _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14)
   15725     r8x16.val[1] =  _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15)
   15726     return r8x16;
   15727 }
   15728 
   15729 _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0
   15730 _NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0
   15731 {
   15732     int16x8x2_t v16x8;
   15733     __m128i a_sh, b_sh;
   15734     a_sh = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
   15735     b_sh = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
   15736     v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6
   15737     v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7
   15738     return v16x8;
   15739 }
   15740 
   15741 _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0
   15742 _NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0
   15743 {
   15744     //may be not optimal solution compared with serial
   15745     int32x4x2_t v32x4;
   15746     __m128i a_sh, b_sh;
   15747     a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
   15748     b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
   15749 
   15750     v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2
   15751     v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3,  b3
   15752     return v32x4;
   15753 }
   15754 
   15755 _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0
   15756 #define vtrnq_u8 vtrnq_s8
   15757 
   15758 _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0
   15759 #define vtrnq_u16 vtrnq_s16
   15760 
   15761 _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0
   15762 #define vtrnq_u32 vtrnq_s32
   15763 
   15764 _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0
   15765 _NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0
   15766 {
   15767     //may be not optimal solution compared with serial
   15768     float32x4x2_t f32x4;
   15769     __m128 a_sh, b_sh;
   15770     a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness
   15771     b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness
   15772 
   15773     f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2
   15774     f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3,  b3
   15775     return f32x4;
   15776 }
   15777 
   15778 _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0
   15779 #define vtrnq_p8 vtrnq_s8
   15780 
   15781 _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0
   15782 #define vtrnq_p16 vtrnq_s16
   15783 
   15784 //***************** Interleave elements ***************************
   15785 //*****************************************************************
   15786 //output has (a0,b0,a1,b1, a2,b2,.....)
   15787 _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0
   15788 _NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0
   15789 {
   15790     int8x8x2_t val;
   15791     __m128i val0;
   15792     val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b));
   15793     vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15794     return val;
   15795 }
   15796 
   15797 _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0
   15798 _NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0
   15799 {
   15800     int16x4x2_t val;
   15801     __m128i val0;
   15802     val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b));
   15803     vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15804     return val;
   15805 }
   15806 
   15807 _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0
   15808 #define vzip_s32 vtrn_s32
   15809 
   15810 _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0
   15811 #define vzip_u8 vzip_s8
   15812 
   15813 _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0
   15814 #define vzip_u16 vzip_s16
   15815 
   15816 _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0
   15817 #define vzip_u32 vzip_s32
   15818 
   15819 _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0
   15820 #define vzip_f32 vtrn_f32
   15821 
   15822 _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0
   15823 #define vzip_p8 vzip_u8
   15824 
   15825 _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0
   15826 #define vzip_p16 vzip_u16
   15827 
   15828 _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0
   15829 _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0
   15830 {
   15831     int8x16x2_t r8x16;
   15832     r8x16.val[0] =  _mm_unpacklo_epi8(a, b);
   15833     r8x16.val[1] =  _mm_unpackhi_epi8(a, b);
   15834     return r8x16;
   15835 }
   15836 
   15837 _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0
   15838 _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0
   15839 {
   15840     int16x8x2_t r16x8;
   15841     r16x8.val[0] =  _mm_unpacklo_epi16(a, b);
   15842     r16x8.val[1] =  _mm_unpackhi_epi16(a, b);
   15843     return r16x8;
   15844 }
   15845 
   15846 _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0
   15847 _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0
   15848 {
   15849     int32x4x2_t r32x4;
   15850     r32x4.val[0] =  _mm_unpacklo_epi32(a, b);
   15851     r32x4.val[1] =  _mm_unpackhi_epi32(a, b);
   15852     return r32x4;
   15853 }
   15854 
   15855 _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0
   15856 #define vzipq_u8 vzipq_s8
   15857 
   15858 _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0
   15859 #define vzipq_u16 vzipq_s16
   15860 
   15861 _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0
   15862 #define vzipq_u32 vzipq_s32
   15863 
   15864 _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0
   15865 _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0
   15866 {
   15867     float32x4x2_t f32x4;
   15868     f32x4.val[0] =   _mm_unpacklo_ps ( a,  b);
   15869     f32x4.val[1] =   _mm_unpackhi_ps ( a,  b);
   15870     return f32x4;
   15871 }
   15872 
   15873 _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0
   15874 #define vzipq_p8 vzipq_u8
   15875 
   15876 _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0
   15877 #define vzipq_p16 vzipq_u16
   15878 
   15879 //*********************** De-Interleave elements *************************
   15880 //*************************************************************************
   15881 //As the result of these functions first val  contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...)
   15882 //no such functions in IA32 SIMD, shuffle is required
   15883 _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0
   15884 _NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0
   15885 {
   15886     int8x8x2_t val;
   15887     __m128i tmp, val0;
   15888     _NEON2SSE_ALIGN_16 static const int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13,  2, 6, 10, 14, 3, 7, 11,15};
   15889     tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7
   15890     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6),  (a1, a3, a5, a7, b1,b3, b5, b7)
   15891     vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15892     return val;
   15893 }
   15894 
   15895 _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0
   15896 _NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0
   15897 {
   15898     int16x4x2_t val;
   15899     __m128i tmp, val0;
   15900     _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1,  8,9,  2,3, 10,11,  4,5, 12,13, 6,7, 14,15};
   15901     tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3
   15902     val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3
   15903     vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15904     return val;
   15905 }
   15906 
   15907 _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0
   15908 _NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0
   15909 {
   15910     int32x2x2_t val;
   15911     __m128i val0;
   15912     val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1
   15913     vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32);
   15914     return val;
   15915 }
   15916 
   15917 _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0
   15918 #define vuzp_u8 vuzp_s8
   15919 
   15920 _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0
   15921 #define vuzp_u16 vuzp_s16
   15922 
   15923 _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0
   15924 #define vuzp_u32 vuzp_s32
   15925 
   15926 _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0
   15927 #define vuzp_f32 vzip_f32
   15928 
   15929 _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0
   15930 #define vuzp_p8 vuzp_u8
   15931 
   15932 _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0
   15933 #define vuzp_p16 vuzp_u16
   15934 
   15935 _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0
   15936 _NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0
   15937 {
   15938     int8x16x2_t v8x16;
   15939     __m128i a_sh, b_sh;
   15940     a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15
   15941     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15
   15942     //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b
   15943     v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14,  b0, b2, b4, b6, b8, b10, b12, b14,
   15944     v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15,  b1, b3, b5, b7, b9, b11, b13, b15
   15945     return v8x16;
   15946 }
   15947 
   15948 _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0
   15949 _NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0
   15950 {
   15951     int16x8x2_t v16x8;
   15952     __m128i a_sh, b_sh;
   15953      a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); //a0, a2, a4, a6,  a1, a3, a5, a7
   15954     b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_32_even_odd); //b0, b2, b4, b6,  b1, b3, b5, b7
   15955     v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6
   15956     v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7
   15957     return v16x8;
   15958 }
   15959 
   15960 _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0
   15961 _NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0
   15962 {
   15963     //may be not optimal solution compared with serial
   15964     int32x4x2_t v32x4;
   15965     __m128i a_sh, b_sh;
   15966     a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3
   15967     b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3
   15968 
   15969     v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2
   15970     v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3
   15971     return v32x4;
   15972 }
   15973 
   15974 _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0
   15975 #define vuzpq_u8 vuzpq_s8
   15976 
   15977 _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0
   15978 #define vuzpq_u16 vuzpq_s16
   15979 
   15980 _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0
   15981 #define vuzpq_u32 vuzpq_s32
   15982 
   15983 _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0
   15984 _NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0
   15985 {
   15986     float32x4x2_t v32x4;
   15987     v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however
   15988     v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however
   15989     return v32x4;
   15990 }
   15991 
   15992 _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0
   15993 #define vuzpq_p8 vuzpq_u8
   15994 
   15995 _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0
   15996 #define vuzpq_p16 vuzpq_u16
   15997 
   15998 //##############################################################################################
   15999 //*********************** Reinterpret cast intrinsics.******************************************
   16000 //##############################################################################################
   16001 // Not a part of oficial NEON instruction set but available in gcc compiler *********************
   16002 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u32 (uint32x2_t t);
   16003 #define vreinterpret_p8_u32
   16004 
   16005 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u16 (uint16x4_t t);
   16006 #define vreinterpret_p8_u16
   16007 
   16008 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u8 (uint8x8_t t);
   16009 #define vreinterpret_p8_u8
   16010 
   16011 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s32 (int32x2_t t);
   16012 #define vreinterpret_p8_s32
   16013 
   16014 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s16 (int16x4_t t);
   16015 #define vreinterpret_p8_s16
   16016 
   16017 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s8 (int8x8_t t);
   16018 #define vreinterpret_p8_s8
   16019 
   16020 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u64 (uint64x1_t t);
   16021 #define vreinterpret_p8_u64
   16022 
   16023 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s64 (int64x1_t t);
   16024 #define vreinterpret_p8_s64
   16025 
   16026 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_f32 (float32x2_t t);
   16027 #define vreinterpret_p8_f32
   16028 
   16029 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_p16 (poly16x4_t t);
   16030 #define vreinterpret_p8_p16
   16031 
   16032 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t);
   16033 #define vreinterpretq_p8_u32
   16034 
   16035 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t);
   16036 #define vreinterpretq_p8_u16
   16037 
   16038 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t);
   16039 #define vreinterpretq_p8_u8
   16040 
   16041 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s32 (int32x4_t t);
   16042 #define vreinterpretq_p8_s32
   16043 
   16044 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s16 (int16x8_t t);
   16045 #define vreinterpretq_p8_s16
   16046 
   16047 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s8 (int8x16_t t);
   16048 #define vreinterpretq_p8_s8
   16049 
   16050 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t);
   16051 #define vreinterpretq_p8_u64
   16052 
   16053 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s64 (int64x2_t t);
   16054 #define vreinterpretq_p8_s64
   16055 
   16056 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_f32 (float32x4_t t);
   16057 #define vreinterpretq_p8_f32(t) _M128i(t)
   16058 
   16059 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t);
   16060 #define vreinterpretq_p8_p16
   16061 
   16062 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u32 (uint32x2_t t);
   16063 #define vreinterpret_p16_u32
   16064 
   16065 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u16 (uint16x4_t t);
   16066 #define vreinterpret_p16_u16
   16067 
   16068 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u8 (uint8x8_t t);
   16069 #define vreinterpret_p16_u8
   16070 
   16071 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s32 (int32x2_t t);
   16072 #define vreinterpret_p16_s32
   16073 
   16074 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s16 (int16x4_t t);
   16075 #define vreinterpret_p16_s16
   16076 
   16077 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s8 (int8x8_t t);
   16078 #define vreinterpret_p16_s8
   16079 
   16080 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u64 (uint64x1_t t);
   16081 #define vreinterpret_p16_u64
   16082 
   16083 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s64 (int64x1_t t);
   16084 #define vreinterpret_p16_s64
   16085 
   16086 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_f32 (float32x2_t t);
   16087 #define vreinterpret_p16_f32
   16088 
   16089 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_p8 (poly8x8_t t);
   16090 #define vreinterpret_p16_p8
   16091 
   16092 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t);
   16093 #define vreinterpretq_p16_u32
   16094 
   16095 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t);
   16096 #define vreinterpretq_p16_u16
   16097 
   16098 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s32 (int32x4_t t);
   16099 #define vreinterpretq_p16_s32
   16100 
   16101 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s16 (int16x8_t t);
   16102 #define vreinterpretq_p16_s16
   16103 
   16104 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s8 (int8x16_t t);
   16105 #define vreinterpretq_p16_s8
   16106 
   16107 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t);
   16108 #define vreinterpretq_p16_u64
   16109 
   16110 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s64 (int64x2_t t);
   16111 #define vreinterpretq_p16_s64
   16112 
   16113 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_f32 (float32x4_t t);
   16114 #define vreinterpretq_p16_f32(t) _M128i(t)
   16115 
   16116 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t);
   16117 #define vreinterpretq_p16_p8  vreinterpretq_s16_p8
   16118 
   16119 //****  Integer to float  ******
   16120 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u32 (uint32x2_t t);
   16121 _NEON2SSE_INLINE float32x2_t vreinterpret_f32_u32 (uint32x2_t t)
   16122 {
   16123     return (*(__m64_128*)&(t));
   16124 }
   16125 
   16126 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u16 (uint16x4_t t);
   16127 #define vreinterpret_f32_u16 vreinterpret_f32_u32
   16128 
   16129 
   16130 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u8 (uint8x8_t t);
   16131 #define vreinterpret_f32_u8 vreinterpret_f32_u32
   16132 
   16133 
   16134 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s32 (int32x2_t t);
   16135 #define vreinterpret_f32_s32 vreinterpret_f32_u32
   16136 
   16137 
   16138 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s16 (int16x4_t t);
   16139 #define vreinterpret_f32_s16 vreinterpret_f32_u32
   16140 
   16141 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s8 (int8x8_t t);
   16142 #define vreinterpret_f32_s8 vreinterpret_f32_u32
   16143 
   16144 
   16145 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u64(uint64x1_t t);
   16146 #define vreinterpret_f32_u64 vreinterpret_f32_u32
   16147 
   16148 
   16149 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s64 (int64x1_t t);
   16150 #define vreinterpret_f32_s64 vreinterpret_f32_u32
   16151 
   16152 
   16153 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p16 (poly16x4_t t);
   16154 #define vreinterpret_f32_p16 vreinterpret_f32_u32
   16155 
   16156 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p8 (poly8x8_t t);
   16157 #define vreinterpret_f32_p8 vreinterpret_f32_u32
   16158 
   16159 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u32 (uint32x4_t t);
   16160 #define  vreinterpretq_f32_u32(t) _M128(t)
   16161 
   16162 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u16 (uint16x8_t t);
   16163 #define vreinterpretq_f32_u16 vreinterpretq_f32_u32
   16164 
   16165 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u8 (uint8x16_t t);
   16166 #define vreinterpretq_f32_u8 vreinterpretq_f32_u32
   16167 
   16168 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s32 (int32x4_t t);
   16169 #define vreinterpretq_f32_s32 vreinterpretq_f32_u32
   16170 
   16171 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s16 (int16x8_t t);
   16172 #define vreinterpretq_f32_s16 vreinterpretq_f32_u32
   16173 
   16174 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s8 (int8x16_t t);
   16175 #define vreinterpretq_f32_s8 vreinterpretq_f32_u32
   16176 
   16177 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u64 (uint64x2_t t);
   16178 #define vreinterpretq_f32_u64 vreinterpretq_f32_u32
   16179 
   16180 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s64 (int64x2_t t);
   16181 #define vreinterpretq_f32_s64 vreinterpretq_f32_u32
   16182 
   16183 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p16 (poly16x8_t t);
   16184 #define vreinterpretq_f32_p16 vreinterpretq_f32_u32
   16185 
   16186 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p8 (poly8x16_t t);
   16187 #define vreinterpretq_f32_p8 vreinterpretq_f32_u32
   16188 
   16189 //*** Integer type conversions ******************
   16190 //no conversion necessary for the following functions because it is same data type
   16191 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u32 (uint32x2_t t);
   16192 #define vreinterpret_s64_u32
   16193 
   16194 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u16 (uint16x4_t t);
   16195 #define vreinterpret_s64_u16
   16196 
   16197 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u8 (uint8x8_t t);
   16198 #define vreinterpret_s64_u8
   16199 
   16200 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s32 (int32x2_t t);
   16201 #define  vreinterpret_s64_s32
   16202 
   16203 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s16 (int16x4_t t);
   16204 #define vreinterpret_s64_s16
   16205 
   16206 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s8 (int8x8_t t);
   16207 #define  vreinterpret_s64_s8
   16208 
   16209 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u64 (uint64x1_t t);
   16210 #define  vreinterpret_s64_u64
   16211 
   16212 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_f32 (float32x2_t t);
   16213 #define  vreinterpret_s64_f32
   16214 
   16215 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p16 (poly16x4_t t);
   16216 #define vreinterpret_s64_p16
   16217 
   16218 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p8 (poly8x8_t t);
   16219 #define vreinterpret_s64_p8
   16220 
   16221 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u32 (uint32x4_t t);
   16222 #define vreinterpretq_s64_u32
   16223 
   16224 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s16 (uint16x8_t t);
   16225 #define vreinterpretq_s64_s16
   16226 
   16227 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u8 (uint8x16_t t);
   16228 #define vreinterpretq_s64_u8
   16229 
   16230 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s32 (int32x4_t t);
   16231 #define vreinterpretq_s64_s32
   16232 
   16233 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u16 (int16x8_t t);
   16234 #define vreinterpretq_s64_u16
   16235 
   16236 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s8 (int8x16_t t);
   16237 #define vreinterpretq_s64_s8
   16238 
   16239 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u64 (uint64x2_t t);
   16240 #define vreinterpretq_s64_u64
   16241 
   16242 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_f32 (float32x4_t t);
   16243 #define vreinterpretq_s64_f32(t) _M128i(t)
   16244 
   16245 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p16 (poly16x8_t t);
   16246 #define vreinterpretq_s64_p16
   16247 
   16248 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p8 (poly8x16_t t);
   16249 #define vreinterpretq_s64_p8
   16250 
   16251 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u32 (uint32x2_t t);
   16252 #define vreinterpret_u64_u32
   16253 
   16254 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u16 (uint16x4_t t);
   16255 #define vreinterpret_u64_u16
   16256 
   16257 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u8 (uint8x8_t t);
   16258 #define vreinterpret_u64_u8
   16259 
   16260 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s32 (int32x2_t t);
   16261 #define vreinterpret_u64_s32
   16262 
   16263 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s16 (int16x4_t t);
   16264 #define vreinterpret_u64_s16
   16265 
   16266 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s8 (int8x8_t t);
   16267 #define vreinterpret_u64_s8
   16268 
   16269 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s64 (int64x1_t t);
   16270 #define vreinterpret_u64_s64
   16271 
   16272 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_f32 (float32x2_t t);
   16273 #define vreinterpret_u64_f32
   16274 
   16275 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p16 (poly16x4_t t);
   16276 #define vreinterpret_u64_p16
   16277 
   16278 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p8 (poly8x8_t t);
   16279 #define vreinterpret_u64_p8
   16280 
   16281 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t);
   16282 #define vreinterpretq_u64_u32
   16283 
   16284 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t);
   16285 #define vreinterpretq_u64_u16
   16286 
   16287 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t);
   16288 #define vreinterpretq_u64_u8
   16289 
   16290 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s32 (int32x4_t t);
   16291 #define vreinterpretq_u64_s32
   16292 
   16293 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s16 (int16x8_t t);
   16294 #define vreinterpretq_u64_s16
   16295 
   16296 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s8 (int8x16_t t);
   16297 #define vreinterpretq_u64_s8
   16298 
   16299 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s64 (int64x2_t t);
   16300 #define vreinterpretq_u64_s64
   16301 
   16302 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_f32 (float32x4_t t);
   16303 #define vreinterpretq_u64_f32(t) _M128i(t)
   16304 
   16305 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t);
   16306 #define vreinterpretq_u64_p16
   16307 
   16308 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t);
   16309 #define vreinterpretq_u64_p8
   16310 
   16311 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u32 (uint32x2_t t);
   16312 #define vreinterpret_s8_u32
   16313 
   16314 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u16 (uint16x4_t t);
   16315 #define vreinterpret_s8_u16
   16316 
   16317 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u8 (uint8x8_t t);
   16318 #define vreinterpret_s8_u8
   16319 
   16320 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s32 (int32x2_t t);
   16321 #define vreinterpret_s8_s32
   16322 
   16323 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s16 (int16x4_t t);
   16324 #define vreinterpret_s8_s16
   16325 
   16326 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u64 (uint64x1_t t);
   16327 #define vreinterpret_s8_u64
   16328 
   16329 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s64 (int64x1_t t);
   16330 #define vreinterpret_s8_s64
   16331 
   16332 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_f32 (float32x2_t t);
   16333 #define vreinterpret_s8_f32
   16334 
   16335 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p16 (poly16x4_t t);
   16336 #define vreinterpret_s8_p16
   16337 
   16338 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p8 (poly8x8_t t);
   16339 #define vreinterpret_s8_p8
   16340 
   16341 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u32 (uint32x4_t t);
   16342 #define vreinterpretq_s8_u32
   16343 
   16344 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u16 (uint16x8_t t);
   16345 #define vreinterpretq_s8_u16
   16346 
   16347 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u8 (uint8x16_t t);
   16348 #define vreinterpretq_s8_u8
   16349 
   16350 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s32 (int32x4_t t);
   16351 #define vreinterpretq_s8_s32
   16352 
   16353 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s16 (int16x8_t t);
   16354 #define vreinterpretq_s8_s16
   16355 
   16356 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u64 (uint64x2_t t);
   16357 #define vreinterpretq_s8_u64
   16358 
   16359 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s64 (int64x2_t t);
   16360 #define vreinterpretq_s8_s64
   16361 
   16362 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_f32 (float32x4_t t);
   16363 #define vreinterpretq_s8_f32(t) _M128i(t)
   16364 
   16365 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p16 (poly16x8_t t);
   16366 #define vreinterpretq_s8_p16
   16367 
   16368 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p8 (poly8x16_t t);
   16369 #define vreinterpretq_s8_p8
   16370 
   16371 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u32 (uint32x2_t t);
   16372 #define vreinterpret_s16_u32
   16373 
   16374 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u16 (uint16x4_t t);
   16375 #define vreinterpret_s16_u16
   16376 
   16377 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u8 (uint8x8_t t);
   16378 #define vreinterpret_s16_u8
   16379 
   16380 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s32 (int32x2_t t);
   16381 #define vreinterpret_s16_s32
   16382 
   16383 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s8 (int8x8_t t);
   16384 #define vreinterpret_s16_s8
   16385 
   16386 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u64 (uint64x1_t t);
   16387 #define vreinterpret_s16_u64
   16388 
   16389 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s64 (int64x1_t t);
   16390 #define vreinterpret_s16_s64
   16391 
   16392 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_f32 (float32x2_t t);
   16393 #define vreinterpret_s16_f32
   16394 
   16395 
   16396 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p16 (poly16x4_t t);
   16397 #define vreinterpret_s16_p16
   16398 
   16399 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p8 (poly8x8_t t);
   16400 #define vreinterpret_s16_p8
   16401 
   16402 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u32 (uint32x4_t t);
   16403 #define vreinterpretq_s16_u32
   16404 
   16405 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u16 (uint16x8_t t);
   16406 #define vreinterpretq_s16_u16
   16407 
   16408 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u8 (uint8x16_t t);
   16409 #define vreinterpretq_s16_u8
   16410 
   16411 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s32 (int32x4_t t);
   16412 #define vreinterpretq_s16_s32
   16413 
   16414 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s8 (int8x16_t t);
   16415 #define vreinterpretq_s16_s8
   16416 
   16417 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u64 (uint64x2_t t);
   16418 #define vreinterpretq_s16_u64
   16419 
   16420 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s64 (int64x2_t t);
   16421 #define vreinterpretq_s16_s64
   16422 
   16423 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_f32 (float32x4_t t);
   16424 #define vreinterpretq_s16_f32(t) _M128i(t)
   16425 
   16426 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p16 (poly16x8_t t);
   16427 #define vreinterpretq_s16_p16
   16428 
   16429 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p8 (poly8x16_t t);
   16430 #define vreinterpretq_s16_p8
   16431 
   16432 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u32 (uint32x2_t t);
   16433 #define vreinterpret_s32_u32
   16434 
   16435 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u16 (uint16x4_t t);
   16436 #define vreinterpret_s32_u16
   16437 
   16438 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u8 (uint8x8_t t);
   16439 #define vreinterpret_s32_u8
   16440 
   16441 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s16 (int16x4_t t);
   16442 #define vreinterpret_s32_s16
   16443 
   16444 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s8 (int8x8_t t);
   16445 #define vreinterpret_s32_s8
   16446 
   16447 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u64 (uint64x1_t t);
   16448 #define vreinterpret_s32_u64
   16449 
   16450 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s64 (int64x1_t t);
   16451 #define vreinterpret_s32_s64
   16452 
   16453 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_f32 (float32x2_t t);
   16454 #define vreinterpret_s32_f32
   16455 
   16456 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p16 (poly16x4_t t);
   16457 #define vreinterpret_s32_p16
   16458 
   16459 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p8 (poly8x8_t t);
   16460 #define vreinterpret_s32_p8
   16461 
   16462 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u32 (uint32x4_t t);
   16463 #define vreinterpretq_s32_u32
   16464 
   16465 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u16 (uint16x8_t t);
   16466 #define vreinterpretq_s32_u16
   16467 
   16468 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u8 (uint8x16_t t);
   16469 #define vreinterpretq_s32_u8
   16470 
   16471 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s16 (int16x8_t t);
   16472 #define vreinterpretq_s32_s16
   16473 
   16474 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s8 (int8x16_t t);
   16475 #define vreinterpretq_s32_s8
   16476 
   16477 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u64 (uint64x2_t t);
   16478 #define vreinterpretq_s32_u64
   16479 
   16480 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s64 (int64x2_t t);
   16481 #define vreinterpretq_s32_s64
   16482 
   16483 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_f32 (float32x4_t t);
   16484 #define vreinterpretq_s32_f32(t)  _M128i(t)
   16485 
   16486 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p16 (poly16x8_t t);
   16487 #define vreinterpretq_s32_p16
   16488 
   16489 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p8 (poly8x16_t t);
   16490 #define vreinterpretq_s32_p8
   16491 
   16492 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u32 (uint32x2_t t);
   16493 #define vreinterpret_u8_u32
   16494 
   16495 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u16 (uint16x4_t t);
   16496 #define vreinterpret_u8_u16
   16497 
   16498 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s32 (int32x2_t t);
   16499 #define vreinterpret_u8_s32
   16500 
   16501 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s16 (int16x4_t t);
   16502 #define vreinterpret_u8_s16
   16503 
   16504 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s8 (int8x8_t t);
   16505 #define vreinterpret_u8_s8
   16506 
   16507 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u64 (uint64x1_t t);
   16508 #define vreinterpret_u8_u64
   16509 
   16510 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s64 (int64x1_t t);
   16511 #define vreinterpret_u8_s64
   16512 
   16513 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_f32 (float32x2_t t);
   16514 #define vreinterpret_u8_f32
   16515 
   16516 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p16 (poly16x4_t t);
   16517 #define vreinterpret_u8_p16
   16518 
   16519 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p8 (poly8x8_t t);
   16520 #define vreinterpret_u8_p8
   16521 
   16522 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t);
   16523 #define vreinterpretq_u8_u32
   16524 
   16525 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t);
   16526 #define vreinterpretq_u8_u16
   16527 
   16528 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s32 (int32x4_t t);
   16529 #define vreinterpretq_u8_s32
   16530 
   16531 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s16 (int16x8_t t);
   16532 #define vreinterpretq_u8_s16
   16533 
   16534 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s8 (int8x16_t t);
   16535 #define vreinterpretq_u8_s8
   16536 
   16537 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t);
   16538 #define vreinterpretq_u8_u64
   16539 
   16540 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s64 (int64x2_t t);
   16541 #define vreinterpretq_u8_s64
   16542 
   16543 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_f32 (float32x4_t t);
   16544 #define vreinterpretq_u8_f32(t) _M128i(t)
   16545 
   16546 
   16547 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t);
   16548 #define vreinterpretq_u8_p16
   16549 
   16550 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t);
   16551 #define vreinterpretq_u8_p8
   16552 
   16553 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u32 (uint32x2_t t);
   16554 #define vreinterpret_u16_u32
   16555 
   16556 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u8 (uint8x8_t t);
   16557 #define vreinterpret_u16_u8
   16558 
   16559 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s32 (int32x2_t t);
   16560 #define vreinterpret_u16_s32
   16561 
   16562 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s16 (int16x4_t t);
   16563 #define vreinterpret_u16_s16
   16564 
   16565 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s8 (int8x8_t t);
   16566 #define vreinterpret_u16_s8
   16567 
   16568 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u64 (uint64x1_t t);
   16569 #define vreinterpret_u16_u64
   16570 
   16571 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s64 (int64x1_t t);
   16572 #define vreinterpret_u16_s64
   16573 
   16574 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_f32 (float32x2_t t);
   16575 #define vreinterpret_u16_f32
   16576 
   16577 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p16 (poly16x4_t t);
   16578 #define vreinterpret_u16_p16
   16579 
   16580 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p8 (poly8x8_t t);
   16581 #define vreinterpret_u16_p8
   16582 
   16583 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t);
   16584 #define vreinterpretq_u16_u32
   16585 
   16586 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t);
   16587 #define vreinterpretq_u16_u8
   16588 
   16589 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s32 (int32x4_t t);
   16590 #define vreinterpretq_u16_s32
   16591 
   16592 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s16 (int16x8_t t);
   16593 #define vreinterpretq_u16_s16
   16594 
   16595 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s8 (int8x16_t t);
   16596 #define vreinterpretq_u16_s8
   16597 
   16598 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t);
   16599 #define vreinterpretq_u16_u64
   16600 
   16601 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s64 (int64x2_t t);
   16602 #define vreinterpretq_u16_s64
   16603 
   16604 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_f32 (float32x4_t t);
   16605 #define vreinterpretq_u16_f32(t) _M128i(t)
   16606 
   16607 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t);
   16608 #define vreinterpretq_u16_p16
   16609 
   16610 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t);
   16611 #define vreinterpretq_u16_p8
   16612 
   16613 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u16 (uint16x4_t t);
   16614 #define vreinterpret_u32_u16
   16615 
   16616 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u8 (uint8x8_t t);
   16617 #define vreinterpret_u32_u8
   16618 
   16619 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s32 (int32x2_t t);
   16620 #define vreinterpret_u32_s32
   16621 
   16622 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s16 (int16x4_t t);
   16623 #define vreinterpret_u32_s16
   16624 
   16625 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s8 (int8x8_t t);
   16626 #define vreinterpret_u32_s8
   16627 
   16628 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u64 (uint64x1_t t);
   16629 #define vreinterpret_u32_u64
   16630 
   16631 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s64 (int64x1_t t);
   16632 #define vreinterpret_u32_s64
   16633 
   16634 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_f32 (float32x2_t t);
   16635 #define vreinterpret_u32_f32
   16636 
   16637 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p16 (poly16x4_t t);
   16638 #define vreinterpret_u32_p16
   16639 
   16640 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p8 (poly8x8_t t);
   16641 #define vreinterpret_u32_p8
   16642 
   16643 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t);
   16644 #define vreinterpretq_u32_u16
   16645 
   16646 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t);
   16647 #define vreinterpretq_u32_u8
   16648 
   16649 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s32 (int32x4_t t);
   16650 #define vreinterpretq_u32_s32
   16651 
   16652 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s16 (int16x8_t t);
   16653 #define vreinterpretq_u32_s16
   16654 
   16655 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s8 (int8x16_t t);
   16656 #define vreinterpretq_u32_s8
   16657 
   16658 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t);
   16659 #define vreinterpretq_u32_u64
   16660 
   16661 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s64 (int64x2_t t);
   16662 #define vreinterpretq_u32_s64
   16663 
   16664 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_f32 (float32x4_t t);
   16665 #define  vreinterpretq_u32_f32(t) _M128i(t)
   16666 
   16667 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t);
   16668 #define vreinterpretq_u32_p16
   16669 
   16670 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t);
   16671 #define vreinterpretq_u32_p8
   16672 
   16673 //*************  Round ******************
   16674 _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a);
   16675 #ifdef USE_SSE4
   16676 #   define vrndnq_f32(a) _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
   16677 #else
   16678 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( float32x4_t vrndnq_f32(float32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   16679 {
   16680     int i;
   16681     _NEON2SSE_ALIGN_16 float32_t res[4];
   16682     _mm_store_ps(res, a);
   16683      for(i = 0; i<4; i++) {
   16684        res[i] = nearbyintf(res[i]);
   16685      }
   16686     return _mm_load_ps(res);
   16687 }
   16688 #endif
   16689 
   16690 
   16691 _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a);
   16692 #ifdef USE_SSE4
   16693 #   define  vrndnq_f64(a)  _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)
   16694 #else
   16695 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_t a), _NEON2SSE_REASON_SLOW_SERIAL)
   16696 {
   16697      _NEON2SSE_ALIGN_16 float64_t res[2];
   16698      _mm_store_pd(res, a);
   16699      res[0] = nearbyintf(res[0]);
   16700      res[1] = nearbyintf(res[1]);
   16701      return _mm_load_pd(res);
   16702 }
   16703 #endif
   16704 
   16705 
   16706 
   16707 //************* Sqrt ******************
   16708 _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a);
   16709 #define vsqrtq_f32 _mm_sqrt_ps
   16710 
   16711 _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a);
   16712 #define vsqrtq_f64 _mm_sqrt_pd
   16713 
   16714 
   16715 #endif /* NEON2SSE_H */
   16716