1 //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina (at) intel.com 2 3 //*** Copyright (C) 2012-2018 Intel Corporation. All rights reserved. 4 5 //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 6 7 //By downloading, copying, installing or using the software you agree to this license. 8 //If you do not agree to this license, do not download, install, copy or use the software. 9 10 // License Agreement 11 //Redistribution and use in source and binary forms, with or without modification, 12 //are permitted provided that the following conditions are met: 13 14 // * Redistributions of source code must retain the above copyright notice, 15 // this list of conditions and the following disclaimer. 16 17 // * The name of the copyright holders may not be used to endorse or promote products 18 // derived from this software without specific prior written permission. 19 20 //This software is provided by the copyright holders and contributors "as is" and 21 //any express or implied warranties, including, but not limited to, the implied 22 //warranties of merchantability and fitness for a particular purpose are disclaimed. 23 //In no event shall the Intel Corporation or contributors be liable for any direct, 24 //indirect, incidental, special, exemplary, or consequential damages 25 //(including, but not limited to, procurement of substitute goods or services; 26 //loss of use, data, or profits; or business interruption) however caused 27 //and on any theory of liability, whether in contract, strict liability, 28 //or tort (including negligence or otherwise) arising in any way out of 29 //the use of this software, even if advised of the possibility of such damage. 30 31 //***************************************************************************************** 32 // This file is intended to simplify ARM->IA32 porting 33 // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h") 34 // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below 35 //MMX instruction set is not used due to non availability on x64 systems, 36 //performance overhead and the necessity to use the EMMS instruction (_mm_empty())for mmx-x87 floating point switching 37 //***************************************************************************************** 38 39 //!!!!!!!!!!!!!! To use this file just include it in your project that uses ARM NEON intinsics instead of "arm_neon.h" and compile it as usual 40 //!!!!!!!!!!!!!! but please pay attention at #define USE_SSE4 below - you might need to define it manualy for newest Intel Atom or any Intel Core platforms for greater performance. 41 42 #ifndef NEON2SSE_H 43 #define NEON2SSE_H 44 45 /*********************************************************************************************************************/ 46 //!!!!!!!!!!!!!! 47 //if USE_SSE4 is defined, some functions use SSE4 instructions instead of earlier SSE versions, when undefined - SIMD up to SSSE3 are used 48 //For older devices without SSE4 support it should be undefined, for newer devices - defined, probably manualy if your compiler doesn't set __SSE4_2__ predefine 49 #ifndef USE_SSE4 50 # if defined(__SSE4_2__) 51 # define USE_SSE4 52 # endif 53 #endif 54 /*********************************************************************************************************************/ 55 56 #include <xmmintrin.h> //SSE 57 #include <emmintrin.h> //SSE2 58 #include <pmmintrin.h> //SSE3 59 #include <tmmintrin.h> //SSSE3 60 #ifdef USE_SSE4 61 # include <smmintrin.h> //SSE4.1 62 # include <nmmintrin.h> //SSE4.2 63 #endif 64 65 #include <math.h> 66 67 //*************** functions and data attributes, compiler dependent ********************************* 68 //*********************************************************************************** 69 #ifdef __GNUC__ 70 # define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 71 # define _NEON2SSESTORAGE static 72 # define _NEON2SSE_ALIGN_16 __attribute__((aligned(16))) 73 # define _NEON2SSE_INLINE _NEON2SSESTORAGE inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 74 # ifndef NEON2SSE_DISABLE_PERFORMANCE_WARNING 75 # if _GCC_VERSION < 40500 76 # define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function 77 # else 78 # define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function 79 # endif 80 # else 81 # define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function 82 # endif 83 # if defined(__x86_64__) 84 # define _NEON2SSE_64BIT __x86_64__ 85 # endif 86 #else 87 # define _NEON2SSESTORAGE static 88 # define _NEON2SSE_ALIGN_16 __declspec(align(16)) 89 # define _NEON2SSE_INLINE _NEON2SSESTORAGE __inline 90 # if (defined(_MSC_VER) || defined (__INTEL_COMPILER)) && !defined(NEON2SSE_DISABLE_PERFORMANCE_WARNING) 91 # define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function 92 # if defined(_M_X64) 93 # define _NEON2SSE_64BIT _M_X64 94 # endif 95 # else 96 # define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function 97 # endif 98 #endif 99 100 #if defined (_NEON2SSE_64BIT) && defined (USE_SSE4) 101 # define _NEON2SSE_64BIT_SSE4 102 #endif 103 104 /*********************************************************************************************************************/ 105 // data types conversion 106 /*********************************************************************************************************************/ 107 #if defined(_MSC_VER) && (_MSC_VER < 1300) 108 typedef signed char int8_t; 109 typedef unsigned char uint8_t; 110 typedef signed short int16_t; 111 typedef unsigned short uint16_t; 112 typedef signed int int32_t; 113 typedef unsigned int uint32_t; 114 typedef signed long long int64_t; 115 typedef unsigned long long uint64_t; 116 #elif defined(_MSC_VER) 117 typedef signed __int8 int8_t; 118 typedef unsigned __int8 uint8_t; 119 typedef signed __int16 int16_t; 120 typedef unsigned __int16 uint16_t; 121 typedef signed __int32 int32_t; 122 typedef unsigned __int32 uint32_t; 123 124 typedef signed long long int64_t; 125 typedef unsigned long long uint64_t; 126 #else 127 # include <stdint.h> 128 # include <limits.h> 129 #endif 130 131 typedef union __m64_128 { 132 uint64_t m64_u64[1]; 133 float m64_f32[2]; 134 int8_t m64_i8[8]; 135 int16_t m64_i16[4]; 136 int32_t m64_i32[2]; 137 int64_t m64_i64[1]; 138 uint8_t m64_u8[8]; 139 uint16_t m64_u16[4]; 140 uint32_t m64_u32[2]; 141 } __m64_128; 142 143 typedef __m64_128 int8x8_t; 144 typedef __m64_128 uint8x8_t; 145 typedef __m64_128 int16x4_t; 146 typedef __m64_128 uint16x4_t; 147 typedef __m64_128 int32x2_t; 148 typedef __m64_128 uint32x2_t; 149 typedef __m64_128 int64x1_t; 150 typedef __m64_128 uint64x1_t; 151 typedef __m64_128 poly8x8_t; 152 typedef __m64_128 poly16x4_t; 153 154 typedef __m64_128 float32x2_t; 155 typedef __m128 float32x4_t; 156 157 typedef __m128 float16x4_t; //not supported by IA, for compartibility 158 typedef __m128 float16x8_t; //not supported by IA, for compartibility 159 160 typedef __m64_128 float64x1_t; 161 typedef __m128d float64x2_t; 162 163 typedef __m128i int8x16_t; 164 typedef __m128i int16x8_t; 165 typedef __m128i int32x4_t; 166 typedef __m128i int64x2_t; 167 typedef __m128i uint8x16_t; 168 typedef __m128i uint16x8_t; 169 typedef __m128i uint32x4_t; 170 typedef __m128i uint64x2_t; 171 typedef __m128i poly8x16_t; 172 typedef __m128i poly16x8_t; 173 174 #if defined(_MSC_VER) 175 # define SINT_MIN (-2147483647 - 1) /* min signed int value */ 176 # define SINT_MAX 2147483647 /* max signed int value */ 177 #else 178 # define SINT_MIN INT_MIN /* min signed int value */ 179 # define SINT_MAX INT_MAX /* max signed int value */ 180 #endif 181 182 typedef float float32_t; 183 #if !defined(__clang__) 184 typedef float __fp16; 185 #endif 186 187 typedef double float64_t; 188 189 190 typedef uint8_t poly8_t; 191 typedef uint16_t poly16_t; 192 193 194 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in 195 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types 196 struct int8x16x2_t { 197 int8x16_t val[2]; 198 }; 199 struct int16x8x2_t { 200 int16x8_t val[2]; 201 }; 202 struct int32x4x2_t { 203 int32x4_t val[2]; 204 }; 205 struct int64x2x2_t { 206 int64x2_t val[2]; 207 }; 208 //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!! 209 struct int8x8x2_t { 210 int8x8_t val[2]; 211 }; 212 struct int16x4x2_t { 213 int16x4_t val[2]; 214 }; 215 struct int32x2x2_t { 216 int32x2_t val[2]; 217 }; 218 struct int64x1x2_t { 219 int64x1_t val[2]; 220 }; 221 222 typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy 223 typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy 224 typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy 225 typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy 226 227 typedef struct int8x8x2_t int8x8x2_t; //for C compilers to make them happy 228 typedef struct int16x4x2_t int16x4x2_t; //for C compilers to make them happy 229 typedef struct int32x2x2_t int32x2x2_t; //for C compilers to make them happy 230 typedef struct int64x1x2_t int64x1x2_t; //for C compilers to make them happy 231 232 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */ 233 typedef struct int8x16x2_t uint8x16x2_t; 234 typedef struct int16x8x2_t uint16x8x2_t; 235 typedef struct int32x4x2_t uint32x4x2_t; 236 typedef struct int64x2x2_t uint64x2x2_t; 237 typedef struct int8x16x2_t poly8x16x2_t; 238 typedef struct int16x8x2_t poly16x8x2_t; 239 240 typedef struct int8x8x2_t uint8x8x2_t; 241 typedef struct int16x4x2_t uint16x4x2_t; 242 typedef struct int32x2x2_t uint32x2x2_t; 243 typedef struct int64x1x2_t uint64x1x2_t; 244 typedef struct int8x8x2_t poly8x8x2_t; 245 typedef struct int16x4x2_t poly16x4x2_t; 246 247 //float 248 struct float32x4x2_t { 249 float32x4_t val[2]; 250 }; 251 struct float16x8x2_t { 252 float16x8_t val[2]; 253 }; 254 struct float32x2x2_t { 255 float32x2_t val[2]; 256 }; 257 258 typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy 259 typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy 260 typedef struct float32x2x2_t float32x2x2_t; //for C compilers to make them happy 261 typedef float16x8x2_t float16x4x2_t; 262 263 //4 264 struct int8x16x4_t { 265 int8x16_t val[4]; 266 }; 267 struct int16x8x4_t { 268 int16x8_t val[4]; 269 }; 270 struct int32x4x4_t { 271 int32x4_t val[4]; 272 }; 273 struct int64x2x4_t { 274 int64x2_t val[4]; 275 }; 276 277 struct int8x8x4_t { 278 int8x8_t val[4]; 279 }; 280 struct int16x4x4_t { 281 int16x4_t val[4]; 282 }; 283 struct int32x2x4_t { 284 int32x2_t val[4]; 285 }; 286 struct int64x1x4_t { 287 int64x1_t val[4]; 288 }; 289 290 typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy 291 typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy 292 typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy 293 typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy 294 295 typedef struct int8x8x4_t int8x8x4_t; //for C compilers to make them happy 296 typedef struct int16x4x4_t int16x4x4_t; //for C compilers to make them happy 297 typedef struct int32x2x4_t int32x2x4_t; //for C compilers to make them happy 298 typedef struct int64x1x4_t int64x1x4_t; //for C compilers to make them happy 299 300 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ 301 typedef struct int8x8x4_t uint8x8x4_t; 302 typedef struct int16x4x4_t uint16x4x4_t; 303 typedef struct int32x2x4_t uint32x2x4_t; 304 typedef struct int64x1x4_t uint64x1x4_t; 305 typedef struct int8x8x4_t poly8x8x4_t; 306 typedef struct int16x4x4_t poly16x4x4_t; 307 308 typedef struct int8x16x4_t uint8x16x4_t; 309 typedef struct int16x8x4_t uint16x8x4_t; 310 typedef struct int32x4x4_t uint32x4x4_t; 311 typedef struct int64x2x4_t uint64x2x4_t; 312 typedef struct int8x16x4_t poly8x16x4_t; 313 typedef struct int16x8x4_t poly16x8x4_t; 314 315 struct float32x4x4_t { 316 float32x4_t val[4]; 317 }; 318 struct float16x8x4_t { 319 float16x8_t val[4]; 320 }; 321 struct float32x2x4_t { 322 float32x2_t val[4]; 323 }; 324 325 typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy 326 typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy 327 typedef struct float32x2x4_t float32x2x4_t; //for C compilers to make them happy 328 typedef float16x8x4_t float16x4x4_t; 329 330 //3 331 struct int16x8x3_t { 332 int16x8_t val[3]; 333 }; 334 struct int32x4x3_t { 335 int32x4_t val[3]; 336 }; 337 struct int64x2x3_t { 338 int64x2_t val[3]; 339 }; 340 struct int8x16x3_t { 341 int8x16_t val[3]; 342 }; 343 344 struct int16x4x3_t { 345 int16x4_t val[3]; 346 }; 347 struct int32x2x3_t { 348 int32x2_t val[3]; 349 }; 350 struct int64x1x3_t { 351 int64x1_t val[3]; 352 }; 353 struct int8x8x3_t { 354 int8x8_t val[3]; 355 }; 356 typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy 357 typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy 358 typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy 359 typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy 360 361 typedef struct int8x8x3_t int8x8x3_t; //for C compilers to make them happy 362 typedef struct int16x4x3_t int16x4x3_t; //for C compilers to make them happy 363 typedef struct int32x2x3_t int32x2x3_t; //for C compilers to make them happy 364 typedef struct int64x1x3_t int64x1x3_t; //for C compilers to make them happy 365 366 367 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ 368 typedef struct int8x16x3_t uint8x16x3_t; 369 typedef struct int16x8x3_t uint16x8x3_t; 370 typedef struct int32x4x3_t uint32x4x3_t; 371 typedef struct int64x2x3_t uint64x2x3_t; 372 typedef struct int8x16x3_t poly8x16x3_t; 373 typedef struct int16x8x3_t poly16x8x3_t; 374 typedef struct int8x8x3_t uint8x8x3_t; 375 typedef struct int16x4x3_t uint16x4x3_t; 376 typedef struct int32x2x3_t uint32x2x3_t; 377 typedef struct int64x1x3_t uint64x1x3_t; 378 typedef struct int8x8x3_t poly8x8x3_t; 379 typedef struct int16x4x3_t poly16x4x3_t; 380 381 //float 382 struct float32x4x3_t { 383 float32x4_t val[3]; 384 }; 385 struct float32x2x3_t { 386 float32x2_t val[3]; 387 }; 388 struct float16x8x3_t { 389 float16x8_t val[3]; 390 }; 391 392 typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy 393 typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy 394 typedef struct float32x2x3_t float32x2x3_t; //for C compilers to make them happy 395 typedef float16x8x3_t float16x4x3_t; 396 397 398 //**************************************************************************** 399 //****** Porting auxiliary macros ******************************************** 400 401 //** floating point related macros ** 402 #define _M128i(a) _mm_castps_si128(a) 403 #define _M128(a) _mm_castsi128_ps(a) 404 //here the most performance effective implementation is compiler and 32/64 bits build dependent 405 #if defined (_NEON2SSE_64BIT) || (defined (__INTEL_COMPILER) && (__INTEL_COMPILER >= 1500) ) 406 # define _pM128i(a) _mm_cvtsi64_si128(*(int64_t*)(&(a))) 407 # define _M64(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (inp); 408 # define _M64f(out, inp) out.m64_i64[0] = _mm_cvtsi128_si64 (_M128i(inp)); 409 #else 410 //for 32bit gcc and Microsoft compilers builds 411 # define _pM128i(a) _mm_loadl_epi64((__m128i*)&(a)) 412 # define _M64(out, inp) _mm_storel_epi64 ((__m128i*)&(out), inp) 413 # define _M64f(out, inp) _mm_storel_epi64 ((__m128i*)&(out), _M128i(inp)) 414 #endif 415 #define _pM128(a) _mm_castsi128_ps(_pM128i(a)) 416 417 #define return64(a) _M64(res64,a); return res64; 418 #define return64f(a) _M64f(res64,a); return res64; 419 420 #define _Ui64(a) (*(uint64_t*)&(a)) 421 #define _UNSIGNED_T(a) u ## a 422 423 #define _SIGNBIT64 ((uint64_t)1 << 63) 424 #define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6)) 425 #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) ) 426 427 #define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it" 428 #define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it" 429 430 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 431 #define __constrange(min,max) const 432 #define __transfersize(size) 433 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 434 435 //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& mask constants used in porting &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& 436 _NEON2SSE_ALIGN_16 static const int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; 437 _NEON2SSE_ALIGN_16 static const int8_t mask8_32_even_odd[16] = { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }; 438 //&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& 439 440 //************************************************************************* 441 //************************************************************************* 442 //********* Functions declarations as declared in original arm_neon.h ***** 443 //************************************************************************* 444 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes. 445 _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 446 _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 447 _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 448 _NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 449 _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 450 _NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 451 _NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 452 _NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 453 _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 454 _NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 455 _NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 456 _NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 457 _NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 458 _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 459 _NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 460 _NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 461 _NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 462 _NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 463 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. 464 _NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 465 _NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 466 _NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 467 _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 468 _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.U16 q0,d0,d0 469 _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 470 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i] 471 _NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 472 _NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 473 _NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 474 _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 475 _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.U16 q0,q0,d0 476 _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 477 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 478 _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 479 _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 480 _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 481 _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.U8 d0,d0,d0 482 _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.U16 d0,d0,d0 483 _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 484 _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 485 _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0 486 _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 487 _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 488 _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0 489 _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 490 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1 491 _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 492 _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 493 _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 494 _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 495 _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.U16 d0,d0,d0 496 _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 497 _NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 498 _NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 499 _NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 500 _NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 501 _NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0 502 _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 503 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i]) 504 _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 505 _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 506 _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 507 _NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 508 _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 509 _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.U16 d0,d0,d0 510 _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 511 _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 512 _NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 513 _NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 514 _NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 515 _NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 516 _NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 517 _NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0 518 _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 519 _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 520 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i] 521 _NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 522 _NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 523 _NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 524 _NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 525 _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 526 _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 527 //Vector rounding add high half: vraddhn 528 _NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 529 _NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 530 _NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 531 _NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 532 _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 533 _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 534 //Multiplication 535 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] 536 _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 537 _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 538 _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 539 _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 540 _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 541 _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 542 _NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 543 _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 544 _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 545 _NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 546 _NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 547 _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 548 _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 549 _NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 550 _NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 551 _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 552 //multiply lane 553 _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); 554 _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); 555 _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); 556 _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); 557 _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); 558 _NEON2SSESTORAGE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c); 559 _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); 560 _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); 561 _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); 562 _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); 563 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] 564 _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 565 _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 566 _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 567 _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 568 _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 569 _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 570 _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 571 _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 572 _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 573 _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 574 _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 575 _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 576 _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 577 _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 578 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i] 579 _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 580 _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 581 _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 582 _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 583 _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.U16 q0,d0,d0 584 _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 585 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] 586 _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 587 _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 588 _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 589 _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 590 _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 591 _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 592 _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 593 _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 594 _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 595 _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 596 _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 597 _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 598 _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 599 _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 600 //Vector multiply subtract long 601 _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 602 _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 603 _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 604 _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 605 _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.U16 q0,d0,d0 606 _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 607 //Vector saturating doubling multiply high 608 _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 609 _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 610 _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 611 _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 612 //Vector saturating rounding doubling multiply high 613 _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 614 _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 615 _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 616 _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 617 //Vector saturating doubling multiply accumulate long 618 _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 619 _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 620 //Vector saturating doubling multiply subtract long 621 _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 622 _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 623 //Vector long multiply 624 _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 625 _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 626 _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 627 _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 628 _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.U16 q0,d0,d0 629 _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 630 _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 631 //Vector saturating doubling long multiply 632 _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 633 _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 634 //Subtraction 635 //Vector subtract 636 _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 637 _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 638 _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 639 _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 640 _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 641 _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 642 _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 643 _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 644 _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 645 _NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 646 _NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 647 _NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 648 _NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 649 _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 650 _NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 651 _NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 652 _NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 653 _NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 654 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i] 655 _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 656 _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 657 _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 658 _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 659 _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.U16 q0,d0,d0 660 _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 661 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i] 662 _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 663 _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 664 _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 665 _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 666 _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.U16 q0,q0,d0 667 _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 668 //Vector saturating subtract 669 _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 670 _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 671 _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 672 _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 673 _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 674 _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.U16 d0,d0,d0 675 _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 676 _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 677 _NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 678 _NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 679 _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 680 _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 681 _NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 682 _NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0 683 _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 684 _NEON2SSESTORAGE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0 685 //Vector halving subtract 686 _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 687 _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 688 _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 689 _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 690 _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.U16 d0,d0,d0 691 _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 692 _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 693 _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 694 _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 695 _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 696 _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0 697 _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 698 //Vector subtract high half 699 _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 700 _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 701 _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 702 _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 703 _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 704 _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 705 //Vector rounding subtract high half 706 _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 707 _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 708 _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 709 _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 710 _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 711 _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 712 //Comparison 713 //Vector compare equal 714 _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 715 _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 716 _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 717 _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 718 _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 719 _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 720 _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 721 _NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 722 _NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 723 _NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 724 _NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 725 _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 726 _NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 727 _NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 728 _NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 729 _NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 730 //Vector compare greater-than or equal 731 _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 732 _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 733 _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 734 _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 735 _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 736 _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 737 _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 738 _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 739 _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 740 _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 741 _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 742 _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 743 _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 744 _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 745 //Vector compare less-than or equal 746 _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 747 _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 748 _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 749 _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 750 _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 751 _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.U16 d0, d0, d0 752 _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 753 _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 754 _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 755 _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 756 _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 757 _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 758 _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 759 _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 760 //Vector compare greater-than 761 _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 762 _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 763 _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 764 _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 765 _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 766 _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 767 _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 768 _NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 769 _NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 770 _NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 771 _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 772 _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 773 _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 774 _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 775 //Vector compare less-than 776 _NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 777 _NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 778 _NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 779 _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 780 _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 781 _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.U16 d0, d0, d0 782 _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 783 _NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 784 _NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 785 _NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 786 _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 787 _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 788 _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 789 _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 790 //Vector compare absolute greater-than or equal 791 _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 792 _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 793 //Vector compare absolute less-than or equal 794 _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 795 _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 796 //Vector compare absolute greater-than 797 _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 798 _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 799 //Vector compare absolute less-than 800 _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 801 _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 802 //Vector test bits 803 _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 804 _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 805 _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 806 _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 807 _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 808 _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 809 _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 810 _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 811 _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 812 _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 813 _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 814 _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 815 _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 816 _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 817 //Absolute difference 818 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] | 819 _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 820 _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 821 _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 822 _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 823 _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.U16 d0,d0,d0 824 _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 825 _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 826 _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 827 _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 828 _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 829 _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 830 _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0 831 _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 832 _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 833 //Absolute difference - long 834 _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 835 _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 836 _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 837 _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 838 _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.U16 q0,d0,d0 839 _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 840 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | 841 _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 842 _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 843 _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 844 _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 845 _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.U16 d0,d0,d0 846 _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 847 _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 848 _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 849 _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 850 _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 851 _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0 852 _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 853 //Absolute difference and accumulate - long 854 _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 855 _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 856 _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 857 _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 858 _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.U16 q0,d0,d0 859 _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 860 //Max/Min 861 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] 862 _NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 863 _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 864 _NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 865 _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 866 _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.U16 d0,d0,d0 867 _NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 868 _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 869 _NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 870 _NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 871 _NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 872 _NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 873 _NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0 874 _NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 875 _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 876 877 _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0 878 879 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] 880 _NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 881 _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 882 _NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 883 _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 884 _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.U16 d0,d0,d0 885 _NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 886 _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 887 _NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 888 _NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 889 _NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 890 _NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 891 _NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0 892 _NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 893 _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 894 895 _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0 896 897 //Pairwise addition 898 //Pairwise add 899 _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 900 _NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 901 _NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 902 _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 903 _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 904 _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 905 _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 906 //Long pairwise add 907 _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 908 _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 909 _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 910 _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 911 _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.U16 d0,d0 912 _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 913 _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 914 _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 915 _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 916 _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 917 _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0 918 _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 919 //Long pairwise add and accumulate 920 _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 921 _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 922 _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 923 _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 924 _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.U16 d0,d0 925 _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 926 _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 927 _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 928 _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 929 _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 930 _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0 931 _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 932 //Folding maximum vpmax -> takes maximum of adjacent pairs 933 _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 934 _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 935 _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 936 _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 937 _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.U16 d0,d0,d0 938 _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 939 _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 940 //Folding minimum vpmin -> takes minimum of adjacent pairs 941 _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 942 _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 943 _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 944 _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 945 _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.U16 d0,d0,d0 946 _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 947 _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 948 //Reciprocal/Sqrt 949 _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 950 _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 951 _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 952 _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 953 //Shifts by signed variable 954 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) 955 _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 956 _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 957 _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 958 _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 959 _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 960 _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.U16 d0,d0,d0 961 _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 962 _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 963 _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 964 _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 965 _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 966 _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 967 _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 968 _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0 969 _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 970 _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 971 //Vector saturating shift left: (negative values shift right) 972 _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 973 _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 974 _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 975 _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 976 _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 977 _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.U16 d0,d0,d0 978 _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 979 _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 980 _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 981 _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 982 _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 983 _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 984 _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 985 _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0 986 _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 987 _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 988 //Vector rounding shift left: (negative values shift right) 989 _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 990 _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 991 _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 992 _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 993 _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 994 _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.U16 d0,d0,d0 995 _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 996 _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 997 _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 998 _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 999 _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 1000 _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 1001 _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 1002 _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0 1003 _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 1004 _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 1005 //Vector saturating rounding shift left: (negative values shift right) 1006 _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 1007 _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 1008 _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 1009 _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 1010 _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 1011 _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.U16 d0,d0,d0 1012 _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 1013 _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 1014 _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 1015 _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 1016 _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 1017 _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 1018 _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 1019 _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0 1020 _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 1021 _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 1022 //Shifts by a constant 1023 //Vector shift right by constant 1024 _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 1025 _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 1026 _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 1027 _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 1028 _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 1029 _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.U16 d0,d0,#16 1030 _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 1031 _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 1032 _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 1033 _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 1034 _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 1035 _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 1036 _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 1037 _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16 1038 _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 1039 _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 1040 //Vector shift left by constant 1041 _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 1042 _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 1043 _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 1044 _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 1045 _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 1046 _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 1047 _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 1048 _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 1049 _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 1050 _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 1051 _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 1052 _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 1053 _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 1054 _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 1055 _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 1056 _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 1057 //Vector rounding shift right by constant 1058 _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 1059 _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 1060 _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 1061 _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 1062 _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 1063 _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.U16 d0,d0,#16 1064 _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 1065 _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 1066 _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 1067 _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 1068 _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 1069 _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 1070 _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 1071 _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16 1072 _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 1073 _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 1074 //Vector shift right by constant and accumulate 1075 _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 1076 _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 1077 _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 1078 _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 1079 _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 1080 _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.U16 d0,d0,#16 1081 _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 1082 _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 1083 _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 1084 _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 1085 _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 1086 _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 1087 _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 1088 _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16 1089 _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 1090 _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 1091 //Vector rounding shift right by constant and accumulate 1092 _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 1093 _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 1094 _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 1095 _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 1096 _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 1097 _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.U16 d0,d0,#16 1098 _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 1099 _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 1100 _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 1101 _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 1102 _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 1103 _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 1104 _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 1105 _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16 1106 _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 1107 _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 1108 //Vector saturating shift left by constant 1109 _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 1110 _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 1111 _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 1112 _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 1113 _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 1114 _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.U16 d0,d0,#0 1115 _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 1116 _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 1117 _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 1118 _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 1119 _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 1120 _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 1121 _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 1122 _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0 1123 _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 1124 _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 1125 //Vector signed->unsigned saturating shift left by constant 1126 _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 1127 _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 1128 _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 1129 _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 1130 _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 1131 _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 1132 _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 1133 _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 1134 //Vector narrowing shift right by constant 1135 _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 1136 _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 1137 _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 1138 _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 1139 _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 1140 _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 1141 //Vector signed->unsigned narrowing saturating shift right by constant 1142 _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 1143 _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 1144 _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 1145 //Vector signed->unsigned rounding narrowing saturating shift right by constant 1146 _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 1147 _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 1148 _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 1149 //Vector narrowing saturating shift right by constant 1150 _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 1151 _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 1152 _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 1153 _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.U16 d0,q0,#8 1154 _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 1155 _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 1156 //Vector rounding narrowing shift right by constant 1157 _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 1158 _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 1159 _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 1160 _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 1161 _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 1162 _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 1163 //Vector rounding narrowing saturating shift right by constant 1164 _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 1165 _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 1166 _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 1167 _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.U16 d0,q0,#8 1168 _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 1169 _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 1170 //Vector widening shift left by constant 1171 _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 1172 _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 1173 _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 1174 _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 1175 _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.U16 q0,d0,#0 1176 _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 1177 //Shifts with insert 1178 //Vector shift right and insert 1179 _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 1180 _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 1181 _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 1182 _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 1183 _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 1184 _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 1185 _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 1186 _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 1187 _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 1188 _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 1189 _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 1190 _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 1191 _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 1192 _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 1193 _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 1194 _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 1195 _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 1196 _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 1197 _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 1198 _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 1199 //Vector shift left and insert 1200 _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 1201 _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 1202 _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 1203 _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 1204 _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 1205 _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 1206 _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 1207 _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 1208 _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 1209 _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 1210 _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 1211 _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 1212 _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 1213 _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 1214 _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 1215 _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 1216 _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 1217 _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 1218 _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 1219 _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 1220 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type. 1221 //Load a single vector from memory 1222 _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 1223 _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 1224 _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 1225 _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 1226 _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 1227 _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 1228 _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 1229 _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 1230 _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] 1231 _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 1232 _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 1233 _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 1234 _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] 1235 _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] 1236 _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] 1237 _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] 1238 _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] 1239 _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] 1240 _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] 1241 _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] 1242 _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] 1243 _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] 1244 _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] 1245 _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] 1246 1247 _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 1248 1249 //Load a single lane from memory 1250 _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] 1251 _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 1252 _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 1253 _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] 1254 _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] 1255 _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] 1256 _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0] 1257 _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] 1258 _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 1259 _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0] 1260 _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] 1261 _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 1262 _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] 1263 _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] 1264 _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] 1265 _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] 1266 _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8{d0[0]}, [r0] 1267 _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] 1268 _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); //VLD1.32 {d0[0]}, [r0] 1269 _NEON2SSESTORAGE float16x4_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] 1270 _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] 1271 _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); //VLD1.64 {d0}, [r0] 1272 _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); //VLD1.8 {d0[0]}, [r0] 1273 _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); //VLD1.16 {d0[0]}, [r0] 1274 //Load all lanes of vector with same value from memory 1275 _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] 1276 _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] 1277 _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] 1278 _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] 1279 _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] 1280 _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] 1281 _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] 1282 _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] 1283 _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] 1284 _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] 1285 _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] 1286 _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] 1287 _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] 1288 _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] 1289 _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] 1290 _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] 1291 _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] 1292 _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] 1293 _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] 1294 _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] 1295 _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] 1296 _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] 1297 _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] 1298 _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] 1299 //Store a single vector or lane. Stores all lanes or a single lane of a vector. 1300 //Store a single vector into memory 1301 _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] 1302 _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] 1303 _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] 1304 _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] 1305 _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] 1306 _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] 1307 _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] 1308 _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] 1309 _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] 1310 _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] 1311 _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] 1312 _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] 1313 _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] 1314 _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] 1315 _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] 1316 _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] 1317 _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] 1318 _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] 1319 _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] 1320 _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] 1321 _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] 1322 _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] 1323 _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] 1324 _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] 1325 //Store a lane of a vector into memory 1326 //Loads of an N-element structure 1327 //Load N-element structure from memory 1328 _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 1329 _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 1330 _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 1331 _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 1332 _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 1333 _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 1334 _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] 1335 _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 1336 _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 1337 _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 1338 _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 1339 _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 1340 _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 1341 _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 1342 _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 1343 _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 1344 _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 1345 _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 1346 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] 1347 _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 1348 _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 1349 _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 1350 _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 1351 _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 1352 _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 1353 _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 1354 _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 1355 _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 1356 _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 1357 _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 1358 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 1359 _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 1360 _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 1361 _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 1362 _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 1363 _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 1364 _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 1365 _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 1366 _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 1367 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 1368 _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 1369 _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 1370 _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 1371 _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 1372 _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 1373 _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 1374 _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 1375 _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 1376 _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 1377 _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 1378 _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 1379 _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 1380 _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 1381 _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 1382 _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 1383 _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 1384 _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 1385 _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 1386 _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 1387 _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 1388 _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 1389 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 1390 _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 1391 _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 1392 _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 1393 _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 1394 //Load all lanes of N-element structure with same value from memory 1395 _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 1396 _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 1397 _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 1398 _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 1399 _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 1400 _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 1401 _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 1402 _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 1403 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 1404 _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 1405 _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 1406 _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 1407 _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 1408 _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 1409 _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 1410 _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 1411 _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 1412 _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 1413 _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 1414 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 1415 _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 1416 _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 1417 _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 1418 _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 1419 _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 1420 _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 1421 _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 1422 _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 1423 _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 1424 _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 1425 _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 1426 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 1427 _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 1428 _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 1429 _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 1430 _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 1431 //Load a single lane of N-element structure from memory 1432 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned 1433 _NEON2SSESTORAGE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] 1434 _NEON2SSESTORAGE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] 1435 _NEON2SSESTORAGE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] 1436 _NEON2SSESTORAGE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] 1437 _NEON2SSESTORAGE float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] 1438 _NEON2SSESTORAGE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] 1439 _NEON2SSESTORAGE poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] 1440 _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] 1441 _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 1442 _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] 1443 _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] 1444 _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0] 1445 _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0] 1446 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 1447 _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] 1448 _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] 1449 _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 1450 _NEON2SSESTORAGE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 1451 _NEON2SSESTORAGE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 1452 _NEON2SSESTORAGE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 1453 _NEON2SSESTORAGE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 1454 _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 1455 _NEON2SSESTORAGE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 1456 _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 1457 _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 1458 _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 1459 _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 1460 _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 1461 _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 1462 _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 1463 _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 1464 _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 1465 _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 1466 _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 1467 _NEON2SSESTORAGE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 1468 _NEON2SSESTORAGE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 1469 _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 1470 _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 1471 _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 1472 _NEON2SSESTORAGE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 1473 _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 1474 _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1475 _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1476 _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1477 _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1478 _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1479 _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1480 _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1481 _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1482 _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1483 _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 1484 //Store N-element structure to memory 1485 _NEON2SSESTORAGE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0] 1486 _NEON2SSESTORAGE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0] 1487 _NEON2SSESTORAGE void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0] 1488 _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0] 1489 _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0] 1490 _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0] 1491 _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0] 1492 _NEON2SSESTORAGE void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0] 1493 _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0] 1494 _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0] 1495 _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val); // VST2.8 {d0, d1}, [r0] 1496 _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val); // VST2.16 {d0, d1}, [r0] 1497 _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val); // VST2.32 {d0, d1}, [r0] 1498 _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val); // VST1.64 {d0, d1}, [r0] 1499 _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val); // VST2.8 {d0, d1}, [r0] 1500 _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0] 1501 _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0] 1502 _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); // VST1.64 {d0, d1}, [r0] 1503 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0] 1504 _NEON2SSESTORAGE void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0] 1505 _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0] 1506 _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0] 1507 _NEON2SSESTORAGE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] 1508 _NEON2SSESTORAGE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] 1509 _NEON2SSESTORAGE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] 1510 _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] 1511 _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] 1512 _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] 1513 _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] 1514 _NEON2SSESTORAGE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] 1515 _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] 1516 _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] 1517 _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0] 1518 _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0] 1519 _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0] 1520 _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0] 1521 _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0] 1522 _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0] 1523 _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0] 1524 _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0] 1525 _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] 1526 _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0] 1527 _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0] 1528 _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0] 1529 _NEON2SSESTORAGE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] 1530 _NEON2SSESTORAGE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] 1531 _NEON2SSESTORAGE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] 1532 _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] 1533 _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] 1534 _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] 1535 _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] 1536 _NEON2SSESTORAGE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] 1537 _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] 1538 _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] 1539 _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0] 1540 _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0] 1541 _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0] 1542 _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0] 1543 _NEON2SSESTORAGE void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0] 1544 _NEON2SSESTORAGE void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0] 1545 _NEON2SSESTORAGE void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0] 1546 _NEON2SSESTORAGE void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0] 1547 _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] 1548 _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val); // VST4.32 {d0, d1, d2, d3}, [r0] 1549 _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val); // VST4.8 {d0, d1, d2, d3}, [r0] 1550 _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val); // VST4.16 {d0, d1, d2, d3}, [r0] 1551 //Store a single lane of N-element structure to memory 1552 _NEON2SSESTORAGE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] 1553 _NEON2SSESTORAGE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] 1554 _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] 1555 _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] 1556 _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] 1557 _NEON2SSESTORAGE void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0] 1558 _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] 1559 _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] 1560 _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] 1561 _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] 1562 _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0] 1563 _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] 1564 _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] 1565 _NEON2SSESTORAGE void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] 1566 _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] 1567 _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] 1568 _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] 1569 _NEON2SSESTORAGE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] 1570 _NEON2SSESTORAGE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] 1571 _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] 1572 _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] 1573 _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] 1574 _NEON2SSESTORAGE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0] 1575 _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] 1576 _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] 1577 _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] 1578 _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] 1579 _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0] 1580 _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] 1581 _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] 1582 _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] 1583 _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] 1584 _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] 1585 _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] 1586 _NEON2SSESTORAGE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] 1587 _NEON2SSESTORAGE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] 1588 _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] 1589 _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] 1590 _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] 1591 _NEON2SSESTORAGE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 1592 _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] 1593 _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] 1594 _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] 1595 _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] 1596 _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0] 1597 _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] 1598 _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] 1599 _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] 1600 _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] 1601 _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] 1602 _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] 1603 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector. 1604 _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] 1605 _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] 1606 _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] 1607 _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] 1608 _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] 1609 _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] 1610 _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] 1611 _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.U16 r0, d0[0] 1612 _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] 1613 _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] 1614 _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] 1615 _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 1616 _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] 1617 _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] 1618 _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 1619 _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] 1620 _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] 1621 _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 1622 _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 1623 _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 1624 _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 1625 _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 1626 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector. 1627 _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 1628 _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 1629 _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 1630 _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 1631 _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 1632 _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 1633 _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 1634 _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 1635 _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 1636 _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 1637 _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 1638 _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 1639 _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 1640 _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 1641 _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 1642 _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 1643 _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 1644 _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 1645 _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 1646 _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 1647 _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 1648 _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 1649 //Initialize a vector from a literal bit pattern. 1650 _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 1651 _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 1652 _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 1653 _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 1654 _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 1655 _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 1656 _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 1657 _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 1658 _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 1659 _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 1660 _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 1661 _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 1662 //Set all lanes to same value 1663 //Load all lanes of vector to the same literal value 1664 _NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 1665 _NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 1666 _NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 1667 _NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 1668 _NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 1669 _NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 1670 _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 1671 _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 1672 _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 1673 _NEON2SSESTORAGE uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 1674 _NEON2SSESTORAGE uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 1675 _NEON2SSESTORAGE uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 1676 _NEON2SSESTORAGE int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 1677 _NEON2SSESTORAGE int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 1678 _NEON2SSESTORAGE int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 1679 _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 1680 _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 1681 _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 1682 _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 1683 _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 1684 _NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 1685 _NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 1686 _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 1687 _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 1688 _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 1689 _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 1690 _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 1691 _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 1692 _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 1693 _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 1694 _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 1695 _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 1696 _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 1697 _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 1698 _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 1699 _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 1700 _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 1701 _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 1702 _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 1703 _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 1704 _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 1705 _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 1706 _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 1707 _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 1708 //Load all lanes of the vector to the value of a lane of a vector 1709 _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] 1710 _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] 1711 _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] 1712 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] 1713 _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] 1714 _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] 1715 _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] 1716 _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] 1717 _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] 1718 _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] 1719 _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] 1720 _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] 1721 _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] 1722 _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] 1723 _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] 1724 _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] 1725 _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] 1726 _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] 1727 _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 1728 _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 1729 _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 1730 _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 1731 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector. 1732 _NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 1733 _NEON2SSESTORAGE int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 1734 _NEON2SSESTORAGE int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 1735 _NEON2SSESTORAGE int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 1736 _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 1737 _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 1738 _NEON2SSESTORAGE uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 1739 _NEON2SSESTORAGE uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 1740 _NEON2SSESTORAGE uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 1741 _NEON2SSESTORAGE uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 1742 _NEON2SSESTORAGE poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 1743 _NEON2SSESTORAGE poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 1744 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors 1745 _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 1746 _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 1747 _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 1748 _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 1749 _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 1750 _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 1751 _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 1752 _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 1753 _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 1754 _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 1755 _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 1756 _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 1757 _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 1758 _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 1759 _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 1760 _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 1761 _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 1762 _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 1763 _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 1764 _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 1765 _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 1766 _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 1767 _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 1768 _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 1769 //Converting vectors. These intrinsics are used to convert vectors. 1770 //Convert from float 1771 _NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 1772 _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 1773 _NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 1774 _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 1775 _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 1776 _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 1777 _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 1778 _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 1779 _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0 1780 //Convert to float 1781 _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 1782 _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 1783 _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 1784 _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 1785 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 1786 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 1787 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 1788 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 1789 //Convert between floats 1790 _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 1791 _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 1792 //Vector narrow integer 1793 _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 1794 _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 1795 _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 1796 _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 1797 _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 1798 _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 1799 //Vector long move 1800 _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 1801 _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 1802 _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 1803 _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 1804 _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.U16 q0,d0 1805 _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 1806 //Vector saturating narrow integer 1807 _NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 1808 _NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 1809 _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 1810 _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.U16 d0,q0 1811 _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0 1812 _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0 1813 //Vector saturating narrow integer signed->unsigned 1814 _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 1815 _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 1816 _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 1817 //Table look up 1818 _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 1819 _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 1820 _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 1821 //Extended table look up intrinsics 1822 _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 1823 _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 1824 _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 1825 _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 1826 _NEON2SSESTORAGE int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 1827 _NEON2SSESTORAGE poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 1828 _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 1829 _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 1830 _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 1831 _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 1832 _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 1833 _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 1834 //Operations with a scalar value 1835 //Vector multiply accumulate with scalar 1836 _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] 1837 _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] 1838 _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0,d0[0] 1839 _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0,d0[0] 1840 _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0,d0, d0[0] 1841 _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0,d0[0] 1842 _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0,d0[0] 1843 _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0,q0, d0[0] 1844 _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0,q0, d0[0] 1845 _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0,q0, d0[0] 1846 //Vector widening multiply accumulate with scalar 1847 _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); //VMLAL.S16 q0, d0,d0[0] 1848 _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); //VMLAL.S32 q0, d0,d0[0] 1849 _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.U16 q0,d0, d0[0] 1850 _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0,d0, d0[0] 1851 //Vector widening saturating doubling multiply accumulate with scalar 1852 _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0,d0, d0[0] 1853 _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0,d0, d0[0] 1854 //Vector multiply subtract with scalar 1855 _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] 1856 _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] 1857 _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0,d0[0] 1858 _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0,d0[0] 1859 _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0,d0, d0[0] 1860 _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0,d0[0] 1861 _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0,d0[0] 1862 _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0,q0, d0[0] 1863 _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0,q0, d0[0] 1864 _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 q0,q0, d0[0] 1865 //Vector widening multiply subtract with scalar 1866 _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLSL.S16 q0, d0,d0[0] 1867 _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLSL.S32 q0, d0,d0[0] 1868 _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLSL.U16 q0,d0, d0[0] 1869 _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLSL.U32 q0,d0, d0[0] 1870 //Vector widening saturating doubling multiply subtract with scalar 1871 _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0,d0, d0[0] 1872 _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0,d0, d0[0] 1873 //Vector multiply by scalar 1874 _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] 1875 _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] 1876 _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] 1877 _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] 1878 _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] 1879 _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] 1880 _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] 1881 _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] 1882 _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] 1883 _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] 1884 //Vector long multiply with scalar 1885 _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] 1886 _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] 1887 _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.U16 q0,d0,d0[0] 1888 _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] 1889 //Vector long multiply by scalar 1890 _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] 1891 _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] 1892 _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.U16 q0,d0,d0[0] 1893 _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] 1894 //Vector saturating doubling long multiply with scalar 1895 _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] 1896 _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] 1897 //Vector saturating doubling long multiply by scalar 1898 _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] 1899 _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] 1900 //Vector saturating doubling multiply high with scalar 1901 _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] 1902 _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] 1903 _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] 1904 _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] 1905 //Vector saturating doubling multiply high by scalar 1906 _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] 1907 _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] 1908 _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] 1909 _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] 1910 //Vector saturating rounding doubling multiply high with scalar 1911 _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] 1912 _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] 1913 _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] 1914 _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] 1915 //Vector rounding saturating doubling multiply high by scalar 1916 _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] 1917 _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] 1918 _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] 1919 _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] 1920 //Vector multiply accumulate with scalar 1921 _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] 1922 _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] 1923 _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] 1924 _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] 1925 _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] 1926 _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] 1927 _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] 1928 _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] 1929 _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] 1930 _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] 1931 //Vector widening multiply accumulate with scalar 1932 _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] 1933 _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] 1934 _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.U16 q0, d0, d0[0] 1935 _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] 1936 //Vector widening saturating doubling multiply accumulate with scalar 1937 _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] 1938 _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] 1939 //Vector multiply subtract with scalar 1940 _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] 1941 _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] 1942 _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] 1943 _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] 1944 _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] 1945 _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] 1946 _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] 1947 _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] 1948 _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] 1949 _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] 1950 //Vector widening multiply subtract with scalar 1951 _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] 1952 _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] 1953 _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.U16 q0, d0, d0[0] 1954 _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] 1955 //Vector widening saturating doubling multiply subtract with scalar 1956 _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] 1957 _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] 1958 //Vector extract 1959 _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 1960 _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 1961 _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 1962 _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 1963 _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 1964 _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 1965 _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 1966 _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 1967 _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 1968 _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 1969 _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 1970 _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 1971 _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 1972 _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 1973 _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 1974 _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 1975 _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 1976 _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 1977 _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 1978 _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 1979 _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 1980 _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0 1981 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide. 1982 _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0 1983 _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0 1984 _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0 1985 _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0 1986 _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0 1987 _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0 1988 _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0 1989 _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0 1990 _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0 1991 _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0 1992 _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0 1993 _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0 1994 _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0 1995 _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0 1996 _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0 1997 _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0 1998 _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0 1999 _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0 2000 _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0 2001 _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0 2002 _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0 2003 _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0 2004 _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0 2005 _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0 2006 _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0 2007 _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0 2008 _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0 2009 _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0 2010 _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0 2011 _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0 2012 _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0 2013 _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0 2014 _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0 2015 _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0 2016 _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0 2017 _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0 2018 //Other single operand arithmetic 2019 //Absolute: Vd[i] = |Va[i]| 2020 _NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0 2021 _NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0 2022 _NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0 2023 _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0 2024 _NEON2SSESTORAGE int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0 2025 _NEON2SSESTORAGE int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 2026 _NEON2SSESTORAGE int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 2027 _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 2028 2029 _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0 2030 _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0 2031 2032 //Saturating absolute: Vd[i] = sat(|Va[i]|) 2033 _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0 2034 _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0 2035 _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0 2036 _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0 2037 _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0 2038 _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0 2039 //Negate: Vd[i] = - Va[i] 2040 _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0 2041 _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0 2042 _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0 2043 _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0 2044 _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0 2045 _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0 2046 _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0 2047 _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0 2048 //Saturating Negate: sat(Vd[i] = - Va[i]) 2049 _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0 2050 _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0 2051 _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0 2052 _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0 2053 _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0 2054 _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0 2055 //Count leading sign bits 2056 _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 2057 _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 2058 _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 2059 _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 2060 _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 2061 _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 2062 //Count leading zeros 2063 _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0 2064 _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0 2065 _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0 2066 _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0 2067 _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0 2068 _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0 2069 _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0 2070 _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0 2071 _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0 2072 _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 2073 _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 2074 _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 2075 //Count number of set bits 2076 _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 2077 _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 2078 _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 2079 _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 2080 _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 2081 _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 2082 //Reciprocal estimate 2083 _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 2084 _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 2085 _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 2086 _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 2087 //Reciprocal square root estimate 2088 _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 2089 _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 2090 _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 2091 _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 2092 //Logical operations 2093 //Bitwise not 2094 _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 2095 _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 2096 _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 2097 _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 2098 _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 2099 _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 2100 _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 2101 _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 2102 _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 2103 _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 2104 _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 2105 _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 2106 _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 2107 _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 2108 //Bitwise and 2109 _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 2110 _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 2111 _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 2112 _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 2113 _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 2114 _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 2115 _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 2116 _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 2117 _NEON2SSESTORAGE int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 2118 _NEON2SSESTORAGE int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 2119 _NEON2SSESTORAGE int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 2120 _NEON2SSESTORAGE int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 2121 _NEON2SSESTORAGE uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 2122 _NEON2SSESTORAGE uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 2123 _NEON2SSESTORAGE uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 2124 _NEON2SSESTORAGE uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 2125 //Bitwise or 2126 _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 2127 _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 2128 _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 2129 _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 2130 _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 2131 _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 2132 _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 2133 _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 2134 _NEON2SSESTORAGE int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 2135 _NEON2SSESTORAGE int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 2136 _NEON2SSESTORAGE int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 2137 _NEON2SSESTORAGE int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 2138 _NEON2SSESTORAGE uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 2139 _NEON2SSESTORAGE uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 2140 _NEON2SSESTORAGE uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 2141 _NEON2SSESTORAGE uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 2142 //Bitwise exclusive or (EOR or XOR) 2143 _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 2144 _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 2145 _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 2146 _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 2147 _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 2148 _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 2149 _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 2150 _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 2151 _NEON2SSESTORAGE int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 2152 _NEON2SSESTORAGE int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 2153 _NEON2SSESTORAGE int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 2154 _NEON2SSESTORAGE int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 2155 _NEON2SSESTORAGE uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 2156 _NEON2SSESTORAGE uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 2157 _NEON2SSESTORAGE uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 2158 _NEON2SSESTORAGE uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 2159 //Bit Clear 2160 _NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 2161 _NEON2SSESTORAGE int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 2162 _NEON2SSESTORAGE int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 2163 _NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 2164 _NEON2SSESTORAGE uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 2165 _NEON2SSESTORAGE uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 2166 _NEON2SSESTORAGE uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 2167 _NEON2SSESTORAGE uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 2168 _NEON2SSESTORAGE int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 2169 _NEON2SSESTORAGE int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 2170 _NEON2SSESTORAGE int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 2171 _NEON2SSESTORAGE int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 2172 _NEON2SSESTORAGE uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 2173 _NEON2SSESTORAGE uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 2174 _NEON2SSESTORAGE uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 2175 _NEON2SSESTORAGE uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 2176 //Bitwise OR complement 2177 _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 2178 _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 2179 _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 2180 _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 2181 _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 2182 _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 2183 _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 2184 _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 2185 _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 2186 _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 2187 _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 2188 _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 2189 _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 2190 _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 2191 _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 2192 _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 2193 //Bitwise Select 2194 _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 2195 _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 2196 _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 2197 _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 2198 _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 2199 _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 2200 _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 2201 _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 2202 _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 2203 _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 2204 _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 2205 _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 2206 _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 2207 _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 2208 _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 2209 _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 2210 _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 2211 _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 2212 _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 2213 _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 2214 _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 2215 _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 2216 //Transposition operations 2217 //Transpose elements 2218 _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 2219 _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 2220 _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 2221 _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 2222 _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 2223 _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 2224 _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 2225 _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 2226 _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 2227 _NEON2SSESTORAGE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 2228 _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 2229 _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 2230 _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 2231 _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 2232 _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 2233 _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 2234 _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 2235 _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 2236 //Interleave elements 2237 _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 2238 _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 2239 _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 2240 _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 2241 _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 2242 _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 2243 _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 2244 _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 2245 _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 2246 _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 2247 _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 2248 _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 2249 _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 2250 _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 2251 _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 2252 _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 2253 _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 2254 _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 2255 //De-Interleave elements 2256 _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 2257 _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 2258 _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 2259 _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 2260 _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 2261 _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 2262 _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 2263 _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 2264 _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 2265 _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 2266 _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 2267 _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 2268 _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 2269 _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 2270 _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 2271 _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 2272 _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 2273 _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 2274 2275 _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); // VRND.F32 q0,q0 2276 2277 _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); // VRND.F64 q0,q0 2278 2279 //Sqrt 2280 _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a); // VSQRT.F32 q0,q0 2281 2282 _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a); // VSQRT.F64 q0,q0 2283 2284 2285 2286 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2287 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. 2288 // we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal 2289 // 2290 #if ( defined (__INTEL_COMPILER) || defined (__GNUC__) && !defined(__llvm__) ) 2291 # define _MM_ALIGNR_EPI8 _mm_alignr_epi8 2292 # define _MM_EXTRACT_EPI16 (int16_t) _mm_extract_epi16 2293 # define _MM_INSERT_EPI16 _mm_insert_epi16 2294 # ifdef USE_SSE4 2295 # define _MM_EXTRACT_EPI8 _mm_extract_epi8 2296 # define _MM_EXTRACT_EPI32 _mm_extract_epi32 2297 # define _MM_EXTRACT_PS _mm_extract_ps 2298 # define _MM_INSERT_EPI8 _mm_insert_epi8 2299 # define _MM_INSERT_EPI32 _mm_insert_epi32 2300 # define _MM_INSERT_PS _mm_insert_ps 2301 # ifdef _NEON2SSE_64BIT 2302 # define _MM_INSERT_EPI64 _mm_insert_epi64 2303 # define _MM_EXTRACT_EPI64 _mm_extract_epi64 2304 # endif 2305 # endif //SSE4 2306 #else 2307 # define _NEON2SSE_COMMA , 2308 # define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \ 2309 switch(LANE) \ 2310 { \ 2311 case 0: return NAME(a b, 0); \ 2312 case 1: return NAME(a b, 1); \ 2313 case 2: return NAME(a b, 2); \ 2314 case 3: return NAME(a b, 3); \ 2315 case 4: return NAME(a b, 4); \ 2316 case 5: return NAME(a b, 5); \ 2317 case 6: return NAME(a b, 6); \ 2318 case 7: return NAME(a b, 7); \ 2319 case 8: return NAME(a b, 8); \ 2320 case 9: return NAME(a b, 9); \ 2321 case 10: return NAME(a b, 10); \ 2322 case 11: return NAME(a b, 11); \ 2323 case 12: return NAME(a b, 12); \ 2324 case 13: return NAME(a b, 13); \ 2325 case 14: return NAME(a b, 14); \ 2326 case 15: return NAME(a b, 15); \ 2327 default: return NAME(a b, 0); \ 2328 } 2329 2330 # define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \ 2331 switch(LANE) \ 2332 { \ 2333 case 0: return NAME(vec p,0); \ 2334 case 1: return NAME(vec p,1); \ 2335 case 2: return NAME(vec p,2); \ 2336 case 3: return NAME(vec p,3); \ 2337 case 4: return NAME(vec p,4); \ 2338 case 5: return NAME(vec p,5); \ 2339 case 6: return NAME(vec p,6); \ 2340 case 7: return NAME(vec p,7); \ 2341 default: return NAME(vec p,0); \ 2342 } 2343 2344 # define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \ 2345 switch(LANE) \ 2346 { \ 2347 case case0: return NAME(vec p,case0); \ 2348 case case1: return NAME(vec p,case1); \ 2349 case case2: return NAME(vec p,case2); \ 2350 case case3: return NAME(vec p,case3); \ 2351 default: return NAME(vec p,case0); \ 2352 } 2353 2354 _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE) 2355 { 2356 _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE) 2357 } 2358 2359 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE) 2360 { 2361 _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p) 2362 } 2363 2364 _NEON2SSE_INLINE int16_t _MM_EXTRACT_EPI16(__m128i vec, const int LANE) 2365 { 2366 _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,) 2367 } 2368 2369 #ifdef USE_SSE4 2370 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) 2371 { 2372 _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,) 2373 } 2374 2375 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) 2376 { 2377 _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,) 2378 } 2379 2380 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) 2381 { 2382 _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE) 2383 } 2384 2385 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) 2386 { 2387 _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p) 2388 } 2389 2390 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) 2391 { 2392 _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE) 2393 } 2394 2395 #ifdef _NEON2SSE_64BIT 2396 //the special case of functions available only for SSE4 and 64-bit build. 2397 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE) 2398 { 2399 switch(LANE) { 2400 case 0: 2401 return _mm_insert_epi64(vec, p, 0); 2402 case 1: 2403 return _mm_insert_epi64(vec, p, 1); 2404 default: 2405 return _mm_insert_epi64(vec, p, 0); 2406 } 2407 } 2408 2409 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE) 2410 { 2411 if (LANE ==0) return _mm_extract_epi64(val, 0); 2412 else return _mm_extract_epi64(val, 1); 2413 } 2414 #endif 2415 2416 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) 2417 { 2418 _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p) 2419 } 2420 2421 #endif //USE_SSE4 2422 2423 #endif //#ifdef NDEBUG 2424 2425 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 2426 // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices 2427 // or for some specific commonly used operations implementation missing in SSE 2428 #ifdef USE_SSE4 2429 # define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16 2430 # define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32 2431 # define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64 2432 2433 # define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16 2434 # define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32 2435 # define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64 2436 2437 # define _MM_MAX_EPI8 _mm_max_epi8 2438 # define _MM_MAX_EPI32 _mm_max_epi32 2439 # define _MM_MAX_EPU16 _mm_max_epu16 2440 # define _MM_MAX_EPU32 _mm_max_epu32 2441 2442 # define _MM_MIN_EPI8 _mm_min_epi8 2443 # define _MM_MIN_EPI32 _mm_min_epi32 2444 # define _MM_MIN_EPU16 _mm_min_epu16 2445 # define _MM_MIN_EPU32 _mm_min_epu32 2446 2447 # define _MM_BLENDV_EPI8 _mm_blendv_epi8 2448 # define _MM_PACKUS_EPI32 _mm_packus_epi32 2449 # define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a) 2450 2451 # define _MM_MULLO_EPI32 _mm_mullo_epi32 2452 # define _MM_MUL_EPI32 _mm_mul_epi32 2453 2454 # define _MM_CMPEQ_EPI64 _mm_cmpeq_epi64 2455 #else //no SSE4 !!!!!! 2456 _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a) 2457 { 2458 __m128i zero = _mm_setzero_si128(); 2459 return _mm_unpacklo_epi8(a, zero); 2460 } 2461 2462 _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a) 2463 { 2464 __m128i zero = _mm_setzero_si128(); 2465 return _mm_unpacklo_epi16(a, zero); 2466 } 2467 2468 _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a) 2469 { 2470 __m128i zero = _mm_setzero_si128(); 2471 return _mm_unpacklo_epi32(a, zero); 2472 } 2473 2474 _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a) 2475 { 2476 __m128i zero = _mm_setzero_si128(); 2477 __m128i sign = _mm_cmpgt_epi8(zero, a); 2478 return _mm_unpacklo_epi8(a, sign); 2479 } 2480 2481 _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a) 2482 { 2483 __m128i zero = _mm_setzero_si128(); 2484 __m128i sign = _mm_cmpgt_epi16(zero, a); 2485 return _mm_unpacklo_epi16(a, sign); 2486 } 2487 2488 _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a) 2489 { 2490 __m128i zero = _mm_setzero_si128(); 2491 __m128i sign = _mm_cmpgt_epi32(zero, a); 2492 return _mm_unpacklo_epi32(a, sign); 2493 } 2494 2495 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) 2496 { 2497 _NEON2SSE_ALIGN_16 int32_t tmp[4]; 2498 _mm_store_si128((__m128i*)tmp, vec); 2499 return tmp[LANE]; 2500 } 2501 2502 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) 2503 { 2504 _NEON2SSE_ALIGN_16 int8_t tmp[16]; 2505 _mm_store_si128((__m128i*)tmp, vec); 2506 return (int)tmp[LANE]; 2507 } 2508 2509 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) 2510 { 2511 _NEON2SSE_ALIGN_16 int32_t tmp[4]; 2512 _mm_store_si128((__m128i*)tmp, _M128i(vec)); 2513 return tmp[LANE]; 2514 } 2515 2516 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) 2517 { 2518 _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0}; 2519 _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; 2520 __m128i vec_masked, p_masked; 2521 pvec[LANE] = p; 2522 mask[LANE] = 0x0; 2523 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p 2524 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec 2525 return _mm_or_si128(vec_masked, p_masked); 2526 } 2527 2528 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) 2529 { 2530 _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; 2531 _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}; 2532 __m128i vec_masked, p_masked; 2533 pvec[LANE] = (int8_t)p; 2534 mask[LANE] = 0x0; 2535 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p 2536 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec 2537 return _mm_or_si128(vec_masked, p_masked); 2538 } 2539 2540 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) 2541 { 2542 _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; 2543 __m128 tmp, vec_masked, p_masked; 2544 mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it 2545 vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p 2546 p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec 2547 tmp = _mm_or_ps(vec_masked, p_masked); 2548 return tmp; 2549 } 2550 2551 _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b) 2552 { 2553 __m128i cmp, resa, resb; 2554 cmp = _mm_cmpgt_epi8 (a, b); 2555 resa = _mm_and_si128 (cmp, a); 2556 resb = _mm_andnot_si128 (cmp,b); 2557 return _mm_or_si128(resa, resb); 2558 } 2559 2560 _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b) 2561 { 2562 __m128i cmp, resa, resb; 2563 cmp = _mm_cmpgt_epi32(a, b); 2564 resa = _mm_and_si128 (cmp, a); 2565 resb = _mm_andnot_si128 (cmp,b); 2566 return _mm_or_si128(resa, resb); 2567 } 2568 2569 _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b) 2570 { 2571 __m128i c8000, b_s, a_s, cmp; 2572 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff 2573 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 2574 b_s = _mm_sub_epi16 (b, c8000); 2575 a_s = _mm_sub_epi16 (a, c8000); 2576 cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed 2577 a_s = _mm_and_si128 (cmp,a); 2578 b_s = _mm_andnot_si128 (cmp,b); 2579 return _mm_or_si128(a_s, b_s); 2580 } 2581 2582 _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b) 2583 { 2584 __m128i c80000000, b_s, a_s, cmp; 2585 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff 2586 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 2587 b_s = _mm_sub_epi32 (b, c80000000); 2588 a_s = _mm_sub_epi32 (a, c80000000); 2589 cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed 2590 a_s = _mm_and_si128 (cmp,a); 2591 b_s = _mm_andnot_si128 (cmp,b); 2592 return _mm_or_si128(a_s, b_s); 2593 } 2594 2595 _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b) 2596 { 2597 __m128i cmp, resa, resb; 2598 cmp = _mm_cmpgt_epi8 (b, a); 2599 resa = _mm_and_si128 (cmp, a); 2600 resb = _mm_andnot_si128 (cmp,b); 2601 return _mm_or_si128(resa, resb); 2602 } 2603 2604 _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b) 2605 { 2606 __m128i cmp, resa, resb; 2607 cmp = _mm_cmpgt_epi32(b, a); 2608 resa = _mm_and_si128 (cmp, a); 2609 resb = _mm_andnot_si128 (cmp,b); 2610 return _mm_or_si128(resa, resb); 2611 } 2612 2613 _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b) 2614 { 2615 __m128i c8000, b_s, a_s, cmp; 2616 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff 2617 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 2618 b_s = _mm_sub_epi16 (b, c8000); 2619 a_s = _mm_sub_epi16 (a, c8000); 2620 cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed 2621 a_s = _mm_and_si128 (cmp,a); 2622 b_s = _mm_andnot_si128 (cmp,b); 2623 return _mm_or_si128(a_s, b_s); 2624 } 2625 2626 _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b) 2627 { 2628 __m128i c80000000, b_s, a_s, cmp; 2629 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff 2630 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 2631 b_s = _mm_sub_epi32 (b, c80000000); 2632 a_s = _mm_sub_epi32 (a, c80000000); 2633 cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed 2634 a_s = _mm_and_si128 (cmp,a); 2635 b_s = _mm_andnot_si128 (cmp,b); 2636 return _mm_or_si128(a_s, b_s); 2637 } 2638 2639 _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below 2640 { 2641 //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters. 2642 __m128i a_masked, b_masked; 2643 b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff 2644 a_masked = _mm_andnot_si128 (mask,a); 2645 return _mm_or_si128(a_masked, b_masked); 2646 } 2647 2648 _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b) 2649 { 2650 __m128i a16, b16, res, reshi,cmp, zero; 2651 zero = _mm_setzero_si128(); 2652 a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); 2653 b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); 2654 res = _mm_unpacklo_epi64(a16, b16); //result without saturation 2655 reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation 2656 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero 2657 res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 2658 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive 2659 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff 2660 } 2661 2662 _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a) 2663 { 2664 __m128i a16, res, reshi,cmp, zero; 2665 zero = _mm_setzero_si128(); 2666 a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); 2667 reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation 2668 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero 2669 res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 2670 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive 2671 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff 2672 } 2673 2674 2675 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL) 2676 { 2677 _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4]; 2678 int64_t res64; 2679 int i; 2680 _mm_store_si128((__m128i*)atmp, a); 2681 _mm_store_si128((__m128i*)btmp, b); 2682 for (i = 0; i<4; i++) { 2683 res64 = atmp[i] * btmp[i]; 2684 res[i] = (int)(res64 & 0xffffffff); 2685 } 2686 return _mm_load_si128((__m128i*)res); 2687 } 2688 2689 _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b) 2690 { 2691 __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg; 2692 sign = _mm_xor_si128 (a, b); 2693 sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive 2694 sign = _mm_shuffle_epi32(sign, _MM_SHUFFLE(2, 2, 0, 0)); //promote sign bit to 3 and 1st data lanes 2695 zero = _mm_setzero_si128(); 2696 a_neg = _mm_abs_epi32 (a); //negate a and b 2697 b_neg = _mm_abs_epi32 (b); //negate a and b 2698 mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result 2699 mul_us_neg = _mm_sub_epi64(zero, mul_us); 2700 mul_us_neg = _mm_and_si128(sign, mul_us_neg); 2701 mul_us = _mm_andnot_si128(sign, mul_us); 2702 return _mm_or_si128 (mul_us, mul_us_neg); 2703 } 2704 2705 _NEON2SSE_INLINE __m128i _MM_CMPEQ_EPI64(__m128i a, __m128i b) 2706 { 2707 __m128i res; 2708 res = _mm_cmpeq_epi32 (a, b); 2709 return _mm_shuffle_epi32 (res, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data 2710 } 2711 #endif //SSE4 2712 2713 //the special case of functions working only for 32 bits, no SSE4 2714 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64_32(__m128i vec, int p, const int LANE) 2715 { 2716 _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0}; 2717 _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff, 0xffffffffffffffff}; 2718 __m128i vec_masked, p_masked; 2719 pvec[LANE] = p; 2720 mask[LANE] = 0x0; 2721 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p 2722 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec 2723 return _mm_or_si128(vec_masked, p_masked); 2724 } 2725 2726 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64_32(__m128i val, const int LANE) 2727 { 2728 _NEON2SSE_ALIGN_16 int64_t tmp[2]; 2729 _mm_store_si128((__m128i*)tmp, val); 2730 return tmp[LANE]; 2731 } 2732 2733 #ifndef _NEON2SSE_64BIT_SSE4 2734 # define _MM_INSERT_EPI64 _MM_INSERT_EPI64_32 2735 # define _MM_EXTRACT_EPI64 _MM_EXTRACT_EPI64_32 2736 #endif 2737 2738 _NEON2SSESTORAGE int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints 2739 _NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a) 2740 { 2741 //Overflow happens only if a and sum have the opposite signs 2742 __m128i c7fffffff, res, res_sat, res_xor_a; 2743 c7fffffff = _mm_set1_epi32(0x7fffffff); 2744 res = _mm_slli_epi32 (a, 1); // res = a*2 2745 res_sat = _mm_srli_epi32(a, 31); 2746 res_sat = _mm_add_epi32(res_sat, c7fffffff); 2747 res_xor_a = _mm_xor_si128(res, a); 2748 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise 2749 res_sat = _mm_and_si128(res_xor_a, res_sat); 2750 res = _mm_andnot_si128(res_xor_a, res); 2751 return _mm_or_si128(res, res_sat); 2752 } 2753 2754 2755 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 2756 //************************************************************************* 2757 //************************************************************************* 2758 //***************** Functions redefinition\implementatin starts here ***** 2759 //************************************************************************* 2760 //************************************************************************* 2761 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 2762 2763 /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample: 2764 #ifdef ARM 2765 #define vector_addq_s32 _mm_add_epi32 2766 #else //if we have IA 2767 #define vector_addq_s32 vadd_s32 2768 #endif 2769 2770 ******************************************************************************************** 2771 Functions below are organised in the following way: 2772 2773 Each NEON intrinsic function has one of the following options: 2774 1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement 2775 2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement 2776 3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition 2777 4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance 2778 the serial implementation is provided along with the corresponding compiler warning. If these functions are on your app critical path 2779 - please consider such functions removal from your code. 2780 */ 2781 2782 //*********************************************************************** 2783 //************************ Vector add ***************************** 2784 //*********************************************************************** 2785 _NEON2SSESTORAGE int8x8_t vadd_s8(int8x8_t a, int8x8_t b); // VADD.I8 d0,d0,d0 2786 _NEON2SSE_INLINE int8x8_t vadd_s8(int8x8_t a, int8x8_t b) 2787 { 2788 int8x8_t res64; 2789 return64(_mm_add_epi8(_pM128i(a),_pM128i(b))); 2790 } 2791 2792 2793 _NEON2SSESTORAGE int16x4_t vadd_s16(int16x4_t a, int16x4_t b); // VADD.I16 d0,d0,d0 2794 _NEON2SSE_INLINE int16x4_t vadd_s16(int16x4_t a, int16x4_t b) 2795 { 2796 int16x4_t res64; 2797 return64(_mm_add_epi16(_pM128i(a),_pM128i(b))); 2798 } 2799 2800 2801 _NEON2SSESTORAGE int32x2_t vadd_s32(int32x2_t a, int32x2_t b); // VADD.I32 d0,d0,d0 2802 _NEON2SSE_INLINE int32x2_t vadd_s32(int32x2_t a, int32x2_t b) 2803 { 2804 int32x2_t res64; 2805 return64(_mm_add_epi32(_pM128i(a),_pM128i(b))); 2806 } 2807 2808 2809 _NEON2SSESTORAGE int64x1_t vadd_s64(int64x1_t a, int64x1_t b); // VADD.I64 d0,d0,d0 2810 _NEON2SSE_INLINE int64x1_t vadd_s64(int64x1_t a, int64x1_t b) 2811 { 2812 int64x1_t res64; 2813 res64.m64_i64[0] = a.m64_i64[0] + b.m64_i64[0]; 2814 return res64; 2815 } 2816 2817 2818 _NEON2SSESTORAGE float32x2_t vadd_f32(float32x2_t a, float32x2_t b); // VADD.F32 d0,d0,d0 2819 _NEON2SSE_INLINE float32x2_t vadd_f32(float32x2_t a, float32x2_t b) 2820 { 2821 __m128 res; 2822 __m64_128 res64; 2823 res = _mm_add_ps(_pM128(a),_pM128(b)); //SSE, use only low 64 bits 2824 _M64f(res64, res); 2825 return res64; 2826 } 2827 2828 _NEON2SSESTORAGE uint8x8_t vadd_u8(uint8x8_t a, uint8x8_t b); // VADD.I8 d0,d0,d0 2829 #define vadd_u8 vadd_s8 2830 2831 _NEON2SSESTORAGE uint16x4_t vadd_u16(uint16x4_t a, uint16x4_t b); // VADD.I16 d0,d0,d0 2832 #define vadd_u16 vadd_s16 2833 2834 _NEON2SSESTORAGE uint32x2_t vadd_u32(uint32x2_t a, uint32x2_t b); // VADD.I32 d0,d0,d0 2835 #define vadd_u32 vadd_s32 2836 2837 _NEON2SSESTORAGE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b); // VADD.I64 d0,d0,d0 2838 _NEON2SSE_INLINE uint64x1_t vadd_u64(uint64x1_t a, uint64x1_t b) 2839 { 2840 uint64x1_t res64; 2841 res64.m64_u64[0] = a.m64_u64[0] + b.m64_u64[0]; 2842 return res64; 2843 } 2844 2845 2846 _NEON2SSESTORAGE int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 2847 #define vaddq_s8 _mm_add_epi8 2848 2849 _NEON2SSESTORAGE int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 2850 #define vaddq_s16 _mm_add_epi16 2851 2852 _NEON2SSESTORAGE int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 2853 #define vaddq_s32 _mm_add_epi32 2854 2855 _NEON2SSESTORAGE int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 2856 #define vaddq_s64 _mm_add_epi64 2857 2858 _NEON2SSESTORAGE float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 2859 #define vaddq_f32 _mm_add_ps 2860 2861 _NEON2SSESTORAGE uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 2862 #define vaddq_u8 _mm_add_epi8 2863 2864 _NEON2SSESTORAGE uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 2865 #define vaddq_u16 _mm_add_epi16 2866 2867 _NEON2SSESTORAGE uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 2868 #define vaddq_u32 _mm_add_epi32 2869 2870 _NEON2SSESTORAGE uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 2871 #define vaddq_u64 _mm_add_epi64 2872 2873 //**************************** Vector long add *****************************: 2874 //*********************************************************************** 2875 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. 2876 _NEON2SSESTORAGE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b); // VADDL.S8 q0,d0,d0 2877 _NEON2SSE_INLINE int16x8_t vaddl_s8(int8x8_t a, int8x8_t b) // VADDL.S8 q0,d0,d0 2878 { 2879 __m128i a16, b16; 2880 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, 2881 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, 2882 return _mm_add_epi16 (a16, b16); 2883 } 2884 2885 _NEON2SSESTORAGE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b); // VADDL.S16 q0,d0,d0 2886 _NEON2SSE_INLINE int32x4_t vaddl_s16(int16x4_t a, int16x4_t b) // VADDL.S16 q0,d0,d0 2887 { 2888 __m128i a32, b32; 2889 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 2890 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1 2891 return _mm_add_epi32 (a32, b32); 2892 } 2893 2894 _NEON2SSESTORAGE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b); // VADDL.S32 q0,d0,d0 2895 _NEON2SSE_INLINE int64x2_t vaddl_s32(int32x2_t a, int32x2_t b) // VADDL.S32 q0,d0,d0 2896 { 2897 //may be not optimal 2898 __m128i a64, b64; 2899 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 2900 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 2901 return _mm_add_epi64 ( a64, b64); 2902 } 2903 2904 _NEON2SSESTORAGE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b); // VADDL.U8 q0,d0,d0 2905 _NEON2SSE_INLINE uint16x8_t vaddl_u8(uint8x8_t a, uint8x8_t b) // VADDL.U8 q0,d0,d0 2906 { 2907 __m128i a16, b16; 2908 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1 2909 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 2910 return _mm_add_epi16 (a16, b16); 2911 } 2912 2913 _NEON2SSESTORAGE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b); // VADDL.s16 q0,d0,d0 2914 _NEON2SSE_INLINE uint32x4_t vaddl_u16(uint16x4_t a, uint16x4_t b) // VADDL.s16 q0,d0,d0 2915 { 2916 __m128i a32, b32; 2917 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 2918 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 2919 return _mm_add_epi32 (a32, b32); 2920 } 2921 2922 _NEON2SSESTORAGE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b); // VADDL.U32 q0,d0,d0 2923 _NEON2SSE_INLINE uint64x2_t vaddl_u32(uint32x2_t a, uint32x2_t b) // VADDL.U32 q0,d0,d0 2924 { 2925 //may be not optimal 2926 __m128i a64, b64; 2927 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 2928 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 2929 return _mm_add_epi64 (a64, b64); 2930 } 2931 2932 //*************** Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ****************** 2933 //*************** ********************************************************************* 2934 _NEON2SSESTORAGE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b); // VADDW.S8 q0,q0,d0 2935 _NEON2SSE_INLINE int16x8_t vaddw_s8(int16x8_t a, int8x8_t b) // VADDW.S8 q0,q0,d0 2936 { 2937 __m128i b16; 2938 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, 2939 return _mm_add_epi16 (a, b16); 2940 } 2941 2942 _NEON2SSESTORAGE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b); // VADDW.S16 q0,q0,d0 2943 _NEON2SSE_INLINE int32x4_t vaddw_s16(int32x4_t a, int16x4_t b) // VADDW.S16 q0,q0,d0 2944 { 2945 __m128i b32; 2946 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1, 2947 return _mm_add_epi32 (a, b32); 2948 } 2949 2950 _NEON2SSESTORAGE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b); // VADDW.S32 q0,q0,d0 2951 _NEON2SSE_INLINE int64x2_t vaddw_s32(int64x2_t a, int32x2_t b) // VADDW.S32 q0,q0,d0 2952 { 2953 __m128i b64; 2954 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 2955 return _mm_add_epi64 (a, b64); 2956 } 2957 2958 _NEON2SSESTORAGE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b); // VADDW.U8 q0,q0,d0 2959 _NEON2SSE_INLINE uint16x8_t vaddw_u8(uint16x8_t a, uint8x8_t b) // VADDW.U8 q0,q0,d0 2960 { 2961 __m128i b16; 2962 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1 2963 return _mm_add_epi16 (a, b16); 2964 } 2965 2966 _NEON2SSESTORAGE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b); // VADDW.s16 q0,q0,d0 2967 _NEON2SSE_INLINE uint32x4_t vaddw_u16(uint32x4_t a, uint16x4_t b) // VADDW.s16 q0,q0,d0 2968 { 2969 __m128i b32; 2970 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1 2971 return _mm_add_epi32 (a, b32); 2972 } 2973 2974 _NEON2SSESTORAGE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b); // VADDW.U32 q0,q0,d0 2975 _NEON2SSE_INLINE uint64x2_t vaddw_u32(uint64x2_t a, uint32x2_t b) // VADDW.U32 q0,q0,d0 2976 { 2977 __m128i b64; 2978 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 2979 return _mm_add_epi64 (a, b64); 2980 } 2981 2982 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated ******************************* 2983 //************************************************************************************************************************* 2984 _NEON2SSESTORAGE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b); // VHADD.S8 d0,d0,d0 2985 _NEON2SSE_INLINE int8x8_t vhadd_s8(int8x8_t a, int8x8_t b) 2986 { 2987 int8x8_t res64; 2988 return64(vhaddq_s8(_pM128i(a), _pM128i(b))); 2989 } 2990 2991 2992 _NEON2SSESTORAGE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b); // VHADD.S16 d0,d0,d0 2993 _NEON2SSE_INLINE int16x4_t vhadd_s16(int16x4_t a, int16x4_t b) 2994 { 2995 int16x4_t res64; 2996 return64( vhaddq_s16(_pM128i(a), _pM128i(b))); 2997 } 2998 2999 3000 _NEON2SSESTORAGE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b); // VHADD.S32 d0,d0,d0 3001 _NEON2SSE_INLINE int32x2_t vhadd_s32(int32x2_t a, int32x2_t b) 3002 { 3003 int32x2_t res64; 3004 return64( vhaddq_s32(_pM128i(a), _pM128i(b))); 3005 } 3006 3007 3008 _NEON2SSESTORAGE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b); // VHADD.w d0,d0,d0 3009 _NEON2SSE_INLINE uint8x8_t vhadd_u8(uint8x8_t a, uint8x8_t b) 3010 { 3011 uint8x8_t res64; 3012 return64( vhaddq_u8(_pM128i(a), _pM128i(b))); 3013 } 3014 3015 3016 _NEON2SSESTORAGE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b); // VHADD.s16 d0,d0,d0 3017 _NEON2SSE_INLINE uint16x4_t vhadd_u16(uint16x4_t a, uint16x4_t b) 3018 { 3019 uint16x4_t res64; 3020 return64( vhaddq_u16(_pM128i(a), _pM128i(b))); 3021 } 3022 3023 3024 _NEON2SSESTORAGE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b); // VHADD.U32 d0,d0,d0 3025 _NEON2SSE_INLINE uint32x2_t vhadd_u32(uint32x2_t a, uint32x2_t b) 3026 { 3027 uint32x2_t res64; 3028 return64( vhaddq_u32(_pM128i(a), _pM128i(b))); 3029 } 3030 3031 3032 _NEON2SSESTORAGE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 3033 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b) 3034 { 3035 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). 3036 __m128i tmp1, tmp2; 3037 tmp1 = _mm_and_si128(a,b); 3038 tmp2 = _mm_xor_si128(a,b); 3039 tmp2 = vshrq_n_s8(tmp2,1); 3040 return _mm_add_epi8(tmp1,tmp2); 3041 } 3042 3043 _NEON2SSESTORAGE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0 3044 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b) 3045 { 3046 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). 3047 __m128i tmp1, tmp2; 3048 tmp1 = _mm_and_si128(a,b); 3049 tmp2 = _mm_xor_si128(a,b); 3050 tmp2 = _mm_srai_epi16(tmp2,1); 3051 return _mm_add_epi16(tmp1,tmp2); 3052 } 3053 3054 _NEON2SSESTORAGE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 3055 _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0 3056 { 3057 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). 3058 __m128i tmp1, tmp2; 3059 tmp1 = _mm_and_si128(a,b); 3060 tmp2 = _mm_xor_si128(a,b); 3061 tmp2 = _mm_srai_epi32(tmp2,1); 3062 return _mm_add_epi32(tmp1,tmp2); 3063 } 3064 3065 _NEON2SSESTORAGE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 3066 _NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0 3067 { 3068 __m128i c1, sum, res; 3069 c1 = _mm_set1_epi8(1); 3070 sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it 3071 res = _mm_xor_si128(a, b); //for rounding compensation 3072 res = _mm_and_si128(res,c1); //for rounding compensation 3073 return _mm_sub_epi8 (sum, res); //actual rounding compensation 3074 } 3075 3076 _NEON2SSESTORAGE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0 3077 _NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0 3078 { 3079 __m128i sum, res; 3080 sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it 3081 res = _mm_xor_si128(a, b); //for rounding compensation 3082 res = _mm_slli_epi16 (res,15); //shift left then back right to 3083 res = _mm_srli_epi16 (res,15); //get 1 or zero 3084 return _mm_sub_epi16 (sum, res); //actual rounding compensation 3085 } 3086 3087 _NEON2SSESTORAGE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 3088 _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0 3089 { 3090 //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). 3091 __m128i tmp1, tmp2; 3092 tmp1 = _mm_and_si128(a,b); 3093 tmp2 = _mm_xor_si128(a,b); 3094 tmp2 = _mm_srli_epi32(tmp2,1); 3095 return _mm_add_epi32(tmp1,tmp2); 3096 } 3097 3098 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 *************************** 3099 //***************************************************************************************************************************** 3100 _NEON2SSESTORAGE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b); // VRHADD.S8 d0,d0,d0 3101 _NEON2SSE_INLINE int8x8_t vrhadd_s8(int8x8_t a, int8x8_t b) 3102 { 3103 int8x8_t res64; 3104 return64(vrhaddq_s8(_pM128i(a), _pM128i(b))); 3105 } 3106 3107 3108 _NEON2SSESTORAGE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b); // VRHADD.S16 d0,d0,d0 3109 _NEON2SSE_INLINE int16x4_t vrhadd_s16(int16x4_t a, int16x4_t b) 3110 { 3111 int16x4_t res64; 3112 return64(vrhaddq_s16(_pM128i(a), _pM128i(b))); 3113 } 3114 3115 3116 _NEON2SSESTORAGE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b); // VRHADD.S32 d0,d0,d0 3117 _NEON2SSE_INLINE int32x2_t vrhadd_s32(int32x2_t a, int32x2_t b) 3118 { 3119 int32x2_t res64; 3120 return64(vrhaddq_s32(_pM128i(a), _pM128i(b))); 3121 } 3122 3123 3124 _NEON2SSESTORAGE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b); // VRHADD.U8 d0,d0,d0 3125 _NEON2SSE_INLINE uint8x8_t vrhadd_u8(uint8x8_t a, uint8x8_t b) 3126 { 3127 uint8x8_t res64; 3128 return64(_mm_avg_epu8(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! 3129 } 3130 3131 3132 _NEON2SSESTORAGE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b); // VRHADD.s16 d0,d0,d0 3133 _NEON2SSE_INLINE uint16x4_t vrhadd_u16(uint16x4_t a, uint16x4_t b) 3134 { 3135 uint16x4_t res64; 3136 return64(_mm_avg_epu16(_pM128i(a),_pM128i(b))); //SSE, result rounding!!! 3137 } 3138 3139 3140 _NEON2SSESTORAGE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b); // VRHADD.U32 d0,d0,d0 3141 _NEON2SSE_INLINE uint32x2_t vrhadd_u32(uint32x2_t a, uint32x2_t b) 3142 { 3143 uint32x2_t res64; 3144 return64(vrhaddq_u32(_pM128i(a), _pM128i(b))); 3145 } 3146 3147 3148 _NEON2SSESTORAGE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 3149 _NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0 3150 { 3151 //no signed average in x86 SIMD, go to unsigned 3152 __m128i c128, au, bu, sum; 3153 c128 = _mm_set1_epi8((int8_t)0x80); //-128 3154 au = _mm_sub_epi8(a, c128); //add 128 3155 bu = _mm_sub_epi8(b, c128); //add 128 3156 sum = _mm_avg_epu8(au, bu); 3157 return _mm_add_epi8 (sum, c128); //sub 128 3158 } 3159 3160 _NEON2SSESTORAGE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 3161 _NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0 3162 { 3163 //no signed average in x86 SIMD, go to unsigned 3164 __m128i cx8000, au, bu, sum; 3165 cx8000 = _mm_set1_epi16((int16_t)0x8000); // - 32768 3166 au = _mm_sub_epi16(a, cx8000); //add 32768 3167 bu = _mm_sub_epi16(b, cx8000); //add 32768 3168 sum = _mm_avg_epu16(au, bu); 3169 return _mm_add_epi16 (sum, cx8000); //sub 32768 3170 } 3171 3172 _NEON2SSESTORAGE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 3173 _NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b) 3174 { 3175 //need to avoid overflow 3176 __m128i a2, b2, res, sum; 3177 a2 = _mm_srai_epi32(a,1); //a2=a/2; 3178 b2 = _mm_srai_epi32(b,1); // b2=b/2; 3179 res = _mm_or_si128(a,b); //for rounding 3180 res = _mm_slli_epi32 (res,31); //shift left then back right to 3181 res = _mm_srli_epi32 (res,31); //get 1 or zero 3182 sum = _mm_add_epi32(a2,b2); 3183 return _mm_add_epi32(sum,res); 3184 } 3185 3186 _NEON2SSESTORAGE uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 3187 #define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded 3188 3189 _NEON2SSESTORAGE uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0 3190 #define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded 3191 3192 3193 _NEON2SSESTORAGE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 3194 _NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0 3195 { 3196 //need to avoid overflow 3197 __m128i a2, b2, res, sum; 3198 a2 = _mm_srli_epi32(a,1); //a2=a/2; 3199 b2 = _mm_srli_epi32(b,1); // b2=b/2; 3200 res = _mm_or_si128(a,b); //for rounding 3201 res = _mm_slli_epi32 (res,31); //shift left then back right to 3202 res = _mm_srli_epi32 (res,31); //get 1 or zero 3203 sum = _mm_add_epi32(a2,b2); 3204 return _mm_add_epi32(sum,res); 3205 } 3206 3207 //****************** VQADD: Vector saturating add ************************ 3208 //************************************************************************ 3209 _NEON2SSESTORAGE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b); // VQADD.S8 d0,d0,d0 3210 _NEON2SSE_INLINE int8x8_t vqadd_s8(int8x8_t a, int8x8_t b) 3211 { 3212 int8x8_t res64; 3213 return64(_mm_adds_epi8(_pM128i(a),_pM128i(b))); 3214 } 3215 3216 3217 _NEON2SSESTORAGE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b); // VQADD.S16 d0,d0,d0 3218 _NEON2SSE_INLINE int16x4_t vqadd_s16(int16x4_t a, int16x4_t b) 3219 { 3220 int16x4_t res64; 3221 return64(_mm_adds_epi16(_pM128i(a),_pM128i(b))); 3222 } 3223 3224 3225 _NEON2SSESTORAGE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b); // VQADD.S32 d0,d0,d0 3226 _NEON2SSE_INLINE int32x2_t vqadd_s32(int32x2_t a, int32x2_t b) 3227 { 3228 int32x2_t res64; 3229 return64(vqaddq_s32(_pM128i(a), _pM128i(b))); 3230 } 3231 3232 3233 _NEON2SSESTORAGE int64x1_t vqadd_s64(int64x1_t a, int64x1_t b); // VQADD.S64 d0,d0,d0 3234 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqadd_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3235 { 3236 int64x1_t res; 3237 uint64_t a64, b64; 3238 a64 = a.m64_u64[0]; 3239 b64 = b.m64_u64[0]; 3240 res.m64_u64[0] = a64 + b64; 3241 a64 = (a64 >> 63) + (~_SIGNBIT64); 3242 if ((int64_t)((b64 ^ a64) | ~(res.m64_u64[0] ^ b64))>=0) { 3243 res.m64_u64[0] = a64; 3244 } 3245 return res; 3246 } 3247 3248 _NEON2SSESTORAGE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b); // VQADD.U8 d0,d0,d0 3249 _NEON2SSE_INLINE uint8x8_t vqadd_u8(uint8x8_t a, uint8x8_t b) 3250 { 3251 uint8x8_t res64; 3252 return64(_mm_adds_epu8(_pM128i(a),_pM128i(b))); 3253 } 3254 3255 3256 _NEON2SSESTORAGE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b); // VQADD.s16 d0,d0,d0 3257 _NEON2SSE_INLINE uint16x4_t vqadd_u16(uint16x4_t a, uint16x4_t b) 3258 { 3259 uint16x4_t res64; 3260 return64(_mm_adds_epu16(_pM128i(a),_pM128i(b))); 3261 } 3262 3263 3264 _NEON2SSESTORAGE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b); // VQADD.U32 d0,d0,d0 3265 _NEON2SSE_INLINE uint32x2_t vqadd_u32(uint32x2_t a, uint32x2_t b) 3266 { 3267 uint32x2_t res64; 3268 return64(vqaddq_u32(_pM128i(a), _pM128i(b))); 3269 } 3270 3271 3272 _NEON2SSESTORAGE uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b); // VQADD.U64 d0,d0,d0 3273 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqadd_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3274 { 3275 _NEON2SSE_ALIGN_16 uint64_t a64, b64; 3276 uint64x1_t res; 3277 a64 = a.m64_u64[0]; 3278 b64 = b.m64_u64[0]; 3279 res.m64_u64[0] = a64 + b64; 3280 if (res.m64_u64[0] < a64) { 3281 res.m64_u64[0] = ~(uint64_t)0; 3282 } 3283 return res; 3284 } 3285 3286 _NEON2SSESTORAGE int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 3287 #define vqaddq_s8 _mm_adds_epi8 3288 3289 _NEON2SSESTORAGE int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 3290 #define vqaddq_s16 _mm_adds_epi16 3291 3292 _NEON2SSESTORAGE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 3293 _NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b) 3294 { 3295 //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign 3296 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_; 3297 c7fffffff = _mm_set1_epi32(0x7fffffff); 3298 res = _mm_add_epi32(a, b); 3299 res_sat = _mm_srli_epi32(a, 31); 3300 res_sat = _mm_add_epi32(res_sat, c7fffffff); 3301 res_xor_a = _mm_xor_si128(res, a); 3302 b_xor_a_ = _mm_xor_si128(b, a); 3303 res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a); 3304 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise 3305 res_sat = _mm_and_si128(res_xor_a, res_sat); 3306 res = _mm_andnot_si128(res_xor_a, res); 3307 return _mm_or_si128(res, res_sat); 3308 } 3309 3310 _NEON2SSESTORAGE int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 3311 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3312 { 3313 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; 3314 _mm_store_si128((__m128i*)atmp, a); 3315 _mm_store_si128((__m128i*)btmp, b); 3316 res[0] = atmp[0] + btmp[0]; 3317 res[1] = atmp[1] + btmp[1]; 3318 3319 atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64); 3320 atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64); 3321 3322 if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) { 3323 res[0] = atmp[0]; 3324 } 3325 if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) { 3326 res[1] = atmp[1]; 3327 } 3328 return _mm_load_si128((__m128i*)res); 3329 } 3330 3331 _NEON2SSESTORAGE uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 3332 #define vqaddq_u8 _mm_adds_epu8 3333 3334 _NEON2SSESTORAGE uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0 3335 #define vqaddq_u16 _mm_adds_epu16 3336 3337 _NEON2SSESTORAGE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 3338 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b) 3339 { 3340 __m128i c80000000, cmp, subsum, suba, sum; 3341 c80000000 = _mm_set1_epi32 (0x80000000); 3342 sum = _mm_add_epi32 (a, b); 3343 subsum = _mm_sub_epi32 (sum, c80000000); 3344 suba = _mm_sub_epi32 (a, c80000000); 3345 cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed 3346 return _mm_or_si128 (sum, cmp); //saturation 3347 } 3348 3349 _NEON2SSESTORAGE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 3350 #ifdef USE_SSE4 3351 _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b) 3352 { 3353 __m128i c80000000, sum, cmp, suba, subsum; 3354 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); 3355 sum = _mm_add_epi64 (a, b); 3356 subsum = _mm_sub_epi64 (sum, c80000000); 3357 suba = _mm_sub_epi64 (a, c80000000); 3358 cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!! 3359 return _mm_or_si128 (sum, cmp); //saturation 3360 } 3361 #else 3362 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3363 { 3364 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; 3365 _mm_store_si128((__m128i*)atmp, a); 3366 _mm_store_si128((__m128i*)btmp, b); 3367 res[0] = atmp[0] + btmp[0]; 3368 res[1] = atmp[1] + btmp[1]; 3369 if (res[0] < atmp[0]) res[0] = ~(uint64_t)0; 3370 if (res[1] < atmp[1]) res[1] = ~(uint64_t)0; 3371 return _mm_load_si128((__m128i*)(res)); 3372 } 3373 #endif 3374 3375 3376 //******************* Vector add high half (truncated) ****************** 3377 //************************************************************************ 3378 _NEON2SSESTORAGE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b); // VADDHN.I16 d0,q0,q0 3379 _NEON2SSE_INLINE int8x8_t vaddhn_s16(int16x8_t a, int16x8_t b) // VADDHN.I16 d0,q0,q0 3380 { 3381 int8x8_t res64; 3382 __m128i sum; 3383 sum = _mm_add_epi16 (a, b); 3384 sum = _mm_srai_epi16 (sum, 8); 3385 sum = _mm_packs_epi16 (sum, sum); //use 64 low bits only 3386 return64(sum); 3387 } 3388 3389 _NEON2SSESTORAGE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b); // VADDHN.I32 d0,q0,q0 3390 _NEON2SSE_INLINE int16x4_t vaddhn_s32(int32x4_t a, int32x4_t b) // VADDHN.I32 d0,q0,q0 3391 { 3392 int16x4_t res64; 3393 __m128i sum; 3394 sum = _mm_add_epi32 (a, b); 3395 sum = _mm_srai_epi32(sum, 16); 3396 sum = _mm_packs_epi32 (sum, sum); //use 64 low bits only 3397 return64(sum); 3398 } 3399 3400 _NEON2SSESTORAGE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b); // VADDHN.I64 d0,q0,q0 3401 _NEON2SSE_INLINE int32x2_t vaddhn_s64(int64x2_t a, int64x2_t b) 3402 { 3403 int32x2_t res64; 3404 __m128i sum; 3405 sum = _mm_add_epi64 (a, b); 3406 sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (0 << 4) | (2 << 6)); 3407 return64(sum); 3408 } 3409 3410 _NEON2SSESTORAGE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b); // VADDHN.I16 d0,q0,q0 3411 _NEON2SSE_INLINE uint8x8_t vaddhn_u16(uint16x8_t a, uint16x8_t b) // VADDHN.I16 d0,q0,q0 3412 { 3413 uint8x8_t res64; 3414 __m128i sum; 3415 sum = _mm_add_epi16 (a, b); 3416 sum = _mm_srli_epi16 (sum, 8); 3417 sum = _mm_packus_epi16 (sum,sum); //use 64 low bits only 3418 return64(sum); 3419 } 3420 3421 _NEON2SSESTORAGE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b); // VADDHN.I32 d0,q0,q0 3422 _NEON2SSE_INLINE uint16x4_t vaddhn_u32(uint32x4_t a, uint32x4_t b) // VADDHN.I32 d0,q0,q0 3423 { 3424 uint16x4_t res64; 3425 __m128i sum; 3426 sum = _mm_add_epi32 (a, b); 3427 sum = _mm_srli_epi32 (sum, 16); 3428 #ifdef USE_SSE4 3429 sum = _MM_PACKUS1_EPI32 (sum); //use 64 low bits only 3430 #else 3431 sum = _mm_shuffle_epi8 (sum, *(__m128i*) mask8_32_even_odd); //go to 16 bits 3432 #endif 3433 return64(sum); 3434 } 3435 3436 _NEON2SSESTORAGE uint32x2_t vaddhn_u64(uint64x2_t a, uint64x2_t b); // VADDHN.I64 d0,q0,q0 3437 #define vaddhn_u64 vaddhn_s64 3438 3439 //*********** Vector rounding add high half: vraddhn_<type> ******************. 3440 //*************************************************************************** 3441 _NEON2SSESTORAGE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b); // VRADDHN.I16 d0,q0,q0 3442 _NEON2SSE_INLINE int8x8_t vraddhn_s16(int16x8_t a, int16x8_t b) // VRADDHN.I16 d0,q0,q0 3443 { 3444 int8x8_t res64; 3445 __m128i sum, mask1; 3446 sum = _mm_add_epi16 (a, b); 3447 mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to 3448 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero 3449 sum = _mm_srai_epi16 (sum, 8); //get high half 3450 sum = _mm_add_epi16 (sum, mask1); //actual rounding 3451 sum = _mm_packs_epi16 (sum, sum); 3452 return64(sum); 3453 } 3454 3455 _NEON2SSESTORAGE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b); // VRADDHN.I32 d0,q0,q0 3456 _NEON2SSE_INLINE int16x4_t vraddhn_s32(int32x4_t a, int32x4_t b) // VRADDHN.I32 d0,q0,q0 3457 { 3458 //SIMD may be not optimal, serial may be faster 3459 int16x4_t res64; 3460 __m128i sum, mask1; 3461 sum = _mm_add_epi32 (a, b); 3462 mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to 3463 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero 3464 sum = _mm_srai_epi32 (sum, 16); //get high half 3465 sum = _mm_add_epi32 (sum, mask1); //actual rounding 3466 sum = _mm_packs_epi32 (sum, sum); 3467 return64(sum); 3468 } 3469 3470 _NEON2SSESTORAGE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b); // VRADDHN.I64 d0,q0,q0 3471 _NEON2SSE_INLINE int32x2_t vraddhn_s64(int64x2_t a, int64x2_t b) 3472 { 3473 //SIMD may be not optimal, serial may be faster 3474 int32x2_t res64; 3475 __m128i sum, mask1; 3476 sum = _mm_add_epi64 (a, b); 3477 mask1 = _mm_slli_epi64(sum, 33); //shift left then back right to 3478 mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero 3479 sum = _mm_add_epi64 (sum, mask1); //actual high half rounding 3480 sum = _mm_shuffle_epi32(sum, 1 | (3 << 2) | (1 << 4) | (3 << 6)); 3481 return64(sum); 3482 } 3483 3484 _NEON2SSESTORAGE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b); // VRADDHN.I16 d0,q0,q0 3485 _NEON2SSE_INLINE uint8x8_t vraddhn_u16(uint16x8_t a, uint16x8_t b) // VRADDHN.I16 d0,q0,q0 3486 { 3487 uint8x8_t res64; 3488 __m128i sum, mask1; 3489 sum = _mm_add_epi16 (a, b); 3490 mask1 = _mm_slli_epi16(sum, 9); //shift left then back right to 3491 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero 3492 sum = _mm_srai_epi16 (sum, 8); //get high half 3493 sum = _mm_add_epi16 (sum, mask1); //actual rounding 3494 sum = _mm_packus_epi16 (sum, sum); 3495 return64(sum); 3496 } 3497 3498 _NEON2SSESTORAGE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b); // VRADDHN.I32 d0,q0,q0 3499 _NEON2SSE_INLINE uint16x4_t vraddhn_u32(uint32x4_t a, uint32x4_t b) 3500 { 3501 //SIMD may be not optimal, serial may be faster 3502 uint16x4_t res64; 3503 __m128i sum, mask1; 3504 sum = _mm_add_epi32 (a, b); 3505 mask1 = _mm_slli_epi32(sum, 17); //shift left then back right to 3506 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero 3507 sum = _mm_srai_epi32 (sum, 16); //get high half 3508 sum = _mm_add_epi32 (sum, mask1); //actual rounding 3509 sum = _MM_PACKUS1_EPI32 (sum); 3510 return64(sum); 3511 } 3512 3513 _NEON2SSESTORAGE uint32x2_t vraddhn_u64(uint64x2_t a, uint64x2_t b); // VRADDHN.I64 d0,q0,q0 3514 #define vraddhn_u64 vraddhn_s64 3515 3516 //********************************************************************************** 3517 //********* Multiplication ************************************* 3518 //************************************************************************************** 3519 3520 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] 3521 //As we don't go to wider result functions are equal to "multiply low" in x86 3522 _NEON2SSESTORAGE int8x8_t vmul_s8(int8x8_t a, int8x8_t b); // VMUL.I8 d0,d0,d0 3523 _NEON2SSE_INLINE int8x8_t vmul_s8(int8x8_t a, int8x8_t b) // VMUL.I8 d0,d0,d0 3524 { 3525 // no 8 bit simd multiply, need to go to 16 bits in SSE 3526 int8x8_t res64; 3527 __m128i a128, b128, res; 3528 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits 3529 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits 3530 res = _mm_mullo_epi16 (a128, b128); 3531 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit from 16, use 64 low bits only 3532 return64(res); 3533 } 3534 3535 _NEON2SSESTORAGE int16x4_t vmul_s16(int16x4_t a, int16x4_t b); // VMUL.I16 d0,d0,d0 3536 #define vmul_s16 vmul_u16 3537 3538 _NEON2SSESTORAGE int32x2_t vmul_s32(int32x2_t a, int32x2_t b); // VMUL.I32 d0,d0,d0 3539 #define vmul_s32 vmul_u32 3540 3541 _NEON2SSESTORAGE float32x2_t vmul_f32(float32x2_t a, float32x2_t b); // VMUL.F32 d0,d0,d0 3542 _NEON2SSE_INLINE float32x2_t vmul_f32(float32x2_t a, float32x2_t b) 3543 { 3544 float32x4_t tmp; 3545 __m64_128 res64; 3546 tmp = _mm_mul_ps(_pM128(a),_pM128(b)); 3547 _M64f(res64, tmp); //use low 64 bits 3548 return res64; 3549 } 3550 3551 _NEON2SSESTORAGE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b); // VMUL.I8 d0,d0,d0 3552 _NEON2SSE_INLINE uint8x8_t vmul_u8(uint8x8_t a, uint8x8_t b) // VMUL.I8 d0,d0,d0 3553 { 3554 // no 8 bit simd multiply, need to go to 16 bits in SSE 3555 uint8x8_t res64; 3556 __m128i mask, a128, b128, res; 3557 mask = _mm_set1_epi16(0xff); 3558 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); 3559 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); 3560 res = _mm_mullo_epi16 (a128, b128); 3561 res = _mm_and_si128(res, mask); //to avoid saturation 3562 res = _mm_packus_epi16 (res,res); //use only low 64 bits 3563 return64(res); 3564 } 3565 3566 _NEON2SSESTORAGE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b); // VMUL.I16 d0,d0,d0 3567 _NEON2SSE_INLINE uint16x4_t vmul_u16(uint16x4_t a, uint16x4_t b) 3568 { 3569 uint16x4_t res64; 3570 return64(_mm_mullo_epi16(_pM128i(a),_pM128i(b))); 3571 } 3572 3573 _NEON2SSESTORAGE uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b); // VMUL.I32 d0,d0,d0 3574 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint32x2_t vmul_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3575 { 3576 uint32x2_t res; 3577 res.m64_u32[0] = a.m64_u32[0] * b.m64_u32[0]; 3578 res.m64_u32[1] = a.m64_u32[1] * b.m64_u32[1]; 3579 return res; 3580 } 3581 3582 _NEON2SSESTORAGE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b); // VMUL.P8 d0,d0,d0 3583 _NEON2SSE_INLINE poly8x8_t vmul_p8(poly8x8_t a, poly8x8_t b) 3584 { 3585 //may be optimized 3586 poly8x8_t res64; 3587 __m128i a64, b64, c1, res, tmp, bmasked; 3588 int i; 3589 a64 = _pM128i(a); 3590 b64 = _pM128i(b); 3591 c1 = _mm_cmpeq_epi8 (a64,a64); //all ones 0xff.... 3592 c1 = vshrq_n_u8(c1,7); //0x1 3593 bmasked = _mm_and_si128(b64, c1); //0x1 3594 res = vmulq_u8(a64, bmasked); 3595 for(i = 1; i<8; i++) { 3596 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here 3597 bmasked = _mm_and_si128(b64, c1); //0x1 3598 tmp = vmulq_u8(a64, bmasked); 3599 res = _mm_xor_si128(res, tmp); 3600 } 3601 return64 (res); 3602 } 3603 3604 _NEON2SSESTORAGE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 3605 _NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0 3606 { 3607 // no 8 bit simd multiply, need to go to 16 bits 3608 //solution may be not optimal 3609 __m128i a16, b16, r16_1, r16_2; 3610 a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 3611 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 3612 r16_1 = _mm_mullo_epi16 (a16, b16); 3613 //swap hi and low part of a and b to process the remaining data 3614 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 3615 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 3616 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 3617 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2 3618 3619 r16_2 = _mm_mullo_epi16 (a16, b16); 3620 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit 3621 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit 3622 3623 return _mm_unpacklo_epi64(r16_1, r16_2); 3624 } 3625 3626 _NEON2SSESTORAGE int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 3627 #define vmulq_s16 _mm_mullo_epi16 3628 3629 _NEON2SSESTORAGE int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 3630 #define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1 3631 3632 _NEON2SSESTORAGE float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 3633 #define vmulq_f32 _mm_mul_ps 3634 3635 _NEON2SSESTORAGE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 3636 _NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0 3637 { 3638 // no 8 bit simd multiply, need to go to 16 bits 3639 //solution may be not optimal 3640 __m128i maskff, a16, b16, r16_1, r16_2; 3641 maskff = _mm_set1_epi16(0xff); 3642 a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1 3643 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 3644 r16_1 = _mm_mullo_epi16 (a16, b16); 3645 r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation 3646 //swap hi and low part of a and b to process the remaining data 3647 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 3648 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 3649 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 3650 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 3651 3652 r16_2 = _mm_mullo_epi16 (a16, b16); 3653 r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation 3654 return _mm_packus_epi16 (r16_1, r16_2); 3655 } 3656 3657 _NEON2SSESTORAGE uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 3658 #define vmulq_u16 _mm_mullo_epi16 3659 3660 _NEON2SSESTORAGE uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 3661 #define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1 3662 3663 _NEON2SSESTORAGE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 3664 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b) 3665 { 3666 //may be optimized 3667 __m128i c1, res, tmp, bmasked; 3668 int i; 3669 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... 3670 c1 = vshrq_n_u8(c1,7); //0x1 3671 bmasked = _mm_and_si128(b, c1); //0x1 3672 res = vmulq_u8(a, bmasked); 3673 for(i = 1; i<8; i++) { 3674 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here 3675 bmasked = _mm_and_si128(b, c1); //0x1 3676 tmp = vmulq_u8(a, bmasked); 3677 res = _mm_xor_si128(res, tmp); 3678 } 3679 return res; 3680 } 3681 3682 //************************* Vector long multiply *********************************** 3683 //**************************************************************************** 3684 _NEON2SSESTORAGE int16x8_t vmull_s8(int8x8_t a, int8x8_t b); // VMULL.S8 q0,d0,d0 3685 _NEON2SSE_INLINE int16x8_t vmull_s8(int8x8_t a, int8x8_t b) // VMULL.S8 q0,d0,d0 3686 { 3687 //no 8 bit simd multiply, need to go to 16 bits 3688 __m128i a16, b16; 3689 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 3690 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 3691 return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit 3692 } 3693 3694 _NEON2SSESTORAGE int32x4_t vmull_s16(int16x4_t a, int16x4_t b); // VMULL.S16 q0,d0,d0 3695 _NEON2SSE_INLINE int32x4_t vmull_s16(int16x4_t a, int16x4_t b) // VMULL.S16 q0,d0,d0 3696 { 3697 #ifdef USE_SSE4 3698 __m128i a16, b16; 3699 a16 = _MM_CVTEPI16_EPI32 (_pM128i(a)); // SSE 4.1 3700 b16 = _MM_CVTEPI16_EPI32 (_pM128i(b)); // SSE 4.1 3701 return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 3702 #else 3703 __m128i low, hi, a128,b128; 3704 a128 = _pM128i(a); 3705 b128 = _pM128i(b); 3706 low = _mm_mullo_epi16(a128,b128); 3707 hi = _mm_mulhi_epi16(a128,b128); 3708 return _mm_unpacklo_epi16(low,hi); 3709 #endif 3710 } 3711 3712 _NEON2SSESTORAGE int64x2_t vmull_s32(int32x2_t a, int32x2_t b); // VMULL.S32 q0,d0,d0 3713 _NEON2SSE_INLINE int64x2_t vmull_s32(int32x2_t a, int32x2_t b) // VMULL.S32 q0,d0,d0 3714 { 3715 __m128i ab, ba, a128, b128; 3716 a128 = _pM128i(a); 3717 b128 = _pM128i(b); 3718 ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 3719 ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 3720 return _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 3721 } 3722 3723 _NEON2SSESTORAGE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b); // VMULL.U8 q0,d0,d0 3724 _NEON2SSE_INLINE uint16x8_t vmull_u8(uint8x8_t a, uint8x8_t b) // VMULL.U8 q0,d0,d0 3725 { 3726 //no 8 bit simd multiply, need to go to 16 bits 3727 __m128i a16, b16; 3728 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 3729 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 3730 return _mm_mullo_epi16 (a16, b16); //should fit into 16 bit 3731 } 3732 3733 _NEON2SSESTORAGE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b); // VMULL.s16 q0,d0,d0 3734 _NEON2SSE_INLINE uint32x4_t vmull_u16(uint16x4_t a, uint16x4_t b) // VMULL.s16 q0,d0,d0 3735 { 3736 #ifdef USE_SSE4 3737 __m128i a16, b16; 3738 a16 = _MM_CVTEPU16_EPI32 (_pM128i(a)); // SSE 4.1 3739 b16 = _MM_CVTEPU16_EPI32 (_pM128i(b)); // SSE 4.1 3740 return _MM_MULLO_EPI32 (a16, b16); // SSE 4.1 3741 #else 3742 __m128i a128,b128,low, hi; 3743 a128 = _pM128i(a); 3744 b128 = _pM128i(b); 3745 low = _mm_mullo_epi16(a128,b128); 3746 hi = _mm_mulhi_epu16(a128,b128); 3747 return _mm_unpacklo_epi16(low,hi); 3748 #endif 3749 } 3750 3751 _NEON2SSESTORAGE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b); // VMULL.U32 q0,d0,d0 3752 _NEON2SSE_INLINE uint64x2_t vmull_u32(uint32x2_t a, uint32x2_t b) // VMULL.U32 q0,d0,d0 3753 { 3754 ///may be not optimal compared with serial implementation 3755 __m128i ab, ba, a128, b128; 3756 a128 = _pM128i(a); 3757 b128 = _pM128i(b); 3758 ab = _mm_unpacklo_epi32 (a128, b128); //a0, b0, a1,b1 3759 ba = _mm_unpacklo_epi32 (b128, a128); //b0, a0, b1,a1 3760 return _mm_mul_epu32 (ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 3761 } 3762 3763 _NEON2SSESTORAGE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b); // VMULL.P8 q0,d0,d0 3764 _NEON2SSE_INLINE poly16x8_t vmull_p8(poly8x8_t a, poly8x8_t b) 3765 { 3766 //may be optimized 3767 __m128i a128,b128, c1, a128_16, bmasked_16, res, tmp, bmasked; 3768 int i; 3769 a128 = _pM128i(a); 3770 b128 = _pM128i(b); 3771 c1 = _mm_cmpeq_epi8 (a128,a128); //all ones 0xff.... 3772 c1 = vshrq_n_u8(c1,7); //0x1 3773 bmasked = _mm_and_si128(b128, c1); //0x1 3774 3775 a128_16 = _MM_CVTEPU8_EPI16 (a128); // SSE 4.1 3776 bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 3777 res = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit 3778 for(i = 1; i<8; i++) { 3779 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here 3780 bmasked = _mm_and_si128(b128, c1); //0x1 3781 bmasked_16 = _MM_CVTEPU8_EPI16 (bmasked); // SSE 4.1 3782 tmp = _mm_mullo_epi16 (a128_16, bmasked_16); //should fit into 16 bit, vmull_u8(a, bmasked); 3783 res = _mm_xor_si128(res, tmp); 3784 } 3785 return res; 3786 } 3787 3788 //****************Vector saturating doubling long multiply ************************** 3789 //***************************************************************** 3790 _NEON2SSESTORAGE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b); // VQDMULL.S16 q0,d0,d0 3791 _NEON2SSE_INLINE int32x4_t vqdmull_s16(int16x4_t a, int16x4_t b) 3792 { 3793 //the serial soulution may be faster due to saturation 3794 __m128i res; 3795 res = vmull_s16(a, b); 3796 return vqd_s32(res); 3797 } 3798 3799 _NEON2SSESTORAGE int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b); // VQDMULL.S32 q0,d0,d0 3800 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) 3801 { 3802 //the serial soulution may be faster due to saturation 3803 __m128i res; 3804 res = vmull_s32(a,b); 3805 return vqaddq_s64(res,res); //slow serial function!!!! 3806 } 3807 3808 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************ 3809 //****************************************************************************************** 3810 _NEON2SSESTORAGE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLA.I8 d0,d0,d0 3811 _NEON2SSE_INLINE int8x8_t vmla_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLA.I8 d0,d0,d0 3812 { 3813 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits 3814 int8x8_t res64; 3815 __m128i b128, c128, res; 3816 b128 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits 3817 c128 = _MM_CVTEPI8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits 3818 res = _mm_mullo_epi16 (c128, b128); 3819 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); 3820 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits 3821 return64(res); 3822 } 3823 3824 _NEON2SSESTORAGE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLA.I16 d0,d0,d0 3825 _NEON2SSE_INLINE int16x4_t vmla_s16(int16x4_t a, int16x4_t b, int16x4_t c) 3826 { 3827 int16x4_t res64; 3828 return64(vmlaq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); 3829 } 3830 3831 3832 _NEON2SSESTORAGE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLA.I32 d0,d0,d0 3833 _NEON2SSE_INLINE int32x2_t vmla_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLA.I32 d0,d0,d0 3834 { 3835 int32x2_t res64; 3836 __m128i res; 3837 res = _MM_MULLO_EPI32 (_pM128i(b), _pM128i(c)); //SSE4.1 3838 res = _mm_add_epi32 (res, _pM128i(a)); //use the low 64 bits 3839 return64(res); 3840 } 3841 3842 _NEON2SSESTORAGE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLA.F32 d0,d0,d0 3843 _NEON2SSE_INLINE float32x2_t vmla_f32(float32x2_t a, float32x2_t b, float32x2_t c) 3844 { 3845 //fma is coming soon, but right now: 3846 __m128 res; 3847 __m64_128 res64; 3848 res = _mm_mul_ps (_pM128(c), _pM128(b)); 3849 res = _mm_add_ps (_pM128(a), res); 3850 _M64f(res64, res); 3851 return res64; 3852 } 3853 3854 _NEON2SSESTORAGE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLA.I8 d0,d0,d0 3855 _NEON2SSE_INLINE uint8x8_t vmla_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) // VMLA.I8 d0,d0,d0 3856 { 3857 // no 8 bit x86 simd multiply, need to go to 16 bits, and use the low 64 bits 3858 uint8x8_t res64; 3859 __m128i mask, b128, c128, res; 3860 mask = _mm_set1_epi16(0xff); 3861 b128 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 use low 64 bits 3862 c128 = _MM_CVTEPU8_EPI16 (_pM128i(c)); // SSE 4.1 use low 64 bits 3863 res = _mm_mullo_epi16 (c128, b128); 3864 res = _mm_and_si128(res, mask); //to avoid saturation 3865 res = _mm_packus_epi16 (res, res); 3866 res = _mm_add_epi8 (res, _pM128i(a)); //use the low 64 bits 3867 return64(res); 3868 } 3869 3870 _NEON2SSESTORAGE uint16x4_t vmla_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLA.I16 d0,d0,d0 3871 #define vmla_u16 vmla_s16 3872 3873 _NEON2SSESTORAGE uint32x2_t vmla_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLA.I32 d0,d0,d0 3874 #define vmla_u32 vmla_s32 3875 3876 _NEON2SSESTORAGE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 3877 _NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0 3878 { 3879 //solution may be not optimal 3880 // no 8 bit simd multiply, need to go to 16 bits 3881 __m128i b16, c16, r16_1, a_2,r16_2; 3882 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 3883 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 3884 r16_1 = _mm_mullo_epi16 (b16, c16); 3885 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits 3886 r16_1 = _mm_add_epi8 (r16_1, a); 3887 //swap hi and low part of a, b and c to process the remaining data 3888 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 3889 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 3890 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); 3891 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 3892 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 3893 3894 r16_2 = _mm_mullo_epi16 (b16, c16); 3895 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); 3896 r16_2 = _mm_add_epi8(r16_2, a_2); 3897 return _mm_unpacklo_epi64(r16_1,r16_2); 3898 } 3899 3900 _NEON2SSESTORAGE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 3901 _NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0 3902 { 3903 __m128i res; 3904 res = _mm_mullo_epi16 (c, b); 3905 return _mm_add_epi16 (res, a); 3906 } 3907 3908 _NEON2SSESTORAGE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 3909 _NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0 3910 { 3911 __m128i res; 3912 res = _MM_MULLO_EPI32 (c, b); //SSE4.1 3913 return _mm_add_epi32 (res, a); 3914 } 3915 3916 _NEON2SSESTORAGE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 3917 _NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0 3918 { 3919 //fma is coming soon, but right now: 3920 __m128 res; 3921 res = _mm_mul_ps (c, b); 3922 return _mm_add_ps (a, res); 3923 } 3924 3925 _NEON2SSESTORAGE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 3926 _NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0 3927 { 3928 //solution may be not optimal 3929 // no 8 bit simd multiply, need to go to 16 bits 3930 __m128i b16, c16, r16_1, a_2, r16_2; 3931 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 3932 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 3933 r16_1 = _mm_mullo_epi16 (b16, c16); 3934 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits 3935 r16_1 = _mm_add_epi8 (r16_1, a); 3936 //swap hi and low part of a, b and c to process the remaining data 3937 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 3938 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 3939 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); 3940 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 3941 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 3942 3943 r16_2 = _mm_mullo_epi16 (b16, c16); 3944 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); 3945 r16_2 = _mm_add_epi8(r16_2, a_2); 3946 return _mm_unpacklo_epi64(r16_1,r16_2); 3947 } 3948 3949 _NEON2SSESTORAGE uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 3950 #define vmlaq_u16 vmlaq_s16 3951 3952 _NEON2SSESTORAGE uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 3953 #define vmlaq_u32 vmlaq_s32 3954 3955 //********************** Vector widening multiply accumulate (long multiply accumulate): 3956 // vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************** 3957 //******************************************************************************************** 3958 _NEON2SSESTORAGE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLAL.S8 q0,d0,d0 3959 _NEON2SSE_INLINE int16x8_t vmlal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLAL.S8 q0,d0,d0 3960 { 3961 int16x8_t res; 3962 res = vmull_s8(b, c); 3963 return _mm_add_epi16 (res, a); 3964 } 3965 3966 _NEON2SSESTORAGE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLAL.S16 q0,d0,d0 3967 _NEON2SSE_INLINE int32x4_t vmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLAL.S16 q0,d0,d0 3968 { 3969 //may be not optimal compared with serial implementation 3970 int32x4_t res; 3971 res = vmull_s16(b, c); 3972 return _mm_add_epi32 (res, a); 3973 } 3974 3975 _NEON2SSESTORAGE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLAL.S32 q0,d0,d0 3976 _NEON2SSE_INLINE int64x2_t vmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLAL.S32 q0,d0,d0 3977 { 3978 //may be not optimal compared with serial implementation 3979 int64x2_t res; 3980 res = vmull_s32( b, c); 3981 return _mm_add_epi64 (res, a); 3982 } 3983 3984 _NEON2SSESTORAGE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLAL.U8 q0,d0,d0 3985 _NEON2SSE_INLINE uint16x8_t vmlal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLAL.U8 q0,d0,d0 3986 { 3987 uint16x8_t res; 3988 res = vmull_u8(b, c); 3989 return _mm_add_epi16 (res, a); 3990 } 3991 3992 _NEON2SSESTORAGE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLAL.s16 q0,d0,d0 3993 _NEON2SSE_INLINE uint32x4_t vmlal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLAL.s16 q0,d0,d0 3994 { 3995 //may be not optimal compared with serial implementation 3996 uint32x4_t res; 3997 res = vmull_u16(b, c); 3998 return _mm_add_epi32 (res, a); 3999 } 4000 4001 _NEON2SSESTORAGE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLAL.U32 q0,d0,d0 4002 _NEON2SSE_INLINE uint64x2_t vmlal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLAL.U32 q0,d0,d0 4003 { 4004 //may be not optimal compared with serial implementation 4005 int64x2_t res; 4006 res = vmull_u32( b,c); 4007 return _mm_add_epi64 (res, a); 4008 } 4009 4010 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] *************************************** 4011 //******************************************************************************************** 4012 _NEON2SSESTORAGE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VMLS.I8 d0,d0,d0 4013 _NEON2SSE_INLINE int8x8_t vmls_s8(int8x8_t a, int8x8_t b, int8x8_t c) // VMLS.I8 d0,d0,d0 4014 { 4015 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits 4016 int8x8_t res64; 4017 __m128i res; 4018 res64 = vmul_s8(b,c); 4019 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); 4020 return64(res); 4021 } 4022 4023 _NEON2SSESTORAGE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VMLS.I16 d0,d0,d0 4024 _NEON2SSE_INLINE int16x4_t vmls_s16(int16x4_t a, int16x4_t b, int16x4_t c) 4025 { 4026 int16x4_t res64; 4027 return64(vmlsq_s16(_pM128i(a),_pM128i(b), _pM128i(c))); 4028 } 4029 4030 4031 _NEON2SSESTORAGE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VMLS.I32 d0,d0,d0 4032 _NEON2SSE_INLINE int32x2_t vmls_s32(int32x2_t a, int32x2_t b, int32x2_t c) // VMLS.I32 d0,d0,d0 4033 { 4034 int32x2_t res64; 4035 __m128i res; 4036 res = _MM_MULLO_EPI32 (_pM128i(c),_pM128i( b)); //SSE4.1 4037 res = _mm_sub_epi32 (_pM128i(a),res); //use low 64 bits only 4038 return64(res); 4039 } 4040 4041 _NEON2SSESTORAGE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c); // VMLS.F32 d0,d0,d0 4042 _NEON2SSE_INLINE float32x2_t vmls_f32(float32x2_t a, float32x2_t b, float32x2_t c) 4043 { 4044 __m128 res; 4045 __m64_128 res64; 4046 res = _mm_mul_ps (_pM128(c), _pM128(b)); 4047 res = _mm_sub_ps (_pM128(a), res); 4048 _M64f(res64, res); 4049 return res64; 4050 } 4051 4052 _NEON2SSESTORAGE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VMLS.I8 d0,d0,d0 4053 _NEON2SSE_INLINE uint8x8_t vmls_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) 4054 { 4055 // no 8 bit simd multiply, need to go to 16 bits - and use the low 64 bits 4056 uint8x8_t res64; 4057 __m128i res; 4058 res64 = vmul_u8(b,c); 4059 res = _mm_sub_epi8 (_pM128i(a), _pM128i(res64)); 4060 return64(res); 4061 } 4062 4063 _NEON2SSESTORAGE uint16x4_t vmls_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VMLS.I16 d0,d0,d0 4064 #define vmls_u16 vmls_s16 4065 4066 _NEON2SSESTORAGE uint32x2_t vmls_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VMLS.I32 d0,d0,d0 4067 #define vmls_u32 vmls_s32 4068 4069 4070 _NEON2SSESTORAGE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 4071 _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0 4072 { 4073 //solution may be not optimal 4074 // no 8 bit simd multiply, need to go to 16 bits 4075 __m128i b16, c16, r16_1, a_2, r16_2; 4076 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 4077 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 4078 r16_1 = _mm_mullo_epi16 (b16, c16); 4079 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); 4080 r16_1 = _mm_sub_epi8 (a, r16_1); 4081 //swap hi and low part of a, b, c to process the remaining data 4082 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 4083 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 4084 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); 4085 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 4086 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 4087 4088 r16_2 = _mm_mullo_epi16 (b16, c16); 4089 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); 4090 r16_2 = _mm_sub_epi8 (a_2, r16_2); 4091 return _mm_unpacklo_epi64(r16_1,r16_2); 4092 } 4093 4094 _NEON2SSESTORAGE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 4095 _NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0 4096 { 4097 __m128i res; 4098 res = _mm_mullo_epi16 (c, b); 4099 return _mm_sub_epi16 (a, res); 4100 } 4101 4102 _NEON2SSESTORAGE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 4103 _NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0 4104 { 4105 __m128i res; 4106 res = _MM_MULLO_EPI32 (c, b); //SSE4.1 4107 return _mm_sub_epi32 (a, res); 4108 } 4109 4110 _NEON2SSESTORAGE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 4111 _NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0 4112 { 4113 __m128 res; 4114 res = _mm_mul_ps (c, b); 4115 return _mm_sub_ps (a, res); 4116 } 4117 4118 _NEON2SSESTORAGE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 4119 _NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0 4120 { 4121 //solution may be not optimal 4122 // no 8 bit simd multiply, need to go to 16 bits 4123 __m128i b16, c16, r16_1, a_2, r16_2; 4124 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 4125 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 4126 r16_1 = _mm_mullo_epi16 (b16, c16); 4127 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits 4128 r16_1 = _mm_sub_epi8 (a, r16_1); 4129 //swap hi and low part of a, b and c to process the remaining data 4130 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 4131 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 4132 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); 4133 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 4134 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 4135 4136 r16_2 = _mm_mullo_epi16 (b16, c16); 4137 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); 4138 r16_2 = _mm_sub_epi8(a_2, r16_2); 4139 return _mm_unpacklo_epi64(r16_1,r16_2); 4140 } 4141 4142 _NEON2SSESTORAGE uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 4143 #define vmlsq_u16 vmlsq_s16 4144 4145 _NEON2SSESTORAGE uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 4146 #define vmlsq_u32 vmlsq_s32 4147 4148 //******************** Vector multiply subtract long (widening multiply subtract) ************************************ 4149 //************************************************************************************************************* 4150 _NEON2SSESTORAGE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VMLSL.S8 q0,d0,d0 4151 _NEON2SSE_INLINE int16x8_t vmlsl_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VMLSL.S8 q0,d0,d0 4152 { 4153 int16x8_t res; 4154 res = vmull_s8(b, c); 4155 return _mm_sub_epi16 (a, res); 4156 } 4157 4158 _NEON2SSESTORAGE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VMLSL.S16 q0,d0,d0 4159 _NEON2SSE_INLINE int32x4_t vmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VMLSL.S16 q0,d0,d0 4160 { 4161 //may be not optimal compared with serial implementation 4162 int32x4_t res; 4163 res = vmull_s16(b, c); 4164 return _mm_sub_epi32 (a, res); 4165 } 4166 4167 _NEON2SSESTORAGE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VMLSL.S32 q0,d0,d0 4168 _NEON2SSE_INLINE int64x2_t vmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c) // VMLSL.S32 q0,d0,d0 4169 { 4170 //may be not optimal compared with serial implementation 4171 int64x2_t res; 4172 res = vmull_s32( b,c); 4173 return _mm_sub_epi64 (a, res); 4174 } 4175 4176 _NEON2SSESTORAGE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VMLSL.U8 q0,d0,d0 4177 _NEON2SSE_INLINE uint16x8_t vmlsl_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) // VMLSL.U8 q0,d0,d0 4178 { 4179 uint16x8_t res; 4180 res = vmull_u8(b, c); 4181 return _mm_sub_epi16 (a, res); 4182 } 4183 4184 _NEON2SSESTORAGE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VMLSL.s16 q0,d0,d0 4185 _NEON2SSE_INLINE uint32x4_t vmlsl_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) // VMLSL.s16 q0,d0,d0 4186 { 4187 //may be not optimal compared with serial implementation 4188 uint32x4_t res; 4189 res = vmull_u16(b, c); 4190 return _mm_sub_epi32 (a, res); 4191 } 4192 4193 _NEON2SSESTORAGE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VMLSL.U32 q0,d0,d0 4194 _NEON2SSE_INLINE uint64x2_t vmlsl_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c) // VMLSL.U32 q0,d0,d0 4195 { 4196 //may be not optimal compared with serial implementation 4197 int64x2_t res; 4198 res = vmull_u32( b,c); 4199 return _mm_sub_epi64 (a, res); 4200 } 4201 4202 //****** Vector saturating doubling multiply high ********************** 4203 //************************************************************************* 4204 _NEON2SSESTORAGE int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b); // VQDMULH.S16 d0,d0,d0 4205 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqdmulh_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 4206 { 4207 int16x4_t res; 4208 int32_t a32, b32, i; 4209 for (i = 0; i<4; i++) { 4210 a32 = (int32_t) a.m64_i16[i]; 4211 b32 = (int32_t) b.m64_i16[i]; 4212 a32 = (a32 * b32) >> 15; 4213 res.m64_i16[i] = (a32 == 0x8000) ? 0x7fff : (int16_t) a32; 4214 } 4215 return res; 4216 } 4217 4218 _NEON2SSESTORAGE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b); // VQDMULH.S32 d0,d0,d0 4219 _NEON2SSE_INLINE int32x2_t vqdmulh_s32(int32x2_t a, int32x2_t b) // no multiply high 32 bit SIMD in IA32, so need to do some tricks, serial solution may be faster 4220 { 4221 //may be not optimal compared with a serial solution 4222 int32x2_t res64; 4223 __m128i mask; 4224 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 4225 int64x2_t mul; 4226 mul = vmull_s32(a,b); 4227 mul = _mm_slli_epi64(mul,1); //double the result 4228 //at this point start treating 2 64-bit numbers as 4 32-bit 4229 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits 4230 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); 4231 mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 4232 return64(mul); 4233 } 4234 4235 _NEON2SSESTORAGE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 4236 _NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0 4237 { 4238 __m128i res, res_lo, mask; 4239 _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; 4240 res = _mm_mulhi_epi16 (a, b); 4241 res = _mm_slli_epi16 (res, 1); //double the result, don't care about saturation 4242 res_lo = _mm_mullo_epi16 (a, b); 4243 res_lo = _mm_srli_epi16(res_lo,15); //take the highest bit 4244 res = _mm_add_epi16(res, res_lo); //combine results 4245 mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); 4246 return _mm_xor_si128 (res, mask); //res saturated for 0x8000 4247 } 4248 4249 _NEON2SSESTORAGE int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 4250 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 4251 { 4252 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target 4253 __m128i ab, ba, mask, mul, mul1; 4254 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 4255 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 4256 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 4257 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 4258 mul = _mm_slli_epi64(mul,1); //double the result 4259 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 4260 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 4261 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 4262 mul1 = _mm_slli_epi64(mul1,1); //double the result 4263 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits 4264 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits 4265 mul = _mm_unpacklo_epi64(mul, mul1); 4266 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); 4267 return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 4268 } 4269 4270 //********* Vector saturating rounding doubling multiply high **************** 4271 //**************************************************************************** 4272 //If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order 4273 _NEON2SSESTORAGE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b); // VQRDMULH.S16 d0,d0,d0 4274 _NEON2SSE_INLINE int16x4_t vqrdmulh_s16(int16x4_t a, int16x4_t b) 4275 { 4276 int16x4_t res64; 4277 return64(vqrdmulhq_s16(_pM128i(a), _pM128i(b))); 4278 } 4279 4280 _NEON2SSESTORAGE int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b); // VQRDMULH.S32 d0,d0,d0 4281 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 4282 { 4283 //may be not optimal compared with a serial solution 4284 int32x2_t res64; 4285 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 4286 __m128i res_sat, mask, mask1; 4287 int64x2_t mul; 4288 mul = vmull_s32(a,b); 4289 res_sat = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered 4290 mask1 = _mm_slli_epi64(res_sat, 32); //shift left then back right to 4291 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero 4292 mul = _mm_add_epi32 (res_sat, mask1); //actual rounding 4293 //at this point start treating 2 64-bit numbers as 4 32-bit 4294 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit 4295 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); 4296 mul = _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 4297 return64(mul); 4298 } 4299 4300 _NEON2SSESTORAGE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 4301 _NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0 4302 { 4303 __m128i mask, res; 4304 _NEON2SSE_ALIGN_16 static const uint16_t cmask[] = {0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000, 0x8000}; 4305 res = _mm_mulhrs_epi16 (a, b); 4306 mask = _mm_cmpeq_epi16 (res, *(__m128i*)cmask); 4307 return _mm_xor_si128 (res, mask); //res saturated for 0x8000 4308 } 4309 4310 _NEON2SSESTORAGE int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 4311 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 4312 { 4313 // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target 4314 __m128i ab, ba, mask, mul, mul1, mask1; 4315 _NEON2SSE_ALIGN_16 static const uint32_t cmask32[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 4316 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 4317 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 4318 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 4319 mul = _mm_slli_epi64 (mul, 1); //double the result, saturation not considered 4320 mask1 = _mm_slli_epi64(mul, 32); //shift left then back right to 4321 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero 4322 mul = _mm_add_epi32 (mul, mask1); //actual rounding 4323 4324 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 4325 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 4326 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 4327 mul1 = _mm_slli_epi64 (mul1, 1); //double the result, saturation not considered 4328 mask1 = _mm_slli_epi64(mul1, 32); //shift left then back right to 4329 mask1 = _mm_srli_epi64(mask1,31); //get 31-th bit 1 or zero 4330 mul1 = _mm_add_epi32 (mul1, mask1); //actual rounding 4331 //at this point start treating 2 64-bit numbers as 4 32-bit 4332 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit 4333 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits from each 64-bit 4334 mul = _mm_unpacklo_epi64(mul, mul1); 4335 mask = _mm_cmpeq_epi32 (mul, *(__m128i*)cmask32); 4336 return _mm_xor_si128 (mul, mask); //res saturated for 0x80000000 4337 } 4338 4339 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) ***** 4340 //************************************************************************************************************************* 4341 _NEON2SSESTORAGE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLAL.S16 q0,d0,d0 4342 _NEON2SSE_INLINE int32x4_t vqdmlal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VQDMLAL.S16 q0,d0,d0 4343 { 4344 //not optimal SIMD soulution, serial may be faster 4345 __m128i res32; 4346 res32 = vmull_s16(b, c); 4347 res32 = vqd_s32(res32); //doubling & saturation ,if no saturation we could use _mm_slli_epi32 (res, 1); 4348 return vqaddq_s32(res32, a); //saturation 4349 } 4350 4351 _NEON2SSESTORAGE int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLAL.S32 q0,d0,d0 4352 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_s32(int64x2_t a, int32x2_t b, int32x2_t c),_NEON2SSE_REASON_SLOW_SERIAL) 4353 { 4354 __m128i res64; 4355 res64 = vmull_s32(b,c); 4356 res64 = vqaddq_s64(res64, res64); //doubling & saturation ,if no saturation we could use _mm_slli_epi64 (res, 1); 4357 return vqaddq_s64(res64, a); //saturation 4358 } 4359 4360 //************************************************************************************ 4361 //****************** Vector subtract *********************************************** 4362 //************************************************************************************ 4363 _NEON2SSESTORAGE int8x8_t vsub_s8(int8x8_t a, int8x8_t b); // VSUB.I8 d0,d0,d0 4364 _NEON2SSE_INLINE int8x8_t vsub_s8(int8x8_t a, int8x8_t b) 4365 { 4366 int8x8_t res64; 4367 return64(_mm_sub_epi8(_pM128i(a),_pM128i(b))); 4368 } 4369 4370 4371 _NEON2SSESTORAGE int16x4_t vsub_s16(int16x4_t a, int16x4_t b); // VSUB.I16 d0,d0,d0 4372 _NEON2SSE_INLINE int16x4_t vsub_s16(int16x4_t a, int16x4_t b) 4373 { 4374 int16x4_t res64; 4375 return64(_mm_sub_epi16(_pM128i(a),_pM128i(b))); 4376 } 4377 4378 4379 _NEON2SSESTORAGE int32x2_t vsub_s32(int32x2_t a, int32x2_t b); // VSUB.I32 d0,d0,d0 4380 _NEON2SSE_INLINE int32x2_t vsub_s32(int32x2_t a, int32x2_t b) 4381 { 4382 int32x2_t res64; 4383 return64(_mm_sub_epi32(_pM128i(a),_pM128i(b))); 4384 } 4385 4386 4387 _NEON2SSESTORAGE int64x1_t vsub_s64(int64x1_t a, int64x1_t b); // VSUB.I64 d0,d0,d0 4388 _NEON2SSE_INLINE int64x1_t vsub_s64(int64x1_t a, int64x1_t b) 4389 { 4390 int64x1_t res64; 4391 res64.m64_i64[0] = a.m64_i64[0] - b.m64_i64[0]; 4392 return res64; 4393 } 4394 4395 4396 _NEON2SSESTORAGE float32x2_t vsub_f32(float32x2_t a, float32x2_t b); // VSUB.F32 d0,d0,d0 4397 _NEON2SSE_INLINE float32x2_t vsub_f32(float32x2_t a, float32x2_t b) 4398 { 4399 float32x2_t res; 4400 res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0]; 4401 res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1]; 4402 return res; 4403 } 4404 4405 _NEON2SSESTORAGE uint8x8_t vsub_u8(uint8x8_t a, uint8x8_t b); // VSUB.I8 d0,d0,d0 4406 #define vsub_u8 vsub_s8 4407 4408 _NEON2SSESTORAGE uint16x4_t vsub_u16(uint16x4_t a, uint16x4_t b); // VSUB.I16 d0,d0,d0 4409 #define vsub_u16 vsub_s16 4410 4411 _NEON2SSESTORAGE uint32x2_t vsub_u32(uint32x2_t a, uint32x2_t b); // VSUB.I32 d0,d0,d0 4412 #define vsub_u32 vsub_s32 4413 4414 4415 _NEON2SSESTORAGE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b); // VSUB.I64 d0,d0,d0 4416 _NEON2SSE_INLINE uint64x1_t vsub_u64(uint64x1_t a, uint64x1_t b) 4417 { 4418 int64x1_t res64; 4419 res64.m64_u64[0] = a.m64_u64[0] - b.m64_u64[0]; 4420 return res64; 4421 } 4422 4423 4424 _NEON2SSESTORAGE int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 4425 #define vsubq_s8 _mm_sub_epi8 4426 4427 _NEON2SSESTORAGE int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 4428 #define vsubq_s16 _mm_sub_epi16 4429 4430 _NEON2SSESTORAGE int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 4431 #define vsubq_s32 _mm_sub_epi32 4432 4433 _NEON2SSESTORAGE int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 4434 #define vsubq_s64 _mm_sub_epi64 4435 4436 _NEON2SSESTORAGE float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 4437 #define vsubq_f32 _mm_sub_ps 4438 4439 _NEON2SSESTORAGE uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 4440 #define vsubq_u8 _mm_sub_epi8 4441 4442 _NEON2SSESTORAGE uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 4443 #define vsubq_u16 _mm_sub_epi16 4444 4445 _NEON2SSESTORAGE uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 4446 #define vsubq_u32 _mm_sub_epi32 4447 4448 _NEON2SSESTORAGE uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 4449 #define vsubq_u64 _mm_sub_epi64 4450 4451 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ****************** 4452 //*********************************************************************************** 4453 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. 4454 _NEON2SSESTORAGE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b); // VSUBL.S8 q0,d0,d0 4455 _NEON2SSE_INLINE int16x8_t vsubl_s8(int8x8_t a, int8x8_t b) // VSUBL.S8 q0,d0,d0 4456 { 4457 __m128i a16, b16; 4458 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, 4459 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, 4460 return _mm_sub_epi16 (a16, b16); 4461 } 4462 4463 _NEON2SSESTORAGE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b); // VSUBL.S16 q0,d0,d0 4464 _NEON2SSE_INLINE int32x4_t vsubl_s16(int16x4_t a, int16x4_t b) // VSUBL.S16 q0,d0,d0 4465 { 4466 __m128i a32, b32; 4467 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 4468 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, 4469 return _mm_sub_epi32 (a32, b32); 4470 } 4471 4472 _NEON2SSESTORAGE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b); // VSUBL.S32 q0,d0,d0 4473 _NEON2SSE_INLINE int64x2_t vsubl_s32(int32x2_t a, int32x2_t b) // VSUBL.S32 q0,d0,d0 4474 { 4475 //may be not optimal 4476 __m128i a64, b64; 4477 a64 = _MM_CVTEPI32_EPI64 (_pM128i(a)); //SSE4.1 4478 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1, 4479 return _mm_sub_epi64 (a64, b64); 4480 } 4481 4482 _NEON2SSESTORAGE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b); // VSUBL.U8 q0,d0,d0 4483 _NEON2SSE_INLINE uint16x8_t vsubl_u8(uint8x8_t a, uint8x8_t b) // VSUBL.U8 q0,d0,d0 4484 { 4485 __m128i a16, b16; 4486 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE4.1, 4487 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, 4488 return _mm_sub_epi16 (a16, b16); 4489 } 4490 4491 _NEON2SSESTORAGE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b); // VSUBL.s16 q0,d0,d0 4492 _NEON2SSE_INLINE uint32x4_t vsubl_u16(uint16x4_t a, uint16x4_t b) // VSUBL.s16 q0,d0,d0 4493 { 4494 __m128i a32, b32; 4495 a32 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE4.1 4496 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, 4497 return _mm_sub_epi32 (a32, b32); 4498 } 4499 4500 _NEON2SSESTORAGE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b); // VSUBL.U32 q0,d0,d0 4501 _NEON2SSE_INLINE uint64x2_t vsubl_u32(uint32x2_t a, uint32x2_t b) // VSUBL.U32 q0,d0,d0 4502 { 4503 //may be not optimal 4504 __m128i a64, b64; 4505 a64 = _MM_CVTEPU32_EPI64 (_pM128i(a)); //SSE4.1 4506 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1, 4507 return _mm_sub_epi64 (a64, b64); 4508 } 4509 4510 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ********************************** 4511 //***************************************************************************************************** 4512 _NEON2SSESTORAGE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b); // VSUBW.S8 q0,q0,d0 4513 _NEON2SSE_INLINE int16x8_t vsubw_s8(int16x8_t a, int8x8_t b) // VSUBW.S8 q0,q0,d0 4514 { 4515 __m128i b16; 4516 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, 4517 return _mm_sub_epi16 (a, b16); 4518 } 4519 4520 _NEON2SSESTORAGE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b); // VSUBW.S16 q0,q0,d0 4521 _NEON2SSE_INLINE int32x4_t vsubw_s16(int32x4_t a, int16x4_t b) // VSUBW.S16 q0,q0,d0 4522 { 4523 __m128i b32; 4524 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, 4525 return _mm_sub_epi32 (a, b32); 4526 } 4527 4528 _NEON2SSESTORAGE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b); // VSUBW.S32 q0,q0,d0 4529 _NEON2SSE_INLINE int64x2_t vsubw_s32(int64x2_t a, int32x2_t b) // VSUBW.S32 q0,q0,d0 4530 { 4531 __m128i b64; 4532 b64 = _MM_CVTEPI32_EPI64 (_pM128i(b)); //SSE4.1 4533 return _mm_sub_epi64 (a, b64); 4534 } 4535 4536 _NEON2SSESTORAGE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b); // VSUBW.U8 q0,q0,d0 4537 _NEON2SSE_INLINE uint16x8_t vsubw_u8(uint16x8_t a, uint8x8_t b) // VSUBW.U8 q0,q0,d0 4538 { 4539 __m128i b16; 4540 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, 4541 return _mm_sub_epi16 (a, b16); 4542 } 4543 4544 _NEON2SSESTORAGE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b); // VSUBW.s16 q0,q0,d0 4545 _NEON2SSE_INLINE uint32x4_t vsubw_u16(uint32x4_t a, uint16x4_t b) // VSUBW.s16 q0,q0,d0 4546 { 4547 __m128i b32; 4548 b32 = _MM_CVTEPU16_EPI32 (_pM128i(b)); //SSE4.1, 4549 return _mm_sub_epi32 (a, b32); 4550 } 4551 4552 _NEON2SSESTORAGE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b); // VSUBW.U32 q0,q0,d0 4553 _NEON2SSE_INLINE uint64x2_t vsubw_u32(uint64x2_t a, uint32x2_t b) // VSUBW.U32 q0,q0,d0 4554 { 4555 __m128i b64; 4556 b64 = _MM_CVTEPU32_EPI64 (_pM128i(b)); //SSE4.1 4557 return _mm_sub_epi64 (a, b64); 4558 } 4559 4560 //************************Vector saturating subtract ********************************* 4561 //************************************************************************************* 4562 _NEON2SSESTORAGE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b); // VQSUB.S8 d0,d0,d0 4563 _NEON2SSE_INLINE int8x8_t vqsub_s8(int8x8_t a, int8x8_t b) 4564 { 4565 int8x8_t res64; 4566 return64(_mm_subs_epi8(_pM128i(a),_pM128i(b))); 4567 } 4568 4569 4570 _NEON2SSESTORAGE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b); // VQSUB.S16 d0,d0,d0 4571 _NEON2SSE_INLINE int16x4_t vqsub_s16(int16x4_t a, int16x4_t b) 4572 { 4573 int16x4_t res64; 4574 return64(_mm_subs_epi16(_pM128i(a),_pM128i(b))); 4575 } 4576 4577 4578 _NEON2SSESTORAGE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b); // VQSUB.S32 d0,d0,d0 4579 _NEON2SSE_INLINE int32x2_t vqsub_s32(int32x2_t a, int32x2_t b) 4580 { 4581 int32x2_t res64; 4582 return64(vqsubq_s32(_pM128i(a), _pM128i(b))); 4583 } 4584 4585 4586 _NEON2SSESTORAGE int64x1_t vqsub_s64(int64x1_t a, int64x1_t b); // VQSUB.S64 d0,d0,d0 4587 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqsub_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution 4588 { 4589 uint64x1_t res; 4590 uint64_t a64,b64; 4591 a64 = a.m64_u64[0]; 4592 b64 = b.m64_u64[0]; 4593 res.m64_u64[0] = a64 - b64; 4594 4595 a64 = (a64 >> 63) + (~_SIGNBIT64); 4596 if ((int64_t)((a64 ^ b64) & (a64 ^ res.m64_u64[0])) < 0) { 4597 res.m64_u64[0] = a64; 4598 } 4599 return res; 4600 } 4601 4602 _NEON2SSESTORAGE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b); // VQSUB.U8 d0,d0,d0 4603 _NEON2SSE_INLINE uint8x8_t vqsub_u8(uint8x8_t a, uint8x8_t b) 4604 { 4605 uint8x8_t res64; 4606 return64(_mm_subs_epu8(_pM128i(a),_pM128i(b))); 4607 } 4608 4609 4610 _NEON2SSESTORAGE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b); // VQSUB.s16 d0,d0,d0 4611 _NEON2SSE_INLINE uint16x4_t vqsub_u16(uint16x4_t a, uint16x4_t b) 4612 { 4613 uint16x4_t res64; 4614 return64(_mm_subs_epu16(_pM128i(a),_pM128i(b))); 4615 } 4616 4617 4618 _NEON2SSESTORAGE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b); // VQSUB.U32 d0,d0,d0 4619 _NEON2SSE_INLINE uint32x2_t vqsub_u32(uint32x2_t a, uint32x2_t b) 4620 { 4621 uint32x2_t res64; 4622 return64(vqsubq_u32(_pM128i(a), _pM128i(b))); 4623 } 4624 4625 4626 _NEON2SSESTORAGE uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b); // VQSUB.U64 d0,d0,d0 4627 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqsub_u64(uint64x1_t a, uint64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 4628 { 4629 uint64x1_t res; 4630 uint64_t a64, b64; 4631 a64 = _Ui64(a); 4632 b64 = _Ui64(b); 4633 if (a64 > b64) { 4634 res.m64_u64[0] = a64 - b64; 4635 } else { 4636 res.m64_u64[0] = 0; 4637 } 4638 return res; 4639 } 4640 4641 _NEON2SSESTORAGE int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 4642 #define vqsubq_s8 _mm_subs_epi8 4643 4644 _NEON2SSESTORAGE int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 4645 #define vqsubq_s16 _mm_subs_epi16 4646 4647 _NEON2SSESTORAGE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 4648 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b) 4649 { 4650 //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a 4651 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a; 4652 c7fffffff = _mm_set1_epi32(0x7fffffff); 4653 res = _mm_sub_epi32(a, b); 4654 res_sat = _mm_srli_epi32(a, 31); 4655 res_sat = _mm_add_epi32(res_sat, c7fffffff); 4656 res_xor_a = _mm_xor_si128(res, a); 4657 b_xor_a = _mm_xor_si128(b, a); 4658 res_xor_a = _mm_and_si128(b_xor_a, res_xor_a); 4659 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise 4660 res_sat = _mm_and_si128(res_xor_a, res_sat); 4661 res = _mm_andnot_si128(res_xor_a, res); 4662 return _mm_or_si128(res, res_sat); 4663 } 4664 4665 _NEON2SSESTORAGE int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 4666 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution 4667 { 4668 _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2]; 4669 _NEON2SSE_ALIGN_16 uint64_t res[2]; 4670 _mm_store_si128((__m128i*)atmp, a); 4671 _mm_store_si128((__m128i*)btmp, b); 4672 res[0] = atmp[0] - btmp[0]; 4673 res[1] = atmp[1] - btmp[1]; 4674 if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) { 4675 res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64; 4676 } 4677 if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) { 4678 res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64; 4679 } 4680 return _mm_load_si128((__m128i*)res); 4681 } 4682 4683 _NEON2SSESTORAGE uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 4684 #define vqsubq_u8 _mm_subs_epu8 4685 4686 _NEON2SSESTORAGE uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0 4687 #define vqsubq_u16 _mm_subs_epu16 4688 4689 _NEON2SSESTORAGE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 4690 _NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0 4691 { 4692 __m128i min, mask, sub; 4693 min = _MM_MIN_EPU32(a, b); //SSE4.1 4694 mask = _mm_cmpeq_epi32 (min, b); 4695 sub = _mm_sub_epi32 (a, b); 4696 return _mm_and_si128 ( sub, mask); 4697 } 4698 4699 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0 4700 #ifdef USE_SSE4 4701 _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b) 4702 { 4703 __m128i c80000000, subb, suba, cmp, sub; 4704 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); 4705 sub = _mm_sub_epi64 (a, b); 4706 suba = _mm_sub_epi64 (a, c80000000); 4707 subb = _mm_sub_epi64 (b, c80000000); 4708 cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!! 4709 return _mm_and_si128 (sub, cmp); //saturation 4710 } 4711 #else 4712 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 4713 { 4714 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; 4715 _mm_store_si128((__m128i*)atmp, a); 4716 _mm_store_si128((__m128i*)btmp, b); 4717 res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0; 4718 res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0; 4719 return _mm_load_si128((__m128i*)(res)); 4720 } 4721 #endif 4722 4723 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ****************************************************** 4724 //**************************************************************** 4725 _NEON2SSESTORAGE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b); // VHSUB.S8 d0,d0,d0 4726 _NEON2SSE_INLINE int8x8_t vhsub_s8(int8x8_t a, int8x8_t b) // VHSUB.S8 d0,d0,d0 4727 { 4728 //no 8 bit shift available, internal overflow is possible, so let's go to 16 bit, 4729 int8x8_t res64; 4730 __m128i r16; 4731 int8x8_t r; 4732 r = vsub_s8 (a, b); 4733 r16 = _MM_CVTEPI8_EPI16 (_pM128i(r)); //SSE 4.1 4734 r16 = _mm_srai_epi16 (r16, 1); //SSE2 4735 r16 = _mm_packs_epi16 (r16,r16); //use low 64 bits 4736 return64(r16); 4737 } 4738 4739 _NEON2SSESTORAGE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b); // VHSUB.S16 d0,d0,d0 4740 _NEON2SSE_INLINE int16x4_t vhsub_s16(int16x4_t a, int16x4_t b) 4741 { 4742 int16x4_t res64; 4743 return64(vhsubq_s16(_pM128i(a), _pM128i(b))); 4744 } 4745 4746 4747 4748 _NEON2SSESTORAGE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b); // VHSUB.S32 d0,d0,d0 4749 _NEON2SSE_INLINE int32x2_t vhsub_s32(int32x2_t a, int32x2_t b) 4750 { 4751 int32x2_t res64; 4752 return64(vhsubq_s32(_pM128i(a), _pM128i(b))); 4753 } 4754 4755 4756 _NEON2SSESTORAGE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b); // VHSUB.U8 d0,d0,d0 4757 _NEON2SSE_INLINE uint8x8_t vhsub_u8(uint8x8_t a, uint8x8_t b) 4758 { 4759 uint8x8_t res64; 4760 return64(vhsubq_u8(_pM128i(a), _pM128i(b))); 4761 } 4762 4763 _NEON2SSESTORAGE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b); // VHSUB.s16 d0,d0,d0 4764 _NEON2SSE_INLINE uint16x4_t vhsub_u16(uint16x4_t a, uint16x4_t b) 4765 { 4766 uint16x4_t res64; 4767 return64(vhsubq_u16(_pM128i(a), _pM128i(b))); 4768 } 4769 4770 _NEON2SSESTORAGE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b); // VHSUB.U32 d0,d0,d0 4771 _NEON2SSE_INLINE uint32x2_t vhsub_u32(uint32x2_t a, uint32x2_t b) 4772 { 4773 uint32x2_t res64; 4774 return64(vhsubq_u32(_pM128i(a), _pM128i(b))); 4775 } 4776 4777 _NEON2SSESTORAGE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 4778 _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0 4779 { 4780 // //need to deal with the possibility of internal overflow 4781 __m128i c128, au,bu; 4782 c128 = _mm_set1_epi8((int8_t)128); 4783 au = _mm_add_epi8( a, c128); 4784 bu = _mm_add_epi8( b, c128); 4785 return vhsubq_u8(au,bu); 4786 } 4787 4788 _NEON2SSESTORAGE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 4789 _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0 4790 { 4791 //need to deal with the possibility of internal overflow 4792 __m128i c8000, au,bu; 4793 c8000 = _mm_set1_epi16((int16_t)0x8000); 4794 au = _mm_add_epi16( a, c8000); 4795 bu = _mm_add_epi16( b, c8000); 4796 return vhsubq_u16(au,bu); 4797 } 4798 4799 _NEON2SSESTORAGE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 4800 _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0 4801 { 4802 //need to deal with the possibility of internal overflow 4803 __m128i a2, b2,r, b_1; 4804 a2 = _mm_srai_epi32 (a,1); 4805 b2 = _mm_srai_epi32 (b,1); 4806 r = _mm_sub_epi32 (a2, b2); 4807 b_1 = _mm_andnot_si128(a, b); //!a and b 4808 b_1 = _mm_slli_epi32 (b_1,31); 4809 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit 4810 return _mm_sub_epi32(r,b_1); 4811 } 4812 4813 _NEON2SSESTORAGE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 4814 _NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0 4815 { 4816 __m128i avg; 4817 avg = _mm_avg_epu8 (a, b); 4818 return _mm_sub_epi8(a, avg); 4819 } 4820 4821 _NEON2SSESTORAGE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0 4822 _NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0 4823 { 4824 __m128i avg; 4825 avg = _mm_avg_epu16 (a, b); 4826 return _mm_sub_epi16(a, avg); 4827 } 4828 4829 _NEON2SSESTORAGE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 4830 _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0 4831 { 4832 //need to deal with the possibility of internal overflow 4833 __m128i a2, b2,r, b_1; 4834 a2 = _mm_srli_epi32 (a,1); 4835 b2 = _mm_srli_epi32 (b,1); 4836 r = _mm_sub_epi32 (a2, b2); 4837 b_1 = _mm_andnot_si128(a, b); //!a and b 4838 b_1 = _mm_slli_epi32 (b_1,31); 4839 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit 4840 return _mm_sub_epi32(r,b_1); 4841 } 4842 4843 //******* Vector subtract high half (truncated) ** ************ 4844 //************************************************************ 4845 _NEON2SSESTORAGE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b); // VSUBHN.I16 d0,q0,q0 4846 _NEON2SSE_INLINE int8x8_t vsubhn_s16(int16x8_t a, int16x8_t b) // VSUBHN.I16 d0,q0,q0 4847 { 4848 int8x8_t res64; 4849 __m128i sum, sum8; 4850 sum = _mm_sub_epi16 (a, b); 4851 sum8 = _mm_srai_epi16 (sum, 8); 4852 sum8 = _mm_packs_epi16(sum8,sum8); 4853 return64(sum8); 4854 } 4855 4856 _NEON2SSESTORAGE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b); // VSUBHN.I32 d0,q0,q0 4857 _NEON2SSE_INLINE int16x4_t vsubhn_s32(int32x4_t a, int32x4_t b) // VSUBHN.I32 d0,q0,q0 4858 { 4859 int16x4_t res64; 4860 __m128i sum, sum16; 4861 sum = _mm_sub_epi32 (a, b); 4862 sum16 = _mm_srai_epi32 (sum, 16); 4863 sum16 = _mm_packs_epi32(sum16,sum16); 4864 return64(sum16); 4865 } 4866 4867 _NEON2SSESTORAGE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b); // VSUBHN.I64 d0,q0,q0 4868 _NEON2SSE_INLINE int32x2_t vsubhn_s64(int64x2_t a, int64x2_t b) 4869 { 4870 int32x2_t res64; 4871 __m128i sub; 4872 sub = _mm_sub_epi64 (a, b); 4873 sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); 4874 return64(sub); 4875 } 4876 4877 _NEON2SSESTORAGE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b); // VSUBHN.I16 d0,q0,q0 4878 _NEON2SSE_INLINE uint8x8_t vsubhn_u16(uint16x8_t a, uint16x8_t b) // VSUBHN.I16 d0,q0,q0 4879 { 4880 uint8x8_t res64; 4881 __m128i sum, sum8; 4882 sum = _mm_sub_epi16 (a, b); 4883 sum8 = _mm_srli_epi16 (sum, 8); 4884 sum8 = _mm_packus_epi16(sum8,sum8); 4885 return64(sum8); 4886 } 4887 4888 _NEON2SSESTORAGE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b); // VSUBHN.I32 d0,q0,q0 4889 _NEON2SSE_INLINE uint16x4_t vsubhn_u32(uint32x4_t a, uint32x4_t b) // VSUBHN.I32 d0,q0,q0 4890 { 4891 uint16x4_t res64; 4892 __m128i sum, sum16; 4893 sum = _mm_sub_epi32 (a, b); 4894 sum16 = _mm_srli_epi32 (sum, 16); 4895 #ifdef USE_SSE4 4896 sum16 = _MM_PACKUS1_EPI32(sum16); 4897 #else 4898 sum16 = _mm_shuffle_epi8 (sum16, *(__m128i*) mask8_32_even_odd); //go to 16 bits 4899 #endif 4900 return64(sum16); 4901 } 4902 4903 _NEON2SSESTORAGE uint32x2_t vsubhn_u64(uint64x2_t a, uint64x2_t b); // VSUBHN.I64 d0,q0,q0 4904 #define vsubhn_u64 vsubhn_s64 4905 4906 //************ Vector rounding subtract high half ********************* 4907 //********************************************************************* 4908 _NEON2SSESTORAGE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b); // VRSUBHN.I16 d0,q0,q0 4909 _NEON2SSE_INLINE int8x8_t vrsubhn_s16(int16x8_t a, int16x8_t b) // VRSUBHN.I16 d0,q0,q0 4910 { 4911 int8x8_t res64; 4912 __m128i sub, mask1; 4913 sub = _mm_sub_epi16 (a, b); 4914 mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to 4915 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero 4916 sub = _mm_srai_epi16 (sub, 8); //get high half 4917 sub = _mm_add_epi16 (sub, mask1); //actual rounding 4918 sub = _mm_packs_epi16 (sub, sub); 4919 return64(sub); 4920 } 4921 4922 _NEON2SSESTORAGE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b); // VRSUBHN.I32 d0,q0,q0 4923 _NEON2SSE_INLINE int16x4_t vrsubhn_s32(int32x4_t a, int32x4_t b) // VRSUBHN.I32 d0,q0,q0 4924 { 4925 //SIMD may be not optimal, serial may be faster 4926 int16x4_t res64; 4927 __m128i sub, mask1; 4928 sub = _mm_sub_epi32 (a, b); 4929 mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to 4930 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero 4931 sub = _mm_srai_epi32 (sub, 16); //get high half 4932 sub = _mm_add_epi32 (sub, mask1); //actual rounding 4933 sub = _mm_packs_epi32 (sub, sub); 4934 return64(sub); 4935 } 4936 4937 _NEON2SSESTORAGE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b); // VRSUBHN.I64 d0,q0,q0 4938 _NEON2SSE_INLINE int32x2_t vrsubhn_s64(int64x2_t a, int64x2_t b) 4939 { 4940 //SIMD may be not optimal, serial may be faster 4941 int32x2_t res64; 4942 __m128i sub, mask1; 4943 sub = _mm_sub_epi64 (a, b); 4944 mask1 = _mm_slli_epi64(sub, 33); //shift left then back right to 4945 mask1 = _mm_srli_epi64(mask1,32); //get 31-th bit 1 or zero 4946 sub = _mm_add_epi64 (sub, mask1); //actual high half rounding 4947 sub = _mm_shuffle_epi32(sub, 1 | (3 << 2) | (0 << 4) | (2 << 6)); 4948 return64(sub); 4949 } 4950 4951 _NEON2SSESTORAGE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b); // VRSUBHN.I16 d0,q0,q0 4952 _NEON2SSE_INLINE uint8x8_t vrsubhn_u16(uint16x8_t a, uint16x8_t b) // VRSUBHN.I16 d0,q0,q0 4953 { 4954 uint8x8_t res64; 4955 __m128i sub, mask1; 4956 sub = _mm_sub_epi16 (a, b); 4957 mask1 = _mm_slli_epi16(sub, 9); //shift left then back right to 4958 mask1 = _mm_srli_epi16(mask1, 15); //get 7-th bit 1 or zero 4959 sub = _mm_srai_epi16 (sub, 8); //get high half 4960 sub = _mm_add_epi16 (sub, mask1); //actual rounding 4961 sub = _mm_packus_epi16 (sub, sub); 4962 return64(sub); 4963 } 4964 4965 _NEON2SSESTORAGE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b); // VRSUBHN.I32 d0,q0,q0 4966 _NEON2SSE_INLINE uint16x4_t vrsubhn_u32(uint32x4_t a, uint32x4_t b) // VRSUBHN.I32 d0,q0,q0 4967 { 4968 //SIMD may be not optimal, serial may be faster 4969 uint16x4_t res64; 4970 __m128i sub, mask1; 4971 sub = _mm_sub_epi32 (a, b); 4972 mask1 = _mm_slli_epi32(sub, 17); //shift left then back right to 4973 mask1 = _mm_srli_epi32(mask1,31); //get 15-th bit 1 or zero 4974 sub = _mm_srai_epi32 (sub, 16); //get high half 4975 sub = _mm_add_epi32 (sub, mask1); //actual rounding 4976 #ifdef USE_SSE4 4977 sub = _MM_PACKUS1_EPI32 (sub); 4978 #else 4979 sub = _mm_shuffle_epi8 (sub, *(__m128i*) mask8_32_even_odd); //go to 16 bits 4980 #endif 4981 return64(sub); 4982 } 4983 4984 _NEON2SSESTORAGE uint32x2_t vrsubhn_u64(uint64x2_t a, uint64x2_t b); // VRSUBHN.I64 d0,q0,q0 4985 #define vrsubhn_u64 vrsubhn_s64 4986 4987 //*********** Vector saturating doubling multiply subtract long ******************** 4988 //************************************************************************************ 4989 _NEON2SSESTORAGE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VQDMLSL.S16 q0,d0,d0 4990 _NEON2SSE_INLINE int32x4_t vqdmlsl_s16(int32x4_t a, int16x4_t b, int16x4_t c) 4991 { 4992 //not optimal SIMD soulution, serial may be faster 4993 __m128i res32, mask; 4994 int32x4_t res; 4995 _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 4996 res = vmull_s16(b, c); 4997 res32 = _mm_slli_epi32 (res, 1); //double the result, saturation not considered 4998 mask = _mm_cmpeq_epi32 (res32, *(__m128i*)cmask); 4999 res32 = _mm_xor_si128 (res32, mask); //res32 saturated for 0x80000000 5000 return vqsubq_s32(a, res32); //saturation 5001 } 5002 5003 _NEON2SSESTORAGE int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VQDMLSL.S32 q0,d0,d0 5004 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) 5005 { 5006 __m128i res64, mask; 5007 int64x2_t res; 5008 _NEON2SSE_ALIGN_16 static const uint64_t cmask[] = {0x8000000000000000, 0x8000000000000000}; 5009 res = vmull_s32(b, c); 5010 res64 = _mm_slli_epi64 (res, 1); //double the result, saturation not considered 5011 mask = _MM_CMPEQ_EPI64 (res64, *(__m128i*)cmask); 5012 res64 = _mm_xor_si128 (res64, mask); //res32 saturated for 0x80000000 5013 return vqsubq_s64(a, res64); //saturation 5014 } 5015 5016 //****************** COMPARISON *************************************** 5017 //******************* Vector compare equal ************************************* 5018 //**************************************************************************** 5019 _NEON2SSESTORAGE uint8x8_t vceq_s8(int8x8_t a, int8x8_t b); // VCEQ.I8 d0, d0, d0 5020 _NEON2SSE_INLINE int8x8_t vceq_s8(int8x8_t a, int8x8_t b) 5021 { 5022 int8x8_t res64; 5023 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); 5024 } 5025 5026 5027 _NEON2SSESTORAGE uint16x4_t vceq_s16(int16x4_t a, int16x4_t b); // VCEQ.I16 d0, d0, d0 5028 _NEON2SSE_INLINE int16x4_t vceq_s16(int16x4_t a, int16x4_t b) 5029 { 5030 int16x4_t res64; 5031 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); 5032 } 5033 5034 5035 _NEON2SSESTORAGE uint32x2_t vceq_s32(int32x2_t a, int32x2_t b); // VCEQ.I32 d0, d0, d0 5036 _NEON2SSE_INLINE int32x2_t vceq_s32(int32x2_t a, int32x2_t b) 5037 { 5038 int32x2_t res64; 5039 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); 5040 } 5041 5042 5043 _NEON2SSESTORAGE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b); // VCEQ.F32 d0, d0, d0 5044 _NEON2SSE_INLINE uint32x2_t vceq_f32(float32x2_t a, float32x2_t b) 5045 { 5046 uint32x2_t res64; 5047 __m128 res; 5048 res = _mm_cmpeq_ps(_pM128(a), _pM128(b) ); 5049 return64f(res); 5050 } 5051 5052 _NEON2SSESTORAGE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b); // VCEQ.I8 d0, d0, d0 5053 _NEON2SSE_INLINE uint8x8_t vceq_u8(uint8x8_t a, uint8x8_t b) 5054 { 5055 uint8x8_t res64; 5056 return64(_mm_cmpeq_epi8(_pM128i(a),_pM128i(b))); 5057 } 5058 5059 5060 _NEON2SSESTORAGE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b); // VCEQ.I16 d0, d0, d0 5061 _NEON2SSE_INLINE uint16x4_t vceq_u16(uint16x4_t a, uint16x4_t b) 5062 { 5063 uint16x4_t res64; 5064 return64(_mm_cmpeq_epi16(_pM128i(a),_pM128i(b))); 5065 } 5066 5067 5068 _NEON2SSESTORAGE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b); // VCEQ.I32 d0, d0, d0 5069 _NEON2SSE_INLINE uint32x2_t vceq_u32(uint32x2_t a, uint32x2_t b) 5070 { 5071 uint32x2_t res64; 5072 return64(_mm_cmpeq_epi32(_pM128i(a),_pM128i(b))); 5073 } 5074 5075 5076 _NEON2SSESTORAGE uint8x8_t vceq_p8(poly8x8_t a, poly8x8_t b); // VCEQ.I8 d0, d0, d0 5077 #define vceq_p8 vceq_u8 5078 5079 5080 _NEON2SSESTORAGE uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 5081 #define vceqq_s8 _mm_cmpeq_epi8 5082 5083 _NEON2SSESTORAGE uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 5084 #define vceqq_s16 _mm_cmpeq_epi16 5085 5086 _NEON2SSESTORAGE uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 5087 #define vceqq_s32 _mm_cmpeq_epi32 5088 5089 _NEON2SSESTORAGE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 5090 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b) 5091 { 5092 __m128 res; 5093 res = _mm_cmpeq_ps(a,b); 5094 return _M128i(res); 5095 } 5096 5097 _NEON2SSESTORAGE uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 5098 #define vceqq_u8 _mm_cmpeq_epi8 5099 5100 _NEON2SSESTORAGE uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 5101 #define vceqq_u16 _mm_cmpeq_epi16 5102 5103 _NEON2SSESTORAGE uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 5104 #define vceqq_u32 _mm_cmpeq_epi32 5105 5106 _NEON2SSESTORAGE uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 5107 #define vceqq_p8 _mm_cmpeq_epi8 5108 5109 //******************Vector compare greater-than or equal************************* 5110 //******************************************************************************* 5111 //in IA SIMD no greater-than-or-equal comparison for integers, 5112 // there is greater-than available only, so we need the following tricks 5113 5114 _NEON2SSESTORAGE uint8x8_t vcge_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 5115 _NEON2SSE_INLINE int8x8_t vcge_s8(int8x8_t a, int8x8_t b) 5116 { 5117 int8x8_t res64; 5118 return64(vcgeq_s8(_pM128i(a), _pM128i(b))); 5119 } 5120 5121 5122 _NEON2SSESTORAGE uint16x4_t vcge_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 5123 _NEON2SSE_INLINE int16x4_t vcge_s16(int16x4_t a, int16x4_t b) 5124 { 5125 int16x4_t res64; 5126 return64(vcgeq_s16(_pM128i(a), _pM128i(b))); 5127 } 5128 5129 5130 _NEON2SSESTORAGE uint32x2_t vcge_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 5131 _NEON2SSE_INLINE int32x2_t vcge_s32(int32x2_t a, int32x2_t b) 5132 { 5133 int32x2_t res64; 5134 return64(vcgeq_s32(_pM128i(a), _pM128i(b))); 5135 } 5136 5137 5138 _NEON2SSESTORAGE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0 5139 _NEON2SSE_INLINE uint32x2_t vcge_f32(float32x2_t a, float32x2_t b) 5140 { 5141 uint32x2_t res64; 5142 __m128 res; 5143 res = _mm_cmpge_ps(_pM128(a),_pM128(b)); //use only 2 first entries 5144 return64f(res); 5145 } 5146 5147 _NEON2SSESTORAGE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 5148 _NEON2SSE_INLINE uint8x8_t vcge_u8(uint8x8_t a, uint8x8_t b) 5149 { 5150 uint8x8_t res64; 5151 return64(vcgeq_u8(_pM128i(a), _pM128i(b))); 5152 } 5153 5154 5155 _NEON2SSESTORAGE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 5156 _NEON2SSE_INLINE uint16x4_t vcge_u16(uint16x4_t a, uint16x4_t b) 5157 { 5158 uint16x4_t res64; 5159 return64(vcgeq_u16(_pM128i(a), _pM128i(b))); 5160 } 5161 5162 5163 _NEON2SSESTORAGE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 5164 _NEON2SSE_INLINE uint32x2_t vcge_u32(uint32x2_t a, uint32x2_t b) 5165 { 5166 //serial solution looks faster 5167 uint32x2_t res64; 5168 return64(vcgeq_u32 (_pM128i(a), _pM128i(b))); 5169 } 5170 5171 5172 5173 _NEON2SSESTORAGE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 5174 _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 5175 { 5176 __m128i m1, m2; 5177 m1 = _mm_cmpgt_epi8 ( a, b); 5178 m2 = _mm_cmpeq_epi8 ( a, b); 5179 return _mm_or_si128 ( m1, m2); 5180 } 5181 5182 _NEON2SSESTORAGE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 5183 _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 5184 { 5185 __m128i m1, m2; 5186 m1 = _mm_cmpgt_epi16 ( a, b); 5187 m2 = _mm_cmpeq_epi16 ( a, b); 5188 return _mm_or_si128 ( m1,m2); 5189 } 5190 5191 _NEON2SSESTORAGE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 5192 _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 5193 { 5194 __m128i m1, m2; 5195 m1 = _mm_cmpgt_epi32 (a, b); 5196 m2 = _mm_cmpeq_epi32 (a, b); 5197 return _mm_or_si128 (m1, m2); 5198 } 5199 5200 _NEON2SSESTORAGE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 5201 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b) 5202 { 5203 __m128 res; 5204 res = _mm_cmpge_ps(a,b); //use only 2 first entries 5205 return *(__m128i*)&res; 5206 } 5207 5208 _NEON2SSESTORAGE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 5209 _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 5210 { 5211 //no unsigned chars comparison, only signed available,so need the trick 5212 __m128i cmp; 5213 cmp = _mm_max_epu8(a, b); 5214 return _mm_cmpeq_epi8(cmp, a); //a>=b 5215 } 5216 5217 _NEON2SSESTORAGE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 5218 _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 5219 { 5220 //no unsigned shorts comparison, only signed available,so need the trick 5221 #ifdef USE_SSE4 5222 __m128i cmp; 5223 cmp = _mm_max_epu16(a, b); 5224 return _mm_cmpeq_epi16(cmp, a); //a>=b 5225 #else 5226 __m128i as, mask; 5227 __m128i zero = _mm_setzero_si128(); 5228 __m128i cffff = _mm_set1_epi16(0xffff); 5229 as = _mm_subs_epu16(b,a); 5230 mask = _mm_cmpgt_epi16(as, zero); 5231 return _mm_xor_si128 ( mask, cffff); 5232 #endif 5233 } 5234 5235 _NEON2SSESTORAGE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 5236 _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 5237 { 5238 //no unsigned ints comparison, only signed available,so need the trick 5239 #ifdef USE_SSE4 5240 __m128i cmp; 5241 cmp = _mm_max_epu32(a, b); 5242 return _mm_cmpeq_epi32(cmp, a); //a>=b 5243 #else 5244 //serial solution may be faster 5245 __m128i c80000000, as, bs, m1, m2; 5246 c80000000 = _mm_set1_epi32 (0x80000000); 5247 as = _mm_sub_epi32(a,c80000000); 5248 bs = _mm_sub_epi32(b,c80000000); 5249 m1 = _mm_cmpgt_epi32 (as, bs); 5250 m2 = _mm_cmpeq_epi32 (as, bs); 5251 return _mm_or_si128 ( m1, m2); 5252 #endif 5253 } 5254 5255 //**********************Vector compare less-than or equal****************************** 5256 //*************************************************************************************** 5257 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks 5258 5259 _NEON2SSESTORAGE uint8x8_t vcle_s8(int8x8_t a, int8x8_t b); // VCGE.S8 d0, d0, d0 5260 _NEON2SSE_INLINE int8x8_t vcle_s8(int8x8_t a, int8x8_t b) 5261 { 5262 int8x8_t res64; 5263 return64(vcleq_s8(_pM128i(a), _pM128i(b))); 5264 } 5265 5266 5267 _NEON2SSESTORAGE uint16x4_t vcle_s16(int16x4_t a, int16x4_t b); // VCGE.S16 d0, d0, d0 5268 _NEON2SSE_INLINE int16x4_t vcle_s16(int16x4_t a, int16x4_t b) 5269 { 5270 int16x4_t res64; 5271 return64(vcleq_s16(_pM128i(a), _pM128i(b))); 5272 } 5273 5274 5275 _NEON2SSESTORAGE uint32x2_t vcle_s32(int32x2_t a, int32x2_t b); // VCGE.S32 d0, d0, d0 5276 _NEON2SSE_INLINE int32x2_t vcle_s32(int32x2_t a, int32x2_t b) 5277 { 5278 int32x2_t res64; 5279 return64(vcleq_s32(_pM128i(a), _pM128i(b))); 5280 } 5281 5282 5283 _NEON2SSESTORAGE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b); // VCGE.F32 d0, d0, d0? 5284 _NEON2SSE_INLINE uint32x2_t vcle_f32(float32x2_t a, float32x2_t b) 5285 { 5286 uint32x2_t res64; 5287 __m128 res; 5288 res = _mm_cmple_ps(_pM128(a),_pM128(b)); 5289 return64f(res); 5290 } 5291 5292 _NEON2SSESTORAGE uint8x8_t vcle_u8(uint8x8_t a, uint8x8_t b); // VCGE.U8 d0, d0, d0 5293 #define vcle_u8(a,b) vcge_u8(b,a) 5294 5295 5296 _NEON2SSESTORAGE uint16x4_t vcle_u16(uint16x4_t a, uint16x4_t b); // VCGE.s16 d0, d0, d0 5297 #define vcle_u16(a,b) vcge_u16(b,a) 5298 5299 5300 _NEON2SSESTORAGE uint32x2_t vcle_u32(uint32x2_t a, uint32x2_t b); // VCGE.U32 d0, d0, d0 5301 #define vcle_u32(a,b) vcge_u32(b,a) 5302 5303 _NEON2SSESTORAGE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 5304 _NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 5305 { 5306 __m128i c1, res; 5307 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... 5308 res = _mm_cmpgt_epi8 ( a, b); 5309 return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal 5310 } 5311 5312 _NEON2SSESTORAGE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 5313 _NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 5314 { 5315 __m128i c1, res; 5316 c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff.... 5317 res = _mm_cmpgt_epi16 ( a, b); 5318 return _mm_andnot_si128 (res, c1); 5319 } 5320 5321 _NEON2SSESTORAGE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 5322 _NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 5323 { 5324 __m128i c1, res; 5325 c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff.... 5326 res = _mm_cmpgt_epi32 ( a, b); 5327 return _mm_andnot_si128 (res, c1); 5328 } 5329 5330 _NEON2SSESTORAGE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 5331 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b) 5332 { 5333 __m128 res; 5334 res = _mm_cmple_ps(a,b); 5335 return *(__m128i*)&res; 5336 } 5337 5338 _NEON2SSESTORAGE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 5339 #ifdef USE_SSE4 5340 _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 5341 { 5342 //no unsigned chars comparison in SSE, only signed available,so need the trick 5343 __m128i cmp; 5344 cmp = _mm_min_epu8(a, b); 5345 return _mm_cmpeq_epi8(cmp, a); //a<=b 5346 } 5347 #else 5348 # define vcleq_u8(a,b) vcgeq_u8(b,a) 5349 #endif 5350 5351 5352 _NEON2SSESTORAGE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 5353 #ifdef USE_SSE4 5354 _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 5355 { 5356 //no unsigned shorts comparison in SSE, only signed available,so need the trick 5357 __m128i cmp; 5358 cmp = _mm_min_epu16(a, b); 5359 return _mm_cmpeq_epi16(cmp, a); //a<=b 5360 } 5361 #else 5362 # define vcleq_u16(a,b) vcgeq_u16(b,a) 5363 #endif 5364 5365 5366 _NEON2SSESTORAGE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 5367 #ifdef USE_SSE4 5368 _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 5369 { 5370 //no unsigned chars comparison in SSE, only signed available,so need the trick 5371 __m128i cmp; 5372 cmp = _mm_min_epu32(a, b); 5373 return _mm_cmpeq_epi32(cmp, a); //a<=b 5374 } 5375 #else 5376 //solution may be not optimal compared with the serial one 5377 # define vcleq_u32(a,b) vcgeq_u32(b,a) 5378 #endif 5379 5380 5381 //****** Vector compare greater-than ****************************************** 5382 //************************************************************************** 5383 _NEON2SSESTORAGE uint8x8_t vcgt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 5384 _NEON2SSE_INLINE int8x8_t vcgt_s8(int8x8_t a, int8x8_t b) 5385 { 5386 int8x8_t res64; 5387 return64(_mm_cmpgt_epi8(_pM128i(a),_pM128i(b))); 5388 } 5389 5390 5391 _NEON2SSESTORAGE uint16x4_t vcgt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 5392 _NEON2SSE_INLINE int16x4_t vcgt_s16(int16x4_t a, int16x4_t b) 5393 { 5394 int16x4_t res64; 5395 return64(_mm_cmpgt_epi16(_pM128i(a),_pM128i(b))); 5396 } 5397 5398 5399 _NEON2SSESTORAGE uint32x2_t vcgt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 5400 _NEON2SSE_INLINE int32x2_t vcgt_s32(int32x2_t a, int32x2_t b) 5401 { 5402 int32x2_t res64; 5403 return64(_mm_cmpgt_epi32(_pM128i(a),_pM128i(b))); 5404 } 5405 5406 5407 _NEON2SSESTORAGE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 5408 _NEON2SSE_INLINE uint32x2_t vcgt_f32(float32x2_t a, float32x2_t b) 5409 { 5410 uint32x2_t res64; 5411 __m128 res; 5412 res = _mm_cmpgt_ps(_pM128(a),_pM128(b)); //use only 2 first entries 5413 return64f(res); 5414 } 5415 5416 _NEON2SSESTORAGE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 5417 _NEON2SSE_INLINE uint8x8_t vcgt_u8(uint8x8_t a, uint8x8_t b) 5418 { 5419 uint8x8_t res64; 5420 return64(vcgtq_u8(_pM128i(a), _pM128i(b))); 5421 } 5422 5423 5424 _NEON2SSESTORAGE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 5425 _NEON2SSE_INLINE uint16x4_t vcgt_u16(uint16x4_t a, uint16x4_t b) 5426 { 5427 uint16x4_t res64; 5428 return64(vcgtq_u16(_pM128i(a), _pM128i(b))); 5429 } 5430 5431 5432 _NEON2SSESTORAGE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 5433 _NEON2SSE_INLINE uint32x2_t vcgt_u32(uint32x2_t a, uint32x2_t b) 5434 { 5435 uint32x2_t res64; 5436 return64(vcgtq_u32(_pM128i(a), _pM128i(b))); 5437 } 5438 5439 5440 _NEON2SSESTORAGE uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 5441 #define vcgtq_s8 _mm_cmpgt_epi8 5442 5443 _NEON2SSESTORAGE uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 5444 #define vcgtq_s16 _mm_cmpgt_epi16 5445 5446 _NEON2SSESTORAGE uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 5447 #define vcgtq_s32 _mm_cmpgt_epi32 5448 5449 _NEON2SSESTORAGE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 5450 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b) 5451 { 5452 __m128 res; 5453 res = _mm_cmpgt_ps(a,b); //use only 2 first entries 5454 return *(__m128i*)&res; 5455 } 5456 5457 _NEON2SSESTORAGE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 5458 _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0 5459 { 5460 //no unsigned chars comparison, only signed available,so need the trick 5461 __m128i as; 5462 __m128i zero = _mm_setzero_si128(); 5463 as = _mm_subs_epu8(a, b); 5464 return _mm_cmpgt_epi8(as, zero); 5465 } 5466 5467 _NEON2SSESTORAGE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 5468 _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0 5469 { 5470 //no unsigned short comparison, only signed available,so need the trick 5471 __m128i as; 5472 __m128i zero = _mm_setzero_si128(); 5473 as = _mm_subs_epu16(a, b); 5474 return _mm_cmpgt_epi16(as, zero); 5475 } 5476 5477 _NEON2SSESTORAGE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 5478 _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0 5479 { 5480 //no unsigned int comparison, only signed available,so need the trick 5481 __m128i c80000000, as, bs; 5482 c80000000 = _mm_set1_epi32 (0x80000000); 5483 as = _mm_sub_epi32(a,c80000000); 5484 bs = _mm_sub_epi32(b,c80000000); 5485 return _mm_cmpgt_epi32 ( as, bs); 5486 } 5487 5488 //********************* Vector compare less-than ************************** 5489 //************************************************************************* 5490 _NEON2SSESTORAGE uint8x8_t vclt_s8(int8x8_t a, int8x8_t b); // VCGT.S8 d0, d0, d0 5491 #define vclt_s8(a,b) vcgt_s8(b,a) //swap the arguments!! 5492 5493 5494 _NEON2SSESTORAGE uint16x4_t vclt_s16(int16x4_t a, int16x4_t b); // VCGT.S16 d0, d0, d0 5495 #define vclt_s16(a,b) vcgt_s16(b,a) //swap the arguments!! 5496 5497 5498 _NEON2SSESTORAGE uint32x2_t vclt_s32(int32x2_t a, int32x2_t b); // VCGT.S32 d0, d0, d0 5499 #define vclt_s32(a,b) vcgt_s32(b,a) //swap the arguments!! 5500 5501 5502 _NEON2SSESTORAGE uint32x2_t vclt_f32(float32x2_t a, float32x2_t b); // VCGT.F32 d0, d0, d0 5503 #define vclt_f32(a,b) vcgt_f32(b, a) //swap the arguments!! 5504 5505 _NEON2SSESTORAGE uint8x8_t vclt_u8(uint8x8_t a, uint8x8_t b); // VCGT.U8 d0, d0, d0 5506 #define vclt_u8(a,b) vcgt_u8(b,a) //swap the arguments!! 5507 5508 _NEON2SSESTORAGE uint16x4_t vclt_u16(uint16x4_t a, uint16x4_t b); // VCGT.s16 d0, d0, d0 5509 #define vclt_u16(a,b) vcgt_u16(b,a) //swap the arguments!! 5510 5511 _NEON2SSESTORAGE uint32x2_t vclt_u32(uint32x2_t a, uint32x2_t b); // VCGT.U32 d0, d0, d0 5512 #define vclt_u32(a,b) vcgt_u32(b,a) //swap the arguments!! 5513 5514 _NEON2SSESTORAGE uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 5515 #define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!! 5516 5517 _NEON2SSESTORAGE uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 5518 #define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!! 5519 5520 _NEON2SSESTORAGE uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 5521 #define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!! 5522 5523 _NEON2SSESTORAGE uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 5524 #define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!! 5525 5526 _NEON2SSESTORAGE uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 5527 #define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!! 5528 5529 _NEON2SSESTORAGE uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 5530 #define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!! 5531 5532 _NEON2SSESTORAGE uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 5533 #define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!! 5534 5535 //*****************Vector compare absolute greater-than or equal ************ 5536 //*************************************************************************** 5537 _NEON2SSESTORAGE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 5538 _NEON2SSE_INLINE uint32x2_t vcage_f32(float32x2_t a, float32x2_t b) 5539 { 5540 uint32x2_t res64; 5541 __m128i c7fffffff; 5542 __m128 a0, b0; 5543 c7fffffff = _mm_set1_epi32 (0x7fffffff); 5544 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); 5545 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); 5546 a0 = _mm_cmpge_ps ( a0, b0); 5547 return64f(a0); 5548 } 5549 5550 _NEON2SSESTORAGE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 5551 _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 5552 { 5553 __m128i c7fffffff; 5554 __m128 a0, b0; 5555 c7fffffff = _mm_set1_epi32 (0x7fffffff); 5556 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); 5557 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); 5558 a0 = _mm_cmpge_ps ( a0, b0); 5559 return (*(__m128i*)&a0); 5560 } 5561 5562 //********Vector compare absolute less-than or equal ****************** 5563 //******************************************************************** 5564 _NEON2SSESTORAGE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b); // VACGE.F32 d0, d0, d0 5565 _NEON2SSE_INLINE uint32x2_t vcale_f32(float32x2_t a, float32x2_t b) 5566 { 5567 uint32x2_t res64; 5568 __m128i c7fffffff; 5569 __m128 a0, b0; 5570 c7fffffff = _mm_set1_epi32 (0x7fffffff); 5571 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); 5572 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); 5573 a0 = _mm_cmple_ps (a0, b0); 5574 return64f(a0); 5575 } 5576 5577 _NEON2SSESTORAGE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 5578 _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 5579 { 5580 __m128i c7fffffff; 5581 __m128 a0, b0; 5582 c7fffffff = _mm_set1_epi32 (0x7fffffff); 5583 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); 5584 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); 5585 a0 = _mm_cmple_ps (a0, b0); 5586 return (*(__m128i*)&a0); 5587 } 5588 5589 //******** Vector compare absolute greater-than ****************** 5590 //****************************************************************** 5591 _NEON2SSESTORAGE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 5592 _NEON2SSE_INLINE uint32x2_t vcagt_f32(float32x2_t a, float32x2_t b) 5593 { 5594 uint32x2_t res64; 5595 __m128i c7fffffff; 5596 __m128 a0, b0; 5597 c7fffffff = _mm_set1_epi32 (0x7fffffff); 5598 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); 5599 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); 5600 a0 = _mm_cmpgt_ps (a0, b0); 5601 return64f(a0); 5602 } 5603 5604 _NEON2SSESTORAGE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 5605 _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 5606 { 5607 __m128i c7fffffff; 5608 __m128 a0, b0; 5609 c7fffffff = _mm_set1_epi32 (0x7fffffff); 5610 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); 5611 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); 5612 a0 = _mm_cmpgt_ps (a0, b0); 5613 return (*(__m128i*)&a0); 5614 } 5615 5616 //***************Vector compare absolute less-than *********************** 5617 //************************************************************************* 5618 _NEON2SSESTORAGE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b); // VACGT.F32 d0, d0, d0 5619 _NEON2SSE_INLINE uint32x2_t vcalt_f32(float32x2_t a, float32x2_t b) 5620 { 5621 uint32x2_t res64; 5622 __m128i c7fffffff; 5623 __m128 a0, b0; 5624 c7fffffff = _mm_set1_epi32 (0x7fffffff); 5625 a0 = _mm_and_ps (_pM128(a), *(__m128*)&c7fffffff); 5626 b0 = _mm_and_ps (_pM128(b), *(__m128*)&c7fffffff); 5627 a0 = _mm_cmplt_ps (a0, b0); 5628 return64f(a0); 5629 } 5630 5631 _NEON2SSESTORAGE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 5632 _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 5633 { 5634 __m128i c7fffffff; 5635 __m128 a0, b0; 5636 c7fffffff = _mm_set1_epi32 (0x7fffffff); 5637 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); 5638 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); 5639 a0 = _mm_cmplt_ps (a0, b0); 5640 return (*(__m128i*)&a0); 5641 } 5642 5643 //*************************Vector test bits************************************ 5644 //***************************************************************************** 5645 /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them 5646 with the corresponding element of a second vector. If the result is not zero, the 5647 corresponding element in the destination vector is set to all ones. Otherwise, it is set to 5648 all zeros. */ 5649 5650 _NEON2SSESTORAGE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b); // VTST.8 d0, d0, d0 5651 _NEON2SSE_INLINE uint8x8_t vtst_s8(int8x8_t a, int8x8_t b) 5652 { 5653 int8x8_t res64; 5654 return64(vtstq_s8(_pM128i(a), _pM128i(b))); 5655 } 5656 5657 5658 _NEON2SSESTORAGE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b); // VTST.16 d0, d0, d0 5659 _NEON2SSE_INLINE uint16x4_t vtst_s16(int16x4_t a, int16x4_t b) 5660 { 5661 int16x4_t res64; 5662 return64(vtstq_s16(_pM128i(a), _pM128i(b))); 5663 } 5664 5665 5666 _NEON2SSESTORAGE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b); // VTST.32 d0, d0, d0 5667 _NEON2SSE_INLINE uint32x2_t vtst_s32(int32x2_t a, int32x2_t b) 5668 { 5669 int32x2_t res64; 5670 return64(vtstq_s32(_pM128i(a), _pM128i(b))); 5671 } 5672 5673 5674 _NEON2SSESTORAGE uint8x8_t vtst_u8(uint8x8_t a, uint8x8_t b); // VTST.8 d0, d0, d0 5675 #define vtst_u8 vtst_s8 5676 5677 _NEON2SSESTORAGE uint16x4_t vtst_u16(uint16x4_t a, uint16x4_t b); // VTST.16 d0, d0, d0 5678 #define vtst_u16 vtst_s16 5679 5680 _NEON2SSESTORAGE uint32x2_t vtst_u32(uint32x2_t a, uint32x2_t b); // VTST.32 d0, d0, d0 5681 #define vtst_u32 vtst_s32 5682 5683 5684 _NEON2SSESTORAGE uint8x8_t vtst_p8(poly8x8_t a, poly8x8_t b); // VTST.8 d0, d0, d0 5685 #define vtst_p8 vtst_u8 5686 5687 _NEON2SSESTORAGE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 5688 _NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0 5689 { 5690 __m128i zero, one, res; 5691 zero = _mm_setzero_si128 (); 5692 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff 5693 res = _mm_and_si128 (a, b); 5694 res = _mm_cmpeq_epi8 (res, zero); 5695 return _mm_xor_si128(res, one); //invert result 5696 } 5697 5698 _NEON2SSESTORAGE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 5699 _NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0 5700 { 5701 __m128i zero, one, res; 5702 zero = _mm_setzero_si128 (); 5703 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff 5704 res = _mm_and_si128 (a, b); 5705 res = _mm_cmpeq_epi16 (res, zero); 5706 return _mm_xor_si128(res, one); //invert result 5707 } 5708 5709 _NEON2SSESTORAGE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 5710 _NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0 5711 { 5712 __m128i zero, one, res; 5713 zero = _mm_setzero_si128 (); 5714 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff 5715 res = _mm_and_si128 (a, b); 5716 res = _mm_cmpeq_epi32 (res, zero); 5717 return _mm_xor_si128(res, one); //invert result 5718 } 5719 5720 _NEON2SSESTORAGE uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 5721 #define vtstq_u8 vtstq_s8 5722 5723 _NEON2SSESTORAGE uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 5724 #define vtstq_u16 vtstq_s16 5725 5726 _NEON2SSESTORAGE uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 5727 #define vtstq_u32 vtstq_s32 5728 5729 _NEON2SSESTORAGE uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 5730 #define vtstq_p8 vtstq_u8 5731 5732 //****************** Absolute difference ******************** 5733 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |***** 5734 //************************************************************ 5735 _NEON2SSESTORAGE int8x8_t vabd_s8(int8x8_t a, int8x8_t b); // VABD.S8 d0,d0,d0 5736 _NEON2SSE_INLINE int8x8_t vabd_s8(int8x8_t a, int8x8_t b) 5737 { 5738 int8x8_t res64; 5739 return64(vabdq_s8(_pM128i(a), _pM128i(b))); 5740 } 5741 5742 _NEON2SSESTORAGE int16x4_t vabd_s16(int16x4_t a, int16x4_t b); // VABD.S16 d0,d0,d0 5743 _NEON2SSE_INLINE int16x4_t vabd_s16(int16x4_t a, int16x4_t b) 5744 { 5745 int16x4_t res64; 5746 return64(vabdq_s16(_pM128i(a), _pM128i(b))); 5747 } 5748 5749 _NEON2SSESTORAGE int32x2_t vabd_s32(int32x2_t a, int32x2_t b); // VABD.S32 d0,d0,d0 5750 _NEON2SSE_INLINE int32x2_t vabd_s32(int32x2_t a, int32x2_t b) 5751 {//need to deal with an intermediate overflow 5752 int32x2_t res; 5753 res.m64_i32[0] = (a.m64_i32[0] > b.m64_i32[0]) ? a.m64_i32[0] - b.m64_i32[0]: b.m64_i32[0] - a.m64_i32[0]; 5754 res.m64_i32[1] = (a.m64_i32[1] > b.m64_i32[1]) ? a.m64_i32[1] - b.m64_i32[1]: b.m64_i32[1] - a.m64_i32[1]; 5755 return res; 5756 } 5757 5758 _NEON2SSESTORAGE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b); // VABD.U8 d0,d0,d0 5759 _NEON2SSE_INLINE uint8x8_t vabd_u8(uint8x8_t a, uint8x8_t b) 5760 { 5761 uint8x8_t res64; 5762 return64(vabdq_u8(_pM128i(a), _pM128i(b))); 5763 } 5764 5765 _NEON2SSESTORAGE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b); // VABD.s16 d0,d0,d0 5766 _NEON2SSE_INLINE uint16x4_t vabd_u16(uint16x4_t a, uint16x4_t b) 5767 { 5768 uint16x4_t res64; 5769 return64(vabdq_u16(_pM128i(a), _pM128i(b))); 5770 } 5771 5772 _NEON2SSESTORAGE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b); // VABD.U32 d0,d0,d0 5773 _NEON2SSE_INLINE uint32x2_t vabd_u32(uint32x2_t a, uint32x2_t b) 5774 { 5775 uint32x2_t res64; 5776 return64(vabdq_u32(_pM128i(a), _pM128i(b))); 5777 } 5778 5779 _NEON2SSESTORAGE float32x2_t vabd_f32(float32x2_t a, float32x2_t b); // VABD.F32 d0,d0,d0 5780 _NEON2SSE_INLINE float32x2_t vabd_f32(float32x2_t a, float32x2_t b) 5781 { 5782 float32x4_t res; 5783 __m64_128 res64; 5784 res = vabdq_f32(_pM128(a), _pM128(b)); 5785 _M64f(res64, res); 5786 return res64; 5787 } 5788 5789 _NEON2SSESTORAGE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 5790 _NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0 5791 { //need to deal with an intermediate overflow 5792 __m128i cmp, difab, difba; 5793 cmp = vcgtq_s8(a,b); 5794 difab = _mm_sub_epi8(a,b); 5795 difba = _mm_sub_epi8(b,a); 5796 difab = _mm_and_si128(cmp, difab); 5797 difba = _mm_andnot_si128(cmp, difba); 5798 return _mm_or_si128(difab, difba); 5799 } 5800 5801 _NEON2SSESTORAGE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 5802 _NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0 5803 {//need to deal with an intermediate overflow 5804 __m128i cmp, difab, difba; 5805 cmp = vcgtq_s16(a,b); 5806 difab = _mm_sub_epi16(a,b); 5807 difba = _mm_sub_epi16 (b,a); 5808 difab = _mm_and_si128(cmp, difab); 5809 difba = _mm_andnot_si128(cmp, difba); 5810 return _mm_or_si128(difab, difba); 5811 } 5812 5813 _NEON2SSESTORAGE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 5814 _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0 5815 {//need to deal with an intermediate overflow 5816 __m128i cmp, difab, difba; 5817 cmp = vcgtq_s32(a,b); 5818 difab = _mm_sub_epi32(a,b); 5819 difba = _mm_sub_epi32(b,a); 5820 difab = _mm_and_si128(cmp, difab); 5821 difba = _mm_andnot_si128(cmp, difba); 5822 return _mm_or_si128(difab, difba); 5823 } 5824 5825 _NEON2SSESTORAGE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 5826 _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned 5827 { 5828 __m128i difab, difba; 5829 difab = _mm_subs_epu8(a,b); 5830 difba = _mm_subs_epu8 (b,a); 5831 return _mm_or_si128(difab, difba); 5832 } 5833 5834 _NEON2SSESTORAGE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0 5835 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b) 5836 { 5837 __m128i difab, difba; 5838 difab = _mm_subs_epu16(a,b); 5839 difba = _mm_subs_epu16 (b,a); 5840 return _mm_or_si128(difab, difba); 5841 } 5842 5843 _NEON2SSESTORAGE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 5844 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b) 5845 { 5846 __m128i cmp, difab, difba; 5847 cmp = vcgtq_u32(a,b); 5848 difab = _mm_sub_epi32(a,b); 5849 difba = _mm_sub_epi32 (b,a); 5850 difab = _mm_and_si128(cmp, difab); 5851 difba = _mm_andnot_si128(cmp, difba); 5852 return _mm_or_si128(difab, difba); 5853 } 5854 5855 _NEON2SSESTORAGE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 5856 _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0 5857 { 5858 __m128i c1; 5859 __m128 res; 5860 c1 = _mm_set1_epi32(0x7fffffff); 5861 res = _mm_sub_ps (a, b); 5862 return _mm_and_ps (res, *(__m128*)&c1); 5863 } 5864 5865 //************ Absolute difference - long ************************** 5866 //******************************************************************** 5867 _NEON2SSESTORAGE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b); // VABDL.S8 q0,d0,d0 5868 _NEON2SSE_INLINE int16x8_t vabdl_s8(int8x8_t a, int8x8_t b) // VABDL.S8 q0,d0,d0 5869 { 5870 __m128i a16, b16; 5871 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE4.1, 5872 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, 5873 return vabdq_s16(a16, b16); 5874 5875 } 5876 5877 _NEON2SSESTORAGE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b); // VABDL.S16 q0,d0,d0 5878 _NEON2SSE_INLINE int32x4_t vabdl_s16(int16x4_t a, int16x4_t b) // VABDL.S16 q0,d0,d0 5879 { 5880 __m128i a32, b32; 5881 a32 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE4.1 5882 b32 = _MM_CVTEPI16_EPI32 (_pM128i(b)); //SSE4.1, 5883 return vabdq_s32(a32, b32); 5884 } 5885 5886 _NEON2SSESTORAGE int64x2_t vabdl_s32(int32x2_t a, int32x2_t b); // VABDL.S32 q0,d0,d0 5887 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabdl_s32(int32x2_t a, int32x2_t b),_NEON2SSE_REASON_SLOW_SERIAL) 5888 { 5889 //no optimal SIMD solution, serial looks faster 5890 _NEON2SSE_ALIGN_16 int64_t res[2]; 5891 if(a.m64_i32[0] > b.m64_i32[0]) res[0] = ( int64_t) a.m64_i32[0] - ( int64_t) b.m64_i32[0]; 5892 else res[0] = ( int64_t) b.m64_i32[0] - ( int64_t) a.m64_i32[0]; 5893 if(a.m64_i32[1] > b.m64_i32[1]) res[1] = ( int64_t) a.m64_i32[1] - ( int64_t) b.m64_i32[1]; 5894 else res[1] = ( int64_t) b.m64_i32[1] - ( int64_t) a.m64_i32[1]; 5895 return _mm_load_si128((__m128i*)res); 5896 } 5897 5898 _NEON2SSESTORAGE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b); // VABDL.U8 q0,d0,d0 5899 _NEON2SSE_INLINE uint16x8_t vabdl_u8(uint8x8_t a, uint8x8_t b) 5900 { 5901 __m128i res; 5902 res = vsubl_u8(a,b); 5903 return _mm_abs_epi16(res); 5904 } 5905 5906 _NEON2SSESTORAGE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b); // VABDL.s16 q0,d0,d0 5907 _NEON2SSE_INLINE uint32x4_t vabdl_u16(uint16x4_t a, uint16x4_t b) 5908 { 5909 __m128i res; 5910 res = vsubl_u16(a,b); 5911 return _mm_abs_epi32(res); 5912 } 5913 5914 _NEON2SSESTORAGE uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b); // VABDL.U32 q0,d0,d0 5915 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabdl_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 5916 { 5917 _NEON2SSE_ALIGN_16 uint64_t res[2]; 5918 if(a.m64_u32[0] > b.m64_u32[0]) res[0] = ( uint64_t) a.m64_u32[0] - ( uint64_t) b.m64_u32[0]; 5919 else res[0] = ( uint64_t) b.m64_u32[0] - ( uint64_t) a.m64_u32[0]; 5920 if(a.m64_u32[1] > b.m64_u32[1]) res[1] = ( uint64_t) a.m64_u32[1] - ( uint64_t) b.m64_u32[1]; 5921 else res[1] = ( uint64_t) b.m64_u32[1] - ( uint64_t) a.m64_u32[1]; 5922 return _mm_load_si128((__m128i*)res); 5923 } 5924 5925 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | ************* 5926 //********************************************************************************************* 5927 _NEON2SSESTORAGE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VABA.S8 d0,d0,d0 5928 _NEON2SSE_INLINE int8x8_t vaba_s8(int8x8_t a, int8x8_t b, int8x8_t c) 5929 { 5930 int8x8_t res64; 5931 return64(vabaq_s8(_pM128i(a),_pM128i(b), _pM128i(c))); 5932 } 5933 5934 _NEON2SSESTORAGE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c); // VABA.S16 d0,d0,d0 5935 _NEON2SSE_INLINE int16x4_t vaba_s16(int16x4_t a, int16x4_t b, int16x4_t c) 5936 { 5937 int16x4_t res64; 5938 return64(vabaq_s16(_pM128i(a), _pM128i(b), _pM128i(c))); 5939 } 5940 5941 _NEON2SSESTORAGE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c); // VABA.S32 d0,d0,d0 5942 _NEON2SSE_INLINE int32x2_t vaba_s32(int32x2_t a, int32x2_t b, int32x2_t c) 5943 { 5944 int32x2_t res64; 5945 return64(vabaq_s32(_pM128i(a), _pM128i(b), _pM128i(c))); 5946 } 5947 5948 _NEON2SSESTORAGE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VABA.U8 d0,d0,d0 5949 _NEON2SSE_INLINE uint8x8_t vaba_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) 5950 { 5951 int8x8_t res64; 5952 return64(vabaq_u8(_pM128i(a),_pM128i(b), _pM128i(c))); 5953 } 5954 5955 5956 _NEON2SSESTORAGE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VABA.s16 d0,d0,d0 5957 _NEON2SSE_INLINE uint16x4_t vaba_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c) 5958 { 5959 int16x4_t res64; 5960 return64(vabaq_u16(_pM128i(a), _pM128i(b), _pM128i(c))); 5961 } 5962 5963 _NEON2SSESTORAGE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VABA.U32 d0,d0,d0 5964 _NEON2SSE_INLINE uint32x2_t vaba_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c) 5965 { 5966 uint32x2_t res64; 5967 return64(vabaq_u32(_pM128i(a), _pM128i(b), _pM128i(c))); 5968 } 5969 5970 _NEON2SSESTORAGE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 5971 _NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0 5972 { 5973 int8x16_t sub; 5974 sub = vabdq_s8(b, c); 5975 return vaddq_s8( a, sub); 5976 } 5977 5978 _NEON2SSESTORAGE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 5979 _NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0 5980 { 5981 int16x8_t sub; 5982 sub = vabdq_s16(b, c); 5983 return vaddq_s16( a, sub); 5984 } 5985 5986 _NEON2SSESTORAGE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 5987 _NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0 5988 { 5989 int32x4_t sub; 5990 sub = vabdq_s32(b, c); 5991 return vaddq_s32( a, sub); 5992 } 5993 5994 _NEON2SSESTORAGE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 5995 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) 5996 { 5997 uint8x16_t sub; 5998 sub = vabdq_u8(b, c); 5999 return vaddq_u8( a, sub); 6000 } 6001 6002 _NEON2SSESTORAGE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0 6003 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) 6004 { 6005 uint16x8_t sub; 6006 sub = vabdq_u16(b, c); 6007 return vaddq_u16( a, sub); 6008 } 6009 6010 _NEON2SSESTORAGE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 6011 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) 6012 { 6013 uint32x4_t sub; 6014 sub = vabdq_u32(b, c); 6015 return vaddq_u32( a, sub); 6016 } 6017 6018 //************** Absolute difference and accumulate - long ******************************** 6019 //************************************************************************************* 6020 _NEON2SSESTORAGE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c); // VABAL.S8 q0,d0,d0 6021 _NEON2SSE_INLINE int16x8_t vabal_s8(int16x8_t a, int8x8_t b, int8x8_t c) // VABAL.S8 q0,d0,d0 6022 { 6023 __m128i b16, c16, res; 6024 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); //SSE4.1, 6025 c16 = _MM_CVTEPI8_EPI16 (_pM128i(c)); //SSE4.1, 6026 res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); 6027 return _mm_add_epi16 (a, res); 6028 } 6029 6030 _NEON2SSESTORAGE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c); // VABAL.S16 q0,d0,d0 6031 _NEON2SSE_INLINE int32x4_t vabal_s16(int32x4_t a, int16x4_t b, int16x4_t c) // VABAL.S16 q0,d0,d0 6032 { 6033 __m128i b32, c32, res; 6034 b32 = _MM_CVTEPI16_EPI32(_pM128i(b)); //SSE4.1 6035 c32 = _MM_CVTEPI16_EPI32(_pM128i(c)); //SSE4.1 6036 res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); 6037 return _mm_add_epi32 (a, res); 6038 } 6039 6040 _NEON2SSESTORAGE int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c); // VABAL.S32 q0,d0,d0 6041 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (int64x2_t vabal_s32(int64x2_t a, int32x2_t b, int32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) 6042 { 6043 __m128i res; 6044 res = vabdl_s32(b,c); 6045 return _mm_add_epi64(a, res); 6046 } 6047 6048 _NEON2SSESTORAGE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c); // VABAL.U8 q0,d0,d0 6049 _NEON2SSE_INLINE uint16x8_t vabal_u8(uint16x8_t a, uint8x8_t b, uint8x8_t c) 6050 { 6051 __m128i b16, c16, res; 6052 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); //SSE4.1, 6053 c16 = _MM_CVTEPU8_EPI16 (_pM128i(c)); //SSE4.1, 6054 res = _mm_abs_epi16 (_mm_sub_epi16 (b16, c16) ); 6055 return _mm_add_epi16 (a, res); 6056 } 6057 6058 _NEON2SSESTORAGE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c); // VABAL.s16 q0,d0,d0 6059 _NEON2SSE_INLINE uint32x4_t vabal_u16(uint32x4_t a, uint16x4_t b, uint16x4_t c) 6060 { 6061 __m128i b32, c32, res; 6062 b32 = _MM_CVTEPU16_EPI32(_pM128i(b)); //SSE4.1 6063 c32 = _MM_CVTEPU16_EPI32(_pM128i(c)); //SSE4.1 6064 res = _mm_abs_epi32 (_mm_sub_epi32 (b32, c32) ); 6065 return _mm_add_epi32 (a, res); 6066 } 6067 6068 _NEON2SSESTORAGE uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c); // VABAL.U32 q0,d0,d0 6069 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING (uint64x2_t vabal_u32(uint64x2_t a, uint32x2_t b, uint32x2_t c), _NEON2SSE_REASON_SLOW_SERIAL) 6070 { 6071 __m128i res; 6072 res = vabdl_u32(b,c); 6073 return _mm_add_epi64(a, res); 6074 } 6075 6076 //*********************************************************************************** 6077 //**************** Maximum and minimum operations ********************************** 6078 //*********************************************************************************** 6079 //************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] ******* 6080 //*********************************************************************************** 6081 _NEON2SSESTORAGE int8x8_t vmax_s8(int8x8_t a, int8x8_t b); // VMAX.S8 d0,d0,d0 6082 _NEON2SSE_INLINE int8x8_t vmax_s8(int8x8_t a, int8x8_t b) 6083 { 6084 int8x8_t res64; 6085 __m128i res; 6086 res = _MM_MAX_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits 6087 return64(res); 6088 } 6089 6090 _NEON2SSESTORAGE int16x4_t vmax_s16(int16x4_t a, int16x4_t b); // VMAX.S16 d0,d0,d0 6091 _NEON2SSE_INLINE int16x4_t vmax_s16(int16x4_t a, int16x4_t b) 6092 { 6093 int16x4_t res64; 6094 return64(_mm_max_epi16(_pM128i(a),_pM128i(b))); 6095 } 6096 6097 _NEON2SSESTORAGE int32x2_t vmax_s32(int32x2_t a, int32x2_t b); // VMAX.S32 d0,d0,d0 6098 _NEON2SSE_INLINE int32x2_t vmax_s32(int32x2_t a, int32x2_t b) 6099 { 6100 int32x2_t res64; 6101 __m128i res; 6102 res = _MM_MAX_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits 6103 return64(res); 6104 } 6105 6106 _NEON2SSESTORAGE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b); // VMAX.U8 d0,d0,d0 6107 _NEON2SSE_INLINE uint8x8_t vmax_u8(uint8x8_t a, uint8x8_t b) 6108 { 6109 uint8x8_t res64; 6110 return64(_mm_max_epu8(_pM128i(a),_pM128i(b))); 6111 } 6112 6113 6114 _NEON2SSESTORAGE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b); // VMAX.s16 d0,d0,d0 6115 _NEON2SSE_INLINE uint16x4_t vmax_u16(uint16x4_t a, uint16x4_t b) 6116 { 6117 uint16x4_t res64; 6118 return64(_MM_MAX_EPU16(_pM128i(a),_pM128i(b))); 6119 } 6120 6121 6122 _NEON2SSESTORAGE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b); // VMAX.U32 d0,d0,d0 6123 _NEON2SSE_INLINE uint32x2_t vmax_u32(uint32x2_t a, uint32x2_t b) 6124 { 6125 uint32x2_t res64; 6126 __m128i res; 6127 res = _MM_MAX_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial 6128 return64(res); 6129 } 6130 6131 _NEON2SSESTORAGE float32x2_t vmax_f32(float32x2_t a, float32x2_t b); // VMAX.F32 d0,d0,d0 6132 _NEON2SSE_INLINE float32x2_t vmax_f32(float32x2_t a, float32x2_t b) 6133 { 6134 //serial solution looks faster than SIMD one 6135 float32x2_t res; 6136 res.m64_f32[0] = (a.m64_f32[0] > b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; 6137 res.m64_f32[1] = (a.m64_f32[1] > b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; 6138 return res; 6139 } 6140 6141 _NEON2SSESTORAGE int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 6142 #define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1 6143 6144 _NEON2SSESTORAGE int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 6145 #define vmaxq_s16 _mm_max_epi16 6146 6147 _NEON2SSESTORAGE int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 6148 #define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1 6149 6150 _NEON2SSESTORAGE uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 6151 #define vmaxq_u8 _mm_max_epu8 6152 6153 _NEON2SSESTORAGE uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0 6154 #define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1 6155 6156 _NEON2SSESTORAGE uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 6157 #define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1 6158 6159 6160 _NEON2SSESTORAGE float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 6161 #define vmaxq_f32 _mm_max_ps 6162 6163 6164 _NEON2SSESTORAGE float64x2_t vmaxq_f64(float64x2_t a, float64x2_t b); // VMAX.F64 q0,q0,q0 6165 #define vmaxq_f64 _mm_max_pd 6166 6167 6168 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ******************************** 6169 //*********************************************************************************************************** 6170 _NEON2SSESTORAGE int8x8_t vmin_s8(int8x8_t a, int8x8_t b); // VMIN.S8 d0,d0,d0 6171 _NEON2SSE_INLINE int8x8_t vmin_s8(int8x8_t a, int8x8_t b) 6172 { 6173 int8x8_t res64; 6174 __m128i res; 6175 res = _MM_MIN_EPI8(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits 6176 return64(res); 6177 } 6178 6179 _NEON2SSESTORAGE int16x4_t vmin_s16(int16x4_t a, int16x4_t b); // VMIN.S16 d0,d0,d0 6180 _NEON2SSE_INLINE int16x4_t vmin_s16(int16x4_t a, int16x4_t b) 6181 { 6182 int16x4_t res64; 6183 return64(_mm_min_epi16(_pM128i(a),_pM128i(b))); 6184 } 6185 6186 6187 _NEON2SSESTORAGE int32x2_t vmin_s32(int32x2_t a, int32x2_t b); // VMIN.S32 d0,d0,d0 6188 _NEON2SSE_INLINE int32x2_t vmin_s32(int32x2_t a, int32x2_t b) 6189 { 6190 int32x2_t res64; 6191 __m128i res; 6192 res = _MM_MIN_EPI32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits 6193 return64(res); 6194 } 6195 6196 _NEON2SSESTORAGE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b); // VMIN.U8 d0,d0,d0 6197 _NEON2SSE_INLINE uint8x8_t vmin_u8(uint8x8_t a, uint8x8_t b) 6198 { 6199 uint8x8_t res64; 6200 return64(_mm_min_epu8(_pM128i(a),_pM128i(b))); 6201 } 6202 6203 6204 _NEON2SSESTORAGE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b); // VMIN.s16 d0,d0,d0 6205 _NEON2SSE_INLINE uint16x4_t vmin_u16(uint16x4_t a, uint16x4_t b) 6206 { 6207 uint16x4_t res64; 6208 return64(_MM_MIN_EPU16(_pM128i(a),_pM128i(b))); 6209 } 6210 6211 6212 _NEON2SSESTORAGE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b); // VMIN.U32 d0,d0,d0 6213 _NEON2SSE_INLINE uint32x2_t vmin_u32(uint32x2_t a, uint32x2_t b) 6214 { 6215 uint32x2_t res64; 6216 __m128i res; 6217 res = _MM_MIN_EPU32(_pM128i(a),_pM128i(b)); //SSE4.1, use only lower 64 bits, may be not effective compared with serial 6218 return64(res); 6219 } 6220 6221 _NEON2SSESTORAGE float32x2_t vmin_f32(float32x2_t a, float32x2_t b); // VMIN.F32 d0,d0,d0 6222 _NEON2SSE_INLINE float32x2_t vmin_f32(float32x2_t a, float32x2_t b) 6223 { 6224 //serial solution looks faster than SIMD one 6225 float32x2_t res; 6226 res.m64_f32[0] = (a.m64_f32[0] < b.m64_f32[0]) ? a.m64_f32[0] : b.m64_f32[0]; 6227 res.m64_f32[1] = (a.m64_f32[1] < b.m64_f32[1]) ? a.m64_f32[1] : b.m64_f32[1]; 6228 return res; 6229 } 6230 6231 _NEON2SSESTORAGE int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 6232 #define vminq_s8 _MM_MIN_EPI8 //SSE4.1 6233 6234 _NEON2SSESTORAGE int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 6235 #define vminq_s16 _mm_min_epi16 6236 6237 _NEON2SSESTORAGE int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 6238 #define vminq_s32 _MM_MIN_EPI32 //SSE4.1 6239 6240 _NEON2SSESTORAGE uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 6241 #define vminq_u8 _mm_min_epu8 6242 6243 _NEON2SSESTORAGE uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0 6244 #define vminq_u16 _MM_MIN_EPU16 //SSE4.1 6245 6246 _NEON2SSESTORAGE uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 6247 #define vminq_u32 _MM_MIN_EPU32 //SSE4.1 6248 6249 _NEON2SSESTORAGE float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 6250 #define vminq_f32 _mm_min_ps 6251 6252 6253 _NEON2SSESTORAGE float64x2_t vminq_f64(float64x2_t a, float64x2_t b); // VMIN.F64 q0,q0,q0 6254 #define vminq_f64 _mm_min_pd 6255 6256 6257 //************* Pairwise addition operations. ************************************** 6258 //************************************************************************************ 6259 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector 6260 _NEON2SSESTORAGE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b); // VPADD.I8 d0,d0,d0 6261 _NEON2SSE_INLINE int8x8_t vpadd_s8(int8x8_t a, int8x8_t b) // VPADD.I8 d0,d0,d0 6262 { 6263 //no 8 bit hadd in IA32, need to go to 16 bit and then pack 6264 int8x8_t res64; 6265 __m128i a16, b16, res; 6266 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 6267 b16 = _MM_CVTEPI8_EPI16 (_pM128i(b)); // SSE 4.1 6268 res = _mm_hadd_epi16 (a16, b16); 6269 res = _mm_shuffle_epi8 (res, *(__m128i*) mask8_16_even_odd); //return to 8 bit, use low 64 bits 6270 return64(res); 6271 } 6272 6273 _NEON2SSESTORAGE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b); // VPADD.I16 d0,d0,d0 6274 _NEON2SSE_INLINE int16x4_t vpadd_s16(int16x4_t a, int16x4_t b) 6275 { 6276 int16x4_t res64; 6277 __m128i hadd128; 6278 hadd128 = _mm_hadd_epi16 (_pM128i(a), _pM128i(b)); 6279 hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); 6280 return64(hadd128); 6281 } 6282 6283 6284 _NEON2SSESTORAGE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b); // VPADD.I32 d0,d0,d0 6285 _NEON2SSE_INLINE int32x2_t vpadd_s32(int32x2_t a, int32x2_t b) 6286 { 6287 int32x2_t res64; 6288 __m128i hadd128; 6289 hadd128 = _mm_hadd_epi32 (_pM128i(a), _pM128i(b)); 6290 hadd128 = _mm_shuffle_epi32 (hadd128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); 6291 return64(hadd128); 6292 } 6293 6294 6295 _NEON2SSESTORAGE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b); // VPADD.I8 d0,d0,d0 6296 _NEON2SSE_INLINE uint8x8_t vpadd_u8(uint8x8_t a, uint8x8_t b) // VPADD.I8 d0,d0,d0 6297 { 6298 // no 8 bit hadd in IA32, need to go to 16 bit and then pack 6299 uint8x8_t res64; 6300 // no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work 6301 __m128i mask8, a16, b16, res; 6302 mask8 = _mm_set1_epi16(0xff); 6303 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 6304 b16 = _MM_CVTEPU8_EPI16 (_pM128i(b)); // SSE 4.1 6305 res = _mm_hadd_epi16 (a16, b16); 6306 res = _mm_and_si128(res, mask8); //to avoid saturation 6307 res = _mm_packus_epi16 (res,res); //use low 64 bits 6308 return64(res); 6309 } 6310 6311 _NEON2SSESTORAGE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b); // VPADD.I16 d0,d0,d0 6312 _NEON2SSE_INLINE uint16x4_t vpadd_u16(uint16x4_t a, uint16x4_t b) // VPADD.I16 d0,d0,d0 6313 { 6314 // solution may be not optimal, serial execution may be faster 6315 // no unsigned _mm_hadd_ functions in IA32, need to move from unsigned to signed 6316 uint16x4_t res64; 6317 __m128i c32767, cfffe, as, bs, res; 6318 c32767 = _mm_set1_epi16 (32767); 6319 cfffe = _mm_set1_epi16 ((int16_t)0xfffe); 6320 as = _mm_sub_epi16 (_pM128i(a), c32767); 6321 bs = _mm_sub_epi16 (_pM128i(b), c32767); 6322 res = _mm_hadd_epi16 (as, bs); 6323 res = _mm_add_epi16 (res, cfffe); 6324 res = _mm_shuffle_epi32 (res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); 6325 return64(res); 6326 } 6327 6328 _NEON2SSESTORAGE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b); // VPADD.I32 d0,d0,d0 6329 _NEON2SSE_INLINE uint32x2_t vpadd_u32(uint32x2_t a, uint32x2_t b) //serial may be faster 6330 { 6331 //hadd doesn't work for unsigned values 6332 uint32x2_t res64; 6333 __m128i ab, ab_sh, res; 6334 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //a0 a1 b0 b1 6335 ab_sh = _mm_shuffle_epi32(ab, 1 | (0 << 2) | (3 << 4) | (2 << 6)); //a1, a0, b1, b0 6336 res = _mm_add_epi32(ab, ab_sh); 6337 res = _mm_shuffle_epi32(res, 0 | (2 << 2) | (1 << 4) | (3 << 6)); 6338 return64(res); 6339 } 6340 6341 _NEON2SSESTORAGE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b); // VPADD.F32 d0,d0,d0 6342 _NEON2SSE_INLINE float32x2_t vpadd_f32(float32x2_t a, float32x2_t b) 6343 { 6344 __m128 hadd128; 6345 __m64_128 res64; 6346 hadd128 = _mm_hadd_ps (_pM128(a), _pM128(b)); 6347 hadd128 = _mm_shuffle_ps (hadd128, hadd128, _MM_SHUFFLE(3,1, 2, 0)); //use low 64 bits 6348 _M64f(res64, hadd128); 6349 return res64; 6350 } 6351 6352 6353 //************************** Long pairwise add ********************************** 6354 //********************************************************************************* 6355 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width, 6356 // and places the final results in the destination vector. 6357 6358 _NEON2SSESTORAGE int16x4_t vpaddl_s8(int8x8_t a); // VPADDL.S8 d0,d0 6359 _NEON2SSE_INLINE int16x4_t vpaddl_s8(int8x8_t a) // VPADDL.S8 d0,d0 6360 { 6361 //no 8 bit hadd in IA32, need to go to 16 bit anyway 6362 __m128i a16; 6363 int16x4_t res64; 6364 a16 = _MM_CVTEPI8_EPI16 (_pM128i(a)); // SSE 4.1 6365 a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits 6366 return64(a16); 6367 } 6368 6369 _NEON2SSESTORAGE int32x2_t vpaddl_s16(int16x4_t a); // VPADDL.S16 d0,d0 6370 _NEON2SSE_INLINE int32x2_t vpaddl_s16(int16x4_t a) // VPADDL.S16 d0,d0 6371 { 6372 // solution may be not optimal, serial execution may be faster 6373 int32x2_t res64; 6374 __m128i r32_1; 6375 r32_1 = _MM_CVTEPI16_EPI32 (_pM128i(a)); 6376 r32_1 = _mm_hadd_epi32(r32_1, r32_1); //use low 64 bits 6377 return64(r32_1); 6378 } 6379 6380 _NEON2SSESTORAGE int64x1_t vpaddl_s32(int32x2_t a); // VPADDL.S32 d0,d0 6381 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vpaddl_s32(int32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster 6382 { 6383 int64x1_t res; 6384 res.m64_i64[0] = (int64_t)a.m64_i32[0] + (int64_t)a.m64_i32[1]; 6385 return res; 6386 } 6387 6388 _NEON2SSESTORAGE uint16x4_t vpaddl_u8(uint8x8_t a); // VPADDL.U8 d0,d0 6389 _NEON2SSE_INLINE uint16x4_t vpaddl_u8(uint8x8_t a) // VPADDL.U8 d0,d0 6390 { 6391 // no 8 bit hadd in IA32, need to go to 16 bit 6392 // no unsigned _mm_hadd_ functions in IA32, but 8 unsigned is less then 16 signed, so it should work 6393 uint16x4_t res64; 6394 __m128i a16; 6395 a16 = _MM_CVTEPU8_EPI16 (_pM128i(a)); // SSE 4.1 use low 64 bits 6396 a16 = _mm_hadd_epi16 (a16, a16); //use low 64 bits 6397 return64(a16); 6398 } 6399 6400 _NEON2SSESTORAGE uint32x2_t vpaddl_u16(uint16x4_t a); // VPADDL.s16 d0,d0 6401 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpaddl_u16(uint16x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) 6402 { 6403 //serial solution looks faster than a SIMD one 6404 uint32x2_t res; 6405 res.m64_u32[0] = (uint32_t)a.m64_u16[0] + (uint32_t)a.m64_u16[1]; 6406 res.m64_u32[1] = (uint32_t)a.m64_u16[2] + (uint32_t)a.m64_u16[3]; 6407 return res; 6408 } 6409 6410 _NEON2SSESTORAGE uint64x1_t vpaddl_u32(uint32x2_t a); // VPADDL.U32 d0,d0 6411 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vpaddl_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution looks faster 6412 { 6413 uint64x1_t res; 6414 res.m64_u64[0] = (uint64_t)a.m64_u32[0] + (uint64_t)a.m64_u32[1]; 6415 return res; 6416 } 6417 6418 _NEON2SSESTORAGE int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 6419 _NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0 6420 { 6421 //no 8 bit hadd in IA32, need to go to 16 bit 6422 __m128i r16_1, r16_2; 6423 r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 6424 //swap hi and low part of r to process the remaining data 6425 r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 6426 r16_2 = _MM_CVTEPI8_EPI16 (r16_2); 6427 return _mm_hadd_epi16 (r16_1, r16_2); 6428 } 6429 6430 _NEON2SSESTORAGE int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 6431 _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0 6432 { 6433 //no 8 bit hadd in IA32, need to go to 16 bit 6434 __m128i r32_1, r32_2; 6435 r32_1 = _MM_CVTEPI16_EPI32(a); 6436 //swap hi and low part of r to process the remaining data 6437 r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 6438 r32_2 = _MM_CVTEPI16_EPI32 (r32_2); 6439 return _mm_hadd_epi32 (r32_1, r32_2); 6440 } 6441 6442 _NEON2SSESTORAGE int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 6443 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0 6444 { 6445 _NEON2SSE_ALIGN_16 int32_t atmp[4]; 6446 _NEON2SSE_ALIGN_16 int64_t res[2]; 6447 _mm_store_si128((__m128i*)atmp, a); 6448 res[0] = (int64_t)atmp[0] + (int64_t)atmp[1]; 6449 res[1] = (int64_t)atmp[2] + (int64_t)atmp[3]; 6450 return _mm_load_si128((__m128i*)res); 6451 } 6452 6453 _NEON2SSESTORAGE uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 6454 _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0 6455 { 6456 //no 8 bit hadd in IA32, need to go to 16 bit 6457 __m128i r16_1, r16_2; 6458 r16_1 = _MM_CVTEPU8_EPI16(a); 6459 //swap hi and low part of r to process the remaining data 6460 r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 6461 r16_2 = _MM_CVTEPU8_EPI16 (r16_2); 6462 return _mm_hadd_epi16 (r16_1, r16_2); 6463 } 6464 6465 _NEON2SSESTORAGE uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0 6466 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL) 6467 { 6468 //serial solution looks faster than a SIMD one 6469 _NEON2SSE_ALIGN_16 uint16_t atmp[8]; 6470 _NEON2SSE_ALIGN_16 uint32_t res[4]; 6471 _mm_store_si128((__m128i*)atmp, a); 6472 res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1]; 6473 res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3]; 6474 res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5]; 6475 res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7]; 6476 return _mm_load_si128((__m128i*)res); 6477 } 6478 6479 _NEON2SSESTORAGE uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 6480 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) 6481 { 6482 _NEON2SSE_ALIGN_16 uint32_t atmp[4]; 6483 _NEON2SSE_ALIGN_16 uint64_t res[2]; 6484 _mm_store_si128((__m128i*)atmp, a); 6485 res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1]; 6486 res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3]; 6487 return _mm_load_si128((__m128i*)res); 6488 } 6489 6490 //************************ Long pairwise add and accumulate ************************** 6491 //**************************************************************************************** 6492 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector, 6493 // and accumulates the values of the results into the elements of the destination (wide) vector 6494 _NEON2SSESTORAGE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b); // VPADAL.S8 d0,d0 6495 _NEON2SSE_INLINE int16x4_t vpadal_s8(int16x4_t a, int8x8_t b) 6496 { 6497 int16x4_t res64; 6498 return64(vpadalq_s8(_pM128i(a), _pM128i(b))); 6499 } 6500 6501 _NEON2SSESTORAGE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b); // VPADAL.S16 d0,d0 6502 _NEON2SSE_INLINE int32x2_t vpadal_s16(int32x2_t a, int16x4_t b) 6503 { 6504 int32x2_t res64; 6505 return64(vpadalq_s16(_pM128i(a), _pM128i(b))); 6506 } 6507 6508 6509 _NEON2SSESTORAGE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b); // VPADAL.S32 d0,d0 6510 _NEON2SSE_INLINE int64x1_t vpadal_s32(int64x1_t a, int32x2_t b) 6511 { 6512 int64x1_t res; 6513 res.m64_i64[0] = (int64_t)b.m64_i32[0] + (int64_t)b.m64_i32[1] + a.m64_i64[0]; 6514 return res; 6515 } 6516 6517 _NEON2SSESTORAGE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b); // VPADAL.U8 d0,d0 6518 _NEON2SSE_INLINE uint16x4_t vpadal_u8(uint16x4_t a, uint8x8_t b) 6519 { 6520 uint16x4_t res64; 6521 return64(vpadalq_u8(_pM128i(a), _pM128i(b))); 6522 } 6523 6524 6525 _NEON2SSESTORAGE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b); // VPADAL.s16 d0,d0 6526 _NEON2SSE_INLINE uint32x2_t vpadal_u16(uint32x2_t a, uint16x4_t b) 6527 { 6528 uint32x2_t res64; 6529 return64(vpadalq_u16(_pM128i(a), _pM128i(b))); 6530 } 6531 6532 _NEON2SSESTORAGE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b); // VPADAL.U32 d0,d0 6533 _NEON2SSE_INLINE uint64x1_t vpadal_u32(uint64x1_t a, uint32x2_t b) 6534 { 6535 uint64x1_t res; 6536 res.m64_u64[0] = (uint64_t)b.m64_u32[0] + (uint64_t)b.m64_u32[1] + a.m64_u64[0]; 6537 return res; 6538 } 6539 6540 _NEON2SSESTORAGE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 6541 _NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0 6542 { 6543 int16x8_t pad; 6544 pad = vpaddlq_s8(b); 6545 return _mm_add_epi16 (a, pad); 6546 } 6547 6548 _NEON2SSESTORAGE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 6549 _NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0 6550 { 6551 int32x4_t pad; 6552 pad = vpaddlq_s16(b); 6553 return _mm_add_epi32(a, pad); 6554 } 6555 6556 _NEON2SSESTORAGE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 6557 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) 6558 { 6559 int64x2_t pad; 6560 pad = vpaddlq_s32(b); 6561 return _mm_add_epi64 (a, pad); 6562 } 6563 6564 _NEON2SSESTORAGE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 6565 _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0 6566 { 6567 uint16x8_t pad; 6568 pad = vpaddlq_u8(b); 6569 return _mm_add_epi16 (a, pad); 6570 } 6571 6572 _NEON2SSESTORAGE uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0 6573 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6574 { 6575 uint32x4_t pad; 6576 pad = vpaddlq_u16(b); 6577 return _mm_add_epi32(a, pad); 6578 } //no optimal SIMD solution, serial is faster 6579 6580 _NEON2SSESTORAGE uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 6581 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6582 { 6583 //no optimal SIMD solution, serial is faster 6584 uint64x2_t pad; 6585 pad = vpaddlq_u32(b); 6586 return _mm_add_epi64(a, pad); 6587 } //no optimal SIMD solution, serial is faster 6588 6589 //********** Folding maximum ************************************* 6590 //******************************************************************* 6591 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors, 6592 //and copies the larger of each pair into the corresponding element in the destination 6593 // no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison 6594 _NEON2SSESTORAGE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b); // VPMAX.S8 d0,d0,d0 6595 _NEON2SSE_INLINE int8x8_t vpmax_s8(int8x8_t a, int8x8_t b) // VPMAX.S8 d0,d0,d0 6596 { 6597 int8x8_t res64; 6598 __m128i ab, ab1, max; 6599 _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; 6600 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; 6601 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab 6602 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding 6603 max = _MM_MAX_EPI8 (ab, ab1); // SSE4.1 6604 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data 6605 return64(max); //we need 64 bits only 6606 } 6607 6608 _NEON2SSESTORAGE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b); // VPMAX.S16 d0,d0,d0 6609 _NEON2SSE_INLINE int16x4_t vpmax_s16(int16x4_t a, int16x4_t b) // VPMAX.S16 d0,d0,d0 6610 { 6611 //solution may be not optimal compared with the serial one 6612 int16x4_t res64; 6613 __m128i ab, ab1, max; 6614 _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number 6615 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab 6616 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask 6617 max = _mm_max_epi16 (ab, ab1); 6618 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used 6619 return64(max); 6620 } 6621 6622 _NEON2SSESTORAGE int32x2_t vpmax_s32(int32x2_t a, int32x2_t b); // VPMAX.S32 d0,d0,d0 6623 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmax_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6624 { 6625 //serial solution looks faster than SIMD one 6626 int32x2_t res; 6627 res.m64_i32[0] = (a.m64_i32[0] < a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; 6628 res.m64_i32[1] = (b.m64_i32[0] < b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; 6629 return res; 6630 } 6631 6632 _NEON2SSESTORAGE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b); // VPMAX.U8 d0,d0,d0 6633 _NEON2SSE_INLINE uint8x8_t vpmax_u8(uint8x8_t a, uint8x8_t b) // VPMAX.U8 d0,d0,d0 6634 { 6635 uint8x8_t res64; 6636 __m128i ab, ab1, max; 6637 _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; 6638 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; 6639 ab = _mm_unpacklo_epi64 (_pM128i(a), _pM128i(b)); //ab 6640 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding 6641 max = _mm_max_epu8 (ab, ab1); // SSE4.1 6642 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_odd); //remove repetitive data 6643 return64(max); 6644 } 6645 6646 _NEON2SSESTORAGE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b); // VPMAX.s16 d0,d0,d0 6647 _NEON2SSE_INLINE uint16x4_t vpmax_u16(uint16x4_t a, uint16x4_t b) // VPMAX.s16 d0,d0,d0 6648 { 6649 //solution may be not optimal compared with the serial one 6650 uint16x4_t res64; 6651 __m128i ab, ab1, max; 6652 _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number 6653 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab 6654 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask 6655 max = _MM_MAX_EPU16 (ab, ab1); 6656 max = _mm_shuffle_epi8 (max, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used 6657 return64(max); 6658 } 6659 6660 _NEON2SSESTORAGE uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b); // VPMAX.U32 d0,d0,d0 6661 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmax_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6662 { 6663 //serial solution looks faster than SIMD one 6664 uint32x2_t res; 6665 res.m64_u32[0] = (a.m64_u32[0] < a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0]; 6666 res.m64_u32[1] = (b.m64_u32[0] < b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0]; 6667 return res; 6668 } 6669 6670 _NEON2SSESTORAGE float32x2_t vpmax_f32(float32x2_t a, float32x2_t b); // VPMAX.F32 d0,d0,d0 6671 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmax_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6672 { 6673 //serial solution looks faster than SIMD one 6674 float32x2_t res; 6675 res.m64_f32[0] = (a.m64_f32[0] < a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; 6676 res.m64_f32[1] = (b.m64_f32[0] < b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; 6677 return res; 6678 } 6679 6680 // ***************** Folding minimum **************************** 6681 // ************************************************************** 6682 //vpmin -> takes minimum of adjacent pairs 6683 _NEON2SSESTORAGE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b); // VPMIN.S8 d0,d0,d0 6684 _NEON2SSE_INLINE int8x8_t vpmin_s8(int8x8_t a, int8x8_t b) // VPMIN.S8 d0,d0,d0 6685 { 6686 int8x8_t res64; 6687 __m128i ab, ab1, min; 6688 _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; 6689 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; 6690 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab 6691 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical min finding 6692 min = _MM_MIN_EPI8 (ab, ab1); // SSE4.1 6693 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data 6694 return64(min); 6695 } 6696 6697 _NEON2SSESTORAGE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b); // VPMIN.S16 d0,d0,d0 6698 _NEON2SSE_INLINE int16x4_t vpmin_s16(int16x4_t a, int16x4_t b) // VPMIN.S16 d0,d0,d0 6699 { 6700 //solution may be not optimal compared with the serial one 6701 int16x4_t res64; 6702 __m128i ab, ab1, min; 6703 _NEON2SSE_ALIGN_16 static const int8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number 6704 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab 6705 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical max finding, use 8bit fn and the corresponding mask 6706 min = _mm_min_epi16 (ab, ab1); 6707 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used 6708 return64(min); 6709 } 6710 6711 _NEON2SSESTORAGE int32x2_t vpmin_s32(int32x2_t a, int32x2_t b); // VPMIN.S32 d0,d0,d0 6712 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vpmin_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6713 { 6714 //serial solution looks faster than SIMD one 6715 int32x2_t res; 6716 res.m64_i32[0] = (a.m64_i32[0] > a.m64_i32[1]) ? a.m64_i32[1] : a.m64_i32[0]; 6717 res.m64_i32[1] = (b.m64_i32[0] > b.m64_i32[1]) ? b.m64_i32[1] : b.m64_i32[0]; 6718 return res; 6719 } 6720 6721 _NEON2SSESTORAGE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b); // VPMIN.U8 d0,d0,d0 6722 _NEON2SSE_INLINE uint8x8_t vpmin_u8(uint8x8_t a, uint8x8_t b) // VPMIN.U8 d0,d0,d0 6723 { 6724 uint8x8_t res64; 6725 __m128i ab, ab1, min; 6726 _NEON2SSE_ALIGN_16 static const uint8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; 6727 _NEON2SSE_ALIGN_16 static const uint8_t mask8_odd[16] = { 1, 3, 5, 7, 9, 11, 13, 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}; 6728 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab 6729 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask8_sab); //horisontal pairs swap for vertical max finding 6730 min = _mm_min_epu8 (ab, ab1); // SSE4.1 6731 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_odd); //remove repetitive data 6732 return64(min); 6733 } 6734 6735 _NEON2SSESTORAGE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b); // VPMIN.s16 d0,d0,d0 6736 _NEON2SSE_INLINE uint16x4_t vpmin_u16(uint16x4_t a, uint16x4_t b) // VPMIN.s16 d0,d0,d0 6737 { 6738 //solution may be not optimal compared with the serial one 6739 uint16x4_t res64; 6740 __m128i ab, ab1, min; 6741 _NEON2SSE_ALIGN_16 static const uint8_t mask16_sab[16] = { 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13}; //each chars pair is considerd to be 16 bit number 6742 ab = _mm_unpacklo_epi64 ( _pM128i(a), _pM128i(b)); //ab 6743 ab1 = _mm_shuffle_epi8 (ab, *(__m128i*) mask16_sab); //horisontal pairs swap for vertical min finding, use 8bit fn and the corresponding mask 6744 min = _MM_MIN_EPU16 (ab, ab1); 6745 min = _mm_shuffle_epi8 (min, *(__m128i*) mask8_32_even_odd); //remove repetitive data, only the low part of mask is used 6746 return64(min); 6747 } 6748 6749 _NEON2SSESTORAGE uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b); // VPMIN.U32 d0,d0,d0 6750 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vpmin_u32(uint32x2_t a, uint32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6751 { 6752 //serial solution looks faster than SIMD one 6753 uint32x2_t res; 6754 res.m64_u32[0] = (a.m64_u32[0] > a.m64_u32[1]) ? a.m64_u32[1] : a.m64_u32[0]; 6755 res.m64_u32[1] = (b.m64_u32[0] > b.m64_u32[1]) ? b.m64_u32[1] : b.m64_u32[0]; 6756 return res; 6757 } 6758 6759 _NEON2SSESTORAGE float32x2_t vpmin_f32(float32x2_t a, float32x2_t b); // VPMIN.F32 d0,d0,d0 6760 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vpmin_f32(float32x2_t a, float32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6761 { 6762 //serial solution looks faster than SIMD one 6763 float32x2_t res; 6764 res.m64_f32[0] = (a.m64_f32[0] > a.m64_f32[1]) ? a.m64_f32[1] : a.m64_f32[0]; 6765 res.m64_f32[1] = (b.m64_f32[0] > b.m64_f32[1]) ? b.m64_f32[1] : b.m64_f32[0]; 6766 return res; 6767 } 6768 6769 //*************************************************************** 6770 //*********** Reciprocal/Sqrt ************************************ 6771 //*************************************************************** 6772 //****************** Reciprocal estimate ******************************* 6773 //the ARM NEON and x86 SIMD results may be slightly different 6774 _NEON2SSESTORAGE float32x2_t vrecpe_f32(float32x2_t a); // VRECPE.F32 d0,d0 6775 _NEON2SSE_INLINE float32x2_t vrecpe_f32(float32x2_t a) //use low 64 bits 6776 { 6777 float32x4_t res; 6778 __m64_128 res64; 6779 res = _mm_rcp_ps(_pM128(a)); 6780 _M64f(res64, res); 6781 return res64; 6782 } 6783 6784 _NEON2SSESTORAGE uint32x2_t vrecpe_u32(uint32x2_t a); // VRECPE.U32 d0,d0 6785 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrecpe_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) 6786 { 6787 //Input is fixed point number!!! No reciprocal for ints in IA32 available 6788 uint32x2_t res; 6789 float resf, r; 6790 int i, q, s; 6791 for (i =0; i<2; i++){ 6792 if((a.m64_u32[i] & 0x80000000) == 0) { 6793 res.m64_u32[i] = 0xffffffff; 6794 }else{ 6795 resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); 6796 q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ 6797 r = (float)(1.0 / (((float)q + 0.5) / 512.0)); /* reciprocal r */ 6798 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ 6799 r = (float)s / 256.0; 6800 res.m64_u32[i] = r * (uint32_t)(1 << 31); 6801 } 6802 } 6803 return res; 6804 } 6805 6806 _NEON2SSESTORAGE float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 6807 #define vrecpeq_f32 _mm_rcp_ps 6808 6809 6810 _NEON2SSESTORAGE uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 6811 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) 6812 { 6813 //Input is fixed point number!!! 6814 //We implement the recip_estimate function as described in ARMv7 reference manual (VRECPE instruction) but use float instead of double 6815 _NEON2SSE_ALIGN_16 uint32_t atmp[4]; 6816 _NEON2SSE_ALIGN_16 uint32_t res[4]; 6817 _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000,0x80000000, 0x80000000,0x80000000}; 6818 float resf, r; 6819 int i, q, s; 6820 __m128i res128, mask, zero; 6821 _mm_store_si128((__m128i*)atmp, a); 6822 zero = _mm_setzero_si128(); 6823 for (i =0; i<4; i++){ 6824 resf = (atmp[i] * (0.5f / (uint32_t) (1 << 31))); // 2.3283064365386963E-10 ~(0.5f / (uint32_t) (1 << 31)) 6825 q = (int)(resf * 512.0); /* a in units of 1/512 rounded down */ 6826 r = 1.0 / (((float)q + 0.5) / 512.0); /* reciprocal r */ 6827 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ 6828 r = (float)s / 256.0; 6829 res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); 6830 } 6831 res128 = _mm_load_si128((__m128i*)res); 6832 mask = _mm_and_si128(a, *(__m128i*)c80000000); 6833 mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x7fffffff 6834 return _mm_or_si128(res128, mask); 6835 } 6836 6837 //**********Reciprocal square root estimate **************** 6838 //********************************************************** 6839 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster 6840 //but the particular implementation for vrsqrte_u32 may vary for various ARM compilers 6841 ////the ARM NEON and x86 SIMD results may be slightly different 6842 _NEON2SSESTORAGE float32x2_t vrsqrte_f32(float32x2_t a); // VRSQRTE.F32 d0,d0 6843 _NEON2SSE_INLINE float32x2_t vrsqrte_f32(float32x2_t a) //use low 64 bits 6844 { 6845 float32x4_t res; 6846 __m64_128 res64; 6847 res = _mm_rsqrt_ps(_pM128(a)); 6848 _M64f(res64, res); 6849 return res64; 6850 } 6851 6852 _NEON2SSESTORAGE uint32x2_t vrsqrte_u32(uint32x2_t a); // VRSQRTE.U32 d0,d0 6853 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrsqrte_u32(uint32x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) 6854 { 6855 //Input is fixed point number!!! 6856 //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double 6857 uint32x2_t res; 6858 __m128 tmp; 6859 float r, resf, coeff; 6860 int i,q0, s; 6861 for (i =0; i<2; i++){ 6862 if((a.m64_u32[i] & 0xc0000000) == 0) { //a <=0x3fffffff 6863 res.m64_u32[i] = 0xffffffff; 6864 }else{ 6865 resf = (float) (a.m64_u32[i] * (0.5f / (uint32_t)(1 << 31))); 6866 coeff = (resf < 0.5)? 512.0 : 256.0 ; /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ 6867 q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ 6868 r = ((float)q0 + 0.5) / coeff; 6869 tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ 6870 _mm_store_ss(&r, tmp); 6871 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ 6872 r = (float)(s / 256.0); 6873 res.m64_u32[i] = r * (((uint32_t)1) << 31); 6874 } 6875 } 6876 return res; 6877 } 6878 6879 _NEON2SSESTORAGE float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 6880 #define vrsqrteq_f32 _mm_rsqrt_ps 6881 6882 _NEON2SSESTORAGE uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 6883 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrsqrteq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) 6884 { 6885 //Input is fixed point number!!! 6886 //We implement the recip_sqrt_estimate function as described in ARMv7 reference manual (VRSQRTE instruction) but use float instead of double 6887 _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4]; 6888 _NEON2SSE_ALIGN_16 static const uint32_t c_c0000000[4] = {0xc0000000,0xc0000000, 0xc0000000,0xc0000000}; 6889 __m128 tmp; 6890 __m128i res128, mask, zero; 6891 float r, resf, coeff; 6892 int i,q0, s; 6893 _mm_store_si128((__m128i*)atmp, a); 6894 zero = _mm_setzero_si128(); 6895 for (i =0; i<4; i++){ 6896 resf = (float) (atmp[i] * (0.5f / (uint32_t)(1 << 31))); 6897 coeff = (float)((resf < 0.5)? 512.0 : 256.0); /* range 0.25 <= resf < 0.5 or range 0.5 <= resf < 1.0*/ 6898 q0 = (int)(resf * coeff); /* a in units of 1/512 rounded down */ 6899 r = ((float)q0 + 0.5) / coeff; 6900 tmp = _mm_rsqrt_ss(_mm_load_ss( &r));/* reciprocal root r */ 6901 _mm_store_ss(&r, tmp); 6902 s = (int)(256.0 * r + 0.5); /* r in units of 1/256 rounded to nearest */ 6903 r = (float)s / 256.0; 6904 res[i] = (uint32_t) (r * (((uint32_t)1) << 31) ); 6905 } 6906 res128 = _mm_load_si128((__m128i*)res); 6907 mask = _mm_and_si128(a, *(__m128i*)c_c0000000); 6908 mask = _mm_cmpeq_epi32(zero, mask); //0xffffffff if atmp[i] <= 0x3fffffff 6909 return _mm_or_si128(res128, mask); 6910 } 6911 //************ Reciprocal estimate/step and 1/sqrt estimate/step *************************** 6912 //****************************************************************************************** 6913 //******VRECPS (Vector Reciprocal Step) *************************************************** 6914 //multiplies the elements of one vector by the corresponding elements of another vector, 6915 //subtracts each of the results from 2, and places the final results into the elements of the destination vector. 6916 6917 _NEON2SSESTORAGE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b); // VRECPS.F32 d0, d0, d0 6918 _NEON2SSE_INLINE float32x2_t vrecps_f32(float32x2_t a, float32x2_t b) 6919 { 6920 float32x4_t res; 6921 __m64_128 res64; 6922 res = vrecpsq_f32(_pM128(a), _pM128(b)); 6923 _M64f(res64, res); 6924 return res64; 6925 } 6926 6927 _NEON2SSESTORAGE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 6928 _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0 6929 { 6930 __m128 f2, mul; 6931 f2 = _mm_set1_ps(2.); 6932 mul = _mm_mul_ps(a,b); 6933 return _mm_sub_ps(f2,mul); 6934 } 6935 6936 //*****************VRSQRTS (Vector Reciprocal Square Root Step) ***************************** 6937 //multiplies the elements of one vector by the corresponding elements of another vector, 6938 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector. 6939 6940 _NEON2SSESTORAGE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b); // VRSQRTS.F32 d0, d0, d0 6941 _NEON2SSE_INLINE float32x2_t vrsqrts_f32(float32x2_t a, float32x2_t b) 6942 { 6943 float32x2_t res; 6944 res.m64_f32[0] = (3 - a.m64_f32[0] * b.m64_f32[0]) / 2; 6945 res.m64_f32[1] = (3 - a.m64_f32[1] * b.m64_f32[1]) / 2; 6946 return res; 6947 } 6948 6949 _NEON2SSESTORAGE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 6950 _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0 6951 { 6952 __m128 f3, f05, mul; 6953 f3 = _mm_set1_ps(3.); 6954 f05 = _mm_set1_ps(0.5); 6955 mul = _mm_mul_ps(a,b); 6956 f3 = _mm_sub_ps(f3,mul); 6957 return _mm_mul_ps (f3, f05); 6958 } 6959 //******************************************************************************************** 6960 //***************************** Shifts by signed variable *********************************** 6961 //******************************************************************************************** 6962 //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) *********************** 6963 //******************************************************************************************** 6964 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution 6965 //helper macro. It matches ARM implementation for big shifts 6966 #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ 6967 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ 6968 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 6969 for (i = 0; i<LEN; i++) { \ 6970 if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \ 6971 else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \ 6972 return _mm_load_si128((__m128i*)res); 6973 6974 #define SERIAL_SHIFT_64(TYPE, SIGN, LEN) \ 6975 int ## TYPE ## x ## LEN ## _t res; int i, lanesize = sizeof(int ## TYPE ## _t) << 3; \ 6976 for (i = 0; i<LEN; i++) { \ 6977 if( (b.m64_i ## TYPE[i] >= lanesize)||(b.m64_i ## TYPE[i] <= -lanesize) ) res.m64_ ## SIGN ## TYPE[i] = 0; \ 6978 else res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] >=0) ? a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i] : a.m64_ ## SIGN ## TYPE[i] >> (-b.m64_i ## TYPE[i]); } \ 6979 return res; 6980 6981 _NEON2SSESTORAGE int8x8_t vshl_s8(int8x8_t a, int8x8_t b); // VSHL.S8 d0,d0,d0 6982 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6983 { 6984 SERIAL_SHIFT_64(8, i, 8) 6985 } 6986 6987 _NEON2SSESTORAGE int16x4_t vshl_s16(int16x4_t a, int16x4_t b); // VSHL.S16 d0,d0,d0 6988 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6989 { 6990 SERIAL_SHIFT_64(16, i, 4) 6991 } 6992 6993 _NEON2SSESTORAGE int32x2_t vshl_s32(int32x2_t a, int32x2_t b); // VSHL.S32 d0,d0,d0 6994 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 6995 { 6996 SERIAL_SHIFT_64(32, i, 2) 6997 } 6998 6999 _NEON2SSESTORAGE int64x1_t vshl_s64(int64x1_t a, int64x1_t b); // VSHL.S64 d0,d0,d0 7000 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7001 { 7002 SERIAL_SHIFT_64(64, i, 1) 7003 } 7004 7005 _NEON2SSESTORAGE uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b); // VSHL.U8 d0,d0,d0 7006 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7007 { 7008 SERIAL_SHIFT_64(8, u, 8) 7009 } 7010 7011 _NEON2SSESTORAGE uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b); // VSHL.s16 d0,d0,d0 7012 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7013 { 7014 SERIAL_SHIFT_64(16, u, 4) 7015 } 7016 7017 _NEON2SSESTORAGE uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b); // VSHL.U32 d0,d0,d0 7018 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7019 { 7020 SERIAL_SHIFT_64(32, u, 2) 7021 } 7022 7023 _NEON2SSESTORAGE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b); // VSHL.U64 d0,d0,d0 7024 _NEON2SSE_INLINE uint64x1_t vshl_u64(uint64x1_t a, int64x1_t b) //if we use the SERIAL_SHIFT macro need to have the special processing for large numbers 7025 { 7026 SERIAL_SHIFT_64(64, u, 1) 7027 } 7028 7029 _NEON2SSESTORAGE int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 7030 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7031 { 7032 SERIAL_SHIFT(int8_t, int8_t, 16, 16) 7033 } 7034 7035 _NEON2SSESTORAGE int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 7036 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7037 { 7038 SERIAL_SHIFT(int16_t, int16_t, 8, 8) 7039 } 7040 7041 _NEON2SSESTORAGE int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 7042 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7043 { 7044 SERIAL_SHIFT(int32_t, int32_t, 4, 4) 7045 } 7046 7047 _NEON2SSESTORAGE int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 7048 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7049 { 7050 SERIAL_SHIFT(int64_t, int64_t, 2, 2) 7051 } 7052 7053 _NEON2SSESTORAGE uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 7054 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7055 { 7056 SERIAL_SHIFT(uint8_t, int8_t, 16, 16) 7057 } 7058 7059 _NEON2SSESTORAGE uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0 7060 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7061 { 7062 SERIAL_SHIFT(uint16_t, int16_t, 8, 8) 7063 } 7064 7065 _NEON2SSESTORAGE uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 7066 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7067 { 7068 SERIAL_SHIFT(uint32_t, int32_t, 4, 4) 7069 } 7070 7071 _NEON2SSESTORAGE uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 7072 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7073 { 7074 SERIAL_SHIFT(uint64_t, int64_t, 2, 2) 7075 } 7076 7077 7078 //*********** Vector saturating shift left: (negative values shift right) ********************** 7079 //******************************************************************************************** 7080 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution 7081 #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ 7082 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ 7083 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ 7084 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 7085 for (i = 0; i<LEN; i++) { \ 7086 if (atmp[i] ==0) res[i] = 0; \ 7087 else{ \ 7088 if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \ 7089 else{ \ 7090 if (btmp[i]>lanesize_1) { \ 7091 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ 7092 }else{ \ 7093 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ 7094 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ 7095 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ 7096 else res[i] = atmp[i] << btmp[i]; }}}} \ 7097 return _mm_load_si128((__m128i*)res); 7098 7099 #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ 7100 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ 7101 TYPE lanesize = (sizeof(TYPE) << 3); \ 7102 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 7103 for (i = 0; i<LEN; i++) { \ 7104 if (atmp[i] ==0) {res[i] = 0; \ 7105 }else{ \ 7106 if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \ 7107 else{ \ 7108 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ 7109 else{ \ 7110 limit = (TYPE) 1 << (lanesize - btmp[i]); \ 7111 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ 7112 return _mm_load_si128((__m128i*)res); 7113 7114 #define SERIAL_SATURATING_SHIFT_SIGNED_64(TYPE, LEN) \ 7115 int ## TYPE ## x ## LEN ## _t res; int ## TYPE ## _t limit; int i; \ 7116 int lanesize_1 = (sizeof( int ## TYPE ## _t) << 3) - 1; \ 7117 for (i = 0; i<LEN; i++) { \ 7118 if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \ 7119 else{ \ 7120 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \ 7121 else{ \ 7122 if (b.m64_i ## TYPE[i]>lanesize_1) { \ 7123 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ 7124 }else{ \ 7125 limit = (int ## TYPE ## _t) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ 7126 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ 7127 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t) 1 << lanesize_1) - 1; \ 7128 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ 7129 return res; 7130 7131 #define SERIAL_SATURATING_SHIFT_UNSIGNED_64(TYPE, LEN) \ 7132 int ## TYPE ## x ## LEN ## _t res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ 7133 int ## TYPE ## _t lanesize = (sizeof(int ## TYPE ## _t) << 3); \ 7134 for (i = 0; i<LEN; i++) { \ 7135 if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \ 7136 }else{ \ 7137 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i])); \ 7138 else{ \ 7139 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ 7140 else{ \ 7141 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ 7142 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_u ## TYPE[i]; }}}} \ 7143 return res; 7144 7145 _NEON2SSESTORAGE int8x8_t vqshl_s8(int8x8_t a, int8x8_t b); // VQSHL.S8 d0,d0,d0 7146 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7147 { 7148 SERIAL_SATURATING_SHIFT_SIGNED_64(8,8) 7149 } 7150 7151 _NEON2SSESTORAGE int16x4_t vqshl_s16(int16x4_t a, int16x4_t b); // VQSHL.S16 d0,d0,d0 7152 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7153 { 7154 SERIAL_SATURATING_SHIFT_SIGNED_64(16,4) 7155 } 7156 7157 _NEON2SSESTORAGE int32x2_t vqshl_s32(int32x2_t a, int32x2_t b); // VQSHL.S32 d0,d0,d0 7158 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7159 { 7160 SERIAL_SATURATING_SHIFT_SIGNED_64(32,2) 7161 } 7162 7163 _NEON2SSESTORAGE int64x1_t vqshl_s64(int64x1_t a, int64x1_t b); // VQSHL.S64 d0,d0,d0 7164 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7165 { 7166 SERIAL_SATURATING_SHIFT_SIGNED_64(64,1) 7167 } 7168 7169 _NEON2SSESTORAGE uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b); // VQSHL.U8 d0,d0,d0 7170 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7171 { 7172 SERIAL_SATURATING_SHIFT_UNSIGNED_64(8,8) 7173 } 7174 7175 _NEON2SSESTORAGE uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b); // VQSHL.s16 d0,d0,d0 7176 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7177 { 7178 SERIAL_SATURATING_SHIFT_UNSIGNED_64(16,4) 7179 } 7180 7181 _NEON2SSESTORAGE uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b); // VQSHL.U32 d0,d0,d0 7182 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7183 { 7184 SERIAL_SATURATING_SHIFT_UNSIGNED_64(32,2) 7185 } 7186 7187 _NEON2SSESTORAGE uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b); // VQSHL.U64 d0,d0,d0 7188 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7189 { 7190 SERIAL_SATURATING_SHIFT_UNSIGNED_64(64,1) 7191 } 7192 7193 _NEON2SSESTORAGE int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 7194 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7195 { 7196 SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16) 7197 } 7198 7199 _NEON2SSESTORAGE int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 7200 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7201 { 7202 SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8) 7203 } 7204 7205 _NEON2SSESTORAGE int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 7206 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7207 { 7208 SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4) 7209 } 7210 7211 _NEON2SSESTORAGE int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 7212 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7213 { 7214 SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2) 7215 } 7216 7217 _NEON2SSESTORAGE uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 7218 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7219 { 7220 SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16) 7221 } 7222 7223 _NEON2SSESTORAGE uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0 7224 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7225 { 7226 SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8) 7227 } 7228 7229 _NEON2SSESTORAGE uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 7230 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7231 { 7232 SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4) 7233 } 7234 7235 _NEON2SSESTORAGE uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 7236 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7237 { 7238 SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2) 7239 } 7240 7241 7242 //******** Vector rounding shift left: (negative values shift right) ********** 7243 //**************************************************************************** 7244 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution 7245 //rounding makes sense for right shifts only. 7246 #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ 7247 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ 7248 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 7249 for (i = 0; i<LEN; i++) { \ 7250 if( btmp[i] >= 0) { \ 7251 if(btmp[i] >= lanesize) res[i] = 0; \ 7252 else res[i] = (atmp[i] << btmp[i]); \ 7253 }else{ \ 7254 res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \ 7255 (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \ 7256 (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \ 7257 return _mm_load_si128((__m128i*)res); 7258 7259 7260 #define SERIAL_ROUNDING_SHIFT_64(TYPE, SIGN, LEN) \ 7261 int ## TYPE ## x ## LEN ## _t res; int i; int lanesize = sizeof(int ## TYPE ## _t) << 3; \ 7262 for (i = 0; i<LEN; i++) { \ 7263 if( b.m64_i ## TYPE[i] >= 0) { \ 7264 if(b.m64_i ## TYPE[i] >= lanesize) res.m64_ ## SIGN ## TYPE[i] = 0; \ 7265 else res.m64_ ## SIGN ## TYPE[i] = (a.m64_ ## SIGN ## TYPE[i] << b.m64_i ## TYPE[i]); \ 7266 }else{ \ 7267 res.m64_ ## SIGN ## TYPE[i] = (b.m64_i ## TYPE[i] < -lanesize) ? res.m64_ ## SIGN ## TYPE[i] = 0 : \ 7268 (b.m64_i ## TYPE[i] == -lanesize) ? (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) : \ 7269 (a.m64_ ## SIGN ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_ ## SIGN ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); }} \ 7270 return res; 7271 7272 7273 _NEON2SSESTORAGE int8x8_t vrshl_s8(int8x8_t a, int8x8_t b); // VRSHL.S8 d0,d0,d0 7274 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7275 { 7276 SERIAL_ROUNDING_SHIFT_64(8,i,8) 7277 } 7278 7279 _NEON2SSESTORAGE int16x4_t vrshl_s16(int16x4_t a, int16x4_t b); // VRSHL.S16 d0,d0,d0 7280 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7281 { 7282 SERIAL_ROUNDING_SHIFT_64(16,i,4) 7283 } 7284 7285 _NEON2SSESTORAGE int32x2_t vrshl_s32(int32x2_t a, int32x2_t b); // VRSHL.S32 d0,d0,d0 7286 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7287 { 7288 SERIAL_ROUNDING_SHIFT_64(32,i,2) 7289 } 7290 7291 _NEON2SSESTORAGE int64x1_t vrshl_s64(int64x1_t a, int64x1_t b); // VRSHL.S64 d0,d0,d0 7292 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7293 { 7294 SERIAL_ROUNDING_SHIFT_64(64,i,1) 7295 } 7296 7297 _NEON2SSESTORAGE uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b); // VRSHL.U8 d0,d0,d0 7298 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7299 { 7300 SERIAL_ROUNDING_SHIFT_64(8,u,8) 7301 } 7302 7303 _NEON2SSESTORAGE uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b); // VRSHL.s16 d0,d0,d0 7304 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7305 { 7306 SERIAL_ROUNDING_SHIFT_64(16,u,4) 7307 } 7308 7309 _NEON2SSESTORAGE uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b); // VRSHL.U32 d0,d0,d0 7310 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7311 { 7312 SERIAL_ROUNDING_SHIFT_64(32,u,2) 7313 } 7314 7315 _NEON2SSESTORAGE uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b); // VRSHL.U64 d0,d0,d0 7316 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7317 { 7318 SERIAL_ROUNDING_SHIFT_64(64,u,1) 7319 } 7320 7321 _NEON2SSESTORAGE int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 7322 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7323 { 7324 SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16) 7325 } 7326 7327 _NEON2SSESTORAGE int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 7328 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7329 { 7330 SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8) 7331 } 7332 7333 _NEON2SSESTORAGE int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 7334 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7335 { 7336 SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4) 7337 } 7338 7339 _NEON2SSESTORAGE int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 7340 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7341 { 7342 SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2) 7343 } 7344 7345 _NEON2SSESTORAGE uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 7346 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7347 { 7348 SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16) 7349 } 7350 7351 _NEON2SSESTORAGE uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0 7352 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7353 { 7354 SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8) 7355 } 7356 7357 _NEON2SSESTORAGE uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 7358 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7359 { 7360 SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4) 7361 } 7362 7363 _NEON2SSESTORAGE uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 7364 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7365 { 7366 SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2) 7367 } 7368 7369 7370 //********** Vector saturating rounding shift left: (negative values shift right) **************** 7371 //************************************************************************************************* 7372 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution 7373 //Saturation happens for left shifts only while rounding makes sense for right shifts only. 7374 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ 7375 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ 7376 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ 7377 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 7378 for (i = 0; i<LEN; i++) { \ 7379 if (atmp[i] ==0) res[i] = 0; \ 7380 else{ \ 7381 if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ 7382 else{ \ 7383 if (btmp[i]>lanesize_1) { \ 7384 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ 7385 }else{ \ 7386 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ 7387 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ 7388 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ 7389 else res[i] = atmp[i] << btmp[i]; }}}} \ 7390 return _mm_load_si128((__m128i*)res); 7391 7392 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ 7393 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ 7394 int lanesize = (sizeof(TYPE) << 3); \ 7395 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 7396 for (i = 0; i<LEN; i++) { \ 7397 if (atmp[i] ==0) {res[i] = 0; \ 7398 }else{ \ 7399 if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ 7400 else{ \ 7401 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ 7402 else{ \ 7403 limit = (TYPE) 1 << (lanesize - btmp[i]); \ 7404 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ 7405 return _mm_load_si128((__m128i*)res); 7406 7407 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(TYPE, LEN) \ 7408 __m64_128 res; int ## TYPE ## _t limit; int i; \ 7409 int lanesize_1 = (sizeof(int ## TYPE ## _t ) << 3) - 1; \ 7410 for (i = 0; i<LEN; i++) { \ 7411 if (a.m64_i ## TYPE[i] ==0) res.m64_i ## TYPE[i] = 0; \ 7412 else{ \ 7413 if(b.m64_i ## TYPE[i] <0) res.m64_i ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize_1)) ? 0 : (a.m64_i ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_i ## TYPE[i] & ((int ## TYPE ## _t ) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ 7414 else{ \ 7415 if (b.m64_i ## TYPE[i]>lanesize_1) { \ 7416 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ 7417 }else{ \ 7418 limit = (int ## TYPE ## _t ) 1 << (lanesize_1 - b.m64_i ## TYPE[i]); \ 7419 if((a.m64_i ## TYPE[i] >= limit)||(a.m64_i ## TYPE[i] <= -limit)) \ 7420 res.m64_i ## TYPE[i] = ((_UNSIGNED_T(int ## TYPE ## _t ))a.m64_i ## TYPE[i] >> lanesize_1 ) + ((int ## TYPE ## _t ) 1 << lanesize_1) - 1; \ 7421 else res.m64_i ## TYPE[i] = a.m64_i ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ 7422 return res; 7423 7424 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(TYPE, LEN) \ 7425 __m64_128 res; _UNSIGNED_T(int ## TYPE ## _t) limit; int i; \ 7426 int lanesize = (sizeof(int ## TYPE ## _t) << 3); \ 7427 for (i = 0; i<LEN; i++) { \ 7428 if (a.m64_u ## TYPE[i] ==0) {res.m64_u ## TYPE[i] = 0; \ 7429 }else{ \ 7430 if(b.m64_i ## TYPE[i] < 0) res.m64_u ## TYPE[i] = (b.m64_i ## TYPE[i] < (-lanesize)) ? 0 : (a.m64_u ## TYPE[i] >> (-(b.m64_i ## TYPE[i]))) + ( (a.m64_u ## TYPE[i] & ((int ## TYPE ## _t) 1 << (-(b.m64_i ## TYPE[i]) - 1))) >> (-(b.m64_i ## TYPE[i]) - 1) ); \ 7431 else{ \ 7432 if (b.m64_i ## TYPE[i]>lanesize) res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0); \ 7433 else{ \ 7434 limit = (int ## TYPE ## _t) 1 << (lanesize - b.m64_i ## TYPE[i]); \ 7435 res.m64_u ## TYPE[i] = ( a.m64_u ## TYPE[i] >= limit) ? res.m64_u ## TYPE[i] = ~((int ## TYPE ## _t) 0) : a.m64_u ## TYPE[i] << b.m64_i ## TYPE[i]; }}}} \ 7436 return res; 7437 7438 _NEON2SSESTORAGE int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b); // VQRSHL.S8 d0,d0,d0 7439 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vqrshl_s8(int8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7440 { 7441 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(8,8) 7442 } 7443 7444 _NEON2SSESTORAGE int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b); // VQRSHL.S16 d0,d0,d0 7445 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vqrshl_s16(int16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7446 { 7447 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(16,4) 7448 } 7449 7450 _NEON2SSESTORAGE int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b); // VQRSHL.S32 d0,d0,d0 7451 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshl_s32(int32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7452 { 7453 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(32,2) 7454 } 7455 7456 _NEON2SSESTORAGE int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b); // VQRSHL.S64 d0,d0,d0 7457 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqrshl_s64(int64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7458 { 7459 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED_64(64,1) 7460 } 7461 7462 _NEON2SSESTORAGE uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b); // VQRSHL.U8 d0,d0,d0 7463 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vqrshl_u8(uint8x8_t a, int8x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7464 { 7465 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(8,8) 7466 } 7467 7468 _NEON2SSESTORAGE uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b); // VQRSHL.s16 d0,d0,d0 7469 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vqrshl_u16(uint16x4_t a, int16x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7470 { 7471 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(16,4) 7472 } 7473 7474 _NEON2SSESTORAGE uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b); // VQRSHL.U32 d0,d0,d0 7475 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshl_u32(uint32x2_t a, int32x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7476 { 7477 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(32,2) 7478 } 7479 7480 _NEON2SSESTORAGE uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b); // VQRSHL.U64 d0,d0,d0 7481 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqrshl_u64(uint64x1_t a, int64x1_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7482 { 7483 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED_64(64,1) 7484 } 7485 7486 _NEON2SSESTORAGE int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 7487 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7488 { 7489 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16) 7490 } 7491 7492 _NEON2SSESTORAGE int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 7493 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7494 { 7495 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8) 7496 } 7497 7498 _NEON2SSESTORAGE int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 7499 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7500 { 7501 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4) 7502 } 7503 7504 _NEON2SSESTORAGE int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 7505 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7506 { 7507 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2) 7508 } 7509 7510 _NEON2SSESTORAGE uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 7511 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7512 { 7513 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16) 7514 } 7515 7516 _NEON2SSESTORAGE uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0 7517 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7518 { 7519 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8) 7520 } 7521 7522 _NEON2SSESTORAGE uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 7523 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7524 { 7525 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4) 7526 } 7527 7528 _NEON2SSESTORAGE uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 7529 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 7530 { 7531 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2) 7532 } 7533 7534 // ********************************************************************************* 7535 // ***************************** Shifts by a constant ***************************** 7536 // ********************************************************************************* 7537 //**************** Vector shift right by constant************************************* 7538 //************************************************************************************ 7539 _NEON2SSESTORAGE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VSHR.S8 d0,d0,#8 7540 _NEON2SSE_INLINE int8x8_t vshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VSHR.S8 d0,d0,#8 7541 { 7542 //no 8 bit shift available, go to 16 bit 7543 int8x8_t res64; 7544 __m128i r; 7545 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 7546 r = _mm_srai_epi16 (r, b); //SSE2 7547 r = _mm_packs_epi16 (r,r); //we need 64 bits only 7548 return64(r); 7549 } 7550 7551 _NEON2SSESTORAGE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VSHR.S16 d0,d0,#16 7552 _NEON2SSE_INLINE int16x4_t vshr_n_s16(int16x4_t a, __constrange(1,16) int b) 7553 { 7554 int16x4_t res64; 7555 return64(_mm_srai_epi16(_pM128i(a), b)); 7556 } 7557 7558 7559 _NEON2SSESTORAGE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VSHR.S32 d0,d0,#32 7560 _NEON2SSE_INLINE int32x2_t vshr_n_s32(int32x2_t a, __constrange(1,32) int b) 7561 { 7562 int32x2_t res64; 7563 return64(_mm_srai_epi32(_pM128i(a), b)); 7564 } 7565 7566 _NEON2SSESTORAGE int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VSHR.S64 d0,d0,#64 7567 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) 7568 { 7569 //no arithmetic shift for 64bit values, serial solution used 7570 int64x1_t res; 7571 if(b>=64) res.m64_i64[0] = 0; 7572 else res.m64_i64[0] = (*(int64_t*)&a) >> b; 7573 return res; 7574 } 7575 7576 _NEON2SSESTORAGE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VSHR.U8 d0,d0,#8 7577 _NEON2SSE_INLINE uint8x8_t vshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VSHR.U8 d0,d0,#8 7578 { 7579 //no 8 bit shift available, go to 16 bit 7580 uint8x8_t res64; 7581 __m128i r; 7582 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 7583 r = _mm_srli_epi16 (r, b); //for unsigned variables we use the logical shift not arithmetical one 7584 r = _mm_packus_epi16 (r,r); //we need 64 bits only 7585 return64(r); 7586 } 7587 7588 _NEON2SSESTORAGE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VSHR.s16 d0,d0,#16 7589 _NEON2SSE_INLINE uint16x4_t vshr_n_u16(uint16x4_t a, __constrange(1,16) int b) 7590 { 7591 uint16x4_t res64; 7592 return64(_mm_srli_epi16(_pM128i(a), b)); 7593 } 7594 7595 7596 _NEON2SSESTORAGE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VSHR.U32 d0,d0,#32 7597 _NEON2SSE_INLINE uint32x2_t vshr_n_u32(uint32x2_t a, __constrange(1,32) int b) 7598 { 7599 uint32x2_t res64; 7600 return64(_mm_srli_epi32(_pM128i(a), b)); 7601 } 7602 7603 7604 _NEON2SSESTORAGE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VSHR.U64 d0,d0,#64 7605 _NEON2SSE_INLINE uint64x1_t vshr_n_u64(uint64x1_t a, __constrange(1,64) int b) 7606 { 7607 uint64x1_t res64; 7608 return64(_mm_srli_epi64(_pM128i(a), b)); 7609 } 7610 7611 7612 _NEON2SSESTORAGE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 7613 _NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8 7614 { 7615 //no 8 bit shift available, go to 16 bit trick 7616 __m128i zero, mask0, a_sign, r, a_sign_mask; 7617 _NEON2SSE_ALIGN_16 static const int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff}; 7618 zero = _mm_setzero_si128(); 7619 mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift 7620 a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0 7621 r = _mm_srai_epi16 (a, b); 7622 a_sign_mask = _mm_and_si128 (mask0, a_sign); 7623 r = _mm_andnot_si128 (mask0, r); 7624 return _mm_or_si128 (r, a_sign_mask); 7625 } 7626 7627 _NEON2SSESTORAGE int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 7628 #define vshrq_n_s16 _mm_srai_epi16 7629 7630 _NEON2SSESTORAGE int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 7631 #define vshrq_n_s32 _mm_srai_epi32 7632 7633 _NEON2SSESTORAGE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 7634 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b) 7635 { 7636 //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD 7637 __m128i c1, signmask,a0, res64; 7638 _NEON2SSE_ALIGN_16 static const uint64_t mask[] = {0x8000000000000000, 0x8000000000000000}; 7639 c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff 7640 signmask = _mm_slli_epi64 (c1, (64 - b)); 7641 a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit 7642 a0 = _MM_CMPEQ_EPI64 (a, a0); 7643 signmask = _mm_and_si128(a0, signmask); 7644 res64 = _mm_srli_epi64 (a, b); 7645 return _mm_or_si128(res64, signmask); 7646 } 7647 7648 _NEON2SSESTORAGE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 7649 _NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8 7650 { 7651 //no 8 bit shift available, need the special trick 7652 __m128i mask0, r; 7653 _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00}; 7654 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift 7655 r = _mm_srli_epi16 ( a, b); 7656 return _mm_and_si128 (r, mask0); 7657 } 7658 7659 _NEON2SSESTORAGE uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16 7660 #define vshrq_n_u16 _mm_srli_epi16 7661 7662 _NEON2SSESTORAGE uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 7663 #define vshrq_n_u32 _mm_srli_epi32 7664 7665 _NEON2SSESTORAGE uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 7666 #define vshrq_n_u64 _mm_srli_epi64 7667 7668 //*************************** Vector shift left by constant ************************* 7669 //********************************************************************************* 7670 _NEON2SSESTORAGE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 7671 _NEON2SSE_INLINE int8x8_t vshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VSHL.I8 d0,d0,#0 7672 { 7673 //no 8 bit shift available, go to 16 bit 7674 int8x8_t res64; 7675 __m128i r; 7676 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 7677 r = _mm_slli_epi16 (r, b); //SSE2 7678 r = _mm_shuffle_epi8 (r, *(__m128i*) mask8_16_even_odd); //return to 8 bit, we need 64 bits only 7679 return64(r); 7680 } 7681 7682 _NEON2SSESTORAGE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 7683 _NEON2SSE_INLINE int16x4_t vshl_n_s16(int16x4_t a, __constrange(0,15) int b) 7684 { 7685 int16x4_t res64; 7686 return64(_mm_slli_epi16(_pM128i(a), b)); 7687 } 7688 7689 7690 _NEON2SSESTORAGE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 7691 _NEON2SSE_INLINE int32x2_t vshl_n_s32(int32x2_t a, __constrange(0,31) int b) 7692 { 7693 int32x2_t res64; 7694 return64(_mm_slli_epi32(_pM128i(a), b)); 7695 } 7696 7697 7698 _NEON2SSESTORAGE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 7699 _NEON2SSE_INLINE int64x1_t vshl_n_s64(int64x1_t a, __constrange(0,63) int b) 7700 { 7701 int64x1_t res64; 7702 return64(_mm_slli_epi64(_pM128i(a), b)); 7703 } 7704 7705 7706 _NEON2SSESTORAGE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VSHL.I8 d0,d0,#0 7707 _NEON2SSE_INLINE uint8x8_t vshl_n_u8(uint8x8_t a, __constrange(0,7) int b) 7708 { 7709 //no 8 bit shift available, go to 16 bit 7710 uint8x8_t res64; 7711 __m128i mask8; 7712 __m128i r; 7713 mask8 = _mm_set1_epi16(0xff); 7714 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 7715 r = _mm_slli_epi16 (r, b); //SSE2 7716 r = _mm_and_si128(r, mask8); //to avoid saturation 7717 r = _mm_packus_epi16 (r,r); //we need 64 bits only 7718 return64(r); 7719 } 7720 7721 _NEON2SSESTORAGE uint16x4_t vshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VSHL.I16 d0,d0,#0 7722 #define vshl_n_u16 vshl_n_s16 7723 7724 7725 _NEON2SSESTORAGE uint32x2_t vshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VSHL.I32 d0,d0,#0 7726 #define vshl_n_u32 vshl_n_s32 7727 7728 _NEON2SSESTORAGE uint64x1_t vshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VSHL.I64 d0,d0,#0 7729 #define vshl_n_u64 vshl_n_s64 7730 7731 _NEON2SSESTORAGE int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 7732 #define vshlq_n_s8 vshlq_n_u8 7733 7734 _NEON2SSESTORAGE int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 7735 #define vshlq_n_s16 _mm_slli_epi16 7736 7737 _NEON2SSESTORAGE int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 7738 #define vshlq_n_s32 _mm_slli_epi32 7739 7740 _NEON2SSESTORAGE int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 7741 #define vshlq_n_s64 _mm_slli_epi64 7742 7743 _NEON2SSESTORAGE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 7744 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) 7745 { 7746 //no 8 bit shift available, need the special trick 7747 __m128i mask0, r; 7748 _NEON2SSE_ALIGN_16 static const uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff}; 7749 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift 7750 r = _mm_slli_epi16 ( a, b); 7751 return _mm_and_si128 (r, mask0); 7752 } 7753 7754 _NEON2SSESTORAGE uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 7755 #define vshlq_n_u16 vshlq_n_s16 7756 7757 _NEON2SSESTORAGE uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 7758 #define vshlq_n_u32 vshlq_n_s32 7759 7760 _NEON2SSESTORAGE uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 7761 #define vshlq_n_u64 vshlq_n_s64 7762 7763 //************* Vector rounding shift right by constant ****************** 7764 //************************************************************************* 7765 //No corresponding x86 intrinsics exist, need to do some tricks 7766 _NEON2SSESTORAGE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b); // VRSHR.S8 d0,d0,#8 7767 _NEON2SSE_INLINE int8x8_t vrshr_n_s8(int8x8_t a, __constrange(1,8) int b) // VRSHR.S8 d0,d0,#8 7768 { 7769 //no 8 bit shift available, go to 16 bit 7770 int8x8_t res64; 7771 __m128i r, maskb; 7772 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 7773 maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit 7774 maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 7775 r = _mm_srai_epi16 (r, b); 7776 r = _mm_add_epi16 (r, maskb); //actual rounding 7777 r = _mm_packs_epi16 (r,r); ////we need 64 bits only 7778 return64(r); 7779 } 7780 7781 _NEON2SSESTORAGE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b); // VRSHR.S16 d0,d0,#16 7782 _NEON2SSE_INLINE int16x4_t vrshr_n_s16(int16x4_t a, __constrange(1,16) int b) 7783 { 7784 int16x4_t res64; 7785 return64(vrshrq_n_s16(_pM128i(a), b)); 7786 } 7787 7788 7789 _NEON2SSESTORAGE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b); // VRSHR.S32 d0,d0,#32 7790 _NEON2SSE_INLINE int32x2_t vrshr_n_s32(int32x2_t a, __constrange(1,32) int b) 7791 { 7792 int32x2_t res64; 7793 return64(vrshrq_n_s32(_pM128i(a), b)); 7794 } 7795 7796 7797 _NEON2SSESTORAGE int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b); // VRSHR.S64 d0,d0,#64 7798 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrshr_n_s64(int64x1_t a, __constrange(1,64) int b), _NEON2SSE_REASON_SLOW_SERIAL) 7799 { 7800 //serial solution is faster 7801 int64x1_t res; 7802 int64_t a_i64 = *( int64_t*)&a; 7803 if(b==64) { 7804 res.m64_i64[0] = 0; //for some compilers rounding happens and we need to use(a_i64 & _SIGNBIT64)>>63; 7805 } else { 7806 int64_t maskb = a_i64 & (( int64_t)1 << (b - 1)); 7807 res.m64_i64[0] = (a_i64 >> b) + (maskb >> (b - 1)); 7808 } 7809 return res; 7810 } 7811 7812 _NEON2SSESTORAGE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b); // VRSHR.U8 d0,d0,#8 7813 _NEON2SSE_INLINE uint8x8_t vrshr_n_u8(uint8x8_t a, __constrange(1,8) int b) // VRSHR.U8 d0,d0,#8 7814 { 7815 //no 8 bit shift available, go to 16 bit, solution may be not optimal compared with the serial one 7816 uint8x8_t res64; 7817 __m128i r, maskb; 7818 r = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 7819 maskb = _mm_slli_epi16 (r, (16 - b)); //to get rounding (b-1)th bit 7820 maskb = _mm_srli_epi16 (maskb, 15); //1 or 0 7821 r = _mm_srli_epi16 (r, b); 7822 r = _mm_add_epi16 (r, maskb); //actual rounding 7823 r = _mm_packus_epi16 (r,r); ////we need 64 bits only 7824 return64(r); 7825 } 7826 7827 _NEON2SSESTORAGE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b); // VRSHR.s16 d0,d0,#16 7828 _NEON2SSE_INLINE uint16x4_t vrshr_n_u16(uint16x4_t a, __constrange(1,16) int b) 7829 { 7830 uint16x4_t res64; 7831 return64(vrshrq_n_u16(_pM128i(a), b)); 7832 } 7833 7834 7835 _NEON2SSESTORAGE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b); // VRSHR.U32 d0,d0,#32 7836 _NEON2SSE_INLINE uint32x2_t vrshr_n_u32(uint32x2_t a, __constrange(1,32) int b) 7837 { 7838 uint32x2_t res64; 7839 return64(vrshrq_n_u32(_pM128i(a), b)); 7840 } 7841 7842 7843 _NEON2SSESTORAGE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b); // VRSHR.U64 d0,d0,#64 7844 _NEON2SSE_INLINE uint64x1_t vrshr_n_u64(uint64x1_t a, __constrange(1,64) int b) 7845 { 7846 uint64x1_t res64; 7847 return64(vrshrq_n_u64(_pM128i(a), b)); 7848 } 7849 7850 _NEON2SSESTORAGE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 7851 _NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8 7852 { 7853 //no 8 bit shift available, go to 16 bit trick 7854 __m128i r, mask1, maskb; 7855 _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 7856 r = vshrq_n_s8 (a, b); 7857 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding 7858 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding 7859 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 7860 return _mm_add_epi8(r, maskb); //actual rounding 7861 } 7862 7863 _NEON2SSESTORAGE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 7864 _NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 7865 { 7866 __m128i maskb, r; 7867 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit 7868 maskb = _mm_srli_epi16(maskb, 15); //1 or 0 7869 r = _mm_srai_epi16 (a, b); 7870 return _mm_add_epi16 (r, maskb); //actual rounding 7871 } 7872 7873 _NEON2SSESTORAGE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 7874 _NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 7875 { 7876 __m128i maskb, r; 7877 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit 7878 maskb = _mm_srli_epi32 (maskb,31); //1 or 0 7879 r = _mm_srai_epi32(a, b); 7880 return _mm_add_epi32 (r, maskb); //actual rounding 7881 } 7882 7883 _NEON2SSESTORAGE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 7884 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b) 7885 { 7886 //solution may be not optimal compared with a serial one 7887 __m128i maskb; 7888 int64x2_t r; 7889 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit 7890 maskb = _mm_srli_epi64 (maskb,63); //1 or 0 7891 r = vshrq_n_s64(a, b); 7892 return _mm_add_epi64 (r, maskb); //actual rounding 7893 } 7894 7895 _NEON2SSESTORAGE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 7896 _NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8 7897 { 7898 //no 8 bit shift available, go to 16 bit trick 7899 __m128i r, mask1, maskb; 7900 _NEON2SSE_ALIGN_16 static const uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 7901 r = vshrq_n_u8 (a, b); 7902 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding 7903 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding 7904 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 7905 return _mm_add_epi8(r, maskb); //actual rounding 7906 } 7907 7908 _NEON2SSESTORAGE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16 7909 _NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 7910 { 7911 __m128i maskb, r; 7912 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit 7913 maskb = _mm_srli_epi16(maskb, 15); //1 or 0 7914 r = _mm_srli_epi16 (a, b); 7915 return _mm_add_epi16 (r, maskb); //actual rounding 7916 } 7917 7918 _NEON2SSESTORAGE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 7919 _NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 7920 { 7921 __m128i maskb, r; 7922 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit 7923 maskb = _mm_srli_epi32 (maskb,31); //1 or 0 7924 r = _mm_srli_epi32(a, b); 7925 return _mm_add_epi32 (r, maskb); //actual rounding 7926 } 7927 7928 _NEON2SSESTORAGE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 7929 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b) 7930 { 7931 //solution may be not optimal compared with a serial one 7932 __m128i maskb, r; 7933 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit 7934 maskb = _mm_srli_epi64 (maskb,63); //1 or 0 7935 r = _mm_srli_epi64(a, b); 7936 return _mm_add_epi64 (r, maskb); //actual rounding 7937 } 7938 7939 //************* Vector shift right by constant and accumulate ********* 7940 //********************************************************************* 7941 _NEON2SSESTORAGE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRA.S8 d0,d0,#8 7942 _NEON2SSE_INLINE int8x8_t vsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VSRA.S8 d0,d0,#8 7943 { 7944 int8x8_t shift; 7945 shift = vshr_n_s8(b, c); 7946 return vadd_s8( a, shift); 7947 } 7948 7949 _NEON2SSESTORAGE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRA.S16 d0,d0,#16 7950 _NEON2SSE_INLINE int16x4_t vsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VSRA.S16 d0,d0,#16 7951 { 7952 int16x4_t shift; 7953 shift = vshr_n_s16( b, c); 7954 return vadd_s16(a, shift); 7955 } 7956 7957 _NEON2SSESTORAGE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRA.S32 d0,d0,#32 7958 _NEON2SSE_INLINE int32x2_t vsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VSRA.S32 d0,d0,#32 7959 { 7960 //may be not optimal compared with the serial execution 7961 int32x2_t shift; 7962 shift = vshr_n_s32(b, c); 7963 return vadd_s32( a, shift); 7964 } 7965 7966 _NEON2SSESTORAGE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRA.S64 d0,d0,#64 7967 _NEON2SSE_INLINE int64x1_t vsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) 7968 { 7969 //may be not optimal compared with a serial solution 7970 int64x1_t shift; 7971 shift = vshr_n_s64(b, c); 7972 return vadd_s64( a, shift); 7973 } 7974 7975 _NEON2SSESTORAGE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRA.U8 d0,d0,#8 7976 _NEON2SSE_INLINE uint8x8_t vsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VSRA.U8 d0,d0,#8 7977 { 7978 uint8x8_t shift; 7979 shift = vshr_n_u8(b, c); 7980 return vadd_u8(a, shift); 7981 } 7982 7983 _NEON2SSESTORAGE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRA.s16 d0,d0,#16 7984 _NEON2SSE_INLINE uint16x4_t vsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VSRA.s16 d0,d0,#16 7985 { 7986 uint16x4_t shift; 7987 shift = vshr_n_u16(b, c); 7988 return vadd_u16(a,shift); 7989 } 7990 7991 _NEON2SSESTORAGE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRA.U32 d0,d0,#32 7992 _NEON2SSE_INLINE uint32x2_t vsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VSRA.U32 d0,d0,#32 7993 { 7994 //may be not optimal compared with the serial execution 7995 uint32x2_t shift; 7996 shift = vshr_n_u32(b, c); 7997 return vadd_u32( a, shift); 7998 } 7999 8000 _NEON2SSESTORAGE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRA.U64 d0,d0,#64 8001 _NEON2SSE_INLINE uint64x1_t vsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c) // VSRA.U64 d0,d0,#64 8002 { 8003 //may be not optimal compared with the serial execution 8004 uint64x1_t shift; 8005 shift = vshr_n_u64(b, c); 8006 return vadd_u64(a, shift); 8007 } 8008 8009 _NEON2SSESTORAGE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 8010 _NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8 8011 { 8012 int8x16_t shift; 8013 shift = vshrq_n_s8(b, c); 8014 return vaddq_s8(a, shift); 8015 } 8016 8017 _NEON2SSESTORAGE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 8018 _NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16 8019 { 8020 int16x8_t shift; 8021 shift = vshrq_n_s16(b, c); 8022 return vaddq_s16(a, shift); 8023 } 8024 8025 _NEON2SSESTORAGE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 8026 _NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32 8027 { 8028 int32x4_t shift; 8029 shift = vshrq_n_s32(b, c); 8030 return vaddq_s32(a, shift); 8031 } 8032 8033 _NEON2SSESTORAGE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 8034 _NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64 8035 { 8036 int64x2_t shift; 8037 shift = vshrq_n_s64(b, c); 8038 return vaddq_s64( a, shift); 8039 } 8040 8041 _NEON2SSESTORAGE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 8042 _NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8 8043 { 8044 uint8x16_t shift; 8045 shift = vshrq_n_u8(b, c); 8046 return vaddq_u8(a, shift); 8047 } 8048 8049 _NEON2SSESTORAGE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16 8050 _NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16 8051 { 8052 uint16x8_t shift; 8053 shift = vshrq_n_u16(b, c); 8054 return vaddq_u16(a, shift); 8055 } 8056 8057 _NEON2SSESTORAGE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 8058 _NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32 8059 { 8060 uint32x4_t shift; 8061 shift = vshrq_n_u32(b, c); 8062 return vaddq_u32(a, shift); 8063 } 8064 8065 _NEON2SSESTORAGE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 8066 _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64 8067 { 8068 uint64x2_t shift; 8069 shift = vshrq_n_u64(b, c); 8070 return vaddq_u64(a, shift); 8071 } 8072 8073 //************* Vector rounding shift right by constant and accumulate **************************** 8074 //************************************************************************************************ 8075 _NEON2SSESTORAGE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VRSRA.S8 d0,d0,#8 8076 _NEON2SSE_INLINE int8x8_t vrsra_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) // VRSRA.S8 d0,d0,#8 8077 { 8078 int8x8_t shift; 8079 shift = vrshr_n_s8(b, c); 8080 return vadd_s8( a, shift); 8081 } 8082 8083 _NEON2SSESTORAGE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VRSRA.S16 d0,d0,#16 8084 _NEON2SSE_INLINE int16x4_t vrsra_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) // VRSRA.S16 d0,d0,#16 8085 { 8086 int16x4_t shift; 8087 shift = vrshr_n_s16( b, c); 8088 return vadd_s16(a, shift); 8089 } 8090 8091 _NEON2SSESTORAGE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VRSRA.S32 d0,d0,#32 8092 _NEON2SSE_INLINE int32x2_t vrsra_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) // VRSRA.S32 d0,d0,#32 8093 { 8094 //may be not optimal compared with the serial execution 8095 int32x2_t shift; 8096 shift = vrshr_n_s32(b, c); 8097 return vadd_s32( a, shift); 8098 } 8099 8100 _NEON2SSESTORAGE int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VRSRA.S64 d0,d0,#64 8101 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vrsra_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution 8102 { 8103 int64x1_t shift; 8104 shift = vrshr_n_s64(b, c); 8105 return vadd_s64( a, shift); 8106 } 8107 8108 _NEON2SSESTORAGE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VRSRA.U8 d0,d0,#8 8109 _NEON2SSE_INLINE uint8x8_t vrsra_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c) // VRSRA.U8 d0,d0,#8 8110 { 8111 uint8x8_t shift; 8112 shift = vrshr_n_u8(b, c); 8113 return vadd_u8(a, shift); 8114 } 8115 8116 _NEON2SSESTORAGE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VRSRA.s16 d0,d0,#16 8117 _NEON2SSE_INLINE uint16x4_t vrsra_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c) // VRSRA.s16 d0,d0,#16 8118 { 8119 uint16x4_t shift; 8120 shift = vrshr_n_u16(b, c); 8121 return vadd_u16(a,shift); 8122 } 8123 8124 _NEON2SSESTORAGE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VRSRA.U32 d0,d0,#32 8125 _NEON2SSE_INLINE uint32x2_t vrsra_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c) // VRSRA.U32 d0,d0,#32 8126 { 8127 //may be not optimal compared with the serial execution 8128 uint32x2_t shift; 8129 shift = vrshr_n_u32(b, c); 8130 return vadd_u32( a, shift); 8131 } 8132 8133 _NEON2SSESTORAGE uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VRSRA.U64 d0,d0,#64 8134 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vrsra_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution 8135 { 8136 //may be not optimal compared with the serial execution 8137 uint64x1_t shift; 8138 shift = vrshr_n_u64(b, c); 8139 return vadd_u64( a, shift); 8140 } 8141 8142 _NEON2SSESTORAGE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 8143 _NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8 8144 { 8145 int8x16_t shift; 8146 shift = vrshrq_n_s8(b, c); 8147 return vaddq_s8(a, shift); 8148 } 8149 8150 _NEON2SSESTORAGE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 8151 _NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16 8152 { 8153 int16x8_t shift; 8154 shift = vrshrq_n_s16(b, c); 8155 return vaddq_s16(a, shift); 8156 } 8157 8158 _NEON2SSESTORAGE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 8159 _NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32 8160 { 8161 int32x4_t shift; 8162 shift = vrshrq_n_s32(b, c); 8163 return vaddq_s32(a, shift); 8164 } 8165 8166 _NEON2SSESTORAGE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 8167 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) 8168 { 8169 int64x2_t shift; 8170 shift = vrshrq_n_s64(b, c); 8171 return vaddq_s64(a, shift); 8172 } 8173 8174 _NEON2SSESTORAGE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 8175 _NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8 8176 { 8177 uint8x16_t shift; 8178 shift = vrshrq_n_u8(b, c); 8179 return vaddq_u8(a, shift); 8180 } 8181 8182 _NEON2SSESTORAGE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16 8183 _NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16 8184 { 8185 uint16x8_t shift; 8186 shift = vrshrq_n_u16(b, c); 8187 return vaddq_u16(a, shift); 8188 } 8189 8190 _NEON2SSESTORAGE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 8191 _NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32 8192 { 8193 uint32x4_t shift; 8194 shift = vrshrq_n_u32(b, c); 8195 return vaddq_u32(a, shift); 8196 } 8197 8198 _NEON2SSESTORAGE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 8199 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) 8200 { 8201 uint64x2_t shift; 8202 shift = vrshrq_n_u64(b, c); 8203 return vaddq_u64(a, shift); 8204 } 8205 8206 //**********************Vector saturating shift left by constant ***************************** 8207 //******************************************************************************************** 8208 //we don't check const ranges assuming they are met 8209 _NEON2SSESTORAGE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHL.S8 d0,d0,#0 8210 _NEON2SSE_INLINE int8x8_t vqshl_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHL.S8 d0,d0,#0 8211 { 8212 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) 8213 int8x8_t res64; 8214 __m128i a128, r128; 8215 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 8216 r128 = _mm_slli_epi16 (a128, b); 8217 r128 = _mm_packs_epi16 (r128,r128); //saturated s8, use 64 low bits only 8218 return64(r128); 8219 } 8220 8221 _NEON2SSESTORAGE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHL.S16 d0,d0,#0 8222 _NEON2SSE_INLINE int16x4_t vqshl_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHL.S16 d0,d0,#0 8223 { 8224 // go to 32 bit to get the auto saturation (in packs function) 8225 int16x4_t res64; 8226 __m128i a128, r128; 8227 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 8228 r128 = _mm_slli_epi32 (a128, b); //shift_res 8229 r128 = _mm_packs_epi32 (r128,r128); //saturated s16, use 64 low bits only 8230 return64(r128); 8231 } 8232 8233 _NEON2SSESTORAGE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHL.S32 d0,d0,#0 8234 _NEON2SSE_INLINE int32x2_t vqshl_n_s32(int32x2_t a, __constrange(0,31) int b) 8235 { 8236 //serial execution may be faster 8237 int32x2_t res64; 8238 return64(vqshlq_n_s32 (_pM128i(a), b)); 8239 } 8240 8241 8242 _NEON2SSESTORAGE int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHL.S64 d0,d0,#0 8243 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x1_t vqshl_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) 8244 { 8245 // no effective SIMD solution here 8246 int64x1_t res; 8247 int64_t bmask; 8248 int64_t a_i64 = *( int64_t*)&a; 8249 bmask = ( int64_t)1 << (63 - b); //positive 8250 if (a_i64 >= bmask) { 8251 res.m64_i64[0] = ~(_SIGNBIT64); 8252 } else { 8253 res.m64_i64[0] = (a_i64 <= -bmask) ? _SIGNBIT64 : a_i64 << b; 8254 } 8255 return res; 8256 } 8257 8258 8259 _NEON2SSESTORAGE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b); // VQSHL.U8 d0,d0,#0 8260 _NEON2SSE_INLINE uint8x8_t vqshl_n_u8(uint8x8_t a, __constrange(0,7) int b) // VQSHL.U8 d0,d0,#0 8261 { 8262 //no 8 bit shift available in IA32 SIMD, go to 16 bit 8263 uint8x8_t res64; 8264 __m128i a128, r128; 8265 a128 = _MM_CVTEPU8_EPI16 (_pM128i(a)); //SSE 4.1 8266 r128 = _mm_slli_epi16 (a128, b); //shift_res 8267 r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only 8268 return64(r128); 8269 } 8270 8271 _NEON2SSESTORAGE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b); // VQSHL.s16 d0,d0,#0 8272 _NEON2SSE_INLINE uint16x4_t vqshl_n_u16(uint16x4_t a, __constrange(0,15) int b) // VQSHL.s16 d0,d0,#0 8273 { 8274 // go to 32 bit to get the auto saturation (in packus function) 8275 uint16x4_t res64; 8276 __m128i a128, r128; 8277 a128 = _MM_CVTEPU16_EPI32 (_pM128i(a)); //SSE 4.1 8278 r128 = _mm_slli_epi32 (a128, b); //shift_res 8279 r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16 8280 return64(r128); 8281 } 8282 8283 _NEON2SSESTORAGE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b); // VQSHL.U32 d0,d0,#0 8284 _NEON2SSE_INLINE uint32x2_t vqshl_n_u32(uint32x2_t a, __constrange(0,31) int b) 8285 { 8286 uint32x2_t res64; 8287 return64(vqshlq_n_u32(_pM128i(a), b)); 8288 } 8289 8290 _NEON2SSESTORAGE uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b); // VQSHL.U64 d0,d0,#0 8291 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshl_n_u64(uint64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) 8292 { 8293 // no effective SIMD solution here 8294 uint64x1_t res; 8295 uint64_t bmask; 8296 uint64_t a_i64 = *(uint64_t*)&a; 8297 bmask = ( uint64_t)1 << (64 - b); 8298 res.m64_u64[0] = (a_i64 >= bmask)&&(b>0) ? 0xffffffffffffffff : a_i64 << b; //if b=0 we are fine with any a 8299 return res; 8300 } 8301 8302 _NEON2SSESTORAGE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 8303 _NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0 8304 { 8305 // go to 16 bit to get the auto saturation (in packs function) 8306 __m128i a128, r128_1, r128_2; 8307 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 8308 r128_1 = _mm_slli_epi16 (a128, b); 8309 //swap hi and low part of a128 to process the remaining data 8310 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 8311 a128 = _MM_CVTEPI8_EPI16 (a128); 8312 r128_2 = _mm_slli_epi16 (a128, b); 8313 return _mm_packs_epi16 (r128_1, r128_2); //saturated s8 8314 } 8315 8316 _NEON2SSESTORAGE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 8317 _NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0 8318 { 8319 // manual saturation solution looks LESS optimal than 32 bits conversion one 8320 // go to 32 bit to get the auto saturation (in packs function) 8321 __m128i a128, r128_1, r128_2; 8322 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 8323 r128_1 = _mm_slli_epi32 (a128, b); //shift_res 8324 //swap hi and low part of a128 to process the remaining data 8325 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 8326 a128 = _MM_CVTEPI16_EPI32 (a128); 8327 r128_2 = _mm_slli_epi32 (a128, b); 8328 return _mm_packs_epi32 (r128_1, r128_2); //saturated s16 8329 } 8330 8331 _NEON2SSESTORAGE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 8332 _NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0 8333 { 8334 // no 64 bit saturation option available, special tricks necessary 8335 __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask; 8336 c1 = _mm_cmpeq_epi32(a,a); //0xff..ff 8337 maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones 8338 saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise 8339 c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not 8340 shift_res = _mm_slli_epi32 (a, b); 8341 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); 8342 //result with positive numbers saturated 8343 shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask); 8344 //treat negative numbers 8345 maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros 8346 saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise 8347 c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not 8348 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); 8349 return _mm_or_si128 (c7ffffff_mask, shift_res_mask); 8350 } 8351 8352 _NEON2SSESTORAGE int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 8353 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) 8354 { 8355 // no effective SIMD solution here 8356 _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2]; 8357 int64_t bmask; 8358 int i; 8359 bmask = ( int64_t)1 << (63 - b); //positive 8360 _mm_store_si128((__m128i*)atmp, a); 8361 for (i = 0; i<2; i++) { 8362 if (atmp[i] >= bmask) { 8363 res[i] = ~(_SIGNBIT64); 8364 } else { 8365 res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b; 8366 } 8367 } 8368 return _mm_load_si128((__m128i*)res); 8369 } 8370 8371 _NEON2SSESTORAGE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 8372 _NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0 8373 { 8374 // go to 16 bit to get the auto saturation (in packs function) 8375 __m128i a128, r128_1, r128_2; 8376 a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1 8377 r128_1 = _mm_slli_epi16 (a128, b); 8378 //swap hi and low part of a128 to process the remaining data 8379 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 8380 a128 = _MM_CVTEPU8_EPI16 (a128); 8381 r128_2 = _mm_slli_epi16 (a128, b); 8382 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 8383 } 8384 8385 _NEON2SSESTORAGE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0 8386 _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0 8387 { 8388 // manual saturation solution looks more optimal than 32 bits conversion one 8389 __m128i cb, c8000, a_signed, saturation_mask, shift_res; 8390 cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 ); 8391 c8000 = _mm_set1_epi16 ((int16_t)0x8000); 8392 //no unsigned shorts comparison in SSE, only signed available, so need the trick 8393 a_signed = _mm_sub_epi16(a, c8000); //go to signed 8394 saturation_mask = _mm_cmpgt_epi16 (a_signed, cb); 8395 shift_res = _mm_slli_epi16 (a, b); 8396 return _mm_or_si128 (shift_res, saturation_mask); 8397 } 8398 8399 _NEON2SSESTORAGE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 8400 _NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0 8401 { 8402 // manual saturation solution, no 64 bit saturation option, the serial version may be faster 8403 __m128i cb, c80000000, a_signed, saturation_mask, shift_res; 8404 cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 ); 8405 c80000000 = _mm_set1_epi32 (0x80000000); 8406 //no unsigned ints comparison in SSE, only signed available, so need the trick 8407 a_signed = _mm_sub_epi32(a, c80000000); //go to signed 8408 saturation_mask = _mm_cmpgt_epi32 (a_signed, cb); 8409 shift_res = _mm_slli_epi32 (a, b); 8410 return _mm_or_si128 (shift_res, saturation_mask); 8411 } 8412 8413 _NEON2SSESTORAGE uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 8414 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) 8415 { 8416 // no effective SIMD solution here 8417 _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2]; 8418 uint64_t bmask; 8419 int i; 8420 bmask = ( uint64_t)1 << (64 - b); 8421 _mm_store_si128((__m128i*)atmp, a); 8422 for (i = 0; i<2; i++) { 8423 res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a 8424 } 8425 return _mm_load_si128((__m128i*)res); 8426 } 8427 8428 //**************Vector signed->unsigned saturating shift left by constant ************* 8429 //************************************************************************************* 8430 _NEON2SSESTORAGE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b); // VQSHLU.S8 d0,d0,#0 8431 _NEON2SSE_INLINE uint8x8_t vqshlu_n_s8(int8x8_t a, __constrange(0,7) int b) // VQSHLU.S8 d0,d0,#0 8432 { 8433 //no 8 bit shift available in IA32 SIMD, go to 16 bit. It also provides the auto saturation (in packs function) 8434 uint8x8_t res64; 8435 __m128i a128, r128; 8436 a128 = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 8437 r128 = _mm_slli_epi16 (a128, b); 8438 r128 = _mm_packus_epi16 (r128,r128); //saturated u8, use 64 low bits only 8439 return64(r128); 8440 } 8441 8442 _NEON2SSESTORAGE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b); // VQSHLU.S16 d0,d0,#0 8443 _NEON2SSE_INLINE uint16x4_t vqshlu_n_s16(int16x4_t a, __constrange(0,15) int b) // VQSHLU.S16 d0,d0,#0 8444 { 8445 uint16x4_t res64; 8446 __m128i a128, r128; 8447 a128 = _MM_CVTEPI16_EPI32 (_pM128i(a)); //SSE 4.1 8448 r128 = _mm_slli_epi32 (a128, b); //shift_res 8449 r128 = _MM_PACKUS1_EPI32 (r128); //saturated s16, use 64 low bits only 8450 return64(r128); 8451 } 8452 8453 _NEON2SSESTORAGE uint32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b); // VQSHLU.S32 d0,d0,#0 8454 _NEON2SSE_INLINE int32x2_t vqshlu_n_s32(int32x2_t a, __constrange(0,31) int b) 8455 { 8456 int32x2_t res64; 8457 return64( vqshluq_n_s32(_pM128i(a), b)); 8458 } 8459 8460 _NEON2SSESTORAGE uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b); // VQSHLU.S64 d0,d0,#0 8461 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x1_t vqshlu_n_s64(int64x1_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) // no effective SIMD solution here, serial execution looks faster 8462 { 8463 uint64x1_t res; 8464 uint64_t limit; 8465 if (a.m64_i64[0]<=0) { 8466 res.m64_u64[0] = 0; 8467 } else { 8468 limit = (uint64_t) 1 << (64 - b); 8469 res.m64_u64[0] = ( ((uint64_t)a.m64_i64[0]) >= limit) ? res.m64_u64[0] = ~((uint64_t)0) : a.m64_i64[0] << b; 8470 } 8471 return res; 8472 } 8473 8474 _NEON2SSESTORAGE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 8475 _NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0 8476 { 8477 __m128i a128, r128_1, r128_2; 8478 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 8479 r128_1 = _mm_slli_epi16 (a128, b); 8480 //swap hi and low part of a128 to process the remaining data 8481 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 8482 a128 = _MM_CVTEPI8_EPI16 (a128); 8483 r128_2 = _mm_slli_epi16 (a128, b); 8484 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 8485 } 8486 8487 _NEON2SSESTORAGE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 8488 _NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0 8489 { 8490 // manual saturation solution looks LESS optimal than 32 bits conversion one 8491 __m128i a128, r128_1, r128_2; 8492 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 8493 r128_1 = _mm_slli_epi32 (a128, b); //shift_res 8494 //swap hi and low part of a128 to process the remaining data 8495 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 8496 a128 = _MM_CVTEPI16_EPI32 (a128); 8497 r128_2 = _mm_slli_epi32 (a128, b); 8498 return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16 8499 } 8500 8501 _NEON2SSESTORAGE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 8502 _NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0 8503 { 8504 //solution may be not optimal compared with the serial one 8505 __m128i zero, maskA, maskGT0, a0, a_masked, a_shift; 8506 zero = _mm_setzero_si128(); 8507 maskA = _mm_cmpeq_epi32(a, a); 8508 maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros 8509 //saturate negative numbers to zero 8510 maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers) 8511 a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now 8512 //saturate positive to 0xffffffff 8513 a_masked = _mm_and_si128 (a0, maskA); 8514 a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise 8515 a_shift = _mm_slli_epi32 (a0, b); 8516 return _mm_or_si128 (a_shift, a_masked); //actual saturation 8517 } 8518 8519 _NEON2SSESTORAGE uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 8520 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) 8521 { 8522 // no effective SIMD solution here, serial execution looks faster 8523 _NEON2SSE_ALIGN_16 int64_t atmp[2]; 8524 _NEON2SSE_ALIGN_16 uint64_t res[2]; 8525 uint64_t limit; 8526 int i; 8527 _mm_store_si128((__m128i*)atmp, a); 8528 for (i = 0; i<2; i++) { 8529 if (atmp[i]<=0) { 8530 res[i] = 0; 8531 } else { 8532 limit = (uint64_t) 1 << (64 - b); 8533 res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b; 8534 } 8535 } 8536 return _mm_load_si128((__m128i*)res); 8537 } 8538 8539 //************** Vector narrowing shift right by constant ************** 8540 //********************************************************************** 8541 _NEON2SSESTORAGE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 8542 _NEON2SSE_INLINE int8x8_t vshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 8543 { 8544 int8x8_t res64; 8545 __m128i r16; 8546 r16 = vshrq_n_s16(a,b); 8547 r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems 8548 return64(r16); 8549 } 8550 8551 _NEON2SSESTORAGE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 8552 _NEON2SSE_INLINE int16x4_t vshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 8553 { 8554 int16x4_t res64; 8555 __m128i r32; 8556 r32 = vshrq_n_s32(a,b); 8557 r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems 8558 return64(r32); 8559 } 8560 8561 _NEON2SSESTORAGE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 8562 _NEON2SSE_INLINE int32x2_t vshrn_n_s64(int64x2_t a, __constrange(1,32) int b) 8563 { 8564 int32x2_t res64; 8565 __m128i r64; 8566 r64 = vshrq_n_s64(a,b); 8567 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits 8568 return64(r64); 8569 } 8570 8571 _NEON2SSESTORAGE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VSHRN.I16 d0,q0,#8 8572 _NEON2SSE_INLINE uint8x8_t vshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VSHRN.I16 d0,q0,#8 8573 { 8574 uint8x8_t res64; 8575 __m128i mask, r16; 8576 mask = _mm_set1_epi16(0xff); 8577 r16 = vshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) 8578 r16 = _mm_and_si128(r16, mask); //to avoid saturation 8579 r16 = _mm_packus_epi16 (r16,r16); //narrow, use low 64 bits only 8580 return64(r16); 8581 } 8582 8583 _NEON2SSESTORAGE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VSHRN.I32 d0,q0,#16 8584 _NEON2SSE_INLINE uint16x4_t vshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VSHRN.I32 d0,q0,#16 8585 { 8586 uint16x4_t res64; 8587 __m128i mask, r32; 8588 mask = _mm_set1_epi32(0xffff); 8589 r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 16) 8590 r32 = _mm_and_si128(r32, mask); //to avoid saturation 8591 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only 8592 return64(r32); 8593 } 8594 8595 _NEON2SSESTORAGE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VSHRN.I64 d0,q0,#32 8596 _NEON2SSE_INLINE uint32x2_t vshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) 8597 { 8598 uint32x2_t res64; 8599 __m128i r64; 8600 r64 = vshrq_n_u64(a,b); 8601 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits 8602 return64(r64); 8603 } 8604 8605 //************** Vector signed->unsigned narrowing saturating shift right by constant ******** 8606 //********************************************************************************************* 8607 _NEON2SSESTORAGE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRUN.S16 d0,q0,#8 8608 _NEON2SSE_INLINE uint8x8_t vqshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRUN.S16 d0,q0,#8 8609 { 8610 uint8x8_t res64; 8611 __m128i r16; 8612 r16 = vshrq_n_s16(a,b); 8613 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only 8614 return64(r16); 8615 } 8616 8617 _NEON2SSESTORAGE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRUN.S32 d0,q0,#16 8618 _NEON2SSE_INLINE uint16x4_t vqshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRUN.S32 d0,q0,#16 8619 { 8620 uint16x4_t res64; 8621 __m128i r32; 8622 r32 = vshrq_n_s32(a,b); 8623 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow(signed to unsigned), use low 64 bits only 8624 return64(r32); 8625 } 8626 8627 _NEON2SSESTORAGE uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRUN.S64 d0,q0,#32 8628 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster 8629 { 8630 _NEON2SSE_ALIGN_16 int64_t atmp[2]; 8631 uint32x2_t res; 8632 int64_t res64; 8633 _mm_store_si128((__m128i*)atmp, a); 8634 if (atmp[0] < 0) { 8635 res.m64_u32[0] = 0; 8636 } else { 8637 res64 = (atmp[0] >> b); 8638 res.m64_u32[0] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t) res64; 8639 } 8640 if (atmp[1] < 0) { 8641 res.m64_u32[1] = 0; 8642 } else { 8643 res64 = (atmp[1] >> b); 8644 res.m64_u32[1] = (res64 > (int64_t)0xffffffff) ? 0xffffffff : (uint32_t)res64; 8645 } 8646 return res; 8647 } 8648 8649 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant ***** 8650 _NEON2SSESTORAGE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRUN.S16 d0,q0,#8 8651 _NEON2SSE_INLINE uint8x8_t vqrshrun_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRUN.S16 d0,q0,#8 8652 { 8653 //solution may be not optimal compared with the serial one 8654 __m128i r16; 8655 uint8x8_t res64; 8656 r16 = vrshrq_n_s16(a,b); 8657 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow (signed to unsigned), use low 64 bits only 8658 return64(r16); 8659 } 8660 8661 _NEON2SSESTORAGE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRUN.S32 d0,q0,#16 8662 _NEON2SSE_INLINE uint16x4_t vqrshrun_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRUN.S32 d0,q0,#16 8663 { 8664 //solution may be not optimal compared with the serial one 8665 __m128i r32; 8666 uint16x4_t res64; 8667 r32 = vrshrq_n_s32(a,b); 8668 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow (signed to unsigned), use low 64 bits only 8669 return64(r32); 8670 } 8671 8672 _NEON2SSESTORAGE uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRUN.S64 d0,q0,#32 8673 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vqrshrun_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_SERIAL) //serial solution is faster 8674 { 8675 _NEON2SSE_ALIGN_16 int64_t atmp[2]; 8676 uint32x2_t res; 8677 int64_t res64; 8678 _mm_store_si128((__m128i*)atmp, a); 8679 if (atmp[0] < 0) { 8680 res.m64_u32[0] = 0; 8681 } else { 8682 res64 = (atmp[0] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); 8683 res.m64_u32[0] = (uint32_t) ((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64); 8684 } 8685 if (atmp[1] < 0) { 8686 res.m64_u32[1] = 0; 8687 } else { 8688 res64 = (atmp[1] >> b) + ( (atmp[0] & ((int64_t)1 << (b - 1))) >> (b - 1) ); 8689 res.m64_u32[1] = (uint32_t)((res64 > (int64_t)0xffffffff ) ? 0xffffffff : res64); 8690 } 8691 return res; 8692 } 8693 8694 //***** Vector narrowing saturating shift right by constant ****** 8695 //***************************************************************** 8696 _NEON2SSESTORAGE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQSHRN.S16 d0,q0,#8 8697 _NEON2SSE_INLINE int8x8_t vqshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQSHRN.S16 d0,q0,#8 8698 { 8699 int8x8_t res64; 8700 __m128i r16; 8701 r16 = vshrq_n_s16(a,b); 8702 r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only 8703 return64(r16); 8704 } 8705 8706 _NEON2SSESTORAGE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQSHRN.S32 d0,q0,#16 8707 _NEON2SSE_INLINE int16x4_t vqshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQSHRN.S32 d0,q0,#16 8708 { 8709 int16x4_t res64; 8710 __m128i r32; 8711 r32 = vshrq_n_s32(a,b); 8712 r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only 8713 return64(r32); 8714 } 8715 8716 _NEON2SSESTORAGE int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQSHRN.S64 d0,q0,#32 8717 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 8718 { 8719 //no optimal SIMD solution found 8720 _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2]; 8721 int32x2_t res; 8722 _mm_store_si128((__m128i*)atmp, a); 8723 res64[0] = (atmp[0] >> b); 8724 res64[1] = (atmp[1] >> b); 8725 if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; 8726 if(res64[0]<SINT_MIN) res64[0] = SINT_MIN; 8727 if(res64[1]>SINT_MAX) res64[1] = SINT_MAX; 8728 if(res64[1]<SINT_MIN) res64[1] = SINT_MIN; 8729 res.m64_i32[0] = (int32_t)res64[0]; 8730 res.m64_i32[1] = (int32_t)res64[1]; 8731 return res; 8732 } 8733 8734 _NEON2SSESTORAGE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQSHRN.s16 d0,q0,#8 8735 _NEON2SSE_INLINE uint8x8_t vqshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQSHRN.s16 d0,q0,#8 8736 { 8737 uint8x8_t res64; 8738 __m128i r16; 8739 r16 = vshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) 8740 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only 8741 return64(r16); 8742 } 8743 8744 _NEON2SSESTORAGE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQSHRN.U32 d0,q0,#16 8745 _NEON2SSE_INLINE uint16x4_t vqshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQSHRN.U32 d0,q0,#16 8746 { 8747 uint16x4_t res64; 8748 __m128i r32; 8749 r32 = vshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) 8750 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only 8751 return64(r32); 8752 } 8753 8754 _NEON2SSESTORAGE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQSHRN.U64 d0,q0,#32 8755 _NEON2SSE_INLINE uint32x2_t vqshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) 8756 { 8757 //serial solution may be faster 8758 uint32x2_t res64; 8759 __m128i r64, res_hi, zero; 8760 zero = _mm_setzero_si128(); 8761 r64 = vshrq_n_u64(a,b); 8762 res_hi = _mm_srli_epi64(r64, 32); 8763 res_hi = _mm_cmpgt_epi32(res_hi, zero); 8764 r64 = _mm_or_si128(r64, res_hi); 8765 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits 8766 return64(r64); 8767 } 8768 8769 8770 //********* Vector rounding narrowing shift right by constant ************************* 8771 //**************************************************************************************** 8772 _NEON2SSESTORAGE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 8773 _NEON2SSE_INLINE int8x8_t vrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 8774 { 8775 int8x8_t res64; 8776 __m128i r16; 8777 r16 = vrshrq_n_s16(a,b); 8778 r16 = _mm_shuffle_epi8 (r16, *(__m128i*) mask8_16_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems 8779 return64(r16); 8780 } 8781 8782 _NEON2SSESTORAGE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 8783 _NEON2SSE_INLINE int16x4_t vrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 8784 { 8785 int16x4_t res64; 8786 __m128i r32; 8787 r32 = vrshrq_n_s32(a,b); 8788 r32 = _mm_shuffle_epi8 (r32, *(__m128i*) mask8_32_even_odd); //narrow, use low 64 bits only. Impossible to use _mm_packs because of negative saturation problems 8789 return64(r32); 8790 } 8791 8792 _NEON2SSESTORAGE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 8793 _NEON2SSE_INLINE int32x2_t vrshrn_n_s64(int64x2_t a, __constrange(1,32) int b) 8794 { 8795 int32x2_t res64; 8796 __m128i r64; 8797 r64 = vrshrq_n_s64(a,b); 8798 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits 8799 return64(r64); 8800 } 8801 8802 _NEON2SSESTORAGE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VRSHRN.I16 d0,q0,#8 8803 _NEON2SSE_INLINE uint8x8_t vrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VRSHRN.I16 d0,q0,#8 8804 { 8805 uint8x8_t res64; 8806 __m128i mask, r16; 8807 mask = _mm_set1_epi16(0xff); 8808 r16 = vrshrq_n_s16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) 8809 r16 = _mm_and_si128(r16, mask); //to avoid saturation 8810 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only 8811 return64(r16); 8812 } 8813 8814 _NEON2SSESTORAGE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VRSHRN.I32 d0,q0,#16 8815 _NEON2SSE_INLINE uint16x4_t vrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VRSHRN.I32 d0,q0,#16 8816 { 8817 uint16x4_t res64; 8818 __m128i mask, r32; 8819 mask = _mm_set1_epi32(0xffff); 8820 r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) 8821 r32 = _mm_and_si128(r32, mask); //to avoid saturation 8822 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only 8823 return64(r32); 8824 } 8825 8826 _NEON2SSESTORAGE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VRSHRN.I64 d0,q0,#32 8827 _NEON2SSE_INLINE uint32x2_t vrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) //serial solution may be faster 8828 { 8829 uint32x2_t res64; 8830 __m128i r64; 8831 r64 = vrshrq_n_u64(a,b); 8832 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits 8833 return64(r64); 8834 } 8835 8836 //************* Vector rounding narrowing saturating shift right by constant ************ 8837 //**************************************************************************************** 8838 _NEON2SSESTORAGE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b); // VQRSHRN.S16 d0,q0,#8 8839 _NEON2SSE_INLINE int8x8_t vqrshrn_n_s16(int16x8_t a, __constrange(1,8) int b) // VQRSHRN.S16 d0,q0,#8 8840 { 8841 int8x8_t res64; 8842 __m128i r16; 8843 r16 = vrshrq_n_s16(a,b); 8844 r16 = _mm_packs_epi16 (r16,r16); //saturate and narrow, use low 64 bits only 8845 return64(r16); 8846 } 8847 8848 _NEON2SSESTORAGE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b); // VQRSHRN.S32 d0,q0,#16 8849 _NEON2SSE_INLINE int16x4_t vqrshrn_n_s32(int32x4_t a, __constrange(1,16) int b) // VQRSHRN.S32 d0,q0,#16 8850 { 8851 int16x4_t res64; 8852 __m128i r32; 8853 r32 = vrshrq_n_s32(a,b); 8854 r32 = _mm_packs_epi32 (r32,r32); //saturate and narrow, use low 64 bits only 8855 return64(r32); 8856 } 8857 8858 _NEON2SSESTORAGE int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b); // VQRSHRN.S64 d0,q0,#32 8859 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrshrn_n_s64(int64x2_t a, __constrange(1,32) int b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 8860 { 8861 //no optimal SIMD solution found 8862 _NEON2SSE_ALIGN_16 int64_t res64[2], atmp[2], maskb[2]; 8863 int32x2_t res; 8864 _mm_store_si128((__m128i*)atmp, a); 8865 maskb[0] = atmp[0] & (( int64_t)1 << (b - 1)); 8866 res64[0] = (atmp[0] >> b) + (maskb[0] >> (b - 1)); //rounded result 8867 maskb[1] = atmp[1] & (( int64_t)1 << (b - 1)); 8868 res64[1] = (atmp[1] >> b) + (maskb[1] >> (b - 1)); //rounded result 8869 if(res64[0]>SINT_MAX) res64[0] = SINT_MAX; 8870 if(res64[0]<SINT_MIN) res64[0] = SINT_MIN; 8871 if(res64[1]>SINT_MAX) res64[1] = SINT_MAX; 8872 if(res64[1]<SINT_MIN) res64[1] = SINT_MIN; 8873 res.m64_i32[0] = (int32_t)res64[0]; 8874 res.m64_i32[1] = (int32_t)res64[1]; 8875 return res; 8876 } 8877 8878 _NEON2SSESTORAGE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b); // VQRSHRN.s16 d0,q0,#8 8879 _NEON2SSE_INLINE uint8x8_t vqrshrn_n_u16(uint16x8_t a, __constrange(1,8) int b) // VQRSHRN.s16 d0,q0,#8 8880 { 8881 uint8x8_t res64; 8882 __m128i r16; 8883 r16 = vrshrq_n_u16(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _mm_packus_epi16 (signed 16 to unsigned 8) 8884 r16 = _mm_packus_epi16 (r16,r16); //saturate and narrow, use low 64 bits only 8885 return64(r16); 8886 } 8887 8888 _NEON2SSESTORAGE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b); // VQRSHRN.U32 d0,q0,#16 8889 _NEON2SSE_INLINE uint16x4_t vqrshrn_n_u32(uint32x4_t a, __constrange(1,16) int b) // VQRSHRN.U32 d0,q0,#16 8890 { 8891 uint16x4_t res64; 8892 __m128i r32; 8893 r32 = vrshrq_n_u32(a,b); //after right shift b>=1 unsigned var fits into signed range, so we could use _MM_PACKUS_EPI32 (signed 32 to unsigned 8) 8894 r32 = _MM_PACKUS1_EPI32 (r32); //saturate and narrow, use low 64 bits only 8895 return64(r32); 8896 } 8897 8898 _NEON2SSESTORAGE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b); // VQRSHRN.U64 d0,q0,#32 8899 _NEON2SSE_INLINE uint32x2_t vqrshrn_n_u64(uint64x2_t a, __constrange(1,32) int b) 8900 { 8901 //serial solution may be faster 8902 uint32x2_t res64; 8903 __m128i r64, res_hi, zero; 8904 zero = _mm_setzero_si128(); 8905 r64 = vrshrq_n_u64(a,b); 8906 res_hi = _mm_srli_epi64(r64, 32); 8907 res_hi = _mm_cmpgt_epi32(res_hi, zero); 8908 r64 = _mm_or_si128(r64, res_hi); 8909 r64 = _mm_shuffle_epi32(r64, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits 8910 return64(r64); 8911 } 8912 8913 //************** Vector widening shift left by constant **************** 8914 //************************************************************************ 8915 _NEON2SSESTORAGE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b); // VSHLL.S8 q0,d0,#0 8916 _NEON2SSE_INLINE int16x8_t vshll_n_s8(int8x8_t a, __constrange(0,8) int b) // VSHLL.S8 q0,d0,#0 8917 { 8918 __m128i r; 8919 r = _MM_CVTEPI8_EPI16 (_pM128i(a)); //SSE 4.1 8920 return _mm_slli_epi16 (r, b); 8921 } 8922 8923 _NEON2SSESTORAGE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b); // VSHLL.S16 q0,d0,#0 8924 _NEON2SSE_INLINE int32x4_t vshll_n_s16(int16x4_t a, __constrange(0,16) int b) // VSHLL.S16 q0,d0,#0 8925 { 8926 __m128i r; 8927 r = _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1, 8928 return _mm_slli_epi32 (r, b); 8929 } 8930 8931 _NEON2SSESTORAGE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b); // VSHLL.S32 q0,d0,#0 8932 _NEON2SSE_INLINE int64x2_t vshll_n_s32(int32x2_t a, __constrange(0,32) int b) // VSHLL.S32 q0,d0,#0 8933 { 8934 __m128i r; 8935 r = _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1, 8936 return _mm_slli_epi64 (r, b); 8937 } 8938 8939 _NEON2SSESTORAGE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b); // VSHLL.U8 q0,d0,#0 8940 _NEON2SSE_INLINE uint16x8_t vshll_n_u8(uint8x8_t a, __constrange(0,8) int b) // VSHLL.U8 q0,d0,#0 8941 { 8942 //no uint8 to uint16 conversion available, manual conversion used 8943 __m128i zero, r; 8944 zero = _mm_setzero_si128 (); 8945 r = _mm_unpacklo_epi8(_pM128i(a), zero); 8946 return _mm_slli_epi16 (r, b); 8947 } 8948 8949 _NEON2SSESTORAGE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b); // VSHLL.s16 q0,d0,#0 8950 _NEON2SSE_INLINE uint32x4_t vshll_n_u16(uint16x4_t a, __constrange(0,16) int b) // VSHLL.s16 q0,d0,#0 8951 { 8952 //no uint16 to uint32 conversion available, manual conversion used 8953 __m128i zero, r; 8954 zero = _mm_setzero_si128 (); 8955 r = _mm_unpacklo_epi16(_pM128i(a), zero); 8956 return _mm_slli_epi32 (r, b); 8957 } 8958 8959 _NEON2SSESTORAGE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b); // VSHLL.U32 q0,d0,#0 8960 _NEON2SSE_INLINE uint64x2_t vshll_n_u32(uint32x2_t a, __constrange(0,32) int b) // VSHLL.U32 q0,d0,#0 8961 { 8962 //no uint32 to uint64 conversion available, manual conversion used 8963 __m128i zero, r; 8964 zero = _mm_setzero_si128 (); 8965 r = _mm_unpacklo_epi32(_pM128i(a), zero); 8966 return _mm_slli_epi64 (r, b); 8967 } 8968 8969 //************************************************************************************ 8970 //**************************** Shifts with insert ************************************ 8971 //************************************************************************************ 8972 //takes each element in a vector, shifts them by an immediate value, 8973 //and inserts the results in the destination vector. Bits shifted out of the each element are lost. 8974 8975 //**************** Vector shift right and insert ************************************ 8976 //Actually the "c" left bits from "a" are the only bits remained from "a" after the shift. 8977 //All other bits are taken from b shifted. 8978 _NEON2SSESTORAGE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 8979 _NEON2SSE_INLINE int8x8_t vsri_n_s8(int8x8_t a, int8x8_t b, __constrange(1,8) int c) 8980 { 8981 int8x8_t res64; 8982 return64(vsriq_n_s8(_pM128i(a),_pM128i(b), c)); 8983 } 8984 8985 8986 _NEON2SSESTORAGE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 8987 _NEON2SSE_INLINE int16x4_t vsri_n_s16(int16x4_t a, int16x4_t b, __constrange(1,16) int c) 8988 { 8989 int16x4_t res64; 8990 return64(vsriq_n_s16(_pM128i(a),_pM128i(b), c)); 8991 } 8992 8993 8994 _NEON2SSESTORAGE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 8995 _NEON2SSE_INLINE int32x2_t vsri_n_s32(int32x2_t a, int32x2_t b, __constrange(1,32) int c) 8996 { 8997 int32x2_t res64; 8998 return64(vsriq_n_s32(_pM128i(a),_pM128i(b), c)); 8999 } 9000 9001 9002 _NEON2SSESTORAGE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 9003 _NEON2SSE_INLINE int64x1_t vsri_n_s64(int64x1_t a, int64x1_t b, __constrange(1,64) int c) 9004 { 9005 int64x1_t res; 9006 if (c ==64) 9007 res = a; 9008 else{ 9009 res.m64_i64[0] = (b.m64_u64[0] >> c) | ((a.m64_i64[0] >> (64 - c)) << (64 - c)); //treat b as unsigned for shift to get leading zeros 9010 } 9011 return res; 9012 } 9013 9014 _NEON2SSESTORAGE uint8x8_t vsri_n_u8(uint8x8_t a, uint8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 9015 #define vsri_n_u8 vsri_n_s8 9016 9017 _NEON2SSESTORAGE uint16x4_t vsri_n_u16(uint16x4_t a, uint16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 9018 #define vsri_n_u16 vsri_n_s16 9019 9020 _NEON2SSESTORAGE uint32x2_t vsri_n_u32(uint32x2_t a, uint32x2_t b, __constrange(1,32) int c); // VSRI.32 d0,d0,#32 9021 #define vsri_n_u32 vsri_n_s32 9022 9023 9024 _NEON2SSESTORAGE uint64x1_t vsri_n_u64(uint64x1_t a, uint64x1_t b, __constrange(1,64) int c); // VSRI.64 d0,d0,#64 9025 #define vsri_n_u64 vsri_n_s64 9026 9027 _NEON2SSESTORAGE poly8x8_t vsri_n_p8(poly8x8_t a, poly8x8_t b, __constrange(1,8) int c); // VSRI.8 d0,d0,#8 9028 #define vsri_n_p8 vsri_n_u8 9029 9030 _NEON2SSESTORAGE poly16x4_t vsri_n_p16(poly16x4_t a, poly16x4_t b, __constrange(1,16) int c); // VSRI.16 d0,d0,#16 9031 #define vsri_n_p16 vsri_n_u16 9032 9033 _NEON2SSESTORAGE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 9034 _NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8 9035 { 9036 __m128i maskA, a_masked; 9037 uint8x16_t b_shift; 9038 _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used 9039 maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros 9040 a_masked = _mm_and_si128 (a, maskA); 9041 b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift 9042 return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a) 9043 } 9044 9045 _NEON2SSESTORAGE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 9046 _NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16 9047 { 9048 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a 9049 uint16x8_t b_shift; 9050 uint16x8_t a_c; 9051 b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift 9052 a_c = vshrq_n_u16( a, (16 - c)); 9053 a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a 9054 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) 9055 } 9056 9057 _NEON2SSESTORAGE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 9058 _NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32 9059 { 9060 //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a 9061 uint32x4_t b_shift; 9062 uint32x4_t a_c; 9063 b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift 9064 a_c = vshrq_n_u32( a, (32 - c)); 9065 a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a 9066 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) 9067 } 9068 9069 _NEON2SSESTORAGE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 9070 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) 9071 { 9072 //serial solution may be faster 9073 uint64x2_t b_shift; 9074 uint64x2_t a_c; 9075 b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift 9076 a_c = _mm_srli_epi64(a, (64 - c)); 9077 a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a 9078 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) 9079 } 9080 9081 _NEON2SSESTORAGE uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 9082 #define vsriq_n_u8 vsriq_n_s8 9083 9084 _NEON2SSESTORAGE uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 9085 #define vsriq_n_u16 vsriq_n_s16 9086 9087 _NEON2SSESTORAGE uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 9088 #define vsriq_n_u32 vsriq_n_s32 9089 9090 _NEON2SSESTORAGE uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 9091 #define vsriq_n_u64 vsriq_n_s64 9092 9093 _NEON2SSESTORAGE poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 9094 #define vsriq_n_p8 vsriq_n_u8 9095 9096 _NEON2SSESTORAGE poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 9097 #define vsriq_n_p16 vsriq_n_u16 9098 9099 //***** Vector shift left and insert ********************************************* 9100 //********************************************************************************* 9101 //Actually the "c" right bits from "a" are the only bits remained from "a" after the shift. 9102 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted". 9103 _NEON2SSESTORAGE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 9104 _NEON2SSE_INLINE int8x8_t vsli_n_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c) 9105 { 9106 int8x8_t res64; 9107 return64(vsliq_n_s8(_pM128i(a),_pM128i(b), c)); 9108 } 9109 9110 9111 _NEON2SSESTORAGE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 9112 _NEON2SSE_INLINE int16x4_t vsli_n_s16(int16x4_t a, int16x4_t b, __constrange(0,15) int c) 9113 { 9114 int16x4_t res64; 9115 return64(vsliq_n_s16(_pM128i(a),_pM128i(b), c)); 9116 } 9117 9118 9119 _NEON2SSESTORAGE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 9120 _NEON2SSE_INLINE int32x2_t vsli_n_s32(int32x2_t a, int32x2_t b, __constrange(0,31) int c) 9121 { 9122 int32x2_t res64; 9123 return64(vsliq_n_s32(_pM128i(a),_pM128i(b), c)); 9124 } 9125 9126 _NEON2SSESTORAGE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 9127 _NEON2SSE_INLINE int64x1_t vsli_n_s64(int64x1_t a, int64x1_t b, __constrange(0,63) int c) 9128 { 9129 int64x1_t res; 9130 res.m64_i64[0] = (b.m64_i64[0] << c) | ((a.m64_u64[0] << (64 - c)) >> (64 - c)); //need to treat a as unsigned to get leading zeros 9131 return res; 9132 } 9133 9134 9135 _NEON2SSESTORAGE uint8x8_t vsli_n_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 9136 #define vsli_n_u8 vsli_n_s8 9137 9138 _NEON2SSESTORAGE uint16x4_t vsli_n_u16(uint16x4_t a, uint16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 9139 #define vsli_n_u16 vsli_n_s16 9140 9141 _NEON2SSESTORAGE uint32x2_t vsli_n_u32(uint32x2_t a, uint32x2_t b, __constrange(0,31) int c); // VSLI.32 d0,d0,#0 9142 #define vsli_n_u32 vsli_n_s32 9143 9144 _NEON2SSESTORAGE uint64x1_t vsli_n_u64(uint64x1_t a, uint64x1_t b, __constrange(0,63) int c); // VSLI.64 d0,d0,#0 9145 #define vsli_n_u64 vsli_n_s64 9146 9147 _NEON2SSESTORAGE poly8x8_t vsli_n_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VSLI.8 d0,d0,#0 9148 #define vsli_n_p8 vsli_n_u8 9149 9150 _NEON2SSESTORAGE poly16x4_t vsli_n_p16(poly16x4_t a, poly16x4_t b, __constrange(0,15) int c); // VSLI.16 d0,d0,#0 9151 #define vsli_n_p16 vsli_n_u16 9152 9153 _NEON2SSESTORAGE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 9154 _NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0 9155 { 9156 __m128i maskA, a_masked; 9157 int8x16_t b_shift; 9158 _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask 9159 maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones 9160 b_shift = vshlq_n_s8( b, c); 9161 a_masked = _mm_and_si128 (a, maskA); 9162 return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a) 9163 } 9164 9165 _NEON2SSESTORAGE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 9166 _NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0 9167 { 9168 //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a 9169 int16x8_t b_shift; 9170 int16x8_t a_c; 9171 b_shift = vshlq_n_s16( b, c); 9172 a_c = vshlq_n_s16( a, (16 - c)); 9173 a_c = _mm_srli_epi16(a_c, (16 - c)); 9174 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) 9175 } 9176 9177 _NEON2SSESTORAGE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 9178 _NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0 9179 { 9180 //solution may be not optimal compared with the serial one 9181 //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a 9182 int32x4_t b_shift; 9183 int32x4_t a_c; 9184 b_shift = vshlq_n_s32( b, c); 9185 a_c = vshlq_n_s32( a, (32 - c)); 9186 a_c = _mm_srli_epi32(a_c, (32 - c)); 9187 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) 9188 } 9189 9190 _NEON2SSESTORAGE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 9191 _NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0 9192 { 9193 //solution may be not optimal compared with the serial one 9194 //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a 9195 int64x2_t b_shift; 9196 int64x2_t a_c; 9197 b_shift = vshlq_n_s64( b, c); 9198 a_c = vshlq_n_s64( a, (64 - c)); 9199 a_c = _mm_srli_epi64(a_c, (64 - c)); 9200 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) 9201 } 9202 9203 _NEON2SSESTORAGE uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 9204 #define vsliq_n_u8 vsliq_n_s8 9205 9206 _NEON2SSESTORAGE uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 9207 #define vsliq_n_u16 vsliq_n_s16 9208 9209 _NEON2SSESTORAGE uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 9210 #define vsliq_n_u32 vsliq_n_s32 9211 9212 _NEON2SSESTORAGE uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 9213 #define vsliq_n_u64 vsliq_n_s64 9214 9215 _NEON2SSESTORAGE poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 9216 #define vsliq_n_p8 vsliq_n_u8 9217 9218 _NEON2SSESTORAGE poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 9219 #define vsliq_n_p16 vsliq_n_u16 9220 9221 // *********************************************************************************************** 9222 // ****************** Loads and stores of a single vector *************************************** 9223 // *********************************************************************************************** 9224 //Performs loads and stores of a single vector of some type. 9225 //******************************* Loads ******************************************************** 9226 // *********************************************************************************************** 9227 //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);. 9228 //also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous. 9229 // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access 9230 //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead; 9231 #define LOAD_SI128(ptr) \ 9232 ( ((uintptr_t)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr)) 9233 9234 _NEON2SSESTORAGE uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 9235 #define vld1q_u8 LOAD_SI128 9236 9237 _NEON2SSESTORAGE uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 9238 #define vld1q_u16 LOAD_SI128 9239 9240 _NEON2SSESTORAGE uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 9241 #define vld1q_u32 LOAD_SI128 9242 9243 _NEON2SSESTORAGE uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 9244 #define vld1q_u64 LOAD_SI128 9245 9246 _NEON2SSESTORAGE int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 9247 #define vld1q_s8 LOAD_SI128 9248 9249 _NEON2SSESTORAGE int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 9250 #define vld1q_s16 LOAD_SI128 9251 9252 _NEON2SSESTORAGE int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 9253 #define vld1q_s32 LOAD_SI128 9254 9255 _NEON2SSESTORAGE int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 9256 #define vld1q_s64 LOAD_SI128 9257 9258 _NEON2SSESTORAGE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] 9259 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers 9260 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0] 9261 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); 9262 __m128 f2; 9263 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]); 9264 }*/ 9265 9266 _NEON2SSESTORAGE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 9267 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr) 9268 { 9269 if( (((uintptr_t)(ptr)) & 15 ) == 0 ) //16 bits aligned 9270 return _mm_load_ps(ptr); 9271 else 9272 return _mm_loadu_ps(ptr); 9273 } 9274 9275 _NEON2SSESTORAGE poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 9276 #define vld1q_p8 LOAD_SI128 9277 9278 _NEON2SSESTORAGE poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 9279 #define vld1q_p16 LOAD_SI128 9280 9281 _NEON2SSESTORAGE uint8x8_t vld1_u8(__transfersize(8) uint8_t const * ptr); // VLD1.8 {d0}, [r0] 9282 #define vld1_u8(ptr) *((__m64_128*)(ptr)) //was _mm_loadl_epi64((__m128i*)(ptr)) 9283 9284 _NEON2SSESTORAGE uint16x4_t vld1_u16(__transfersize(4) uint16_t const * ptr); // VLD1.16 {d0}, [r0] 9285 #define vld1_u16 vld1_u8 9286 9287 _NEON2SSESTORAGE uint32x2_t vld1_u32(__transfersize(2) uint32_t const * ptr); // VLD1.32 {d0}, [r0] 9288 #define vld1_u32 vld1_u8 9289 9290 9291 _NEON2SSESTORAGE uint64x1_t vld1_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] 9292 #define vld1_u64 vld1_u8 9293 9294 _NEON2SSESTORAGE int8x8_t vld1_s8(__transfersize(8) int8_t const * ptr); // VLD1.8 {d0}, [r0] 9295 #define vld1_s8 vld1_u8 9296 9297 _NEON2SSESTORAGE int16x4_t vld1_s16(__transfersize(4) int16_t const * ptr); // VLD1.16 {d0}, [r0] 9298 #define vld1_s16 vld1_u16 9299 9300 _NEON2SSESTORAGE int32x2_t vld1_s32(__transfersize(2) int32_t const * ptr); // VLD1.32 {d0}, [r0] 9301 #define vld1_s32 vld1_u32 9302 9303 _NEON2SSESTORAGE int64x1_t vld1_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] 9304 #define vld1_s64 vld1_u64 9305 9306 _NEON2SSESTORAGE float16x4_t vld1_f16(__transfersize(4) __fp16 const * ptr); // VLD1.16 {d0}, [r0] 9307 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); 9308 9309 _NEON2SSESTORAGE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr); // VLD1.32 {d0}, [r0] 9310 _NEON2SSE_INLINE float32x2_t vld1_f32(__transfersize(2) float32_t const * ptr) 9311 { 9312 float32x2_t res; 9313 res.m64_f32[0] = *(ptr); 9314 res.m64_f32[1] = *(ptr + 1); 9315 return res; 9316 } 9317 9318 _NEON2SSESTORAGE poly8x8_t vld1_p8(__transfersize(8) poly8_t const * ptr); // VLD1.8 {d0}, [r0] 9319 #define vld1_p8 vld1_u8 9320 9321 _NEON2SSESTORAGE poly16x4_t vld1_p16(__transfersize(4) poly16_t const * ptr); // VLD1.16 {d0}, [r0] 9322 #define vld1_p16 vld1_u16 9323 9324 9325 _NEON2SSESTORAGE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 9326 _NEON2SSE_INLINE float64x2_t vld1q_f64(__transfersize(4) float64_t const * ptr) 9327 { 9328 if ((((uintptr_t)(ptr)) & 15) == 0) //16 bits aligned 9329 return _mm_load_pd(ptr); 9330 else 9331 return _mm_loadu_pd(ptr); 9332 } 9333 9334 9335 //*********************************************************************************************************** 9336 //******* Lane load functions - insert the data at vector's given position (lane) ************************* 9337 //*********************************************************************************************************** 9338 _NEON2SSESTORAGE uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] 9339 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) 9340 9341 _NEON2SSESTORAGE uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 9342 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) 9343 9344 _NEON2SSESTORAGE uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 9345 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) 9346 9347 _NEON2SSESTORAGE uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] 9348 #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p; 9349 9350 9351 _NEON2SSESTORAGE int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] 9352 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) 9353 9354 _NEON2SSESTORAGE int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 9355 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) 9356 9357 _NEON2SSESTORAGE int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 9358 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) 9359 9360 _NEON2SSESTORAGE float16x8_t vld1q_lane_f16(__transfersize(1) __fp16 const * ptr, float16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 9361 //current IA SIMD doesn't support float16 9362 9363 _NEON2SSESTORAGE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 9364 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane) 9365 { 9366 //we need to deal with ptr 16bit NOT aligned case 9367 __m128 p; 9368 p = _mm_set1_ps(*(ptr)); 9369 return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane)); 9370 } 9371 9372 _NEON2SSESTORAGE int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] 9373 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane) 9374 9375 _NEON2SSESTORAGE poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] 9376 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) 9377 9378 _NEON2SSESTORAGE poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 9379 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) 9380 9381 _NEON2SSESTORAGE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] 9382 _NEON2SSE_INLINE uint8x8_t vld1_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x8_t vec, __constrange(0,7) int lane) 9383 { 9384 uint8x8_t res; 9385 res = vec; 9386 res.m64_u8[lane] = *(ptr); 9387 return res; 9388 } 9389 9390 _NEON2SSESTORAGE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] 9391 _NEON2SSE_INLINE uint16x4_t vld1_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x4_t vec, __constrange(0,3) int lane) 9392 { 9393 uint16x4_t res; 9394 res = vec; 9395 res.m64_u16[lane] = *(ptr); 9396 return res; 9397 } 9398 9399 _NEON2SSESTORAGE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] 9400 _NEON2SSE_INLINE uint32x2_t vld1_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x2_t vec, __constrange(0,1) int lane) 9401 { 9402 uint32x2_t res; 9403 res = vec; 9404 res.m64_u32[lane] = *(ptr); 9405 return res; 9406 } 9407 9408 _NEON2SSESTORAGE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] 9409 _NEON2SSE_INLINE uint64x1_t vld1_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x1_t vec, __constrange(0,0) int lane) 9410 { 9411 uint64x1_t res; 9412 res.m64_u64[0] = *(ptr); 9413 return res; 9414 } 9415 9416 9417 _NEON2SSESTORAGE int8x8_t vld1_lane_s8(__transfersize(1) int8_t const * ptr, int8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] 9418 #define vld1_lane_s8(ptr, vec, lane) vld1_lane_u8((uint8_t*)ptr, vec, lane) 9419 9420 _NEON2SSESTORAGE int16x4_t vld1_lane_s16(__transfersize(1) int16_t const * ptr, int16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] 9421 #define vld1_lane_s16(ptr, vec, lane) vld1_lane_u16((uint16_t*)ptr, vec, lane) 9422 9423 _NEON2SSESTORAGE int32x2_t vld1_lane_s32(__transfersize(1) int32_t const * ptr, int32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] 9424 #define vld1_lane_s32(ptr, vec, lane) vld1_lane_u32((uint32_t*)ptr, vec, lane) 9425 9426 _NEON2SSESTORAGE float16x4_t vld1_lane_f16(__transfersize(1) __fp16 const * ptr, float16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] 9427 //current IA SIMD doesn't support float16 9428 9429 _NEON2SSESTORAGE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane); // VLD1.32 {d0[0]}, [r0] 9430 _NEON2SSE_INLINE float32x2_t vld1_lane_f32(__transfersize(1) float32_t const * ptr, float32x2_t vec, __constrange(0,1) int lane) 9431 { 9432 float32x2_t res; 9433 res = vec; 9434 res.m64_f32[lane] = *(ptr); 9435 return res; 9436 } 9437 9438 _NEON2SSESTORAGE int64x1_t vld1_lane_s64(__transfersize(1) int64_t const * ptr, int64x1_t vec, __constrange(0,0) int lane); // VLD1.64 {d0}, [r0] 9439 #define vld1_lane_s64(ptr, vec, lane) vld1_lane_u64((uint64_t*)ptr, vec, lane) 9440 9441 _NEON2SSESTORAGE poly8x8_t vld1_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x8_t vec, __constrange(0,7) int lane); // VLD1.8 {d0[0]}, [r0] 9442 #define vld1_lane_p8 vld1_lane_u8 9443 9444 _NEON2SSESTORAGE poly16x4_t vld1_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x4_t vec, __constrange(0,3) int lane); // VLD1.16 {d0[0]}, [r0] 9445 #define vld1_lane_p16 vld1_lane_s16 9446 9447 // ****************** Load single value ( set all lanes of vector with same value from memory)********************** 9448 // ****************************************************************************************************************** 9449 _NEON2SSESTORAGE uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] 9450 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr)) 9451 9452 _NEON2SSESTORAGE uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] 9453 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr)) 9454 9455 _NEON2SSESTORAGE uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] 9456 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr)) 9457 9458 _NEON2SSESTORAGE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] 9459 _NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr) 9460 { 9461 _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)}; 9462 return LOAD_SI128(val); 9463 } 9464 9465 _NEON2SSESTORAGE int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] 9466 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr)) 9467 9468 _NEON2SSESTORAGE int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] 9469 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr)) 9470 9471 _NEON2SSESTORAGE int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] 9472 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr)) 9473 9474 _NEON2SSESTORAGE int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] 9475 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr) 9476 9477 _NEON2SSESTORAGE float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] 9478 //current IA SIMD doesn't support float16, need to go to 32 bits 9479 9480 _NEON2SSESTORAGE float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] 9481 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr)) 9482 9483 _NEON2SSESTORAGE poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] 9484 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr)) 9485 9486 _NEON2SSESTORAGE poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] 9487 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr)) 9488 9489 _NEON2SSESTORAGE uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] 9490 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vld1_dup_u8(__transfersize(1) uint8_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) 9491 { 9492 uint8x8_t res; 9493 int i; 9494 for(i = 0; i<8; i++) { 9495 res.m64_u8[i] = *(ptr); 9496 } 9497 return res; 9498 } 9499 9500 _NEON2SSESTORAGE uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] 9501 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vld1_dup_u16(__transfersize(1) uint16_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) 9502 { 9503 uint16x4_t res; 9504 int i; 9505 for(i = 0; i<4; i++) { 9506 res.m64_u16[i] = *(ptr); 9507 } 9508 return res; 9509 } 9510 9511 _NEON2SSESTORAGE uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] 9512 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vld1_dup_u32(__transfersize(1) uint32_t const * ptr), _NEON2SSE_REASON_SLOW_SERIAL) 9513 { 9514 uint32x2_t res; 9515 res.m64_u32[0] = *(ptr); 9516 res.m64_u32[1] = *(ptr); 9517 return res; 9518 } 9519 9520 _NEON2SSESTORAGE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] 9521 _NEON2SSE_INLINE uint64x1_t vld1_dup_u64(__transfersize(1) uint64_t const * ptr) 9522 { 9523 uint64x1_t res; 9524 res.m64_u64[0] = *(ptr); 9525 return res; 9526 } 9527 9528 _NEON2SSESTORAGE int8x8_t vld1_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] 9529 #define vld1_dup_s8(ptr) vld1_dup_u8((uint8_t*)ptr) 9530 9531 9532 _NEON2SSESTORAGE int16x4_t vld1_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] 9533 #define vld1_dup_s16(ptr) vld1_dup_u16((uint16_t*)ptr) 9534 9535 9536 _NEON2SSESTORAGE int32x2_t vld1_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] 9537 #define vld1_dup_s32(ptr) vld1_dup_u32((uint32_t*)ptr) 9538 9539 9540 _NEON2SSESTORAGE int64x1_t vld1_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] 9541 #define vld1_dup_s64(ptr) vld1_dup_u64((uint64_t*)ptr) 9542 9543 _NEON2SSESTORAGE float16x4_t vld1_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] 9544 //current IA SIMD doesn't support float16 9545 9546 _NEON2SSESTORAGE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] 9547 _NEON2SSE_INLINE float32x2_t vld1_dup_f32(__transfersize(1) float32_t const * ptr) 9548 { 9549 float32x2_t res; 9550 res.m64_f32[0] = *(ptr); 9551 res.m64_f32[1] = res.m64_f32[0]; 9552 return res; // use last 64bits only 9553 } 9554 9555 _NEON2SSESTORAGE poly8x8_t vld1_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] 9556 #define vld1_dup_p8 vld1_dup_u8 9557 9558 9559 _NEON2SSESTORAGE poly16x4_t vld1_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] 9560 #define vld1_dup_p16 vld1_dup_u16 9561 9562 9563 //************************************************************************************* 9564 //********************************* Store ********************************************** 9565 //************************************************************************************* 9566 // If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val); 9567 //here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro 9568 #define STORE_SI128(ptr, val) \ 9569 (((uintptr_t)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val); 9570 9571 _NEON2SSESTORAGE void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] 9572 #define vst1q_u8 STORE_SI128 9573 9574 _NEON2SSESTORAGE void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] 9575 #define vst1q_u16 STORE_SI128 9576 9577 _NEON2SSESTORAGE void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] 9578 #define vst1q_u32 STORE_SI128 9579 9580 _NEON2SSESTORAGE void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] 9581 #define vst1q_u64 STORE_SI128 9582 9583 _NEON2SSESTORAGE void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] 9584 #define vst1q_s8 STORE_SI128 9585 9586 _NEON2SSESTORAGE void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] 9587 #define vst1q_s16 STORE_SI128 9588 9589 _NEON2SSESTORAGE void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] 9590 #define vst1q_s32 STORE_SI128 9591 9592 _NEON2SSESTORAGE void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] 9593 #define vst1q_s64 STORE_SI128 9594 9595 _NEON2SSESTORAGE void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] 9596 // IA32 SIMD doesn't work with 16bit floats currently 9597 9598 _NEON2SSESTORAGE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] 9599 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val) 9600 { 9601 if( ((uintptr_t)(ptr) & 15) == 0 ) //16 bits aligned 9602 _mm_store_ps (ptr, val); 9603 else 9604 _mm_storeu_ps (ptr, val); 9605 } 9606 9607 _NEON2SSESTORAGE void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] 9608 #define vst1q_p8 vst1q_u8 9609 9610 _NEON2SSESTORAGE void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] 9611 #define vst1q_p16 vst1q_u16 9612 9613 _NEON2SSESTORAGE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val); // VST1.8 {d0}, [r0] 9614 _NEON2SSE_INLINE void vst1_u8(__transfersize(8) uint8_t * ptr, uint8x8_t val) 9615 { 9616 int i; 9617 for (i = 0; i<8; i++) { 9618 *(ptr + i) = ((uint8_t*)&val)[i]; 9619 } 9620 //_mm_storel_epi64((__m128i*)ptr, val); 9621 return; 9622 } 9623 9624 _NEON2SSESTORAGE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val); // VST1.16 {d0}, [r0] 9625 _NEON2SSE_INLINE void vst1_u16(__transfersize(4) uint16_t * ptr, uint16x4_t val) 9626 { 9627 int i; 9628 for (i = 0; i<4; i++) { 9629 *(ptr + i) = ((uint16_t*)&val)[i]; 9630 } 9631 //_mm_storel_epi64((__m128i*)ptr, val); 9632 return; 9633 } 9634 9635 _NEON2SSESTORAGE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val); // VST1.32 {d0}, [r0] 9636 _NEON2SSE_INLINE void vst1_u32(__transfersize(2) uint32_t * ptr, uint32x2_t val) 9637 { 9638 int i; 9639 for (i = 0; i<2; i++) { 9640 *(ptr + i) = ((uint32_t*)&val)[i]; 9641 } 9642 //_mm_storel_epi64((__m128i*)ptr, val); 9643 return; 9644 } 9645 9646 _NEON2SSESTORAGE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val); // VST1.64 {d0}, [r0] 9647 _NEON2SSE_INLINE void vst1_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val) 9648 { 9649 *(ptr) = *((uint64_t*)&val); 9650 //_mm_storel_epi64((__m128i*)ptr, val); 9651 return; 9652 } 9653 9654 _NEON2SSESTORAGE void vst1_s8(__transfersize(8) int8_t * ptr, int8x8_t val); // VST1.8 {d0}, [r0] 9655 #define vst1_s8(ptr,val) vst1_u8((uint8_t*)ptr,val) 9656 9657 _NEON2SSESTORAGE void vst1_s16(__transfersize(4) int16_t * ptr, int16x4_t val); // VST1.16 {d0}, [r0] 9658 #define vst1_s16(ptr,val) vst1_u16((uint16_t*)ptr,val) 9659 9660 _NEON2SSESTORAGE void vst1_s32(__transfersize(2) int32_t * ptr, int32x2_t val); // VST1.32 {d0}, [r0] 9661 #define vst1_s32(ptr,val) vst1_u32((uint32_t*)ptr,val) 9662 9663 _NEON2SSESTORAGE void vst1_s64(__transfersize(1) int64_t * ptr, int64x1_t val); // VST1.64 {d0}, [r0] 9664 #define vst1_s64(ptr,val) vst1_u64((uint64_t*)ptr,val) 9665 9666 _NEON2SSESTORAGE void vst1_f16(__transfersize(4) __fp16 * ptr, float16x4_t val); // VST1.16 {d0}, [r0] 9667 //current IA SIMD doesn't support float16 9668 9669 _NEON2SSESTORAGE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val); // VST1.32 {d0}, [r0] 9670 _NEON2SSE_INLINE void vst1_f32(__transfersize(2) float32_t * ptr, float32x2_t val) 9671 { 9672 *(ptr) = val.m64_f32[0]; 9673 *(ptr + 1) = val.m64_f32[1]; 9674 return; 9675 } 9676 9677 _NEON2SSESTORAGE void vst1_p8(__transfersize(8) poly8_t * ptr, poly8x8_t val); // VST1.8 {d0}, [r0] 9678 #define vst1_p8 vst1_u8 9679 9680 _NEON2SSESTORAGE void vst1_p16(__transfersize(4) poly16_t * ptr, poly16x4_t val); // VST1.16 {d0}, [r0] 9681 #define vst1_p16 vst1_u16 9682 9683 //***********Store a lane of a vector into memory (extract given lane) ********************* 9684 //****************************************************************************************** 9685 _NEON2SSESTORAGE void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] 9686 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = (uint8_t) _MM_EXTRACT_EPI8 (val, lane) 9687 9688 _NEON2SSESTORAGE void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] 9689 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = (uint16_t) _MM_EXTRACT_EPI16 (val, lane) 9690 9691 _NEON2SSESTORAGE void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] 9692 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = (uint32_t) _MM_EXTRACT_EPI32 (val, lane) 9693 9694 _NEON2SSESTORAGE void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] 9695 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = (uint64_t) _MM_EXTRACT_EPI64 (val, lane) 9696 9697 _NEON2SSESTORAGE void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] 9698 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = (int8_t) _MM_EXTRACT_EPI8 (val, lane) 9699 9700 _NEON2SSESTORAGE void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] 9701 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = (int16_t) _MM_EXTRACT_EPI16 (val, lane) 9702 9703 _NEON2SSESTORAGE void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] 9704 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) 9705 9706 _NEON2SSESTORAGE void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] 9707 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) 9708 9709 _NEON2SSESTORAGE void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] 9710 //current IA SIMD doesn't support float16 9711 9712 _NEON2SSESTORAGE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] 9713 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane) 9714 { 9715 int32_t ilane; 9716 ilane = _MM_EXTRACT_PS(val,lane); 9717 *(ptr) = *((float*)&ilane); 9718 } 9719 9720 _NEON2SSESTORAGE void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] 9721 #define vst1q_lane_p8 vst1q_lane_u8 9722 9723 _NEON2SSESTORAGE void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] 9724 #define vst1q_lane_p16 vst1q_lane_s16 9725 9726 _NEON2SSESTORAGE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] 9727 _NEON2SSE_INLINE void vst1_lane_u8(__transfersize(1) uint8_t * ptr, uint8x8_t val, __constrange(0,7) int lane) 9728 { 9729 *(ptr) = val.m64_u8[lane]; 9730 } 9731 9732 _NEON2SSESTORAGE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] 9733 _NEON2SSE_INLINE void vst1_lane_u16(__transfersize(1) uint16_t * ptr, uint16x4_t val, __constrange(0,3) int lane) 9734 { 9735 *(ptr) = val.m64_u16[lane]; 9736 } 9737 9738 _NEON2SSESTORAGE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] 9739 _NEON2SSE_INLINE void vst1_lane_u32(__transfersize(1) uint32_t * ptr, uint32x2_t val, __constrange(0,1) int lane) 9740 { 9741 *(ptr) = val.m64_u32[lane]; 9742 } 9743 9744 _NEON2SSESTORAGE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] 9745 _NEON2SSE_INLINE void vst1_lane_u64(__transfersize(1) uint64_t * ptr, uint64x1_t val, __constrange(0,0) int lane) 9746 { 9747 *(ptr) = val.m64_u64[0]; 9748 } 9749 9750 _NEON2SSESTORAGE void vst1_lane_s8(__transfersize(1) int8_t * ptr, int8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] 9751 #define vst1_lane_s8(ptr, val, lane) vst1_lane_u8((uint8_t*)ptr, val, lane) 9752 9753 _NEON2SSESTORAGE void vst1_lane_s16(__transfersize(1) int16_t * ptr, int16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] 9754 #define vst1_lane_s16(ptr, val, lane) vst1_lane_u16((uint16_t*)ptr, val, lane) 9755 9756 _NEON2SSESTORAGE void vst1_lane_s32(__transfersize(1) int32_t * ptr, int32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] 9757 #define vst1_lane_s32(ptr, val, lane) vst1_lane_u32((uint32_t*)ptr, val, lane) 9758 9759 9760 _NEON2SSESTORAGE void vst1_lane_s64(__transfersize(1) int64_t * ptr, int64x1_t val, __constrange(0,0) int lane); // VST1.64 {d0}, [r0] 9761 #define vst1_lane_s64(ptr, val, lane) vst1_lane_u64((uint64_t*)ptr, val, lane) 9762 9763 9764 _NEON2SSESTORAGE void vst1_lane_f16(__transfersize(1) __fp16 * ptr, float16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] 9765 //current IA SIMD doesn't support float16 9766 9767 _NEON2SSESTORAGE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane); // VST1.32 {d0[0]}, [r0] 9768 _NEON2SSE_INLINE void vst1_lane_f32(__transfersize(1) float32_t * ptr, float32x2_t val, __constrange(0,1) int lane) 9769 { 9770 *(ptr) = val.m64_f32[lane]; 9771 } 9772 9773 _NEON2SSESTORAGE void vst1_lane_p8(__transfersize(1) poly8_t * ptr, poly8x8_t val, __constrange(0,7) int lane); // VST1.8 {d0[0]}, [r0] 9774 #define vst1_lane_p8 vst1_lane_u8 9775 9776 _NEON2SSESTORAGE void vst1_lane_p16(__transfersize(1) poly16_t * ptr, poly16x4_t val, __constrange(0,3) int lane); // VST1.16 {d0[0]}, [r0] 9777 #define vst1_lane_p16 vst1_lane_s16 9778 9779 //*********************************************************************************************** 9780 //**************** Loads and stores of an N-element structure ********************************** 9781 //*********************************************************************************************** 9782 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning 9783 //We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions" 9784 //****************** 2 elements load ********************************************* 9785 _NEON2SSESTORAGE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 9786 _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0] 9787 { 9788 uint8x16x2_t v; 9789 v.val[0] = vld1q_u8(ptr); 9790 v.val[1] = vld1q_u8((ptr + 16)); 9791 v = vuzpq_s8(v.val[0], v.val[1]); 9792 return v; 9793 } 9794 9795 _NEON2SSESTORAGE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 9796 _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0] 9797 { 9798 uint16x8x2_t v; 9799 v.val[0] = vld1q_u16( ptr); 9800 v.val[1] = vld1q_u16( (ptr + 8)); 9801 v = vuzpq_s16(v.val[0], v.val[1]); 9802 return v; 9803 } 9804 9805 _NEON2SSESTORAGE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 9806 _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0] 9807 { 9808 uint32x4x2_t v; 9809 v.val[0] = vld1q_u32 ( ptr); 9810 v.val[1] = vld1q_u32 ( (ptr + 4)); 9811 v = vuzpq_s32(v.val[0], v.val[1]); 9812 return v; 9813 } 9814 9815 _NEON2SSESTORAGE int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); 9816 #define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr) 9817 9818 _NEON2SSESTORAGE int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 9819 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr) 9820 9821 _NEON2SSESTORAGE int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 9822 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr) 9823 9824 9825 _NEON2SSESTORAGE float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] 9826 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 9827 9828 _NEON2SSESTORAGE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 9829 _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0] 9830 { 9831 float32x4x2_t v; 9832 v.val[0] = vld1q_f32 (ptr); 9833 v.val[1] = vld1q_f32 ((ptr + 4)); 9834 v = vuzpq_f32(v.val[0], v.val[1]); 9835 return v; 9836 } 9837 9838 _NEON2SSESTORAGE poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 9839 #define vld2q_p8 vld2q_u8 9840 9841 _NEON2SSESTORAGE poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 9842 #define vld2q_p16 vld2q_u16 9843 9844 _NEON2SSESTORAGE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 9845 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr) 9846 { 9847 uint8x8x2_t v; 9848 __m128i ld128; 9849 ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit 9850 ld128 = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_16_even_odd); 9851 vst1q_u8((v.val), ld128); // v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); 9852 return v; 9853 } 9854 9855 _NEON2SSESTORAGE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 9856 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr) 9857 { 9858 _NEON2SSE_ALIGN_16 uint16x4x2_t v; 9859 __m128i ld128; 9860 ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit 9861 ld128 = _mm_shuffle_epi8(ld128, *(__m128i*) mask8_32_even_odd); 9862 vst1q_u16((v.val), ld128); 9863 return v; 9864 } 9865 9866 _NEON2SSESTORAGE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 9867 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr) 9868 { 9869 _NEON2SSE_ALIGN_16 uint32x2x2_t v; 9870 __m128i ld128; 9871 ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit 9872 ld128 = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); 9873 vst1q_u32((v.val), ld128); 9874 return v; 9875 } 9876 9877 _NEON2SSESTORAGE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 9878 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr) 9879 { 9880 uint64x1x2_t v; 9881 v.val[0].m64_u64[0] = *(ptr); 9882 v.val[1].m64_u64[0] = *(ptr + 1); 9883 return v; 9884 } 9885 9886 _NEON2SSESTORAGE int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 9887 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr) 9888 9889 _NEON2SSESTORAGE int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 9890 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr) 9891 9892 _NEON2SSESTORAGE int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 9893 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr) 9894 9895 _NEON2SSESTORAGE int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 9896 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr) 9897 9898 _NEON2SSESTORAGE float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] 9899 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1_f16 for example 9900 9901 _NEON2SSESTORAGE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 9902 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr) 9903 { 9904 float32x2x2_t v; 9905 v.val[0].m64_f32[0] = *(ptr); 9906 v.val[0].m64_f32[1] = *(ptr + 2); 9907 v.val[1].m64_f32[0] = *(ptr + 1); 9908 v.val[1].m64_f32[1] = *(ptr + 3); 9909 return v; 9910 } 9911 9912 _NEON2SSESTORAGE poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 9913 #define vld2_p8 vld2_u8 9914 9915 _NEON2SSESTORAGE poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 9916 #define vld2_p16 vld2_u16 9917 9918 //******************** Triplets *************************************** 9919 //********************************************************************* 9920 _NEON2SSESTORAGE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 9921 _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0] 9922 { 9923 //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 -> 9924 //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13 9925 //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14, 9926 //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15 9927 uint8x16x3_t v; 9928 __m128i tmp0, tmp1,tmp2, tmp3; 9929 _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14}; 9930 _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13}; 9931 _NEON2SSE_ALIGN_16 static const int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15}; 9932 9933 v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15 9934 v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15 9935 v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15 9936 9937 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11 9938 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13 9939 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15 9940 9941 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15 9942 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x 9943 tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14, 9944 tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0 9945 v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13, 9946 v.val[0] = _mm_or_si128(v.val[0],tmp3); //a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13, 9947 9948 tmp3 = _mm_slli_si128(tmp0, 5); //0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13, 9949 tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0 9950 v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0 9951 v.val[1] = _mm_slli_si128(v.val[1], 5); //0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13, 9952 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13, 9953 v.val[1] = _mm_slli_si128(v.val[1],5); //0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15, 9954 v.val[1] = _mm_srli_si128(v.val[1], 5); //a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0 9955 tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0 9956 tmp3 = _mm_slli_si128(tmp3,11); //0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14, 9957 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14, 9958 9959 tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0, 9960 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15, 9961 v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0 9962 v.val[2] = _mm_slli_si128(v.val[2],5); //0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0 9963 v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15, 9964 tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0, 9965 v.val[2] = _mm_or_si128(v.val[2],tmp0); //a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15, 9966 return v; 9967 } 9968 9969 _NEON2SSESTORAGE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 9970 _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0] 9971 { 9972 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 9973 uint16x8x3_t v; 9974 __m128i tmp0, tmp1,tmp2, tmp3; 9975 _NEON2SSE_ALIGN_16 static const int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; 9976 _NEON2SSE_ALIGN_16 static const int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13}; 9977 _NEON2SSE_ALIGN_16 static const int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15}; 9978 9979 v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7, 9980 v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7 9981 v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7 9982 9983 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5, 9984 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6 9985 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7 9986 9987 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6, 9988 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x 9989 tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7 9990 tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0 9991 v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5, 9992 v.val[0] = _mm_or_si128(v.val[0],tmp3); //a0,a3,a6,b1,b4,b7,c2,c5 9993 9994 tmp3 = _mm_slli_si128(tmp0, 4); //0,0,a0,a3,a6,a1,a4,a7 9995 tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0 9996 v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0 9997 v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6, 9998 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,b0,b3,b6, 9999 v.val[1] = _mm_slli_si128(v.val[1],6); //0,0,0,a1,a4,a7,b2,b5, 10000 v.val[1] = _mm_srli_si128(v.val[1], 6); //a1,a4,a7,b2,b5,0,0,0, 10001 tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0 10002 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6, 10003 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6, 10004 10005 tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0 10006 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7, 10007 v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0 10008 v.val[2] = _mm_slli_si128(v.val[2],4); //0,0, b0,b3,b6,0,0,0 10009 v.val[2] = _mm_or_si128(v.val[2],tmp3); //0,0, b0,b3,b6,c1,c4,c7, 10010 tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0 10011 v.val[2] = _mm_or_si128(v.val[2],tmp0); //a2,a5,b0,b3,b6,c1,c4,c7, 10012 return v; 10013 } 10014 10015 _NEON2SSESTORAGE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 10016 _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] 10017 { 10018 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, 10019 uint32x4x3_t v; 10020 __m128i tmp0, tmp1,tmp2, tmp3; 10021 v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3, 10022 v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3 10023 v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3, 10024 10025 tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2 10026 tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1 10027 tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3 10028 10029 tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2 10030 v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1 10031 tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1 10032 v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0, 10033 v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2 10034 v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3 10035 return v; 10036 } 10037 10038 _NEON2SSESTORAGE int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 10039 #define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr)) 10040 10041 _NEON2SSESTORAGE int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 10042 #define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr)) 10043 10044 _NEON2SSESTORAGE int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 10045 #define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr)) 10046 10047 _NEON2SSESTORAGE float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 10048 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 10049 10050 _NEON2SSESTORAGE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 10051 _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] 10052 { 10053 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, 10054 float32x4x3_t v; 10055 __m128 tmp0, tmp1,tmp2, tmp3; 10056 v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3, 10057 v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3 10058 v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3, 10059 10060 tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2 10061 tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1 10062 tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3 10063 tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2 10064 10065 v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1 10066 tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1 10067 v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0, 10068 v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2 10069 v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3 10070 return v; 10071 } 10072 10073 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 10074 #define vld3q_p8 vld3q_u8 10075 10076 _NEON2SSESTORAGE poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 10077 #define vld3q_p16 vld3q_u16 10078 10079 _NEON2SSESTORAGE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 10080 _NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0] 10081 { 10082 //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 10083 uint8x8x3_t v; 10084 __m128i val0, val1, val2, tmp0, tmp1; 10085 _NEON2SSE_ALIGN_16 static const int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14}; 10086 _NEON2SSE_ALIGN_16 static const int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0}; 10087 val0 = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7 10088 val2 = _mm_loadl_epi64((__m128i*)(ptr + 16)); //c0,c1,c2,c3,...c7 10089 10090 tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6, 10091 tmp1 = _mm_shuffle_epi8(val2, *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x 10092 val0 = _mm_slli_si128(tmp0,10); 10093 val0 = _mm_srli_si128(val0,10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0 10094 val2 = _mm_slli_si128(tmp1,6); //0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x 10095 val0 = _mm_or_si128(val0,val2); //a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x 10096 _M64(v.val[0], val0); 10097 val1 = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5, 10098 val1 = _mm_srli_si128(val1,11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0, 10099 val2 = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0 10100 val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0 10101 val1 = _mm_or_si128(val1,val2); //a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x 10102 _M64(v.val[1], val1); 10103 10104 tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0, 10105 val2 = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0 10106 val2 = _mm_slli_si128(val2,5); //0,0,0,0,0,c1,c4,c7, 10107 val2 = _mm_or_si128(tmp0, val2); //a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x 10108 _M64(v.val[2], val2); 10109 return v; 10110 } 10111 10112 _NEON2SSESTORAGE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 10113 _NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0] 10114 { 10115 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, 10116 uint16x4x3_t v; 10117 __m128i val0, val1, val2, tmp0, tmp1; 10118 _NEON2SSE_ALIGN_16 static const int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; 10119 val0 = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3 10120 val2 = _mm_loadl_epi64((__m128i*)(ptr + 8)); //c0,c1,c2,c3, x,x,x,x 10121 10122 tmp0 = _mm_shuffle_epi8(val0, *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1 10123 tmp1 = _mm_shufflelo_epi16(val2, 201); //11 00 10 01 : c1, c2, c0, c3, 10124 val0 = _mm_slli_si128(tmp0,10); 10125 val0 = _mm_srli_si128(val0,10); //a0, a3, b2, 0,0, 0,0, 10126 val2 = _mm_slli_si128(tmp1,14); //0,0,0,0,0,0,0,c1 10127 val2 = _mm_srli_si128(val2,8); //0,0,0,c1,0,0,0,0 10128 val0 = _mm_or_si128(val0,val2); //a0, a3, b2, c1, x,x,x,x 10129 _M64(v.val[0], val0); 10130 10131 val1 = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3 10132 val1 = _mm_srli_si128(val1,10); //a1, b0, b3, 0,0, 0,0, 10133 val2 = _mm_srli_si128(tmp1,2); //c2, 0,0,0,0,0,0,0, 10134 val2 = _mm_slli_si128(val2,6); //0,0,0,c2,0,0,0,0 10135 val1 = _mm_or_si128(val1,val2); //a1, b0, b3, c2, x,x,x,x 10136 _M64(v.val[1], val1); 10137 10138 tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0 10139 tmp1 = _mm_srli_si128(tmp1,4); 10140 tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3, 10141 val2 = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3, 10142 _M64(v.val[2], val2); 10143 return v; 10144 } 10145 10146 _NEON2SSESTORAGE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 10147 _NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0] 10148 { 10149 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 10150 uint32x2x3_t v; 10151 __m128i val0, val1, val2; 10152 val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, 10153 val2 = _mm_loadl_epi64((__m128i*) (ptr + 4)); //c0,c1, x,x 10154 10155 val0 = _mm_shuffle_epi32(val0, 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0 10156 _M64(v.val[0], val0); 10157 val2 = _mm_slli_si128(val2, 8); //x, x,c0,c1, 10158 val1 = _mm_unpackhi_epi32(val0,val2); //a1,c0, b0, c1 10159 _M64(v.val[1], val1); 10160 val2 = _mm_srli_si128(val1, 8); //b0, c1, x, x, 10161 _M64(v.val[2], val2); 10162 return v; 10163 } 10164 _NEON2SSESTORAGE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 10165 _NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] 10166 { 10167 uint64x1x3_t v; 10168 v.val[0].m64_u64[0] = *(ptr); 10169 v.val[1].m64_u64[0] = *(ptr + 1); 10170 v.val[2].m64_u64[0] = *(ptr + 2); 10171 return v; 10172 } 10173 10174 _NEON2SSESTORAGE int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 10175 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr) 10176 10177 _NEON2SSESTORAGE int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 10178 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr) 10179 10180 _NEON2SSESTORAGE int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 10181 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr) 10182 10183 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 10184 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr) 10185 10186 _NEON2SSESTORAGE float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 10187 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 10188 10189 _NEON2SSESTORAGE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 10190 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr) 10191 { 10192 //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 10193 float32x2x3_t v; 10194 v.val[0].m64_f32[0] = *(ptr); 10195 v.val[0].m64_f32[1] = *(ptr + 3); 10196 10197 v.val[1].m64_f32[0] = *(ptr + 1); 10198 v.val[1].m64_f32[1] = *(ptr + 4); 10199 10200 v.val[2].m64_f32[0] = *(ptr + 2); 10201 v.val[2].m64_f32[1] = *(ptr + 5); 10202 return v; 10203 } 10204 10205 _NEON2SSESTORAGE poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 10206 #define vld3_p8 vld3_u8 10207 10208 _NEON2SSESTORAGE poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 10209 #define vld3_p16 vld3_u16 10210 10211 //*************** Quadruples load ******************************** 10212 //***************************************************************** 10213 _NEON2SSESTORAGE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 10214 _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0] 10215 { 10216 uint8x16x4_t v; 10217 __m128i tmp3, tmp2, tmp1, tmp0; 10218 10219 v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15 10220 v.val[1] = vld1q_u8 ( (ptr + 16)); //b0, b1,b2,...b7.... b15 10221 v.val[2] = vld1q_u8 ( (ptr + 32)); //c0, c1,c2,...c7....c15 10222 v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15 10223 10224 tmp0 = _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7 10225 tmp1 = _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7 10226 tmp2 = _mm_unpackhi_epi8(v.val[0],v.val[1]); //a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15 10227 tmp3 = _mm_unpackhi_epi8(v.val[2],v.val[3]); //c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15 10228 10229 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11 10230 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15 10231 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11 10232 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15 10233 10234 tmp0 = _mm_unpacklo_epi32(v.val[0], v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9 10235 tmp1 = _mm_unpackhi_epi32(v.val[0], v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11 10236 tmp2 = _mm_unpacklo_epi32(v.val[1], v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13, 10237 tmp3 = _mm_unpackhi_epi32(v.val[1], v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15 10238 10239 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12 10240 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13 10241 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14 10242 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15 10243 return v; 10244 } 10245 10246 _NEON2SSESTORAGE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 10247 _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0] 10248 { 10249 uint16x8x4_t v; 10250 __m128i tmp3, tmp2, tmp1, tmp0; 10251 tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7 10252 tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7 10253 tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7 10254 tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7 10255 v.val[0] = _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3, 10256 v.val[1] = _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3, 10257 v.val[2] = _mm_unpackhi_epi16(tmp0,tmp1); //a4,b4, a5,b5, a6,b6, a7,b7 10258 v.val[3] = _mm_unpackhi_epi16(tmp2,tmp3); //c4,d4, c5,d5, c6,d6, c7,d7 10259 tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]); //a0,a4, b0,b4, a1,a5, b1,b5 10260 tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7 10261 tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5 10262 tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]); //c2,c6, d2,d6, c3,c7, d3,d7 10263 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4, 10264 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5 10265 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6, 10266 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7 10267 return v; 10268 } 10269 10270 _NEON2SSESTORAGE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 10271 _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] 10272 { 10273 uint32x4x4_t v; 10274 __m128i tmp3, tmp2, tmp1, tmp0; 10275 v.val[0] = vld1q_u32 (ptr); 10276 v.val[1] = vld1q_u32 ((ptr + 4)); 10277 v.val[2] = vld1q_u32 ((ptr + 8)); 10278 v.val[3] = vld1q_u32 ((ptr + 12)); 10279 tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]); 10280 tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]); 10281 tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]); 10282 tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]); 10283 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1); 10284 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1); 10285 v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3); 10286 v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3); 10287 return v; 10288 } 10289 10290 _NEON2SSESTORAGE int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 10291 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr) 10292 10293 _NEON2SSESTORAGE int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 10294 #define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr) 10295 10296 _NEON2SSESTORAGE int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 10297 #define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr) 10298 10299 _NEON2SSESTORAGE float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 10300 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 10301 10302 _NEON2SSESTORAGE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 10303 _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] 10304 { 10305 float32x4x4_t v; 10306 __m128 tmp3, tmp2, tmp1, tmp0; 10307 10308 v.val[0] = vld1q_f32 ((float*) ptr); 10309 v.val[1] = vld1q_f32 ((float*) (ptr + 4)); 10310 v.val[2] = vld1q_f32 ((float*) (ptr + 8)); 10311 v.val[3] = vld1q_f32 ((float*) (ptr + 12)); 10312 tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]); 10313 tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]); 10314 tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]); 10315 tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]); 10316 v.val[0] = _mm_movelh_ps(tmp0, tmp2); 10317 v.val[1] = _mm_movehl_ps(tmp2, tmp0); 10318 v.val[2] = _mm_movelh_ps(tmp1, tmp3); 10319 v.val[3] = _mm_movehl_ps(tmp3, tmp1); 10320 return v; 10321 } 10322 10323 _NEON2SSESTORAGE poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 10324 #define vld4q_p8 vld4q_u8 10325 10326 _NEON2SSESTORAGE poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 10327 #define vld4q_p16 vld4q_s16 10328 10329 _NEON2SSESTORAGE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 10330 _NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0] 10331 { 10332 uint8x8x4_t v; 10333 __m128i sh0, sh1; 10334 __m128i val0, val2; 10335 _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; 10336 10337 val0 = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1] 10338 val2 = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3] 10339 10340 sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_8); 10341 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_8); 10342 val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29 10343 vst1q_u8(&v.val[0], val0 ); 10344 val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31 10345 vst1q_u8(&v.val[2], val2 ); 10346 return v; 10347 } 10348 10349 _NEON2SSESTORAGE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 10350 _NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0] 10351 { 10352 uint16x4x4_t v; 10353 __m128i sh0, sh1; 10354 __m128i val0, val2; 10355 _NEON2SSE_ALIGN_16 static const int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7 10356 val0 = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1] 10357 val2 = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3] 10358 sh0 = _mm_shuffle_epi8(val0, *(__m128i*)mask4_16); 10359 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask4_16); 10360 val0 = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13 10361 vst1q_u16(&v.val[0], val0 ); 10362 val2 = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15 10363 vst1q_u16(&v.val[2], val2 ); 10364 return v; 10365 } 10366 10367 _NEON2SSESTORAGE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 10368 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr) 10369 { 10370 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 10371 uint32x2x4_t v; 10372 __m128i val0, val01, val2; 10373 val0 = vld1q_u32 (ptr); //a0,a1, b0,b1, 10374 val2 = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1 10375 val01 = _mm_unpacklo_epi32(val0,val2); //a0, c0, a1,c1, 10376 val2 = _mm_unpackhi_epi32(val0,val2); //b0,d0, b1, d1 10377 vst1q_u32(&v.val[0], val01); 10378 vst1q_u32(&v.val[2], val2 ); 10379 return v; 10380 } 10381 10382 _NEON2SSESTORAGE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 10383 _NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] 10384 { 10385 uint64x1x4_t v; 10386 v.val[0].m64_u64[0] = *(ptr); //load first 64-bits in val[0] and val[1] 10387 v.val[1].m64_u64[0] = *(ptr + 1); //load first 64-bits in val[0] and val[1] 10388 v.val[2].m64_u64[0] = *(ptr + 2); //load third and forth 64-bits in val[2], val[3] 10389 v.val[3].m64_u64[0] = *(ptr + 3); //load third and forth 64-bits in val[2], val[3] 10390 return v; 10391 } 10392 10393 _NEON2SSESTORAGE int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 10394 #define vld4_s8(ptr) vld4_u8((uint8_t*)ptr) 10395 10396 _NEON2SSESTORAGE int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 10397 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr) 10398 10399 _NEON2SSESTORAGE int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 10400 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr) 10401 10402 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 10403 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr) 10404 10405 _NEON2SSESTORAGE float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 10406 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 10407 10408 _NEON2SSESTORAGE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 10409 _NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0] 10410 { 10411 //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 10412 float32x2x4_t res; 10413 res.val[0].m64_f32[0] = *(ptr); 10414 res.val[0].m64_f32[1] = *(ptr + 4); 10415 res.val[1].m64_f32[0] = *(ptr + 1); 10416 res.val[1].m64_f32[1] = *(ptr + 5); 10417 res.val[2].m64_f32[0] = *(ptr + 2); 10418 res.val[2].m64_f32[1] = *(ptr + 6); 10419 res.val[3].m64_f32[0] = *(ptr + 3); 10420 res.val[3].m64_f32[1] = *(ptr + 7); 10421 return res; 10422 } 10423 10424 _NEON2SSESTORAGE poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 10425 #define vld4_p8 vld4_u8 10426 10427 _NEON2SSESTORAGE poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 10428 #define vld4_p16 vld4_u16 10429 10430 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes ******************* 10431 //******************************************************************************************************************* 10432 _NEON2SSESTORAGE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 10433 _NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0] 10434 { 10435 uint8x8x2_t v; 10436 __m128i val0, val1; 10437 val0 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x 10438 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x, 10439 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x 10440 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, 10441 vst1q_u8(v.val, val0); 10442 return v; 10443 } 10444 10445 _NEON2SSESTORAGE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 10446 _NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0] 10447 { 10448 uint16x4x2_t v; 10449 __m128i val0, val1; 10450 val1 = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x 10451 val0 = _mm_shufflelo_epi16(val1, 0); //00 00 00 00 (all 0) 10452 _M64(v.val[0], val0); 10453 val1 = _mm_shufflelo_epi16(val1, 85); //01 01 01 01 (all 1) 10454 _M64(v.val[1], val1); 10455 return v; 10456 } 10457 10458 _NEON2SSESTORAGE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 10459 _NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] 10460 { 10461 uint32x2x2_t v; 10462 __m128i val0; 10463 val0 = LOAD_SI128(ptr); //0,1,x,x 10464 val0 = _mm_shuffle_epi32(val0, 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1 10465 vst1q_u32(v.val, val0); 10466 return v; 10467 } 10468 10469 _NEON2SSESTORAGE uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 10470 #define vld2_dup_u64 vld2_u64 10471 10472 _NEON2SSESTORAGE int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 10473 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr) 10474 10475 _NEON2SSESTORAGE int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 10476 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr) 10477 10478 _NEON2SSESTORAGE int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 10479 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr) 10480 10481 _NEON2SSESTORAGE int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 10482 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr) 10483 10484 _NEON2SSESTORAGE float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 10485 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 10486 10487 _NEON2SSESTORAGE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 10488 _NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] 10489 { 10490 float32x2x2_t v; 10491 v.val[0].m64_f32[0] = *(ptr); //0,0 10492 v.val[0].m64_f32[1] = *(ptr); //0,0 10493 v.val[1].m64_f32[0] = *(ptr + 1); //1,1 10494 v.val[1].m64_f32[1] = *(ptr + 1); //1,1 10495 return v; 10496 } 10497 10498 _NEON2SSESTORAGE poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 10499 #define vld2_dup_p8 vld2_dup_u8 10500 10501 _NEON2SSESTORAGE poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 10502 #define vld2_dup_p16 vld2_dup_s16 10503 10504 //************* Duplicate (or propagate)triplets: ******************* 10505 //******************************************************************** 10506 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes 10507 _NEON2SSESTORAGE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 10508 _NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0] 10509 { 10510 uint8x8x3_t v; 10511 __m128i val0, val1, val2; 10512 val0 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x 10513 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x, 10514 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x, 10515 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, 10516 val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x, 10517 vst1q_u8(v.val, val0); 10518 _M64(v.val[2], val2); 10519 return v; 10520 } 10521 10522 _NEON2SSESTORAGE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 10523 _NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0] 10524 { 10525 uint16x4x3_t v; 10526 __m128i val0, val1, val2; 10527 val2 = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x 10528 val0 = _mm_shufflelo_epi16(val2, 0); //00 00 00 00 (all 0) 10529 val1 = _mm_shufflelo_epi16(val2, 85); //01 01 01 01 (all 1) 10530 val2 = _mm_shufflelo_epi16(val2, 170); //10 10 10 10 (all 2) 10531 _M64(v.val[0], val0); 10532 _M64(v.val[1], val1); 10533 _M64(v.val[2], val2); 10534 return v; 10535 } 10536 10537 _NEON2SSESTORAGE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 10538 _NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] 10539 { 10540 uint32x2x3_t v; 10541 __m128i val0, val1, val2; 10542 val2 = LOAD_SI128(ptr); //0,1,2,x 10543 val0 = _mm_shuffle_epi32(val2, 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2 10544 val1 = _mm_shuffle_epi32(val2, 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2 10545 val2 = _mm_srli_si128(val0, 8); //2,2,0x0,0x0 10546 _M64(v.val[0], val0); 10547 _M64(v.val[1], val1); 10548 _M64(v.val[2], val2); 10549 return v; 10550 } 10551 10552 _NEON2SSESTORAGE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 10553 _NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] 10554 { 10555 uint64x1x3_t v; 10556 v.val[0].m64_u64[0] = *(ptr); 10557 v.val[1].m64_u64[0] = *(ptr + 1); 10558 v.val[2].m64_u64[0] = *(ptr + 2); 10559 return v; 10560 } 10561 10562 _NEON2SSESTORAGE int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 10563 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr) 10564 10565 _NEON2SSESTORAGE int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 10566 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr) 10567 10568 _NEON2SSESTORAGE int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 10569 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr) 10570 10571 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 10572 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr) 10573 10574 10575 _NEON2SSESTORAGE float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 10576 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 10577 10578 _NEON2SSESTORAGE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 10579 _NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] 10580 { 10581 float32x2x3_t v; 10582 int i; 10583 for (i = 0; i<3; i++) { 10584 v.val[i].m64_f32[0] = *(ptr + i); 10585 v.val[i].m64_f32[1] = *(ptr + i); 10586 } 10587 return v; 10588 } 10589 10590 _NEON2SSESTORAGE poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 10591 #define vld3_dup_p8 vld3_dup_u8 10592 10593 _NEON2SSESTORAGE poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 10594 #define vld3_dup_p16 vld3_dup_s16 10595 10596 10597 //************* Duplicate (or propagate) quadruples: ******************* 10598 //*********************************************************************** 10599 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes 10600 _NEON2SSESTORAGE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 10601 _NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 10602 { 10603 uint8x8x4_t v; 10604 __m128i val0, val1, val2; 10605 val0 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x 10606 val1 = _mm_unpacklo_epi8(val0,val0); //0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x, 10607 val1 = _mm_unpacklo_epi16(val1,val1); //0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3 10608 val0 = _mm_unpacklo_epi32(val1,val1); //0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, 10609 val2 = _mm_unpackhi_epi32(val1,val1); // 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3 10610 vst1q_u8(&v.val[0], val0); 10611 vst1q_u8(&v.val[2], val2); 10612 return v; 10613 } 10614 10615 _NEON2SSESTORAGE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 10616 _NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 10617 { 10618 uint16x4x4_t v; 10619 __m128i val0, val1, val2, val3; 10620 val3 = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x 10621 val0 = _mm_shufflelo_epi16(val3, 0); //00 00 00 00 (all 0) 10622 val1 = _mm_shufflelo_epi16(val3, 85); //01 01 01 01 (all 1) 10623 val2 = _mm_shufflelo_epi16(val3, 170); //10 10 10 10 (all 2) 10624 val3 = _mm_shufflelo_epi16(val3, 255); //11 11 11 11 (all 3) 10625 _M64(v.val[0], val0); 10626 _M64(v.val[1], val1); 10627 _M64(v.val[2], val2); 10628 _M64(v.val[3], val3); 10629 return v; 10630 } 10631 10632 _NEON2SSESTORAGE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 10633 _NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 10634 { 10635 uint32x2x4_t v; 10636 __m128i val0, val1, val2, val3; 10637 val3 = LOAD_SI128(ptr); //0,1,2,3 10638 val0 = _mm_shuffle_epi32(val3, 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3 10639 val1 = _mm_shuffle_epi32(val3, 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3 10640 val2 = _mm_shuffle_epi32(val3, 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3 10641 val3 = _mm_shuffle_epi32(val3, 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2 10642 _M64(v.val[0], val0); 10643 _M64(v.val[1], val1); 10644 _M64(v.val[2], val2); 10645 _M64(v.val[3], val3); 10646 return v; 10647 } 10648 10649 _NEON2SSESTORAGE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 10650 _NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] 10651 { 10652 uint64x1x4_t v; 10653 v.val[0].m64_u64[0] = *(ptr); 10654 v.val[1].m64_u64[0] = *(ptr + 1); 10655 v.val[2].m64_u64[0] = *(ptr + 2); 10656 v.val[3].m64_u64[0] = *(ptr + 3); 10657 return v; 10658 } 10659 10660 _NEON2SSESTORAGE int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 10661 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr) 10662 10663 _NEON2SSESTORAGE int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 10664 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr) 10665 10666 _NEON2SSESTORAGE int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 10667 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr) 10668 10669 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 10670 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr) 10671 10672 _NEON2SSESTORAGE float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 10673 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 10674 10675 _NEON2SSESTORAGE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 10676 _NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 10677 { 10678 float32x2x4_t v; 10679 int i; 10680 for (i = 0; i<4; i++) { 10681 v.val[i].m64_f32[0] = *(ptr + i); 10682 v.val[i].m64_f32[1] = *(ptr + i); 10683 } 10684 return v; 10685 } 10686 10687 _NEON2SSESTORAGE poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 10688 #define vld4_dup_p8 vld4_dup_u8 10689 10690 _NEON2SSESTORAGE poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 10691 #define vld4_dup_p16 vld4_dup_u16 10692 10693 10694 //********************************************************************************** 10695 //*******************Lane loads for an N-element structures *********************** 10696 //********************************************************************************** 10697 //********************** Lane pairs ************************************************ 10698 //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon 10699 //we assume src is 16 bit aligned 10700 10701 //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error 10702 //to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined 10703 10704 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] 10705 _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0] 10706 { 10707 uint16x8x2_t v; 10708 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); 10709 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); 10710 return v; 10711 } 10712 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane) 10713 10714 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] 10715 _NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] 10716 { 10717 uint32x4x2_t v; 10718 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); 10719 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); 10720 return v; 10721 } 10722 #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane) 10723 10724 //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] 10725 _NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane) 10726 { 10727 int16x8x2_t v; 10728 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); 10729 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); 10730 return v; 10731 } 10732 #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane) 10733 10734 //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0] 10735 _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane) 10736 { 10737 int32x4x2_t v; 10738 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); 10739 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); 10740 return v; 10741 } 10742 #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane) 10743 10744 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] 10745 //current IA SIMD doesn't support float16 10746 10747 //float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] 10748 _NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] 10749 { 10750 float32x4x2_t v; 10751 v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane); 10752 v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane); 10753 return v; 10754 } 10755 #define vld2q_lane_f32(ptr,src,lane) vld2q_lane_f32_ptr(ptr,&src,lane) 10756 10757 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] 10758 #define vld2q_lane_p16 vld2q_lane_u16 10759 10760 _NEON2SSESTORAGE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] 10761 _NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0] 10762 { 10763 uint8x8x2_t v; 10764 v.val[0] = vld1_lane_u8(ptr, src.val[0], lane); 10765 v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane); 10766 return v; 10767 } 10768 10769 _NEON2SSESTORAGE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] 10770 _NEON2SSE_INLINE uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3) int lane) 10771 { 10772 uint16x4x2_t v; 10773 v.val[0] = vld1_lane_u16(ptr, src.val[0], lane); 10774 v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane); 10775 return v; 10776 } 10777 10778 _NEON2SSESTORAGE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0] 10779 _NEON2SSE_INLINE uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1) int lane) 10780 { 10781 uint32x2x2_t v; 10782 v.val[0] = vld1_lane_u32(ptr, src.val[0], lane); 10783 v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane); 10784 return v; 10785 } 10786 10787 _NEON2SSESTORAGE int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] 10788 #define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane) 10789 10790 _NEON2SSESTORAGE int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0] 10791 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane) 10792 10793 _NEON2SSESTORAGE int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0] 10794 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane) 10795 10796 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 10797 //current IA SIMD doesn't support float16 10798 10799 _NEON2SSESTORAGE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] 10800 _NEON2SSE_INLINE float32x2x2_t vld2_lane_f32(__transfersize(2) float32_t const * ptr, float32x2x2_t src,__constrange(0,1) int lane) 10801 { 10802 float32x2x2_t v; 10803 v.val[0] = vld1_lane_f32(ptr, src.val[0], lane); 10804 v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane); 10805 return v; 10806 } 10807 10808 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] 10809 _NEON2SSESTORAGE poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] 10810 #define vld2_lane_p8 vld2_lane_u8 10811 10812 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] 10813 _NEON2SSESTORAGE poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 10814 #define vld2_lane_p16 vld2_lane_u16 10815 10816 //*********** Lane triplets ********************** 10817 //************************************************* 10818 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon 10819 //we assume src is 16 bit aligned 10820 10821 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 10822 _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 10823 { 10824 uint16x8x3_t v; 10825 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); 10826 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); 10827 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); 10828 return v; 10829 } 10830 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane) 10831 10832 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 10833 _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 10834 { 10835 uint32x4x3_t v; 10836 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); 10837 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); 10838 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); 10839 return v; 10840 } 10841 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane) 10842 10843 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 10844 _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 10845 { 10846 int16x8x3_t v; 10847 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); 10848 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); 10849 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); 10850 return v; 10851 } 10852 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane) 10853 10854 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 10855 _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 10856 { 10857 int32x4x3_t v; 10858 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); 10859 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); 10860 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); 10861 return v; 10862 } 10863 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane) 10864 10865 _NEON2SSESTORAGE float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 10866 //current IA SIMD doesn't support float16 10867 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane) 10868 10869 10870 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 10871 _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 10872 { 10873 float32x4x3_t v; 10874 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); 10875 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); 10876 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); 10877 return v; 10878 } 10879 #define vld3q_lane_f32(ptr,src,lane) vld3q_lane_f32_ptr(ptr,&src,lane) 10880 10881 _NEON2SSESTORAGE poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 10882 #define vld3q_lane_p16 vld3q_lane_u16 10883 10884 _NEON2SSESTORAGE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 10885 _NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 10886 { 10887 uint8x8x3_t v; 10888 v.val[0] = vld1_lane_u8(ptr, src.val[0], lane); 10889 v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane); 10890 v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane); 10891 return v; 10892 } 10893 10894 _NEON2SSESTORAGE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 10895 _NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 10896 { 10897 uint16x4x3_t v; 10898 v.val[0] = vld1_lane_u16(ptr, src.val[0], lane); 10899 v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane); 10900 v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane); 10901 return v; 10902 } 10903 10904 _NEON2SSESTORAGE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 10905 _NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 10906 { 10907 //need to merge into 128 bit anyway 10908 uint32x2x3_t v; 10909 v.val[0] = vld1_lane_u32(ptr, src.val[0], lane);; 10910 v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane);; 10911 v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane);; 10912 return v; 10913 } 10914 10915 _NEON2SSESTORAGE int8x8x3_t vld3_lane_s8(__transfersize(3) int8_t const * ptr, int8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 10916 #define vld3_lane_s8(ptr, src, lane) vld3_lane_u8(( uint8_t*) ptr, src, lane) 10917 10918 _NEON2SSESTORAGE int16x4x3_t vld3_lane_s16(__transfersize(3) int16_t const * ptr, int16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 10919 #define vld3_lane_s16(ptr, src, lane) vld3_lane_u16(( uint16_t*) ptr, src, lane) 10920 10921 _NEON2SSESTORAGE int32x2x3_t vld3_lane_s32(__transfersize(3) int32_t const * ptr, int32x2x3_t src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 10922 #define vld3_lane_s32(ptr, src, lane) vld3_lane_u32(( uint32_t*) ptr, src, lane) 10923 10924 _NEON2SSESTORAGE float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 10925 //current IA SIMD doesn't support float16 10926 10927 _NEON2SSESTORAGE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 10928 _NEON2SSE_INLINE float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 10929 { 10930 float32x2x3_t v; 10931 v.val[0] = vld1_lane_f32(ptr, src.val[0], lane); 10932 v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane); 10933 v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane); 10934 return v; 10935 } 10936 10937 _NEON2SSESTORAGE poly8x8x3_t vld3_lane_p8(__transfersize(3) poly8_t const * ptr, poly8x8x3_t src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 10938 #define vld3_lane_p8 vld3_lane_u8 10939 10940 _NEON2SSESTORAGE poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 10941 #define vld3_lane_p16 vld3_lane_u16 10942 10943 //******************* Lane Quadruples load *************************** 10944 //********************************************************************* 10945 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon 10946 //we assume src is 16 bit aligned 10947 10948 //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10949 _NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane) 10950 { 10951 uint16x8x4_t v; 10952 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); 10953 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); 10954 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); 10955 v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane); 10956 return v; 10957 } 10958 #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane) 10959 10960 //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10961 _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane) 10962 { 10963 uint32x4x4_t v; 10964 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); 10965 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); 10966 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); 10967 v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane); 10968 return v; 10969 } 10970 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane) 10971 10972 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10973 _NEON2SSESTORAGE int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10974 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane) 10975 10976 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10977 _NEON2SSESTORAGE int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10978 #define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane) 10979 10980 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10981 _NEON2SSESTORAGE float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10982 //current IA SIMD doesn't support float16 10983 10984 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10985 _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane) 10986 { 10987 float32x4x4_t v; 10988 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); 10989 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); 10990 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); 10991 v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane); 10992 return v; 10993 } 10994 #define vld4q_lane_f32(ptr,val,lane) vld4q_lane_f32_ptr(ptr,&val,lane) 10995 10996 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10997 _NEON2SSESTORAGE poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 10998 #define vld4q_lane_p16 vld4q_lane_u16 10999 11000 _NEON2SSESTORAGE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11001 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane) 11002 { 11003 uint8x8x4_t v; 11004 v.val[0] = vld1_lane_u8(ptr, src.val[0], lane); 11005 v.val[1] = vld1_lane_u8((ptr + 1), src.val[1], lane); 11006 v.val[2] = vld1_lane_u8((ptr + 2), src.val[2], lane); 11007 v.val[3] = vld1_lane_u8((ptr + 3), src.val[3], lane); 11008 return v; 11009 } 11010 11011 _NEON2SSESTORAGE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11012 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3) int lane) 11013 { 11014 uint16x4x4_t v; 11015 v.val[0] = vld1_lane_u16(ptr, src.val[0], lane); 11016 v.val[1] = vld1_lane_u16((ptr + 1), src.val[1], lane); 11017 v.val[2] = vld1_lane_u16((ptr + 2), src.val[2], lane); 11018 v.val[3] = vld1_lane_u16((ptr + 3), src.val[3], lane); 11019 return v; 11020 } 11021 11022 _NEON2SSESTORAGE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11023 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1) int lane) 11024 { 11025 uint32x2x4_t v; 11026 v.val[0] = vld1_lane_u32(ptr, src.val[0], lane); 11027 v.val[1] = vld1_lane_u32((ptr + 1), src.val[1], lane); 11028 v.val[2] = vld1_lane_u32((ptr + 2), src.val[2], lane); 11029 v.val[3] = vld1_lane_u32((ptr + 3), src.val[3], lane); 11030 return v; 11031 } 11032 11033 _NEON2SSESTORAGE int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11034 #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane) 11035 11036 _NEON2SSESTORAGE int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11037 #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane) 11038 11039 _NEON2SSESTORAGE int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11040 #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane) 11041 11042 //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11043 _NEON2SSESTORAGE float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); 11044 //current IA SIMD doesn't support float16 11045 11046 _NEON2SSESTORAGE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11047 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane) 11048 { 11049 //serial solution may be faster 11050 float32x2x4_t v; 11051 v.val[0] = vld1_lane_f32(ptr, src.val[0], lane); 11052 v.val[1] = vld1_lane_f32((ptr + 1), src.val[1], lane); 11053 v.val[2] = vld1_lane_f32((ptr + 2), src.val[2], lane); 11054 v.val[3] = vld1_lane_f32((ptr + 3), src.val[3], lane); 11055 return v; 11056 } 11057 11058 _NEON2SSESTORAGE poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11059 #define vld4_lane_p8 vld4_lane_u8 11060 11061 _NEON2SSESTORAGE poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11062 #define vld4_lane_p16 vld4_lane_u16 11063 11064 //******************* Store duplets ********************************************* 11065 //******************************************************************************** 11066 //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0] 11067 _NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val) 11068 { 11069 uint8x16x2_t v; 11070 v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]); 11071 v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]); 11072 vst1q_u8 (ptr, v.val[0]); 11073 vst1q_u8 ((ptr + 16), v.val[1]); 11074 } 11075 #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val) 11076 11077 //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0] 11078 _NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val) 11079 { 11080 uint16x8x2_t v; 11081 v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]); 11082 v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]); 11083 vst1q_u16 (ptr, v.val[0]); 11084 vst1q_u16 ((ptr + 8), v.val[1]); 11085 } 11086 #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val) 11087 11088 //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0] 11089 _NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val) 11090 { 11091 uint32x4x2_t v; 11092 v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]); 11093 v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]); 11094 vst1q_u32 (ptr, v.val[0]); 11095 vst1q_u32 ((ptr + 4), v.val[1]); 11096 } 11097 #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val) 11098 11099 //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0] 11100 _NEON2SSESTORAGE void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); 11101 #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val) 11102 11103 //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0] 11104 _NEON2SSESTORAGE void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); 11105 #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val) 11106 11107 //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0] 11108 _NEON2SSESTORAGE void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); 11109 #define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val) 11110 11111 //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0] 11112 _NEON2SSESTORAGE void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); 11113 // IA32 SIMD doesn't work with 16bit floats currently 11114 11115 //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0] 11116 _NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val) 11117 { 11118 float32x4x2_t v; 11119 v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]); 11120 v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]); 11121 vst1q_f32 (ptr, v.val[0]); 11122 vst1q_f32 ((ptr + 4), v.val[1]); 11123 } 11124 #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val) 11125 11126 //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0] 11127 _NEON2SSESTORAGE void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); 11128 #define vst2q_p8 vst2q_u8 11129 11130 //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0] 11131 _NEON2SSESTORAGE void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); 11132 #define vst2q_p16 vst2q_u16 11133 11134 _NEON2SSESTORAGE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0] 11135 _NEON2SSE_INLINE void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val) 11136 { 11137 __m128i v0; 11138 v0 = _mm_unpacklo_epi8(_pM128i(val.val[0]), _pM128i(val.val[1])); 11139 vst1q_u8 (ptr, v0); 11140 } 11141 11142 _NEON2SSESTORAGE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0] 11143 _NEON2SSE_INLINE void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val) 11144 { 11145 __m128i v0; 11146 v0 = _mm_unpacklo_epi16(_pM128i(val.val[0]), _pM128i(val.val[1])); 11147 vst1q_u16 (ptr, v0); 11148 } 11149 11150 _NEON2SSESTORAGE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0] 11151 _NEON2SSE_INLINE void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val) 11152 { 11153 __m128i v0; 11154 v0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1])); 11155 vst1q_u32 (ptr, v0); 11156 } 11157 11158 _NEON2SSESTORAGE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0] 11159 _NEON2SSE_INLINE void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val) 11160 { 11161 *(ptr) = val.val[0].m64_u64[0]; 11162 *(ptr + 1) = val.val[1].m64_u64[0]; 11163 } 11164 11165 _NEON2SSESTORAGE void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0] 11166 #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val) 11167 11168 _NEON2SSESTORAGE void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0] 11169 #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val) 11170 11171 _NEON2SSESTORAGE void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0] 11172 #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val) 11173 11174 _NEON2SSESTORAGE void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); 11175 #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val) 11176 11177 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0] 11178 //current IA SIMD doesn't support float16 11179 11180 _NEON2SSESTORAGE void vst2_f32(__transfersize(4) float32_t * ptr, float32x2x2_t val); // VST2.32 {d0, d1}, [r0] 11181 _NEON2SSE_INLINE void vst2_f32(__transfersize(4) float32_t* ptr, float32x2x2_t val) 11182 { 11183 *(ptr) = val.val[0].m64_f32[0]; 11184 *(ptr + 1) = val.val[1].m64_f32[0]; 11185 *(ptr + 2) = val.val[0].m64_f32[1]; 11186 *(ptr + 3) = val.val[1].m64_f32[1]; 11187 } 11188 11189 _NEON2SSESTORAGE void vst2_p8(__transfersize(16) poly8_t * ptr, poly8x8x2_t val); // VST2.8 {d0, d1}, [r0] 11190 #define vst2_p8 vst2_u8 11191 11192 _NEON2SSESTORAGE void vst2_p16(__transfersize(8) poly16_t * ptr, poly16x4x2_t val); // VST2.16 {d0, d1}, [r0] 11193 #define vst2_p16 vst2_u16 11194 11195 //******************** Triplets store ***************************************** 11196 //****************************************************************************** 11197 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0] 11198 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val) 11199 { 11200 uint8x16x3_t v; 11201 __m128i v0,v1,v2, cff, bldmask; 11202 _NEON2SSE_ALIGN_16 static const uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10}; 11203 _NEON2SSE_ALIGN_16 static const uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10}; 11204 _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff}; 11205 _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff}; 11206 _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff}; 11207 _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15}; 11208 11209 v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22 11210 v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46 11211 v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34 11212 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding 11213 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding 11214 cff = _mm_cmpeq_epi8(v0, v0); //all ff 11215 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff); 11216 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); 11217 vst1q_u8(ptr, v.val[0]); 11218 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding 11219 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding 11220 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff); 11221 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); 11222 vst1q_u8((ptr + 16), v.val[1]); 11223 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding 11224 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding 11225 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff); 11226 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); 11227 vst1q_u8((ptr + 32), v.val[2]); 11228 } 11229 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val) 11230 11231 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0] 11232 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val) 11233 { 11234 uint16x8x3_t v; 11235 __m128i v0,v1,v2, cff, bldmask; 11236 _NEON2SSE_ALIGN_16 static const uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11}; 11237 _NEON2SSE_ALIGN_16 static const uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9}; 11238 _NEON2SSE_ALIGN_16 static const uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff}; 11239 _NEON2SSE_ALIGN_16 static const uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff}; 11240 _NEON2SSE_ALIGN_16 static const uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff}; 11241 _NEON2SSE_ALIGN_16 static const uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15}; 11242 11243 v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10 11244 v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22, 11245 v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19 11246 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding 11247 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding 11248 cff = _mm_cmpeq_epi16(v0, v0); //all ff 11249 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff); 11250 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); 11251 vst1q_u16(ptr, v.val[0]); 11252 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding 11253 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding 11254 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff); 11255 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); 11256 vst1q_u16((ptr + 8), v.val[1]); 11257 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding 11258 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding 11259 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff); 11260 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); 11261 vst1q_u16((ptr + 16), v.val[2]); 11262 } 11263 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val) 11264 11265 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] 11266 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val) 11267 { 11268 //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3 11269 uint32x4x3_t v; 11270 __m128i tmp0, tmp1,tmp2; 11271 tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1 11272 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3 11273 tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1 11274 v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2, 11275 v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3 11276 v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3 11277 tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1 11278 v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1, 11279 11280 vst1q_u32(ptr, v.val[0]); 11281 vst1q_u32((ptr + 4), v.val[1]); 11282 vst1q_u32((ptr + 8), v.val[2]); 11283 } 11284 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val) 11285 11286 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val); 11287 _NEON2SSESTORAGE void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); 11288 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val) 11289 11290 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val); 11291 _NEON2SSESTORAGE void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); 11292 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val) 11293 11294 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val); 11295 _NEON2SSESTORAGE void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); 11296 #define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val) 11297 11298 //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] 11299 _NEON2SSESTORAGE void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); 11300 // IA32 SIMD doesn't work with 16bit floats currently 11301 11302 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] 11303 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val) 11304 { 11305 float32x4x3_t v; 11306 __m128 tmp0, tmp1,tmp2; 11307 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1 11308 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3 11309 tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1 11310 v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2, 11311 v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3 11312 v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3 11313 tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1 11314 v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1, 11315 11316 vst1q_f32( ptr, v.val[0]); 11317 vst1q_f32( (ptr + 4), v.val[1]); 11318 vst1q_f32( (ptr + 8), v.val[2]); 11319 } 11320 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val) 11321 11322 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0] 11323 _NEON2SSESTORAGE void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); 11324 #define vst3q_p8 vst3q_u8 11325 11326 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] 11327 _NEON2SSESTORAGE void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); 11328 #define vst3q_p16 vst3q_u16 11329 11330 _NEON2SSESTORAGE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0] 11331 _NEON2SSE_INLINE void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val) 11332 { 11333 __m128i tmp, sh0, sh1, val0, val2; 11334 _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5}; 11335 _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0}; 11336 _NEON2SSE_ALIGN_16 static const uint8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0}; 11337 _NEON2SSE_ALIGN_16 static const uint8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0}; 11338 tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1]) ); 11339 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15) 11340 val2 = _pM128i(val.val[2]); 11341 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); 11342 val0 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel); 11343 vst1q_u8(ptr, val0); //store as 128 bit structure 11344 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15) 11345 sh1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); 11346 val2 = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel); 11347 _M64((*(__m64_128*)(ptr + 16)), val2); //need it to fit into *ptr memory 11348 } 11349 11350 _NEON2SSESTORAGE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] 11351 _NEON2SSE_INLINE void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val) 11352 { 11353 __m128i tmp, val0, val1, val2; 11354 _NEON2SSE_ALIGN_16 static const int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13}; 11355 _NEON2SSE_ALIGN_16 static const int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0}; 11356 _NEON2SSE_ALIGN_16 static const uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1] 11357 _NEON2SSE_ALIGN_16 static const uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0] 11358 tmp = _mm_unpacklo_epi64(_pM128i(val.val[0]), _pM128i(val.val[1])); 11359 val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); 11360 val2 = _pM128i(val.val[2]); 11361 val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask0); 11362 val0 = _MM_BLENDV_EPI8(val1, val0, *(__m128i*)mask0f); 11363 vst1q_u16(ptr, val0); //store as 128 bit structure 11364 val0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); 11365 val1 = _mm_shuffle_epi8(val2, *(__m128i*)mask1); 11366 val1 = _MM_BLENDV_EPI8(val0, val1, *(__m128i*)mask1f); //change the operands order 11367 _M64((*(__m64_128*)(ptr + 8)), val1); //need it to fit into *ptr memory 11368 } 11369 11370 _NEON2SSESTORAGE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0] 11371 _NEON2SSE_INLINE void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val) 11372 { 11373 //val.val[0]:0,3,val.val[1]:1,4; val.val[2]:2,5,x,x; 11374 __m128i val0, val1; 11375 val0 = _mm_unpacklo_epi64(_pM128i(val.val[1]), _pM128i(val.val[2])); //val[0]: 1,4,2,5 11376 val0 = _mm_shuffle_epi32(val0, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5 11377 val1 = _mm_srli_si128(val0, 8); //4,5, x,x 11378 _M64((*(__m64_128*)(ptr + 4)), val1); 11379 val0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), val0); //0,1,3,2 11380 val0 = _mm_shuffle_epi32(val0, 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3 11381 vst1q_u32(ptr, val0); //store as 128 bit structure 11382 } 11383 11384 _NEON2SSESTORAGE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val);// VST1.64 {d0, d1, d2}, [r0] 11385 _NEON2SSE_INLINE void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val) 11386 { 11387 *(ptr) = val.val[0].m64_u64[0]; 11388 *(ptr + 1) = val.val[1].m64_u64[0]; 11389 *(ptr + 2) = val.val[2].m64_u64[0]; 11390 } 11391 11392 _NEON2SSESTORAGE void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val); // VST3.8 {d0, d1, d2}, [r0] 11393 #define vst3_s8(ptr, val) vst3_u8((uint8_t*)ptr, val) 11394 11395 _NEON2SSESTORAGE void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val); // VST3.16 {d0, d1, d2}, [r0] 11396 #define vst3_s16(ptr, val) vst3_u16((uint16_t*)ptr, val) 11397 11398 _NEON2SSESTORAGE void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0] 11399 #define vst3_s32(ptr, val) vst3_u32((uint32_t*)ptr, val) 11400 11401 _NEON2SSESTORAGE void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val); // VST1.64 {d0, d1, d2}, [r0] 11402 #define vst3_s64(ptr, val) vst3_u64((uint64_t*)ptr, val) 11403 11404 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] 11405 _NEON2SSESTORAGE void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] 11406 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 11407 11408 _NEON2SSESTORAGE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val);// VST3.32 {d0, d1, d2}, [r0] 11409 _NEON2SSE_INLINE void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val) 11410 { 11411 //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; -> 0,2, 4,1, 3,5 11412 *(ptr) = val.val[0].m64_f32[0]; 11413 *(ptr + 1) = val.val[1].m64_f32[0]; 11414 *(ptr + 2) = val.val[2].m64_f32[0]; 11415 *(ptr + 3) = val.val[0].m64_f32[1]; 11416 *(ptr + 4) = val.val[1].m64_f32[1]; 11417 *(ptr + 5) = val.val[2].m64_f32[1]; 11418 } 11419 11420 _NEON2SSESTORAGE void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0] 11421 #define vst3_p8 vst3_u8 11422 11423 _NEON2SSESTORAGE void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] 11424 #define vst3_p16 vst3_u16 11425 11426 //*************** Quadruples store ******************************** 11427 //********************************************************************* 11428 //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0] 11429 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val) 11430 { 11431 __m128i tmp1, tmp2, res; 11432 tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29 11433 tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31 11434 res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15 11435 vst1q_u8(ptr, res); 11436 res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31 11437 vst1q_u8((ptr + 16), res); 11438 tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); // 11439 tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); // 11440 res = _mm_unpacklo_epi16(tmp1, tmp2); // 11441 vst1q_u8((ptr + 32), res); 11442 res = _mm_unpackhi_epi16(tmp1, tmp2); // 11443 vst1q_u8((ptr + 48), res); 11444 } 11445 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val) 11446 11447 //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0] 11448 _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val) 11449 { 11450 uint16x8x4_t v; 11451 __m128i tmp1, tmp2; 11452 tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 11453 tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 11454 v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2); 11455 v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2); 11456 tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 11457 tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 11458 v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2); 11459 v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2); 11460 vst1q_u16(ptr, v.val[0]); 11461 vst1q_u16((ptr + 8), v.val[1]); 11462 vst1q_u16((ptr + 16),v.val[2]); 11463 vst1q_u16((ptr + 24), v.val[3]); 11464 } 11465 #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val) 11466 11467 //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] 11468 _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val) 11469 { 11470 uint16x8x4_t v; 11471 __m128i tmp1, tmp2; 11472 tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 11473 tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 11474 v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2); 11475 v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2); 11476 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 11477 tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 11478 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2); 11479 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2); 11480 vst1q_u32(ptr, v.val[0]); 11481 vst1q_u32((ptr + 4), v.val[1]); 11482 vst1q_u32((ptr + 8), v.val[2]); 11483 vst1q_u32((ptr + 12), v.val[3]); 11484 } 11485 #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val) 11486 11487 //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val); 11488 _NEON2SSESTORAGE void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); 11489 #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val) 11490 11491 //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val); 11492 _NEON2SSESTORAGE void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); 11493 #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val) 11494 11495 //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val); 11496 _NEON2SSESTORAGE void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); 11497 #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val) 11498 11499 //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] 11500 _NEON2SSESTORAGE void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); 11501 // IA32 SIMD doesn't work with 16bit floats currently 11502 11503 //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] 11504 _NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val) 11505 { 11506 __m128 tmp3, tmp2, tmp1, tmp0; 11507 float32x4x4_t v; 11508 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); 11509 tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]); 11510 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); 11511 tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]); 11512 v.val[0] = _mm_movelh_ps(tmp0, tmp2); 11513 v.val[1] = _mm_movehl_ps(tmp2, tmp0); 11514 v.val[2] = _mm_movelh_ps(tmp1, tmp3); 11515 v.val[3] = _mm_movehl_ps(tmp3, tmp1); 11516 vst1q_f32(ptr, v.val[0]); 11517 vst1q_f32((ptr + 4), v.val[1]); 11518 vst1q_f32((ptr + 8), v.val[2]); 11519 vst1q_f32((ptr + 12), v.val[3]); 11520 } 11521 #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val) 11522 11523 //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0] 11524 _NEON2SSESTORAGE void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); 11525 #define vst4q_p8 vst4q_u8 11526 11527 //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] 11528 _NEON2SSESTORAGE void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); 11529 #define vst4q_p16 vst4q_s16 11530 11531 _NEON2SSESTORAGE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0] 11532 _NEON2SSE_INLINE void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val) 11533 { 11534 __m128i sh0, sh1, val0, val2; 11535 sh0 = _mm_unpacklo_epi8(_pM128i(val.val[0]),_pM128i(val.val[1])); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7, 11536 sh1 = _mm_unpacklo_epi8(_pM128i(val.val[2]),_pM128i(val.val[3])); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7 11537 val0 = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3, 11538 val2 = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7 11539 vst1q_u8(ptr, val0); 11540 vst1q_u8((ptr + 16), val2); 11541 } 11542 11543 _NEON2SSESTORAGE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] 11544 _NEON2SSE_INLINE void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val) 11545 { 11546 __m128i sh0, sh1, val0, val2; 11547 sh0 = _mm_unpacklo_epi16(_pM128i(val.val[0]),_pM128i(val.val[1])); //a0,a1,b0,b1,c0,c1,d0,d1, 11548 sh1 = _mm_unpacklo_epi16(_pM128i(val.val[2]),_pM128i(val.val[3])); //a2,a3,b2,b3,c2,c3,d2,d3 11549 val0 = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3 11550 val2 = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3 11551 vst1q_u16(ptr, val0); //store as 128 bit structure 11552 vst1q_u16((ptr + 8), val2); 11553 } 11554 11555 _NEON2SSESTORAGE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0] 11556 _NEON2SSE_INLINE void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val) 11557 { 11558 //0,4, 1,5, 2,6, 3,7 11559 __m128i sh0, sh1, val0, val1; 11560 sh0 = _mm_unpacklo_epi32(_pM128i(val.val[0]), _pM128i(val.val[1])); //0,1,4,5 11561 sh1 = _mm_unpacklo_epi32(_pM128i(val.val[2]), _pM128i(val.val[3])); //2,3,6,7 11562 val0 = _mm_unpacklo_epi64(sh0,sh1); // 11563 val1 = _mm_unpackhi_epi64(sh0,sh1); // 11564 vst1q_u32(ptr, val0); //store as 128 bit structure 11565 vst1q_u32((ptr + 4), val1); 11566 } 11567 11568 _NEON2SSESTORAGE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val);// VST1.64 {d0, d1, d2, d3}, [r0] 11569 _NEON2SSE_INLINE void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val) 11570 { 11571 *(ptr) = val.val[0].m64_u64[0]; 11572 *(ptr + 1) = val.val[1].m64_u64[0]; 11573 *(ptr + 2) = val.val[2].m64_u64[0]; 11574 *(ptr + 3) = val.val[3].m64_u64[0]; 11575 } 11576 11577 //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0] 11578 #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val) 11579 11580 //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0] 11581 #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val) 11582 11583 //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0] 11584 #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val) 11585 11586 //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0] 11587 _NEON2SSESTORAGE void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); 11588 #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val) 11589 11590 //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] 11591 _NEON2SSESTORAGE void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); 11592 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 11593 11594 _NEON2SSESTORAGE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val);// VST4.32 {d0, d1, d2, d3}, [r0] 11595 _NEON2SSE_INLINE void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val) 11596 { 11597 //0,4, 1,5, 2,6, 3,7 -> 0,1, 2,3, 4,5, 6,7 11598 *(ptr) = val.val[0].m64_f32[0]; 11599 *(ptr + 1) = val.val[1].m64_f32[0]; 11600 *(ptr + 2) = val.val[2].m64_f32[0]; 11601 *(ptr + 3) = val.val[3].m64_f32[0]; 11602 *(ptr + 4) = val.val[0].m64_f32[1]; 11603 *(ptr + 5) = val.val[1].m64_f32[1]; 11604 *(ptr + 6) = val.val[2].m64_f32[1]; 11605 *(ptr + 7) = val.val[3].m64_f32[1]; 11606 } 11607 11608 _NEON2SSESTORAGE void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0] 11609 #define vst4_p8 vst4_u8 11610 11611 _NEON2SSESTORAGE void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] 11612 #define vst4_p16 vst4_u16 11613 11614 //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors ********************* 11615 //******************************************************************************************************************** 11616 //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0] 11617 _NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane) 11618 { 11619 vst1q_lane_s16(ptr, val->val[0], lane); 11620 vst1q_lane_s16((ptr + 1), val->val[1], lane); 11621 } 11622 #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane) 11623 11624 //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] 11625 _NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane) 11626 { 11627 vst1q_lane_u32(ptr, val->val[0], lane); 11628 vst1q_lane_u32((ptr + 1), val->val[1], lane); 11629 } 11630 #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane) 11631 11632 //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] 11633 _NEON2SSESTORAGE void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); 11634 #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane) 11635 11636 //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0] 11637 _NEON2SSESTORAGE void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); 11638 #define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane) 11639 11640 //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] 11641 _NEON2SSESTORAGE void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); 11642 //current IA SIMD doesn't support float16 11643 11644 //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] 11645 _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane) 11646 { 11647 vst1q_lane_f32(ptr, val->val[0], lane); 11648 vst1q_lane_f32((ptr + 1), val->val[1], lane); 11649 } 11650 #define vst2q_lane_f32(ptr,src,lane) vst2q_lane_f32_ptr(ptr,&src,lane) 11651 11652 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] 11653 _NEON2SSESTORAGE void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); 11654 #define vst2q_lane_p16 vst2q_lane_s16 11655 11656 _NEON2SSESTORAGE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] 11657 _NEON2SSE_INLINE void vst2_lane_u8(__transfersize(2) uint8_t * ptr, uint8x8x2_t val, __constrange(0,7) int lane) // VST2.8 {d0[0], d1[0]}, [r0] 11658 { 11659 *(ptr) = val.val[0].m64_u8[lane]; 11660 *(ptr + 1) = val.val[1].m64_u8[lane]; 11661 } 11662 11663 _NEON2SSESTORAGE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] 11664 _NEON2SSE_INLINE void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane) 11665 { 11666 *(ptr) = val.val[0].m64_u16[lane]; 11667 *(ptr + 1) = val.val[1].m64_u16[lane]; 11668 } 11669 11670 _NEON2SSESTORAGE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] 11671 _NEON2SSE_INLINE void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane) 11672 { 11673 *(ptr) = val.val[0].m64_u32[lane]; 11674 *(ptr + 1) = val.val[1].m64_u32[lane]; 11675 } 11676 11677 _NEON2SSESTORAGE void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] 11678 #define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane) 11679 11680 _NEON2SSESTORAGE void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] 11681 #define vst2_lane_s16(ptr, val, lane) vst2_lane_u16((uint16_t*)ptr, val, lane) 11682 11683 _NEON2SSESTORAGE void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] 11684 #define vst2_lane_s32(ptr, val, lane) vst2_lane_u32((uint32_t*)ptr, val, lane) 11685 11686 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] 11687 //current IA SIMD doesn't support float16 11688 11689 _NEON2SSESTORAGE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] 11690 _NEON2SSE_INLINE void vst2_lane_f32(__transfersize(2) float32_t * ptr, float32x2x2_t val, __constrange(0,1) int lane) 11691 { 11692 *(ptr) = val.val[0].m64_f32[lane]; 11693 *(ptr + 1) = val.val[1].m64_f32[lane]; 11694 } 11695 11696 _NEON2SSESTORAGE void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] 11697 #define vst2_lane_p8 vst2_lane_u8 11698 11699 _NEON2SSESTORAGE void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] 11700 #define vst2_lane_p16 vst2_lane_u16 11701 11702 //************************* Triple lanes stores ******************************************************* 11703 //******************************************************************************************************* 11704 //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0] 11705 _NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane) 11706 { 11707 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane); 11708 vst1q_lane_u16((ptr + 2), val->val[2], lane); 11709 } 11710 #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane) 11711 11712 //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] 11713 _NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane) 11714 { 11715 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane); 11716 vst1q_lane_u32((ptr + 2), val->val[2], lane); 11717 } 11718 #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane) 11719 11720 //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] 11721 _NEON2SSESTORAGE void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); 11722 #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane) 11723 11724 //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0] 11725 _NEON2SSESTORAGE void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); 11726 #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane) 11727 11728 //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] 11729 _NEON2SSESTORAGE void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); 11730 //current IA SIMD doesn't support float16 11731 11732 //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] 11733 _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane) 11734 { 11735 vst1q_lane_f32(ptr, val->val[0], lane); 11736 vst1q_lane_f32((ptr + 1), val->val[1], lane); 11737 vst1q_lane_f32((ptr + 2), val->val[2], lane); 11738 } 11739 #define vst3q_lane_f32(ptr,val,lane) vst3q_lane_f32_ptr(ptr,&val,lane) 11740 11741 //void vst3q_lane_p16(__transfersize(3) poly16_t * ptr, poly16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] 11742 _NEON2SSESTORAGE void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); 11743 #define vst3q_lane_p16 vst3q_lane_s16 11744 11745 _NEON2SSESTORAGE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] 11746 _NEON2SSE_INLINE void vst3_lane_u8(__transfersize(3) uint8_t * ptr, uint8x8x3_t val, __constrange(0,7) int lane) 11747 { 11748 *(ptr) = val.val[0].m64_u8[lane]; 11749 *(ptr + 1) = val.val[1].m64_u8[lane]; 11750 *(ptr + 2) = val.val[2].m64_u8[lane]; 11751 } 11752 11753 _NEON2SSESTORAGE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] 11754 _NEON2SSE_INLINE void vst3_lane_u16(__transfersize(3) uint16_t * ptr, uint16x4x3_t val, __constrange(0,3) int lane) 11755 { 11756 *(ptr) = val.val[0].m64_u16[lane]; 11757 *(ptr + 1) = val.val[1].m64_u16[lane]; 11758 *(ptr + 2) = val.val[2].m64_u16[lane]; 11759 } 11760 11761 _NEON2SSESTORAGE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0] 11762 _NEON2SSE_INLINE void vst3_lane_u32(__transfersize(3) uint32_t * ptr, uint32x2x3_t val, __constrange(0,1) int lane) 11763 { 11764 *(ptr) = val.val[0].m64_u32[lane]; 11765 *(ptr + 1) = val.val[1].m64_u32[lane]; 11766 *(ptr + 2) = val.val[2].m64_u32[lane]; 11767 } 11768 11769 _NEON2SSESTORAGE void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] 11770 #define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane) 11771 11772 _NEON2SSESTORAGE void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] 11773 #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane) 11774 11775 _NEON2SSESTORAGE void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0] 11776 #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane) 11777 11778 //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] 11779 _NEON2SSESTORAGE void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); 11780 //current IA SIMD doesn't support float16 11781 11782 _NEON2SSESTORAGE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0] 11783 _NEON2SSE_INLINE void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane) 11784 { 11785 *(ptr) = val.val[0].m64_f32[lane]; 11786 *(ptr + 1) = val.val[1].m64_f32[lane]; 11787 *(ptr + 2) = val.val[2].m64_f32[lane]; 11788 } 11789 11790 _NEON2SSESTORAGE void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] 11791 #define vst3_lane_p8 vst3_lane_u8 11792 11793 _NEON2SSESTORAGE void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] 11794 #define vst3_lane_p16 vst3_lane_u16 11795 11796 //******************************** Quadruple lanes stores *********************************************** 11797 //******************************************************************************************************* 11798 //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 11799 _NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane) 11800 { 11801 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane); 11802 vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane); 11803 } 11804 #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane) 11805 11806 //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 11807 _NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane) 11808 { 11809 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane); 11810 vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane); 11811 } 11812 #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane) 11813 11814 //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 11815 _NEON2SSESTORAGE void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); 11816 #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane) 11817 11818 //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 11819 _NEON2SSESTORAGE void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); 11820 #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane) 11821 11822 //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 11823 _NEON2SSESTORAGE void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); 11824 //current IA SIMD doesn't support float16 11825 11826 //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 11827 _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane) 11828 { 11829 vst1q_lane_f32(ptr, val->val[0], lane); 11830 vst1q_lane_f32((ptr + 1), val->val[1], lane); 11831 vst1q_lane_f32((ptr + 2), val->val[2], lane); 11832 vst1q_lane_f32((ptr + 3), val->val[3], lane); 11833 } 11834 #define vst4q_lane_f32(ptr,val,lane) vst4q_lane_f32_ptr(ptr,&val,lane) 11835 11836 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 11837 _NEON2SSESTORAGE void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); 11838 #define vst4q_lane_p16 vst4q_lane_u16 11839 11840 _NEON2SSESTORAGE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11841 _NEON2SSE_INLINE void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane) 11842 { 11843 *(ptr) = val.val[0].m64_u8[lane]; 11844 *(ptr + 1) = val.val[1].m64_u8[lane]; 11845 *(ptr + 2) = val.val[2].m64_u8[lane]; 11846 *(ptr + 3) = val.val[3].m64_u8[lane]; 11847 } 11848 11849 _NEON2SSESTORAGE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11850 _NEON2SSE_INLINE void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane) 11851 { 11852 *(ptr) = val.val[0].m64_u16[lane]; 11853 *(ptr + 1) = val.val[1].m64_u16[lane]; 11854 *(ptr + 2) = val.val[2].m64_u16[lane]; 11855 *(ptr + 3) = val.val[3].m64_u16[lane]; 11856 } 11857 11858 _NEON2SSESTORAGE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11859 _NEON2SSE_INLINE void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane) 11860 { 11861 *(ptr) = val.val[0].m64_u32[lane]; 11862 *(ptr + 1) = val.val[1].m64_u32[lane]; 11863 *(ptr + 2) = val.val[2].m64_u32[lane]; 11864 *(ptr + 3) = val.val[3].m64_u32[lane]; 11865 } 11866 11867 _NEON2SSESTORAGE void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11868 #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane) 11869 11870 _NEON2SSESTORAGE void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11871 #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane) 11872 11873 _NEON2SSESTORAGE void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane);// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11874 #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane) 11875 11876 //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11877 _NEON2SSESTORAGE void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); 11878 //current IA SIMD doesn't support float16 11879 11880 _NEON2SSESTORAGE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane); // VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11881 _NEON2SSE_INLINE void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane) 11882 { 11883 *(ptr) = val.val[0].m64_f32[lane]; 11884 *(ptr + 1) = val.val[1].m64_f32[lane]; 11885 *(ptr + 2) = val.val[2].m64_f32[lane]; 11886 *(ptr + 3) = val.val[3].m64_f32[lane]; 11887 } 11888 11889 _NEON2SSESTORAGE void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11890 #define vst4_lane_p8 vst4_lane_u8 11891 11892 _NEON2SSESTORAGE void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 11893 #define vst4_lane_p16 vst4_lane_u16 11894 11895 //************************************************************************************************** 11896 //************************ Extract lanes from a vector ******************************************** 11897 //************************************************************************************************** 11898 //These intrinsics extract a single lane (element) from a vector. 11899 _NEON2SSESTORAGE uint8_t vget_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] 11900 #define vget_lane_u8(vec, lane) vec.m64_u8[lane] 11901 11902 _NEON2SSESTORAGE uint16_t vget_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] 11903 #define vget_lane_u16(vec, lane) vec.m64_u16[lane] 11904 11905 11906 _NEON2SSESTORAGE uint32_t vget_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] 11907 #define vget_lane_u32(vec, lane) vec.m64_u32[lane] 11908 11909 _NEON2SSESTORAGE int8_t vget_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VMOV.S8 r0, d0[0] 11910 #define vget_lane_s8(vec, lane) vec.m64_i8[lane] 11911 11912 _NEON2SSESTORAGE int16_t vget_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VMOV.S16 r0, d0[0] 11913 #define vget_lane_s16(vec, lane) vec.m64_i16[lane] 11914 11915 _NEON2SSESTORAGE int32_t vget_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] 11916 #define vget_lane_s32(vec, lane) vec.m64_i32[lane] 11917 11918 _NEON2SSESTORAGE poly8_t vget_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VMOV.U8 r0, d0[0] 11919 #define vget_lane_p8 vget_lane_u8 11920 11921 _NEON2SSESTORAGE poly16_t vget_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VMOV.s16 r0, d0[0] 11922 #define vget_lane_p16 vget_lane_u16 11923 11924 _NEON2SSESTORAGE float32_t vget_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 r0, d0[0] 11925 #define vget_lane_f32(vec, lane) vec.m64_f32[lane] 11926 11927 _NEON2SSESTORAGE uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] 11928 #define vgetq_lane_u8 (uint8_t) _MM_EXTRACT_EPI8 11929 11930 _NEON2SSESTORAGE uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] 11931 #define vgetq_lane_u16 (uint16_t) _MM_EXTRACT_EPI16 11932 11933 _NEON2SSESTORAGE uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 11934 #define vgetq_lane_u32 (uint32_t) _MM_EXTRACT_EPI32 11935 11936 _NEON2SSESTORAGE int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] 11937 #define vgetq_lane_s8 _MM_EXTRACT_EPI8 11938 11939 _NEON2SSESTORAGE int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] 11940 #define vgetq_lane_s16 _MM_EXTRACT_EPI16 11941 11942 _NEON2SSESTORAGE int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 11943 #define vgetq_lane_s32 _MM_EXTRACT_EPI32 11944 11945 _NEON2SSESTORAGE poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] 11946 #define vgetq_lane_p8 vgetq_lane_u8 11947 11948 _NEON2SSESTORAGE poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] 11949 #define vgetq_lane_p16 vgetq_lane_u16 11950 11951 _NEON2SSESTORAGE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 11952 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane) 11953 { 11954 int32_t ilane; 11955 ilane = _MM_EXTRACT_PS(vec,lane); 11956 return *(float*)&ilane; 11957 } 11958 11959 _NEON2SSESTORAGE int64_t vget_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 11960 #define vget_lane_s64(vec, lane) vec.m64_i64[0] 11961 11962 _NEON2SSESTORAGE uint64_t vget_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV r0,r0,d0 11963 #define vget_lane_u64(vec, lane) vec.m64_u64[0] 11964 11965 11966 _NEON2SSESTORAGE int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 11967 #define vgetq_lane_s64 _MM_EXTRACT_EPI64 11968 11969 _NEON2SSESTORAGE uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 11970 #define vgetq_lane_u64 (uint64_t) _MM_EXTRACT_EPI64 11971 11972 // ***************** Set lanes within a vector ******************************************** 11973 // ************************************************************************************** 11974 //These intrinsics set a single lane (element) within a vector. 11975 //same functions as vld1_lane_xx ones, but take the value to be set directly. 11976 11977 _NEON2SSESTORAGE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 11978 _NEON2SSE_INLINE uint8x8_t vset_lane_u8(uint8_t value, uint8x8_t vec, __constrange(0,7) int lane) 11979 { 11980 uint8_t val; 11981 val = value; 11982 return vld1_lane_u8(&val, vec, lane); 11983 } 11984 11985 _NEON2SSESTORAGE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 11986 _NEON2SSE_INLINE uint16x4_t vset_lane_u16(uint16_t value, uint16x4_t vec, __constrange(0,3) int lane) 11987 { 11988 uint16_t val; 11989 val = value; 11990 return vld1_lane_u16(&val, vec, lane); 11991 } 11992 11993 _NEON2SSESTORAGE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 11994 _NEON2SSE_INLINE uint32x2_t vset_lane_u32(uint32_t value, uint32x2_t vec, __constrange(0,1) int lane) 11995 { 11996 uint32_t val; 11997 val = value; 11998 return vld1_lane_u32(&val, vec, lane); 11999 } 12000 12001 _NEON2SSESTORAGE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 12002 _NEON2SSE_INLINE int8x8_t vset_lane_s8(int8_t value, int8x8_t vec, __constrange(0,7) int lane) 12003 { 12004 int8_t val; 12005 val = value; 12006 return vld1_lane_s8(&val, vec, lane); 12007 } 12008 12009 _NEON2SSESTORAGE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 12010 _NEON2SSE_INLINE int16x4_t vset_lane_s16(int16_t value, int16x4_t vec, __constrange(0,3) int lane) 12011 { 12012 int16_t val; 12013 val = value; 12014 return vld1_lane_s16(&val, vec, lane); 12015 } 12016 12017 _NEON2SSESTORAGE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 12018 _NEON2SSE_INLINE int32x2_t vset_lane_s32(int32_t value, int32x2_t vec, __constrange(0,1) int lane) 12019 { 12020 int32_t val; 12021 val = value; 12022 return vld1_lane_s32(&val, vec, lane); 12023 } 12024 12025 _NEON2SSESTORAGE poly8x8_t vset_lane_p8(poly8_t value, poly8x8_t vec, __constrange(0,7) int lane); // VMOV.8 d0[0],r0 12026 #define vset_lane_p8 vset_lane_u8 12027 12028 _NEON2SSESTORAGE poly16x4_t vset_lane_p16(poly16_t value, poly16x4_t vec, __constrange(0,3) int lane); // VMOV.16 d0[0],r0 12029 #define vset_lane_p16 vset_lane_u16 12030 12031 _NEON2SSESTORAGE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane); // VMOV.32 d0[0],r0 12032 _NEON2SSE_INLINE float32x2_t vset_lane_f32(float32_t value, float32x2_t vec, __constrange(0,1) int lane) 12033 { 12034 float32_t val; 12035 val = value; 12036 return vld1_lane_f32(&val, vec, lane); 12037 } 12038 12039 _NEON2SSESTORAGE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 12040 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane) 12041 { 12042 uint8_t val; 12043 val = value; 12044 return vld1q_lane_u8(&val, vec, lane); 12045 } 12046 12047 _NEON2SSESTORAGE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 12048 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane) 12049 { 12050 uint16_t val; 12051 val = value; 12052 return vld1q_lane_u16(&val, vec, lane); 12053 } 12054 12055 _NEON2SSESTORAGE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 12056 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane) 12057 { 12058 uint32_t val; 12059 val = value; 12060 return vld1q_lane_u32(&val, vec, lane); 12061 } 12062 12063 _NEON2SSESTORAGE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 12064 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane) 12065 { 12066 int8_t val; 12067 val = value; 12068 return vld1q_lane_s8(&val, vec, lane); 12069 } 12070 12071 _NEON2SSESTORAGE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 12072 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane) 12073 { 12074 int16_t val; 12075 val = value; 12076 return vld1q_lane_s16(&val, vec, lane); 12077 } 12078 12079 _NEON2SSESTORAGE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 12080 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane) 12081 { 12082 int32_t val; 12083 val = value; 12084 return vld1q_lane_s32(&val, vec, lane); 12085 } 12086 12087 _NEON2SSESTORAGE poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 12088 #define vsetq_lane_p8 vsetq_lane_u8 12089 12090 _NEON2SSESTORAGE poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 12091 #define vsetq_lane_p16 vsetq_lane_u16 12092 12093 _NEON2SSESTORAGE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 12094 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane) 12095 { 12096 float32_t val; 12097 val = value; 12098 return vld1q_lane_f32(&val, vec, lane); 12099 } 12100 12101 _NEON2SSESTORAGE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 12102 _NEON2SSE_INLINE int64x1_t vset_lane_s64(int64_t value, int64x1_t vec, __constrange(0,0) int lane) 12103 { 12104 int64_t val; 12105 val = value; 12106 return vld1_lane_s64(&val, vec, lane); 12107 } 12108 12109 _NEON2SSESTORAGE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,r0,r0 12110 _NEON2SSE_INLINE uint64x1_t vset_lane_u64(uint64_t value, uint64x1_t vec, __constrange(0,0) int lane) 12111 { 12112 uint64_t val; 12113 val = value; 12114 return vld1_lane_u64(&val, vec, lane); 12115 } 12116 12117 _NEON2SSESTORAGE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 12118 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane) 12119 { 12120 uint64_t val; 12121 val = value; 12122 return vld1q_lane_s64(&val, vec, lane); 12123 } 12124 12125 _NEON2SSESTORAGE uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 12126 #define vsetq_lane_u64 vsetq_lane_s64 12127 12128 // ******************************************************************************* 12129 // **************** Initialize a vector from bit pattern *************************** 12130 // ******************************************************************************* 12131 //These intrinsics create a vector from a literal bit pattern. 12132 _NEON2SSESTORAGE int8x8_t vcreate_s8(uint64_t a); // VMOV d0,r0,r0 12133 _NEON2SSE_INLINE int8x8_t vcreate_s8(uint64_t a) 12134 { 12135 return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage 12136 } 12137 12138 _NEON2SSESTORAGE int16x4_t vcreate_s16(uint64_t a); // VMOV d0,r0,r0 12139 #define vcreate_s16 vcreate_s8 12140 12141 _NEON2SSESTORAGE int32x2_t vcreate_s32(uint64_t a); // VMOV d0,r0,r0 12142 #define vcreate_s32 vcreate_s8 12143 12144 _NEON2SSESTORAGE float16x4_t vcreate_f16(uint64_t a); // VMOV d0,r0,r0 12145 //no IA32 SIMD avalilable 12146 12147 _NEON2SSESTORAGE float32x2_t vcreate_f32(uint64_t a); // VMOV d0,r0,r0 12148 _NEON2SSE_INLINE float32x2_t vcreate_f32(uint64_t a) 12149 { 12150 return (*(__m64_128*)&(a)); //here we couldn't use macro due to possible immediate value usage 12151 } 12152 12153 _NEON2SSESTORAGE uint8x8_t vcreate_u8(uint64_t a); // VMOV d0,r0,r0 12154 #define vcreate_u8 vcreate_s8 12155 12156 _NEON2SSESTORAGE uint16x4_t vcreate_u16(uint64_t a); // VMOV d0,r0,r0 12157 #define vcreate_u16 vcreate_s16 12158 12159 _NEON2SSESTORAGE uint32x2_t vcreate_u32(uint64_t a); // VMOV d0,r0,r0 12160 #define vcreate_u32 vcreate_s32 12161 12162 _NEON2SSESTORAGE uint64x1_t vcreate_u64(uint64_t a); // VMOV d0,r0,r0 12163 #define vcreate_u64 vcreate_s8 12164 12165 12166 _NEON2SSESTORAGE poly8x8_t vcreate_p8(uint64_t a); // VMOV d0,r0,r0 12167 #define vcreate_p8 vcreate_u8 12168 12169 _NEON2SSESTORAGE poly16x4_t vcreate_p16(uint64_t a); // VMOV d0,r0,r0 12170 #define vcreate_p16 vcreate_u16 12171 12172 _NEON2SSESTORAGE int64x1_t vcreate_s64(uint64_t a); // VMOV d0,r0,r0 12173 #define vcreate_s64 vcreate_u64 12174 12175 //********************* Set all lanes to same value ******************************** 12176 //********************************************************************************* 12177 //These intrinsics set all lanes to the same value. 12178 _NEON2SSESTORAGE uint8x8_t vdup_n_u8(uint8_t value); // VDUP.8 d0,r0 12179 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x8_t vdup_n_u8(uint8_t value), _NEON2SSE_REASON_SLOW_SERIAL) 12180 { 12181 uint8x8_t res; 12182 int i; 12183 for (i = 0; i<8; i++) { 12184 res.m64_u8[i] = value; 12185 } 12186 return res; 12187 } 12188 12189 _NEON2SSESTORAGE uint16x4_t vdup_n_u16(uint16_t value); // VDUP.16 d0,r0 12190 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x4_t vdup_n_u16(uint16_t value), _NEON2SSE_REASON_SLOW_SERIAL) 12191 { 12192 uint16x4_t res; 12193 int i; 12194 for (i = 0; i<4; i++) { 12195 res.m64_u16[i] = value; 12196 } 12197 return res; 12198 } 12199 12200 _NEON2SSESTORAGE uint32x2_t vdup_n_u32(uint32_t value); // VDUP.32 d0,r0 12201 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x2_t vdup_n_u32(uint32_t value), _NEON2SSE_REASON_SLOW_SERIAL) 12202 { 12203 uint32x2_t res; 12204 res.m64_u32[0] = value; 12205 res.m64_u32[1] = value; 12206 return res; 12207 } 12208 12209 _NEON2SSESTORAGE int8x8_t vdup_n_s8(int8_t value); // VDUP.8 d0,r0 12210 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vdup_n_s8(int8_t value), _NEON2SSE_REASON_SLOW_SERIAL) 12211 { 12212 int8x8_t res; 12213 int i; 12214 for (i = 0; i<8; i++) { 12215 res.m64_i8[i] = value; 12216 } 12217 return res; 12218 } 12219 12220 _NEON2SSESTORAGE int16x4_t vdup_n_s16(int16_t value); // VDUP.16 d0,r0 12221 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x4_t vdup_n_s16(int16_t value), _NEON2SSE_REASON_SLOW_SERIAL) 12222 { 12223 int16x4_t res; 12224 int i; 12225 for (i = 0; i<4; i++) { 12226 res.m64_i16[i] = value; 12227 } 12228 return res; 12229 } 12230 12231 _NEON2SSESTORAGE int32x2_t vdup_n_s32(int32_t value); // VDUP.32 d0,r0 12232 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vdup_n_s32(int32_t value), _NEON2SSE_REASON_SLOW_SERIAL) 12233 { 12234 int32x2_t res; 12235 res.m64_i32[0] = value; 12236 res.m64_i32[1] = value; 12237 return res; 12238 } 12239 12240 _NEON2SSESTORAGE poly8x8_t vdup_n_p8(poly8_t value); // VDUP.8 d0,r0 12241 #define vdup_n_p8 vdup_n_u8 12242 12243 _NEON2SSESTORAGE poly16x4_t vdup_n_p16(poly16_t value); // VDUP.16 d0,r0 12244 #define vdup_n_p16 vdup_n_s16 12245 12246 _NEON2SSESTORAGE float32x2_t vdup_n_f32(float32_t value); // VDUP.32 d0,r0 12247 _NEON2SSE_INLINE float32x2_t vdup_n_f32(float32_t value) 12248 { 12249 float32x2_t res; 12250 res.m64_f32[0] = value; 12251 res.m64_f32[1] = value; 12252 return res; 12253 } 12254 12255 _NEON2SSESTORAGE uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 12256 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value)) 12257 12258 _NEON2SSESTORAGE uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 12259 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value)) 12260 12261 _NEON2SSESTORAGE uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 12262 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value)) 12263 12264 _NEON2SSESTORAGE int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 12265 #define vdupq_n_s8 _mm_set1_epi8 12266 12267 _NEON2SSESTORAGE int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 12268 #define vdupq_n_s16 _mm_set1_epi16 12269 12270 _NEON2SSESTORAGE int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 12271 #define vdupq_n_s32 _mm_set1_epi32 12272 12273 _NEON2SSESTORAGE poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 12274 #define vdupq_n_p8 vdupq_n_u8 12275 12276 _NEON2SSESTORAGE poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 12277 #define vdupq_n_p16 vdupq_n_u16 12278 12279 _NEON2SSESTORAGE float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 12280 #define vdupq_n_f32 _mm_set1_ps 12281 12282 _NEON2SSESTORAGE int64x1_t vdup_n_s64(int64_t value); // VMOV d0,r0,r0 12283 _NEON2SSE_INLINE int64x1_t vdup_n_s64(int64_t value) 12284 { 12285 int64x1_t res; 12286 res.m64_i64[0] = value; 12287 return res; 12288 } 12289 12290 _NEON2SSESTORAGE uint64x1_t vdup_n_u64(uint64_t value); // VMOV d0,r0,r0 12291 _NEON2SSE_INLINE uint64x1_t vdup_n_u64(uint64_t value) 12292 { 12293 uint64x1_t res; 12294 res.m64_u64[0] = value; 12295 return res; 12296 } 12297 12298 _NEON2SSESTORAGE int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 12299 _NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value) 12300 { 12301 _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate 12302 return LOAD_SI128(value2); 12303 } 12304 12305 _NEON2SSESTORAGE uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 12306 _NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value) 12307 { 12308 _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate 12309 return LOAD_SI128(val); 12310 } 12311 12312 //**** Set all lanes to same value ************************ 12313 //Same functions as above - just aliaces.******************** 12314 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction ********** 12315 _NEON2SSESTORAGE uint8x8_t vmov_n_u8(uint8_t value); // VDUP.8 d0,r0 12316 #define vmov_n_u8 vdup_n_s8 12317 12318 _NEON2SSESTORAGE uint16x4_t vmov_n_u16(uint16_t value); // VDUP.16 d0,r0 12319 #define vmov_n_u16 vdup_n_s16 12320 12321 _NEON2SSESTORAGE uint32x2_t vmov_n_u32(uint32_t value); // VDUP.32 d0,r0 12322 #define vmov_n_u32 vdup_n_u32 12323 12324 _NEON2SSESTORAGE int8x8_t vmov_n_s8(int8_t value); // VDUP.8 d0,r0 12325 #define vmov_n_s8 vdup_n_s8 12326 12327 _NEON2SSESTORAGE int16x4_t vmov_n_s16(int16_t value); // VDUP.16 d0,r0 12328 #define vmov_n_s16 vdup_n_s16 12329 12330 _NEON2SSESTORAGE int32x2_t vmov_n_s32(int32_t value); // VDUP.32 d0,r0 12331 #define vmov_n_s32 vdup_n_s32 12332 12333 _NEON2SSESTORAGE poly8x8_t vmov_n_p8(poly8_t value); // VDUP.8 d0,r0 12334 #define vmov_n_p8 vdup_n_u8 12335 12336 _NEON2SSESTORAGE poly16x4_t vmov_n_p16(poly16_t value); // VDUP.16 d0,r0 12337 #define vmov_n_p16 vdup_n_s16 12338 12339 _NEON2SSESTORAGE float32x2_t vmov_n_f32(float32_t value); // VDUP.32 d0,r0 12340 #define vmov_n_f32 vdup_n_f32 12341 12342 _NEON2SSESTORAGE uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 12343 #define vmovq_n_u8 vdupq_n_u8 12344 12345 _NEON2SSESTORAGE uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 12346 #define vmovq_n_u16 vdupq_n_s16 12347 12348 _NEON2SSESTORAGE uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 12349 #define vmovq_n_u32 vdupq_n_u32 12350 12351 _NEON2SSESTORAGE int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 12352 #define vmovq_n_s8 vdupq_n_s8 12353 12354 _NEON2SSESTORAGE int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 12355 #define vmovq_n_s16 vdupq_n_s16 12356 12357 _NEON2SSESTORAGE int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 12358 #define vmovq_n_s32 vdupq_n_s32 12359 12360 _NEON2SSESTORAGE poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 12361 #define vmovq_n_p8 vdupq_n_u8 12362 12363 _NEON2SSESTORAGE poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 12364 #define vmovq_n_p16 vdupq_n_s16 12365 12366 _NEON2SSESTORAGE float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 12367 #define vmovq_n_f32 vdupq_n_f32 12368 12369 _NEON2SSESTORAGE int64x1_t vmov_n_s64(int64_t value); // VMOV d0,r0,r0 12370 #define vmov_n_s64 vdup_n_s64 12371 12372 _NEON2SSESTORAGE uint64x1_t vmov_n_u64(uint64_t value); // VMOV d0,r0,r0 12373 #define vmov_n_u64 vdup_n_u64 12374 12375 _NEON2SSESTORAGE int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 12376 #define vmovq_n_s64 vdupq_n_s64 12377 12378 _NEON2SSESTORAGE uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 12379 #define vmovq_n_u64 vdupq_n_u64 12380 12381 //**************Set all lanes to the value of one lane of a vector ************* 12382 //**************************************************************************** 12383 //here shuffle is better solution than lane extraction followed by set1 function 12384 _NEON2SSESTORAGE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] 12385 _NEON2SSE_INLINE uint8x8_t vdup_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) 12386 { 12387 uint8x8_t res; 12388 uint8_t valane; 12389 int i = 0; 12390 valane = vec.m64_u8[lane]; 12391 for (i = 0; i<8; i++) { 12392 res.m64_u8[i] = valane; 12393 } 12394 return res; 12395 } 12396 12397 _NEON2SSESTORAGE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] 12398 _NEON2SSE_INLINE uint16x4_t vdup_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) 12399 { 12400 uint16x4_t res; 12401 uint16_t valane; 12402 valane = vec.m64_u16[lane]; 12403 res.m64_u16[0] = valane; 12404 res.m64_u16[1] = valane; 12405 res.m64_u16[2] = valane; 12406 res.m64_u16[3] = valane; 12407 return res; 12408 } 12409 12410 _NEON2SSESTORAGE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] 12411 _NEON2SSE_INLINE uint32x2_t vdup_lane_u32(uint32x2_t vec, __constrange(0,1) int lane) 12412 { 12413 uint32x2_t res; 12414 res.m64_u32[0] = vec.m64_u32[lane]; 12415 res.m64_u32[1] = res.m64_u32[0]; 12416 return res; 12417 } 12418 12419 _NEON2SSESTORAGE int8x8_t vdup_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] 12420 #define vdup_lane_s8 vdup_lane_u8 12421 12422 _NEON2SSESTORAGE int16x4_t vdup_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] 12423 #define vdup_lane_s16 vdup_lane_u16 12424 12425 _NEON2SSESTORAGE int32x2_t vdup_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] 12426 #define vdup_lane_s32 vdup_lane_u32 12427 12428 _NEON2SSESTORAGE poly8x8_t vdup_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 d0,d0[0] 12429 #define vdup_lane_p8 vdup_lane_u8 12430 12431 _NEON2SSESTORAGE poly16x4_t vdup_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 d0,d0[0] 12432 #define vdup_lane_p16 vdup_lane_s16 12433 12434 _NEON2SSESTORAGE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 d0,d0[0] 12435 _NEON2SSE_INLINE float32x2_t vdup_lane_f32(float32x2_t vec, __constrange(0,1) int lane) 12436 { 12437 float32x2_t res; 12438 res.m64_f32[0] = vec.m64_f32[lane]; 12439 res.m64_f32[1] = res.m64_f32[0]; 12440 return res; 12441 } 12442 12443 _NEON2SSESTORAGE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] 12444 _NEON2SSE_INLINE uint8x16_t vdupq_lane_u8(uint8x8_t vec, __constrange(0,7) int lane) // VDUP.8 q0,d0[0] 12445 { 12446 const int8_t lane8 = (int8_t) lane; 12447 _NEON2SSE_ALIGN_16 int8_t lanemask8[16] = {lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8, lane8}; 12448 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*) lanemask8); 12449 } 12450 12451 _NEON2SSESTORAGE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] 12452 _NEON2SSE_INLINE uint16x8_t vdupq_lane_u16(uint16x4_t vec, __constrange(0,3) int lane) // VDUP.16 q0,d0[0] 12453 { 12454 //we could use 8bit shuffle for 16 bit as well 12455 const int8_t lane16 = ((int8_t) lane) << 1; 12456 const int8_t lane16_1 = lane16 + 1; 12457 _NEON2SSE_ALIGN_16 int8_t lanemask_e16[16] = {lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, 12458 lane16, lane16_1, lane16, lane16_1, lane16, lane16_1, lane16, lane16_1}; 12459 return _mm_shuffle_epi8 (_pM128i(vec), *(__m128i*)lanemask_e16); 12460 } 12461 12462 _NEON2SSESTORAGE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] 12463 _NEON2SSE_INLINE uint32x4_t vdupq_lane_u32(uint32x2_t vec, __constrange(0,1) int lane) 12464 { 12465 //need to use function not macro to make it gcc friendly and meet the immediate const requirement for _mm_shuffle_epi32 12466 if (lane == 1) 12467 return _mm_shuffle_epi32 (_pM128i(vec), (1 | (1 << 2) | (1 << 4) | (1 << 6)) ); 12468 else 12469 return _mm_shuffle_epi32 (_pM128i(vec), 0); 12470 } 12471 12472 _NEON2SSESTORAGE int8x16_t vdupq_lane_s8(int8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] 12473 #define vdupq_lane_s8 vdupq_lane_u8 12474 12475 _NEON2SSESTORAGE int16x8_t vdupq_lane_s16(int16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] 12476 #define vdupq_lane_s16 vdupq_lane_u16 12477 12478 _NEON2SSESTORAGE int32x4_t vdupq_lane_s32(int32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] 12479 #define vdupq_lane_s32 vdupq_lane_u32 12480 12481 _NEON2SSESTORAGE poly8x16_t vdupq_lane_p8(poly8x8_t vec, __constrange(0,7) int lane); // VDUP.8 q0,d0[0] 12482 #define vdupq_lane_p8 vdupq_lane_u8 12483 12484 _NEON2SSESTORAGE poly16x8_t vdupq_lane_p16(poly16x4_t vec, __constrange(0,3) int lane); // VDUP.16 q0,d0[0] 12485 #define vdupq_lane_p16 vdupq_lane_s16 12486 12487 _NEON2SSESTORAGE float32x4_t vdupq_lane_f32(float32x2_t vec, __constrange(0,1) int lane); // VDUP.32 q0,d0[0] 12488 #define vdupq_lane_f32(vec, lane) _mm_load1_ps((vec.m64_f32 + lane)) 12489 12490 _NEON2SSESTORAGE int64x1_t vdup_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 12491 #define vdup_lane_s64(vec,lane) vec 12492 12493 _NEON2SSESTORAGE uint64x1_t vdup_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV d0,d0 12494 #define vdup_lane_u64(vec,lane) vec 12495 12496 _NEON2SSESTORAGE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 12497 _NEON2SSE_INLINE int64x2_t vdupq_lane_s64(int64x1_t vec, __constrange(0,0) int lane) 12498 { 12499 __m128i vec128; 12500 vec128 = _pM128i(vec); 12501 return _mm_unpacklo_epi64(vec128,vec128); 12502 } 12503 12504 _NEON2SSESTORAGE uint64x2_t vdupq_lane_u64(uint64x1_t vec, __constrange(0,0) int lane); // VMOV q0,q0 12505 #define vdupq_lane_u64 vdupq_lane_s64 12506 12507 // ******************************************************************** 12508 // ******************** Combining vectors ***************************** 12509 // ******************************************************************** 12510 //These intrinsics join two 64 bit vectors into a single 128bit vector. 12511 _NEON2SSESTORAGE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high); // VMOV d0,d0 12512 _NEON2SSE_INLINE int8x16_t vcombine_s8(int8x8_t low, int8x8_t high) 12513 { 12514 return _mm_unpacklo_epi64 (_pM128i(low), _pM128i(high) ); 12515 } 12516 12517 _NEON2SSESTORAGE int16x8_t vcombine_s16(int16x4_t low, int16x4_t high); // VMOV d0,d0 12518 #define vcombine_s16 vcombine_s8 12519 12520 _NEON2SSESTORAGE int32x4_t vcombine_s32(int32x2_t low, int32x2_t high); // VMOV d0,d0 12521 #define vcombine_s32 vcombine_s8 12522 12523 _NEON2SSESTORAGE int64x2_t vcombine_s64(int64x1_t low, int64x1_t high); // VMOV d0,d0 12524 #define vcombine_s64 vcombine_s8 12525 12526 _NEON2SSESTORAGE float16x8_t vcombine_f16(float16x4_t low, float16x4_t high); // VMOV d0,d0 12527 //current IA SIMD doesn't support float16 12528 12529 _NEON2SSESTORAGE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high); // VMOV d0,d0 12530 _NEON2SSE_INLINE float32x4_t vcombine_f32(float32x2_t low, float32x2_t high) 12531 { 12532 __m128i res; 12533 res = _mm_unpacklo_epi64(_pM128i(low), _pM128i(high) ); 12534 return _M128(res); 12535 } 12536 12537 _NEON2SSESTORAGE uint8x16_t vcombine_u8(uint8x8_t low, uint8x8_t high); // VMOV d0,d0 12538 #define vcombine_u8 vcombine_s8 12539 12540 _NEON2SSESTORAGE uint16x8_t vcombine_u16(uint16x4_t low, uint16x4_t high); // VMOV d0,d0 12541 #define vcombine_u16 vcombine_s16 12542 12543 _NEON2SSESTORAGE uint32x4_t vcombine_u32(uint32x2_t low, uint32x2_t high); // VMOV d0,d0 12544 #define vcombine_u32 vcombine_s32 12545 12546 _NEON2SSESTORAGE uint64x2_t vcombine_u64(uint64x1_t low, uint64x1_t high); // VMOV d0,d0 12547 #define vcombine_u64 vcombine_s64 12548 12549 _NEON2SSESTORAGE poly8x16_t vcombine_p8(poly8x8_t low, poly8x8_t high); // VMOV d0,d0 12550 #define vcombine_p8 vcombine_u8 12551 12552 _NEON2SSESTORAGE poly16x8_t vcombine_p16(poly16x4_t low, poly16x4_t high); // VMOV d0,d0 12553 #define vcombine_p16 vcombine_u16 12554 12555 //********************************************************************** 12556 //************************* Splitting vectors ************************** 12557 //********************************************************************** 12558 //**************** Get high part ****************************************** 12559 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors 12560 _NEON2SSESTORAGE int8x8_t vget_high_s8(int8x16_t a); // VMOV d0,d0 12561 _NEON2SSE_INLINE int8x8_t vget_high_s8(int8x16_t a) 12562 { 12563 int8x8_t res64; 12564 __m128i res; 12565 res = _mm_unpackhi_epi64(a,a); //SSE2 12566 return64(res); 12567 } 12568 12569 _NEON2SSESTORAGE int16x4_t vget_high_s16(int16x8_t a); // VMOV d0,d0 12570 _NEON2SSE_INLINE int16x4_t vget_high_s16(int16x8_t a) 12571 { 12572 int16x4_t res64; 12573 __m128i res; 12574 res = _mm_unpackhi_epi64(a,a); //SSE2 12575 return64(res); 12576 } 12577 12578 _NEON2SSESTORAGE int32x2_t vget_high_s32(int32x4_t a); // VMOV d0,d0 12579 _NEON2SSE_INLINE int32x2_t vget_high_s32(int32x4_t a) 12580 { 12581 int32x2_t res64; 12582 __m128i res; 12583 res = _mm_unpackhi_epi64(a,a); //SSE2 12584 return64(res); 12585 } 12586 12587 _NEON2SSESTORAGE int64x1_t vget_high_s64(int64x2_t a); // VMOV d0,d0 12588 _NEON2SSE_INLINE int64x1_t vget_high_s64(int64x2_t a) 12589 { 12590 int64x1_t res64; 12591 __m128i res; 12592 res = _mm_unpackhi_epi64(a,a); //SSE2 12593 return64(res); 12594 } 12595 12596 _NEON2SSESTORAGE float16x4_t vget_high_f16(float16x8_t a); // VMOV d0,d0 12597 // IA32 SIMD doesn't work with 16bit floats currently 12598 12599 _NEON2SSESTORAGE float32x2_t vget_high_f32(float32x4_t a); // VMOV d0,d0 12600 _NEON2SSE_INLINE float32x2_t vget_high_f32(float32x4_t a) 12601 { 12602 __m128i res; 12603 __m64_128 res64; 12604 res = _mm_unpackhi_epi64(_M128i(a),_M128i(a)); 12605 return64(res); 12606 } 12607 12608 _NEON2SSESTORAGE uint8x8_t vget_high_u8(uint8x16_t a); // VMOV d0,d0 12609 #define vget_high_u8 vget_high_s8 12610 12611 _NEON2SSESTORAGE uint16x4_t vget_high_u16(uint16x8_t a); // VMOV d0,d0 12612 #define vget_high_u16 vget_high_s16 12613 12614 _NEON2SSESTORAGE uint32x2_t vget_high_u32(uint32x4_t a); // VMOV d0,d0 12615 #define vget_high_u32 vget_high_s32 12616 12617 _NEON2SSESTORAGE uint64x1_t vget_high_u64(uint64x2_t a); // VMOV d0,d0 12618 #define vget_high_u64 vget_high_s64 12619 12620 _NEON2SSESTORAGE poly8x8_t vget_high_p8(poly8x16_t a); // VMOV d0,d0 12621 #define vget_high_p8 vget_high_u8 12622 12623 _NEON2SSESTORAGE poly16x4_t vget_high_p16(poly16x8_t a); // VMOV d0,d0 12624 #define vget_high_p16 vget_high_u16 12625 12626 //********************** Get low part ********************** 12627 //********************************************************** 12628 _NEON2SSESTORAGE int8x8_t vget_low_s8(int8x16_t a); // VMOV d0,d0 12629 _NEON2SSE_INLINE int8x8_t vget_low_s8(int8x16_t a) // VMOV d0,d0 12630 { 12631 int16x4_t res64; 12632 return64(a); 12633 } 12634 12635 _NEON2SSESTORAGE int16x4_t vget_low_s16(int16x8_t a); // VMOV d0,d0 12636 _NEON2SSE_INLINE int16x4_t vget_low_s16(int16x8_t a) // VMOV d0,d0 12637 { 12638 int16x4_t res64; 12639 return64(a); 12640 } 12641 12642 _NEON2SSESTORAGE int32x2_t vget_low_s32(int32x4_t a); // VMOV d0,d0 12643 _NEON2SSE_INLINE int32x2_t vget_low_s32(int32x4_t a) // VMOV d0,d0 12644 { 12645 int32x2_t res64; 12646 return64(a); 12647 } 12648 12649 _NEON2SSESTORAGE int64x1_t vget_low_s64(int64x2_t a); // VMOV d0,d0 12650 _NEON2SSE_INLINE int64x1_t vget_low_s64(int64x2_t a) // VMOV d0,d0 12651 { 12652 int64x1_t res64; 12653 return64 (a); 12654 } 12655 12656 _NEON2SSESTORAGE float16x4_t vget_low_f16(float16x8_t a); // VMOV d0,d0 12657 // IA32 SIMD doesn't work with 16bit floats currently 12658 12659 _NEON2SSESTORAGE float32x2_t vget_low_f32(float32x4_t a); // VMOV d0,d0 12660 _NEON2SSE_INLINE float32x2_t vget_low_f32(float32x4_t a) 12661 { 12662 float32x2_t res64; 12663 _M64f(res64, a); 12664 return res64; 12665 } 12666 12667 _NEON2SSESTORAGE uint8x8_t vget_low_u8(uint8x16_t a); // VMOV d0,d0 12668 #define vget_low_u8 vget_low_s8 12669 12670 _NEON2SSESTORAGE uint16x4_t vget_low_u16(uint16x8_t a); // VMOV d0,d0 12671 #define vget_low_u16 vget_low_s16 12672 12673 _NEON2SSESTORAGE uint32x2_t vget_low_u32(uint32x4_t a); // VMOV d0,d0 12674 #define vget_low_u32 vget_low_s32 12675 12676 _NEON2SSESTORAGE uint64x1_t vget_low_u64(uint64x2_t a); // VMOV d0,d0 12677 #define vget_low_u64 vget_low_s64 12678 12679 _NEON2SSESTORAGE poly8x8_t vget_low_p8(poly8x16_t a); // VMOV d0,d0 12680 #define vget_low_p8 vget_low_u8 12681 12682 _NEON2SSESTORAGE poly16x4_t vget_low_p16(poly16x8_t a); // VMOV d0,d0 12683 #define vget_low_p16 vget_low_s16 12684 12685 //************************************************************************** 12686 //************************ Converting vectors ********************************** 12687 //************************************************************************** 12688 //************* Convert from float *************************************** 12689 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly 12690 _NEON2SSESTORAGE int32x2_t vcvt_s32_f32(float32x2_t a); // VCVT.S32.F32 d0, d0 12691 _NEON2SSE_INLINE int32x2_t vcvt_s32_f32(float32x2_t a) 12692 { 12693 int32x2_t res64; 12694 __m128i res; 12695 res = _mm_cvtps_epi32(_pM128(a)); //use low 64 bits of result only 12696 return64(res); 12697 } 12698 12699 _NEON2SSESTORAGE uint32x2_t vcvt_u32_f32(float32x2_t a); // VCVT.U32.F32 d0, d0 12700 _NEON2SSE_INLINE uint32x2_t vcvt_u32_f32(float32x2_t a) 12701 { 12702 uint32x2_t res64; 12703 __m128i res; 12704 res = vcvtq_u32_f32(_pM128(a)); 12705 return64(res); 12706 } 12707 12708 _NEON2SSESTORAGE int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 12709 _NEON2SSE_INLINE int32x4_t vcvtq_s32_f32(float32x4_t a) 12710 { 12711 __m128 dif; 12712 __m128i res; 12713 //_mm_cvttps_epi32 incorrectly treats the case a > =2.14748364e+009, therefore the special processing is necessary 12714 _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f }; 12715 dif = _mm_cmpge_ps(a, *(__m128*)fmax); 12716 res = _mm_cvttps_epi32(a); 12717 return _mm_xor_si128(res, _M128i(dif)); 12718 } 12719 12720 _NEON2SSESTORAGE uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 12721 _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0 12722 { 12723 //No single instruction SSE solution but we could implement it as following: 12724 __m128i res1, res2, zero, mask; 12725 __m128 max, min, dif; 12726 _NEON2SSE_ALIGN_16 static const float32_t fmax[] = { 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f, 2.14748364e+009f }; 12727 _NEON2SSE_ALIGN_16 static const float32_t fmax_unsigned[] = { 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f, 4.29496729e+009f }; 12728 zero = _mm_setzero_si128(); 12729 mask = _mm_cmpgt_epi32(_M128i(a), zero); 12730 min = _mm_and_ps(_M128(mask), a); 12731 max = _mm_min_ps(min, *(__m128*)fmax_unsigned); //clamped in 0 - 4.29496729+009 12732 12733 dif = _mm_sub_ps(max, *(__m128*)fmax); 12734 mask = _mm_cmpgt_epi32(_M128i(dif),zero); 12735 dif = _mm_and_ps(_M128(mask), dif); 12736 12737 res1 = _mm_cvttps_epi32(dif); 12738 res2 = vcvtq_s32_f32(max); 12739 return _mm_add_epi32(res1, res2); 12740 } 12741 12742 // ***** Convert to the fixed point with the number of fraction bits specified by b *********** 12743 //************************************************************************************************* 12744 _NEON2SSESTORAGE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.S32.F32 d0, d0, #32 12745 _NEON2SSE_INLINE int32x2_t vcvt_n_s32_f32(float32x2_t a, __constrange(1,32) int b) 12746 { 12747 int32x2_t res64; 12748 return64(vcvtq_n_s32_f32(_pM128(a),b)); 12749 } 12750 12751 _NEON2SSESTORAGE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b); // VCVT.U32.F32 d0, d0, #32 12752 _NEON2SSE_INLINE uint32x2_t vcvt_n_u32_f32(float32x2_t a, __constrange(1,32) int b) 12753 { 12754 uint32x2_t res; 12755 float convconst; 12756 convconst = (float)((uint32_t)1 << b); 12757 res.m64_u32[0] = (uint32_t) (a.m64_f32[0] * convconst); 12758 res.m64_u32[1] = (uint32_t) (a.m64_f32[1] * convconst); 12759 return res; 12760 } 12761 12762 _NEON2SSESTORAGE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 12763 _NEON2SSE_INLINE int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b) 12764 { 12765 float convconst; 12766 _NEON2SSE_ALIGN_16 static const uint32_t cmask[] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 12767 __m128 cconst128; 12768 __m128i mask, res; 12769 convconst = (float)(1 << b); 12770 cconst128 = vdupq_n_f32(convconst); 12771 res = _mm_cvttps_epi32(_mm_mul_ps(a,cconst128)); 12772 mask = _mm_cmpeq_epi32 (res, *(__m128i*)cmask); 12773 return _mm_xor_si128 (res, mask); //res saturated for 0x80000000 12774 } 12775 12776 _NEON2SSESTORAGE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 12777 _NEON2SSE_INLINE uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b) 12778 { 12779 float convconst; 12780 __m128 cconst128; 12781 convconst = (float)(1 << b); 12782 cconst128 = vdupq_n_f32(convconst); 12783 return vcvtq_u32_f32(_mm_mul_ps(a,cconst128)); 12784 } 12785 12786 12787 _NEON2SSESTORAGE int32x4_t vcvtnq_s32_f32(float32x4_t a); // VCVTN.S32.F32 q0, q0 12788 _NEON2SSE_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t a) 12789 { 12790 return _mm_cvtps_epi32(a); 12791 } 12792 12793 //***************** Convert to float ************************* 12794 //************************************************************* 12795 _NEON2SSESTORAGE float32x2_t vcvt_f32_s32(int32x2_t a); // VCVT.F32.S32 d0, d0 12796 _NEON2SSE_INLINE float32x2_t vcvt_f32_s32(int32x2_t a) //use low 64 bits 12797 { 12798 float32x2_t res; 12799 res.m64_f32[0] = (float) a.m64_i32[0]; 12800 res.m64_f32[1] = (float) a.m64_i32[1]; 12801 return res; 12802 } 12803 12804 _NEON2SSESTORAGE float32x2_t vcvt_f32_u32(uint32x2_t a); // VCVT.F32.U32 d0, d0 12805 _NEON2SSE_INLINE float32x2_t vcvt_f32_u32(uint32x2_t a) 12806 { 12807 float32x2_t res; 12808 res.m64_f32[0] = (float) a.m64_u32[0]; 12809 res.m64_f32[1] = (float) a.m64_u32[1]; 12810 return res; 12811 } 12812 12813 _NEON2SSESTORAGE float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 12814 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a) 12815 12816 _NEON2SSESTORAGE float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 12817 _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0 12818 { 12819 //solution may be not optimal 12820 __m128 two16, fHi, fLo; 12821 __m128i hi, lo; 12822 two16 = _mm_set1_ps((float)0x10000); //2^16 12823 // Avoid double rounding by doing two exact conversions 12824 // of high and low 16-bit segments 12825 hi = _mm_srli_epi32(a, 16); 12826 lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16); 12827 fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16); 12828 fLo = _mm_cvtepi32_ps(lo); 12829 // do single rounding according to current rounding mode 12830 return _mm_add_ps(fHi, fLo); 12831 } 12832 12833 // ***** Convert to the float from fixed point with the number of fraction bits specified by b *********** 12834 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b); // VCVT.F32.S32 d0, d0, #32 12835 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_s32(int32x2_t a, __constrange(1,32) int b) 12836 { 12837 float32x2_t res; 12838 float convconst; 12839 convconst = (float)(1. / ((uint32_t)1 << b)); 12840 res.m64_f32[0] = a.m64_i32[0] * convconst; 12841 res.m64_f32[1] = a.m64_i32[1] * convconst; 12842 return res; 12843 } 12844 12845 _NEON2SSESTORAGE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b); // VCVT.F32.U32 d0, d0, #32 12846 _NEON2SSE_INLINE float32x2_t vcvt_n_f32_u32(uint32x2_t a, __constrange(1,32) int b) // VCVT.F32.U32 d0, d0, #32 12847 { 12848 float32x2_t res; 12849 float convconst; 12850 convconst = (float)(1. / ((uint32_t)1 << b)); 12851 res.m64_f32[0] = a.m64_u32[0] * convconst; 12852 res.m64_f32[1] = a.m64_u32[1] * convconst; 12853 return res; 12854 } 12855 12856 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 12857 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b) 12858 { 12859 float convconst; 12860 __m128 cconst128, af; 12861 convconst = (float)(1. / ((uint32_t)1 << b)); 12862 af = _mm_cvtepi32_ps(a); 12863 cconst128 = vdupq_n_f32(convconst); 12864 return _mm_mul_ps(af,cconst128); 12865 } 12866 12867 _NEON2SSESTORAGE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 12868 _NEON2SSE_INLINE float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b) 12869 { 12870 float convconst; 12871 __m128 cconst128, af; 12872 convconst = (float)(1. / (1 << b)); 12873 af = vcvtq_f32_u32(a); 12874 cconst128 = vdupq_n_f32(convconst); 12875 return _mm_mul_ps(af,cconst128); 12876 } 12877 12878 //**************Convert between floats *********************** 12879 //************************************************************ 12880 _NEON2SSESTORAGE float16x4_t vcvt_f16_f32(float32x4_t a); // VCVT.F16.F32 d0, q0 12881 //Intel SIMD doesn't support 16bits floats curently 12882 12883 _NEON2SSESTORAGE float32x4_t vcvt_f32_f16(float16x4_t a); // VCVT.F32.F16 q0, d0 12884 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits 12885 12886 //************Vector narrow integer conversion (truncation) ****************** 12887 //**************************************************************************** 12888 _NEON2SSESTORAGE int8x8_t vmovn_s16(int16x8_t a); // VMOVN.I16 d0,q0 12889 _NEON2SSE_INLINE int8x8_t vmovn_s16(int16x8_t a) // VMOVN.I16 d0,q0 12890 { 12891 int8x8_t res64; 12892 __m128i res; 12893 res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_16_even_odd); //use 64 low bits only 12894 return64(res); 12895 } 12896 12897 _NEON2SSESTORAGE int16x4_t vmovn_s32(int32x4_t a); // VMOVN.I32 d0,q0 12898 _NEON2SSE_INLINE int16x4_t vmovn_s32(int32x4_t a) // VMOVN.I32 d0,q0 12899 { 12900 int16x4_t res64; 12901 __m128i res; 12902 res = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //use 64 low bits only 12903 return64(res); 12904 } 12905 12906 _NEON2SSESTORAGE int32x2_t vmovn_s64(int64x2_t a); // VMOVN.I64 d0,q0 12907 _NEON2SSE_INLINE int32x2_t vmovn_s64(int64x2_t a) 12908 { 12909 //may be not effective compared with a serial implementation 12910 int32x2_t res64; 12911 __m128i res; 12912 res = _mm_shuffle_epi32 (a, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //use 64 low bits only, _MM_SHUFFLE(3, 1, 2, 0) 12913 return64(res); 12914 } 12915 12916 _NEON2SSESTORAGE uint8x8_t vmovn_u16(uint16x8_t a); // VMOVN.I16 d0,q0 12917 #define vmovn_u16 vmovn_s16 12918 12919 _NEON2SSESTORAGE uint16x4_t vmovn_u32(uint32x4_t a); // VMOVN.I32 d0,q0 12920 #define vmovn_u32 vmovn_s32 12921 12922 _NEON2SSESTORAGE uint32x2_t vmovn_u64(uint64x2_t a); // VMOVN.I64 d0,q0 12923 #define vmovn_u64 vmovn_s64 12924 12925 //**************** Vector long move *********************** 12926 //*********************************************************** 12927 _NEON2SSESTORAGE int16x8_t vmovl_s8(int8x8_t a); // VMOVL.S8 q0,d0 12928 _NEON2SSE_INLINE int16x8_t vmovl_s8(int8x8_t a) 12929 { 12930 return _MM_CVTEPI8_EPI16(_pM128i(a)); //SSE4.1 12931 } 12932 12933 _NEON2SSESTORAGE int32x4_t vmovl_s16(int16x4_t a); // VMOVL.S16 q0,d0 12934 _NEON2SSE_INLINE int32x4_t vmovl_s16(int16x4_t a) 12935 { 12936 return _MM_CVTEPI16_EPI32(_pM128i(a)); //SSE4.1 12937 } 12938 12939 _NEON2SSESTORAGE int64x2_t vmovl_s32(int32x2_t a); // VMOVL.S32 q0,d0 12940 _NEON2SSE_INLINE int64x2_t vmovl_s32(int32x2_t a) 12941 { 12942 return _MM_CVTEPI32_EPI64(_pM128i(a)); //SSE4.1 12943 } 12944 12945 _NEON2SSESTORAGE uint16x8_t vmovl_u8(uint8x8_t a); // VMOVL.U8 q0,d0 12946 _NEON2SSE_INLINE uint16x8_t vmovl_u8(uint8x8_t a) 12947 { 12948 return _MM_CVTEPU8_EPI16(_pM128i(a)); //SSE4.1 12949 } 12950 12951 _NEON2SSESTORAGE uint32x4_t vmovl_u16(uint16x4_t a); // VMOVL.s16 q0,d0 12952 _NEON2SSE_INLINE uint32x4_t vmovl_u16(uint16x4_t a) 12953 { 12954 return _MM_CVTEPU16_EPI32(_pM128i(a)); //SSE4.1 12955 } 12956 12957 _NEON2SSESTORAGE uint64x2_t vmovl_u32(uint32x2_t a); // VMOVL.U32 q0,d0 12958 _NEON2SSE_INLINE uint64x2_t vmovl_u32(uint32x2_t a) 12959 { 12960 return _MM_CVTEPU32_EPI64(_pM128i(a)); //SSE4.1 12961 } 12962 12963 //*************Vector saturating narrow integer***************** 12964 //************************************************************** 12965 _NEON2SSESTORAGE int8x8_t vqmovn_s16(int16x8_t a); // VQMOVN.S16 d0,q0 12966 _NEON2SSE_INLINE int8x8_t vqmovn_s16(int16x8_t a) 12967 { 12968 int8x8_t res64; 12969 __m128i res; 12970 res = _mm_packs_epi16(a, a); 12971 return64(res); 12972 } 12973 12974 _NEON2SSESTORAGE int16x4_t vqmovn_s32(int32x4_t a); // VQMOVN.S32 d0,q0 12975 _NEON2SSE_INLINE int16x4_t vqmovn_s32(int32x4_t a) 12976 { 12977 int16x4_t res64; 12978 __m128i res; 12979 res = _mm_packs_epi32(a, a); 12980 return64(res); 12981 } 12982 12983 _NEON2SSESTORAGE int32x2_t vqmovn_s64(int64x2_t a); // VQMOVN.S64 d0,q0 12984 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqmovn_s64(int64x2_t a),_NEON2SSE_REASON_SLOW_SERIAL) //no effective SIMD solution 12985 { 12986 int32x2_t res; 12987 _NEON2SSE_ALIGN_16 int64_t atmp[2]; 12988 _mm_store_si128((__m128i*)atmp, a); 12989 if(atmp[0]>SINT_MAX) atmp[0] = SINT_MAX; 12990 if(atmp[0]<SINT_MIN) atmp[0] = SINT_MIN; 12991 if(atmp[1]>SINT_MAX) atmp[1] = SINT_MAX; 12992 if(atmp[1]<SINT_MIN) atmp[1] = SINT_MIN; 12993 res.m64_i32[0] = (int32_t)atmp[0]; 12994 res.m64_i32[1] = (int32_t)atmp[1]; 12995 return res; 12996 } 12997 12998 _NEON2SSESTORAGE uint8x8_t vqmovn_u16(uint16x8_t a); // VQMOVN.s16 d0,q0 12999 _NEON2SSE_INLINE uint8x8_t vqmovn_u16(uint16x8_t a) // VQMOVN.s16 d0,q0 13000 { 13001 //no uint16 to uint8 conversion in SSE, need truncate to max signed first. Also trying to avoid _mm_shuffle_epi8 because of its big latency for old Atom CPUs 13002 uint8x8_t res64; 13003 __m128i c7fff, a_trunc, mask_trunc; 13004 c7fff = _mm_set1_epi16 (0x7fff); // 15-th bit set to zero 13005 a_trunc = _mm_and_si128(a, c7fff); // a truncated to max signed 13006 mask_trunc = _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially. 13007 mask_trunc = _mm_and_si128(mask_trunc, c7fff); //zero or c7fff if the 15-th bit had been set initially 13008 a_trunc = _mm_or_si128(a_trunc, mask_trunc); 13009 a_trunc = _mm_packus_epi16 (a_trunc, a_trunc); //use low 64bits only 13010 return64(a_trunc); 13011 } 13012 13013 _NEON2SSESTORAGE uint16x4_t vqmovn_u32(uint32x4_t a); // VQMOVN.U32 d0,q0 13014 _NEON2SSE_INLINE uint16x4_t vqmovn_u32(uint32x4_t a) // VQMOVN.U32 d0,q0 13015 { 13016 #ifdef USE_SSE4 13017 //no uint32 to uint16 conversion in SSE, need truncate to max signed first 13018 uint16x4_t res64; 13019 __m128i c7fffffff, a_trunc, mask_trunc; 13020 c7fffffff = _mm_set1_epi32((uint32_t)0x7fffffff); // 31-th bit set to zero 13021 a_trunc = _mm_and_si128(a, c7fffffff); // a truncated to max signed 13022 mask_trunc = _mm_cmpgt_epi16(a_trunc, a); //if after the shift we have bigger value than before then the 15-th bit had been set initially. 13023 mask_trunc = _mm_and_si128(mask_trunc, c7fffffff); //zero or c7fff if the 15-th bit had been set initially 13024 a_trunc = _mm_or_si128(a_trunc, mask_trunc); 13025 a_trunc = _MM_PACKUS1_EPI32 (a_trunc); //use low 64bits only 13026 return64(a_trunc); 13027 #else 13028 uint16x4_t res64; 13029 __m128i res_hi, mask; 13030 mask = _mm_setzero_si128(); 13031 res_hi = _mm_srli_epi32(a, 16); 13032 res_hi = _mm_cmpeq_epi16(res_hi, mask); 13033 mask = _mm_cmpeq_epi16(mask,mask); //all fff 13034 mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >16 bits numbers 13035 res_hi = _mm_or_si128(a, mask); //saturated res 13036 res_hi = _mm_shuffle_epi8 (res_hi, *(__m128i*) mask8_32_even_odd); //go to 16 bits 13037 return64(res_hi); 13038 #endif 13039 } 13040 13041 _NEON2SSESTORAGE uint32x2_t vqmovn_u64(uint64x2_t a); // VQMOVN.U64 d0,q0 13042 _NEON2SSE_INLINE uint32x2_t vqmovn_u64(uint64x2_t a) 13043 { 13044 //serial solution may be faster 13045 uint32x2_t res64; 13046 __m128i res_hi, mask; 13047 mask = _mm_setzero_si128(); 13048 res_hi = _mm_srli_epi64(a, 32); 13049 res_hi = _mm_cmpeq_epi32(res_hi, mask); 13050 mask = _mm_cmpeq_epi32(mask,mask); //all fff 13051 mask = _mm_andnot_si128(res_hi,mask); //inverst res_hi to get >32 bits numbers 13052 res_hi = _mm_or_si128(a, mask); 13053 res_hi = _mm_shuffle_epi32(res_hi, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits 13054 return64(res_hi); 13055 } 13056 //************* Vector saturating narrow integer signed->unsigned ************** 13057 //***************************************************************************** 13058 _NEON2SSESTORAGE uint8x8_t vqmovun_s16(int16x8_t a); // VQMOVUN.S16 d0,q0 13059 _NEON2SSE_INLINE uint8x8_t vqmovun_s16(int16x8_t a) 13060 { 13061 uint8x8_t res64; 13062 __m128i res; 13063 res = _mm_packus_epi16(a, a); //use low 64bits only 13064 return64(res); 13065 } 13066 13067 _NEON2SSESTORAGE uint16x4_t vqmovun_s32(int32x4_t a); // VQMOVUN.S32 d0,q0 13068 _NEON2SSE_INLINE uint16x4_t vqmovun_s32(int32x4_t a) 13069 { 13070 uint16x4_t res64; 13071 __m128i res; 13072 res = _MM_PACKUS1_EPI32(a); //use low 64bits only 13073 return64(res); 13074 } 13075 13076 _NEON2SSESTORAGE uint32x2_t vqmovun_s64(int64x2_t a); // VQMOVUN.S64 d0,q0 13077 _NEON2SSE_INLINE uint32x2_t vqmovun_s64(int64x2_t a) 13078 { 13079 uint32x2_t res64; 13080 __m128i res_hi,res_lo, zero, cmp; 13081 zero = _mm_setzero_si128(); 13082 res_hi = _mm_srli_epi64(a, 32); 13083 cmp = _mm_cmpgt_epi32(zero, res_hi); //if cmp<0 the result should be zero 13084 res_lo = _mm_andnot_si128(cmp,a); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 13085 cmp = _mm_cmpgt_epi32(res_hi,zero); //if cmp positive 13086 res_lo = _mm_or_si128(res_lo, cmp); //if cmp positive we are out of 32bits need to saturaate to 0xffffffff 13087 res_lo = _mm_shuffle_epi32(res_lo, 0 | (2 << 2) | (1 << 4) | (3 << 6)); //shuffle the data to get 2 32-bits 13088 return64(res_lo); 13089 } 13090 13091 // ******************************************************** 13092 // **************** Table look up ************************** 13093 // ******************************************************** 13094 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values 13095 //in a table and generate a new vector. Indexes out of range return 0. 13096 //for Intel SIMD we need to set the MSB to 1 for zero return 13097 _NEON2SSESTORAGE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 13098 _NEON2SSE_INLINE uint8x8_t vtbl1_u8(uint8x8_t a, uint8x8_t b) 13099 { 13100 uint8x8_t res64; 13101 __m128i c7, maskgt, bmask, b128; 13102 c7 = _mm_set1_epi8 (7); 13103 b128 = _pM128i(b); 13104 maskgt = _mm_cmpgt_epi8(b128,c7); 13105 bmask = _mm_or_si128(b128,maskgt); 13106 bmask = _mm_shuffle_epi8(_pM128i(a),bmask); 13107 return64(bmask); 13108 } 13109 13110 _NEON2SSESTORAGE int8x8_t vtbl1_s8(int8x8_t a, int8x8_t b); // VTBL.8 d0, {d0}, d0 13111 #define vtbl1_s8 vtbl1_u8 13112 13113 _NEON2SSESTORAGE poly8x8_t vtbl1_p8(poly8x8_t a, uint8x8_t b); // VTBL.8 d0, {d0}, d0 13114 #define vtbl1_p8 vtbl1_u8 13115 13116 _NEON2SSESTORAGE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 13117 _NEON2SSE_INLINE uint8x8_t vtbl2_u8(uint8x8x2_t a, uint8x8_t b) 13118 { 13119 uint8x8_t res64; 13120 __m128i c15, a01, maskgt15, bmask, b128; 13121 c15 = _mm_set1_epi8 (15); 13122 b128 = _pM128i(b); 13123 maskgt15 = _mm_cmpgt_epi8(b128,c15); 13124 bmask = _mm_or_si128(b128, maskgt15); 13125 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]), _pM128i(a.val[1])); 13126 a01 = _mm_shuffle_epi8(a01, bmask); 13127 return64(a01); 13128 } 13129 13130 //int8x8_t vtbl2_s8(int8x8x2_t a, int8x8_t b); // VTBL.8 d0, {d0, d1}, d0 13131 #define vtbl2_s8 vtbl2_u8 13132 13133 //poly8x8_t vtbl2_p8(poly8x8x2_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1}, d0 13134 #define vtbl2_p8 vtbl2_u8 13135 13136 _NEON2SSESTORAGE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 13137 _NEON2SSE_INLINE uint8x8_t vtbl3_u8(uint8x8x3_t a, uint8x8_t b) 13138 { 13139 //solution may be not optimal 13140 uint8x8_t res64; 13141 __m128i c15, c23, maskgt23, bmask, maskgt15, sh0, sh1, a01, b128; 13142 c15 = _mm_set1_epi8 (15); 13143 c23 = _mm_set1_epi8 (23); 13144 b128 = _pM128i(b); 13145 maskgt23 = _mm_cmpgt_epi8(b128,c23); 13146 bmask = _mm_or_si128(b128, maskgt23); 13147 maskgt15 = _mm_cmpgt_epi8(b128,c15); 13148 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1])); 13149 sh0 = _mm_shuffle_epi8(a01, bmask); 13150 sh1 = _mm_shuffle_epi8(_pM128i(a.val[2]), bmask); //for bi>15 bi is wrapped (bi-=15) 13151 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); //SSE4.1 13152 return64(sh0); 13153 } 13154 13155 _NEON2SSESTORAGE int8x8_t vtbl3_s8(int8x8x3_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 13156 #define vtbl3_s8 vtbl3_u8 13157 13158 _NEON2SSESTORAGE poly8x8_t vtbl3_p8(poly8x8x3_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2}, d0 13159 #define vtbl3_p8 vtbl3_u8 13160 13161 _NEON2SSESTORAGE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 13162 _NEON2SSE_INLINE uint8x8_t vtbl4_u8(uint8x8x4_t a, uint8x8_t b) 13163 { 13164 //solution may be not optimal 13165 uint8x8_t res64; 13166 __m128i c15, c31, maskgt31, bmask, maskgt15, sh0, sh1, a01, a23, b128; 13167 c15 = _mm_set1_epi8 (15); 13168 c31 = _mm_set1_epi8 (31); 13169 b128 = _pM128i(b); 13170 maskgt31 = _mm_cmpgt_epi8(b128,c31); 13171 bmask = _mm_or_si128(b128, maskgt31); 13172 maskgt15 = _mm_cmpgt_epi8(b128,c15); 13173 a01 = _mm_unpacklo_epi64(_pM128i(a.val[0]),_pM128i(a.val[1])); 13174 a23 = _mm_unpacklo_epi64(_pM128i(a.val[2]),_pM128i(a.val[3])); 13175 sh0 = _mm_shuffle_epi8(a01, bmask); 13176 sh1 = _mm_shuffle_epi8(a23, bmask); //for bi>15 bi is wrapped (bi-=15) 13177 sh0 = _MM_BLENDV_EPI8 (sh0, sh1, maskgt15); //SSE4.1 13178 return64(sh0); 13179 } 13180 13181 _NEON2SSESTORAGE int8x8_t vtbl4_s8(int8x8x4_t a, int8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 13182 #define vtbl4_s8 vtbl4_u8 13183 13184 _NEON2SSESTORAGE poly8x8_t vtbl4_p8(poly8x8x4_t a, uint8x8_t b); // VTBL.8 d0, {d0, d1, d2, d3}, d0 13185 #define vtbl4_p8 vtbl4_u8 13186 13187 //****************** Extended table look up intrinsics *************************** 13188 //********************************************************************************** 13189 //VTBX (Vector Table Extension) works in the same way as VTBL do, 13190 // except that indexes out of range leave the destination element unchanged. 13191 13192 _NEON2SSESTORAGE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 13193 _NEON2SSE_INLINE uint8x8_t vtbx1_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c) 13194 { 13195 uint8x8_t res64; 13196 __m128i c7, maskgt, sh, c128; 13197 c7 = _mm_set1_epi8 (7); 13198 c128 = _pM128i(c); 13199 maskgt = _mm_cmpgt_epi8(c128,c7); 13200 c7 = _mm_and_si128(maskgt,_pM128i(a)); 13201 sh = _mm_shuffle_epi8(_pM128i(b),c128); 13202 sh = _mm_andnot_si128(maskgt,sh); 13203 sh = _mm_or_si128(sh,c7); 13204 return64(sh); 13205 } 13206 13207 _NEON2SSESTORAGE int8x8_t vtbx1_s8(int8x8_t a, int8x8_t b, int8x8_t c); // VTBX.8 d0, {d0}, d0 13208 #define vtbx1_s8 vtbx1_u8 13209 13210 _NEON2SSESTORAGE poly8x8_t vtbx1_p8(poly8x8_t a, poly8x8_t b, uint8x8_t c); // VTBX.8 d0, {d0}, d0 13211 #define vtbx1_p8 vtbx1_u8 13212 13213 _NEON2SSESTORAGE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 13214 _NEON2SSE_INLINE uint8x8_t vtbx2_u8(uint8x8_t a, uint8x8x2_t b, uint8x8_t c) 13215 { 13216 uint8x8_t res64; 13217 __m128i c15, b01, maskgt15, sh, c128; 13218 c15 = _mm_set1_epi8 (15); 13219 c128 = _pM128i(c); 13220 maskgt15 = _mm_cmpgt_epi8(c128, c15); 13221 c15 = _mm_and_si128(maskgt15, _pM128i(a)); 13222 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]), _pM128i(b.val[1])); 13223 sh = _mm_shuffle_epi8(b01, c128); 13224 sh = _mm_andnot_si128(maskgt15, sh); 13225 sh = _mm_or_si128(sh,c15); 13226 return64(sh); 13227 } 13228 13229 //int8x8_t vtbx2_s8(int8x8_t a, int8x8x2_t b, int8x8_t c); // VTBX.8 d0, {d0, d1}, d0 13230 #define vtbx2_s8 vtbx2_u8 13231 13232 //poly8x8_t vtbx2_p8(poly8x8_t a, poly8x8x2_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1}, d0 13233 #define vtbx2_p8 vtbx2_u8 13234 13235 _NEON2SSESTORAGE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 13236 _NEON2SSE_INLINE uint8x8_t vtbx3_u8(uint8x8_t a, uint8x8x3_t b, uint8x8_t c) 13237 { 13238 //solution may be not optimal 13239 uint8x8_t res64; 13240 __m128i c15, c23, maskgt15, maskgt23, sh0, sh1, b01, c128; 13241 c15 = _mm_set1_epi8 (15); 13242 c23 = _mm_set1_epi8 (23); 13243 c128 = _pM128i(c); 13244 maskgt15 = _mm_cmpgt_epi8(c128,c15); 13245 maskgt23 = _mm_cmpgt_epi8(c128,c23); 13246 c23 = _mm_and_si128(maskgt23, _pM128i(a)); 13247 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1])); 13248 sh0 = _mm_shuffle_epi8(b01, c128); 13249 sh1 = _mm_shuffle_epi8(_pM128i(b.val[2]), c128); //for bi>15 bi is wrapped (bi-=15) 13250 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); 13251 sh0 = _mm_andnot_si128(maskgt23,sh0); 13252 sh0 = _mm_or_si128(sh0,c23); 13253 return64(sh0); 13254 } 13255 13256 _NEON2SSESTORAGE int8x8_t vtbx3_s8(int8x8_t a, int8x8x3_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 13257 #define vtbx3_s8 vtbx3_u8 13258 13259 _NEON2SSESTORAGE poly8x8_t vtbx3_p8(poly8x8_t a, poly8x8x3_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2}, d0 13260 #define vtbx3_p8 vtbx3_u8 13261 13262 _NEON2SSESTORAGE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 13263 _NEON2SSE_INLINE uint8x8_t vtbx4_u8(uint8x8_t a, uint8x8x4_t b, uint8x8_t c) 13264 { 13265 //solution may be not optimal 13266 uint8x8_t res64; 13267 __m128i c15, c31, maskgt15, maskgt31, sh0, sh1, b01, b23, c128; 13268 c15 = _mm_set1_epi8 (15); 13269 c31 = _mm_set1_epi8 (31); 13270 c128 = _pM128i(c); 13271 maskgt15 = _mm_cmpgt_epi8(c128,c15); 13272 maskgt31 = _mm_cmpgt_epi8(c128,c31); 13273 c31 = _mm_and_si128(maskgt31, _pM128i(a)); 13274 13275 b01 = _mm_unpacklo_epi64(_pM128i(b.val[0]),_pM128i(b.val[1])); 13276 b23 = _mm_unpacklo_epi64(_pM128i(b.val[2]),_pM128i(b.val[3])); 13277 sh0 = _mm_shuffle_epi8(b01, c128); 13278 sh1 = _mm_shuffle_epi8(b23, c128); //for bi>15 bi is wrapped (bi-=15) 13279 sh0 = _MM_BLENDV_EPI8(sh0, sh1, maskgt15); 13280 sh0 = _mm_andnot_si128(maskgt31,sh0); 13281 sh0 = _mm_or_si128(sh0,c31); 13282 return64(sh0); 13283 } 13284 13285 _NEON2SSESTORAGE int8x8_t vtbx4_s8(int8x8_t a, int8x8x4_t b, int8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 13286 #define vtbx4_s8 vtbx4_u8 13287 13288 _NEON2SSESTORAGE poly8x8_t vtbx4_p8(poly8x8_t a, poly8x8x4_t b, uint8x8_t c); // VTBX.8 d0, {d0, d1, d2, d3}, d0 13289 #define vtbx4_p8 vtbx4_u8 13290 13291 //************************************************************************************************* 13292 // *************************** Operations with a scalar value ********************************* 13293 //************************************************************************************************* 13294 13295 //******* Vector multiply accumulate by scalar ************************************************* 13296 //********************************************************************************************** 13297 _NEON2SSESTORAGE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] 13298 _NEON2SSE_INLINE int16x4_t vmla_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 d0, d0, d0[0] 13299 { 13300 int16_t c; 13301 int16x4_t scalar; 13302 c = vget_lane_s16(v, l); 13303 scalar = vdup_n_s16(c); 13304 return vmla_s16(a, b, scalar); 13305 } 13306 13307 _NEON2SSESTORAGE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] 13308 _NEON2SSE_INLINE int32x2_t vmla_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 d0, d0, d0[0] 13309 { 13310 int32_t c; 13311 int32x2_t scalar; 13312 c = vget_lane_s32(v, l); 13313 scalar = vdup_n_s32(c); 13314 return vmla_s32(a, b, scalar); 13315 } 13316 13317 _NEON2SSESTORAGE uint16x4_t vmla_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 d0, d0, d0[0] 13318 #define vmla_lane_u16 vmla_lane_s16 13319 13320 13321 _NEON2SSESTORAGE uint32x2_t vmla_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 d0, d0, d0[0] 13322 #define vmla_lane_u32 vmla_lane_s32 13323 13324 _NEON2SSESTORAGE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 d0, d0, d0[0] 13325 _NEON2SSE_INLINE float32x2_t vmla_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) 13326 { 13327 float32_t vlane; 13328 float32x2_t c; 13329 vlane = vget_lane_f32(v, l); 13330 c = vdup_n_f32(vlane); 13331 return vmla_f32(a,b,c); 13332 } 13333 13334 _NEON2SSESTORAGE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] 13335 _NEON2SSE_INLINE int16x8_t vmlaq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] 13336 { 13337 int16_t vlane; 13338 int16x8_t c; 13339 vlane = vget_lane_s16(v, l); 13340 c = vdupq_n_s16(vlane); 13341 return vmlaq_s16(a,b,c); 13342 } 13343 13344 _NEON2SSESTORAGE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] 13345 _NEON2SSE_INLINE int32x4_t vmlaq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] 13346 { 13347 int32_t vlane; 13348 int32x4_t c; 13349 vlane = vget_lane_s32(v, l); 13350 c = vdupq_n_s32(vlane); 13351 return vmlaq_s32(a,b,c); 13352 } 13353 13354 _NEON2SSESTORAGE uint16x8_t vmlaq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] 13355 #define vmlaq_lane_u16 vmlaq_lane_s16 13356 13357 _NEON2SSESTORAGE uint32x4_t vmlaq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] 13358 #define vmlaq_lane_u32 vmlaq_lane_s32 13359 13360 _NEON2SSESTORAGE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] 13361 _NEON2SSE_INLINE float32x4_t vmlaq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] 13362 { 13363 float32_t vlane; 13364 float32x4_t c; 13365 vlane = vget_lane_f32(v, l); 13366 c = vdupq_n_f32(vlane); 13367 return vmlaq_f32(a,b,c); 13368 } 13369 13370 //***************** Vector widening multiply accumulate by scalar ********************** 13371 //*************************************************************************************** 13372 _NEON2SSESTORAGE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] 13373 _NEON2SSE_INLINE int32x4_t vmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] 13374 { 13375 int16_t vlane; 13376 int16x4_t c; 13377 vlane = vget_lane_s16(v, l); 13378 c = vdup_n_s16(vlane); 13379 return vmlal_s16(a, b, c); 13380 } 13381 13382 _NEON2SSESTORAGE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] 13383 _NEON2SSE_INLINE int64x2_t vmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] 13384 { 13385 int32_t vlane; 13386 int32x2_t c; 13387 vlane = vget_lane_s32(v, l); 13388 c = vdup_n_s32(vlane); 13389 return vmlal_s32(a, b, c); 13390 } 13391 13392 _NEON2SSESTORAGE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] 13393 _NEON2SSE_INLINE uint32x4_t vmlal_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] 13394 { 13395 uint16_t vlane; 13396 uint16x4_t c; 13397 vlane = vget_lane_u16(v, l); 13398 c = vdup_n_u16(vlane); 13399 return vmlal_u16(a, b, c); 13400 } 13401 13402 _NEON2SSESTORAGE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] 13403 _NEON2SSE_INLINE uint64x2_t vmlal_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] 13404 { 13405 uint32_t vlane; 13406 uint32x2_t c; 13407 vlane = vget_lane_u32(v, l); 13408 c = vdup_n_u32(vlane); 13409 return vmlal_u32(a, b, c); 13410 } 13411 13412 // ******** Vector widening saturating doubling multiply accumulate by scalar ******************************* 13413 // ************************************************************************************************ 13414 _NEON2SSESTORAGE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLAL.S16 q0, d0, d0[0] 13415 _NEON2SSE_INLINE int32x4_t vqdmlal_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) 13416 { 13417 int16_t vlane; 13418 int16x4_t c; 13419 vlane = vget_lane_s16(v, l); 13420 c = vdup_n_s16(vlane); 13421 return vqdmlal_s16(a, b, c); 13422 } 13423 13424 _NEON2SSESTORAGE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLAL.S32 q0, d0, d0[0] 13425 _NEON2SSE_INLINE int64x2_t vqdmlal_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) 13426 { 13427 int32_t vlane; 13428 uint32x2_t c; 13429 vlane = vget_lane_s32(v, l); 13430 c = vdup_n_s32(vlane); 13431 return vqdmlal_s32(a, b, c); 13432 } 13433 13434 // ****** Vector multiply subtract by scalar ***************** 13435 // ************************************************************* 13436 _NEON2SSESTORAGE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] 13437 _NEON2SSE_INLINE int16x4_t vmls_lane_s16(int16x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] 13438 { 13439 int16_t vlane; 13440 int16x4_t c; 13441 vlane = vget_lane_s16(v, l); 13442 c = vdup_n_s16(vlane); 13443 return vmls_s16(a, b, c); 13444 } 13445 13446 _NEON2SSESTORAGE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] 13447 _NEON2SSE_INLINE int32x2_t vmls_lane_s32(int32x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] 13448 { 13449 int32_t vlane; 13450 int32x2_t c; 13451 vlane = vget_lane_s32(v, l); 13452 c = vdup_n_s32(vlane); 13453 return vmls_s32(a, b, c); 13454 } 13455 13456 _NEON2SSESTORAGE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLS.I16 d0, d0, d0[0] 13457 _NEON2SSE_INLINE uint16x4_t vmls_lane_u16(uint16x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLS.I16 d0, d0, d0[0] 13458 { 13459 uint16_t vlane; 13460 uint16x4_t c; 13461 vlane = vget_lane_s16(v, l); 13462 c = vdup_n_s16(vlane); 13463 return vmls_s16(a, b, c); 13464 } 13465 13466 _NEON2SSESTORAGE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLS.I32 d0, d0, d0[0] 13467 _NEON2SSE_INLINE uint32x2_t vmls_lane_u32(uint32x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLS.I32 d0, d0, d0[0] 13468 { 13469 uint32_t vlane; 13470 uint32x2_t c; 13471 vlane = vget_lane_u32(v, l); 13472 c = vdup_n_u32(vlane); 13473 return vmls_u32(a, b, c); 13474 } 13475 13476 _NEON2SSESTORAGE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l); // VMLS.F32 d0, d0, d0[0] 13477 _NEON2SSE_INLINE float32x2_t vmls_lane_f32(float32x2_t a, float32x2_t b, float32x2_t v, __constrange(0,1) int l) 13478 { 13479 float32_t vlane; 13480 float32x2_t c; 13481 vlane = (float) vget_lane_f32(v, l); 13482 c = vdup_n_f32(vlane); 13483 return vmls_f32(a,b,c); 13484 } 13485 13486 _NEON2SSESTORAGE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l); // VMLS.I16 q0, q0, d0[0] 13487 _NEON2SSE_INLINE int16x8_t vmlsq_lane_s16(int16x8_t a, int16x8_t b, int16x4_t v, __constrange(0,3) int l) // VMLS.I16 q0, q0, d0[0] 13488 { 13489 int16_t vlane; 13490 int16x8_t c; 13491 vlane = vget_lane_s16(v, l); 13492 c = vdupq_n_s16(vlane); 13493 return vmlsq_s16(a, b,c); 13494 } 13495 13496 _NEON2SSESTORAGE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l); // VMLS.I32 q0, q0, d0[0] 13497 _NEON2SSE_INLINE int32x4_t vmlsq_lane_s32(int32x4_t a, int32x4_t b, int32x2_t v, __constrange(0,1) int l) // VMLS.I32 q0, q0, d0[0] 13498 { 13499 int32_t vlane; 13500 int32x4_t c; 13501 vlane = vget_lane_s32(v, l); 13502 c = vdupq_n_s32(vlane); 13503 return vmlsq_s32(a,b,c); 13504 } 13505 13506 _NEON2SSESTORAGE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l); // VMLA.I16 q0, q0, d0[0] 13507 _NEON2SSE_INLINE uint16x8_t vmlsq_lane_u16(uint16x8_t a, uint16x8_t b, uint16x4_t v, __constrange(0,3) int l) // VMLA.I16 q0, q0, d0[0] 13508 { 13509 uint16_t vlane; 13510 uint16x8_t c; 13511 vlane = vget_lane_u16(v, l); 13512 c = vdupq_n_u16(vlane); 13513 return vmlsq_u16(a,b,c); 13514 } 13515 13516 _NEON2SSESTORAGE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l); // VMLA.I32 q0, q0, d0[0] 13517 _NEON2SSE_INLINE uint32x4_t vmlsq_lane_u32(uint32x4_t a, uint32x4_t b, uint32x2_t v, __constrange(0,1) int l) // VMLA.I32 q0, q0, d0[0] 13518 { 13519 uint32_t vlane; 13520 uint32x4_t c; 13521 vlane = vget_lane_u32(v, l); 13522 c = vdupq_n_u32(vlane); 13523 return vmlsq_u32(a,b,c); 13524 } 13525 13526 _NEON2SSESTORAGE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l); // VMLA.F32 q0, q0, d0[0] 13527 _NEON2SSE_INLINE float32x4_t vmlsq_lane_f32(float32x4_t a, float32x4_t b, float32x2_t v, __constrange(0,1) int l) // VMLA.F32 q0, q0, d0[0] 13528 { 13529 float32_t vlane; 13530 float32x4_t c; 13531 vlane = (float) vget_lane_f32(v, l); 13532 c = vdupq_n_f32(vlane); 13533 return vmlsq_f32(a,b,c); 13534 } 13535 13536 // **** Vector widening multiply subtract by scalar **** 13537 // **************************************************** 13538 _NEON2SSESTORAGE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VMLAL.S16 q0, d0, d0[0] 13539 _NEON2SSE_INLINE int32x4_t vmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) // VMLAL.S16 q0, d0, d0[0] 13540 { 13541 int16_t vlane; 13542 int16x4_t c; 13543 vlane = vget_lane_s16(v, l); 13544 c = vdup_n_s16(vlane); 13545 return vmlsl_s16(a, b, c); 13546 } 13547 13548 _NEON2SSESTORAGE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VMLAL.S32 q0, d0, d0[0] 13549 _NEON2SSE_INLINE int64x2_t vmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l) // VMLAL.S32 q0, d0, d0[0] 13550 { 13551 int32_t vlane; 13552 int32x2_t c; 13553 vlane = vget_lane_s32(v, l); 13554 c = vdup_n_s32(vlane); 13555 return vmlsl_s32(a, b, c); 13556 } 13557 13558 _NEON2SSESTORAGE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l); // VMLAL.s16 q0, d0, d0[0] 13559 _NEON2SSE_INLINE uint32x4_t vmlsl_lane_u16(uint32x4_t a, uint16x4_t b, uint16x4_t v, __constrange(0,3) int l) // VMLAL.s16 q0, d0, d0[0] 13560 { 13561 uint16_t vlane; 13562 uint16x4_t c; 13563 vlane = vget_lane_s16(v, l); 13564 c = vdup_n_s16(vlane); 13565 return vmlsl_s16(a, b, c); 13566 } 13567 13568 _NEON2SSESTORAGE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l); // VMLAL.U32 q0, d0, d0[0] 13569 _NEON2SSE_INLINE uint64x2_t vmlsl_lane_u32(uint64x2_t a, uint32x2_t b, uint32x2_t v, __constrange(0,1) int l) // VMLAL.U32 q0, d0, d0[0] 13570 { 13571 uint32_t vlane; 13572 uint32x2_t c; 13573 vlane = vget_lane_u32(v, l); 13574 c = vdup_n_u32(vlane); 13575 return vmlsl_u32(a, b, c); 13576 } 13577 13578 //********* Vector widening saturating doubling multiply subtract by scalar ************************** 13579 //****************************************************************************************************** 13580 _NEON2SSESTORAGE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l); // VQDMLSL.S16 q0, d0, d0[0] 13581 _NEON2SSE_INLINE int32x4_t vqdmlsl_lane_s16(int32x4_t a, int16x4_t b, int16x4_t v, __constrange(0,3) int l) 13582 { 13583 int16_t vlane; 13584 int16x4_t c; 13585 vlane = vget_lane_s16(v, l); 13586 c = vdup_n_s16(vlane); 13587 return vqdmlsl_s16(a, b, c); 13588 } 13589 13590 _NEON2SSESTORAGE int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l); // VQDMLSL.S32 q0, d0, d0[0] 13591 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_lane_s32(int64x2_t a, int32x2_t b, int32x2_t v, __constrange(0,1) int l), _NEON2SSE_REASON_SLOW_SERIAL) 13592 { 13593 int32_t vlane; 13594 int32x2_t c; 13595 vlane = vget_lane_s32(v, l); 13596 c = vdup_n_s32(vlane); 13597 return vqdmlsl_s32(a, b, c); 13598 } 13599 //********** Vector multiply with scalar ***************************** 13600 _NEON2SSESTORAGE int16x4_t vmul_n_s16(int16x4_t a, int16_t b); // VMUL.I16 d0,d0,d0[0] 13601 _NEON2SSE_INLINE int16x4_t vmul_n_s16(int16x4_t a, int16_t b) // VMUL.I16 d0,d0,d0[0] 13602 { 13603 int16x4_t b16x4; 13604 b16x4 = vdup_n_s16(b); 13605 return vmul_s16(a, b16x4); 13606 } 13607 13608 _NEON2SSESTORAGE int32x2_t vmul_n_s32(int32x2_t a, int32_t b); // VMUL.I32 d0,d0,d0[0] 13609 _NEON2SSE_INLINE int32x2_t vmul_n_s32(int32x2_t a, int32_t b) // VMUL.I32 d0,d0,d0[0] 13610 { 13611 //serial solution looks faster 13612 int32x2_t b32x2; 13613 b32x2 = vdup_n_s32(b); 13614 return vmul_s32(a, b32x2); 13615 } 13616 13617 _NEON2SSESTORAGE float32x2_t vmul_n_f32(float32x2_t a, float32_t b); // VMUL.F32 d0,d0,d0[0] 13618 _NEON2SSE_INLINE float32x2_t vmul_n_f32(float32x2_t a, float32_t b) // VMUL.F32 d0,d0,d0[0] 13619 { 13620 float32x2_t b32x2; 13621 b32x2 = vdup_n_f32(b); 13622 return vmul_f32(a, b32x2); 13623 } 13624 13625 _NEON2SSESTORAGE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b); // VMUL.I16 d0,d0,d0[0] 13626 _NEON2SSE_INLINE uint16x4_t vmul_n_u16(uint16x4_t a, uint16_t b) // VMUL.I16 d0,d0,d0[0] 13627 { 13628 uint16x4_t b16x4; 13629 b16x4 = vdup_n_s16(b); 13630 return vmul_s16(a, b16x4); 13631 } 13632 13633 _NEON2SSESTORAGE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b); // VMUL.I32 d0,d0,d0[0] 13634 _NEON2SSE_INLINE uint32x2_t vmul_n_u32(uint32x2_t a, uint32_t b) // VMUL.I32 d0,d0,d0[0] 13635 { 13636 //serial solution looks faster 13637 uint32x2_t b32x2; 13638 b32x2 = vdup_n_u32(b); 13639 return vmul_u32(a, b32x2); 13640 } 13641 13642 _NEON2SSESTORAGE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] 13643 _NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0] 13644 { 13645 int16x8_t b16x8; 13646 b16x8 = vdupq_n_s16(b); 13647 return vmulq_s16(a, b16x8); 13648 } 13649 13650 _NEON2SSESTORAGE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] 13651 _NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0] 13652 { 13653 int32x4_t b32x4; 13654 b32x4 = vdupq_n_s32(b); 13655 return vmulq_s32(a, b32x4); 13656 } 13657 13658 _NEON2SSESTORAGE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] 13659 _NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0] 13660 { 13661 float32x4_t b32x4; 13662 b32x4 = vdupq_n_f32(b); 13663 return vmulq_f32(a, b32x4); 13664 } 13665 13666 _NEON2SSESTORAGE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] 13667 _NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0] 13668 { 13669 uint16x8_t b16x8; 13670 b16x8 = vdupq_n_s16(b); 13671 return vmulq_s16(a, b16x8); 13672 } 13673 13674 _NEON2SSESTORAGE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] 13675 _NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0] 13676 { 13677 uint32x4_t b32x4; 13678 b32x4 = vdupq_n_u32(b); 13679 return vmulq_u32(a, b32x4); 13680 } 13681 13682 //********** Vector multiply lane ***************************** 13683 _NEON2SSESTORAGE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c); 13684 _NEON2SSE_INLINE int16x4_t vmul_lane_s16 (int16x4_t a, int16x4_t b, __constrange(0,3) int c) 13685 { 13686 int16x4_t b16x4; 13687 int16_t vlane; 13688 vlane = vget_lane_s16(b, c); 13689 b16x4 = vdup_n_s16(vlane); 13690 return vmul_s16(a, b16x4); 13691 } 13692 13693 _NEON2SSESTORAGE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c); 13694 _NEON2SSE_INLINE int32x2_t vmul_lane_s32 (int32x2_t a, int32x2_t b, __constrange(0,1) int c) 13695 { 13696 int32x2_t b32x2; 13697 int32_t vlane; 13698 vlane = vget_lane_s32(b, c); 13699 b32x2 = vdup_n_s32(vlane); 13700 return vmul_s32(a, b32x2); 13701 } 13702 13703 _NEON2SSESTORAGE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c); 13704 _NEON2SSE_INLINE float32x2_t vmul_lane_f32 (float32x2_t a, float32x2_t b, __constrange(0,1) int c) 13705 { 13706 float32x2_t b32x2; 13707 float32_t vlane; 13708 vlane = vget_lane_f32(b, c); 13709 b32x2 = vdup_n_f32(vlane); 13710 return vmul_f32(a, b32x2); 13711 } 13712 13713 _NEON2SSESTORAGE uint16x4_t vmul_lane_u16 (uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); 13714 #define vmul_lane_u16 vmul_lane_s16 13715 13716 _NEON2SSESTORAGE uint32x2_t vmul_lane_u32 (uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); 13717 #define vmul_lane_u32 vmul_lane_s32 13718 13719 _NEON2SSESTORAGE int16x8_t vmulq_lane_s16(int16x8_t a, int16x4_t b, __constrange(0,3) int c); 13720 _NEON2SSE_INLINE int16x8_t vmulq_lane_s16 (int16x8_t a, int16x4_t b, __constrange(0,3) int c) 13721 { 13722 int16x8_t b16x8; 13723 int16_t vlane; 13724 vlane = vget_lane_s16(b, c); 13725 b16x8 = vdupq_n_s16(vlane); 13726 return vmulq_s16(a, b16x8); 13727 } 13728 13729 _NEON2SSESTORAGE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c); 13730 _NEON2SSE_INLINE int32x4_t vmulq_lane_s32 (int32x4_t a, int32x2_t b, __constrange(0,1) int c) 13731 { 13732 int32x4_t b32x4; 13733 int32_t vlane; 13734 vlane = vget_lane_s32(b, c); 13735 b32x4 = vdupq_n_s32(vlane); 13736 return vmulq_s32(a, b32x4); 13737 } 13738 13739 _NEON2SSESTORAGE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c); 13740 _NEON2SSE_INLINE float32x4_t vmulq_lane_f32 (float32x4_t a, float32x2_t b, __constrange(0,1) int c) 13741 { 13742 float32x4_t b32x4; 13743 float32_t vlane; 13744 vlane = vget_lane_f32(b, c); 13745 b32x4 = vdupq_n_f32(vlane); 13746 return vmulq_f32(a, b32x4); 13747 } 13748 13749 _NEON2SSESTORAGE uint16x8_t vmulq_lane_u16 (uint16x8_t a, uint16x4_t b, __constrange(0,3) int c); 13750 #define vmulq_lane_u16 vmulq_lane_s16 13751 13752 _NEON2SSESTORAGE uint32x4_t vmulq_lane_u32 (uint32x4_t a, uint32x2_t b, __constrange(0,1) int c); 13753 #define vmulq_lane_u32 vmulq_lane_s32 13754 13755 //**** Vector long multiply with scalar ************ 13756 _NEON2SSESTORAGE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2); // VMULL.S16 q0,d0,d0[0] 13757 _NEON2SSE_INLINE int32x4_t vmull_n_s16(int16x4_t vec1, int16_t val2) // VMULL.S16 q0,d0,d0[0] 13758 { 13759 int16x4_t b16x4; 13760 b16x4 = vdup_n_s16(val2); 13761 return vmull_s16(vec1, b16x4); 13762 } 13763 13764 _NEON2SSESTORAGE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2); // VMULL.S32 q0,d0,d0[0] 13765 _NEON2SSE_INLINE int64x2_t vmull_n_s32(int32x2_t vec1, int32_t val2) // VMULL.S32 q0,d0,d0[0] 13766 { 13767 int32x2_t b32x2; 13768 b32x2 = vdup_n_s32(val2); 13769 return vmull_s32(vec1, b32x2); 13770 } 13771 13772 _NEON2SSESTORAGE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2); // VMULL.s16 q0,d0,d0[0] 13773 _NEON2SSE_INLINE uint32x4_t vmull_n_u16(uint16x4_t vec1, uint16_t val2) // VMULL.s16 q0,d0,d0[0] 13774 { 13775 uint16x4_t b16x4; 13776 b16x4 = vdup_n_s16(val2); 13777 return vmull_s16(vec1, b16x4); 13778 } 13779 13780 _NEON2SSESTORAGE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2); // VMULL.U32 q0,d0,d0[0] 13781 _NEON2SSE_INLINE uint64x2_t vmull_n_u32(uint32x2_t vec1, uint32_t val2) // VMULL.U32 q0,d0,d0[0] 13782 { 13783 uint32x2_t b32x2; 13784 b32x2 = vdup_n_u32(val2); 13785 return vmull_u32(vec1, b32x2); 13786 } 13787 13788 //**** Vector long multiply by scalar **** 13789 _NEON2SSESTORAGE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VMULL.S16 q0,d0,d0[0] 13790 _NEON2SSE_INLINE int32x4_t vmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VMULL.S16 q0,d0,d0[0] 13791 { 13792 int16_t vlane; 13793 int16x4_t b; 13794 vlane = vget_lane_s16(val2, val3); 13795 b = vdup_n_s16(vlane); 13796 return vmull_s16(vec1, b); 13797 } 13798 13799 _NEON2SSESTORAGE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VMULL.S32 q0,d0,d0[0] 13800 _NEON2SSE_INLINE int64x2_t vmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3) // VMULL.S32 q0,d0,d0[0] 13801 { 13802 int32_t vlane; 13803 int32x2_t b; 13804 vlane = vget_lane_s32(val2, val3); 13805 b = vdup_n_s32(vlane); 13806 return vmull_s32(vec1, b); 13807 } 13808 13809 _NEON2SSESTORAGE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3); // VMULL.s16 q0,d0,d0[0] 13810 _NEON2SSE_INLINE uint32x4_t vmull_lane_u16(uint16x4_t vec1, uint16x4_t val2, __constrange(0, 3) int val3) // VMULL.s16 q0,d0,d0[0] 13811 { 13812 uint16_t vlane; 13813 uint16x4_t b; 13814 vlane = vget_lane_s16(val2, val3); 13815 b = vdup_n_s16(vlane); 13816 return vmull_s16(vec1, b); 13817 } 13818 13819 _NEON2SSESTORAGE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3); // VMULL.U32 q0,d0,d0[0] 13820 _NEON2SSE_INLINE uint64x2_t vmull_lane_u32(uint32x2_t vec1, uint32x2_t val2, __constrange(0, 1) int val3) // VMULL.U32 q0,d0,d0[0] 13821 { 13822 uint32_t vlane; 13823 uint32x2_t b; 13824 vlane = vget_lane_u32(val2, val3); 13825 b = vdup_n_u32(vlane); 13826 return vmull_u32(vec1, b); 13827 } 13828 13829 //********* Vector saturating doubling long multiply with scalar ******************* 13830 _NEON2SSESTORAGE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2); // VQDMULL.S16 q0,d0,d0[0] 13831 _NEON2SSE_INLINE int32x4_t vqdmull_n_s16(int16x4_t vec1, int16_t val2) 13832 { 13833 //the serial soulution may be faster due to saturation 13834 int16x4_t b; 13835 b = vdup_n_s16(val2); 13836 return vqdmull_s16(vec1, b); 13837 } 13838 13839 _NEON2SSESTORAGE int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2); // VQDMULL.S32 q0,d0,d0[0] 13840 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_SERIAL) 13841 { 13842 int32x2_t b; 13843 b = vdup_n_s32(val2); 13844 return vqdmull_s32(vec1,b); //slow serial function!!!! 13845 } 13846 13847 //************* Vector saturating doubling long multiply by scalar *********************************************** 13848 _NEON2SSESTORAGE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULL.S16 q0,d0,d0[0] 13849 _NEON2SSE_INLINE int32x4_t vqdmull_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) 13850 { 13851 int16_t c; 13852 int16x4_t scalar; 13853 c = vget_lane_s16(val2, val3); 13854 scalar = vdup_n_s16(c); 13855 return vqdmull_s16(vec1, scalar); 13856 } 13857 13858 13859 _NEON2SSESTORAGE int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULL.S32 q0,d0,d0[0] 13860 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmull_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_SERIAL) 13861 { 13862 int32_t c; 13863 int32x2_t scalar; 13864 c = vget_lane_s32(val2, val3); 13865 scalar = vdup_n_s32(c); 13866 return vqdmull_s32(vec1,scalar); //slow serial function!!!! 13867 } 13868 13869 // *****Vector saturating doubling multiply high with scalar ***** 13870 _NEON2SSESTORAGE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQDMULH.S16 d0,d0,d0[0] 13871 _NEON2SSE_INLINE int16x4_t vqdmulh_n_s16(int16x4_t vec1, int16_t val2) 13872 { 13873 int16x4_t res64; 13874 return64(vqdmulhq_n_s16(_pM128i(vec1), val2)); 13875 } 13876 13877 _NEON2SSESTORAGE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQDMULH.S32 d0,d0,d0[0] 13878 _NEON2SSE_INLINE int32x2_t vqdmulh_n_s32(int32x2_t vec1, int32_t val2) 13879 { 13880 int32x2_t res64; 13881 return64(vqdmulhq_n_s32(_pM128i(vec1), val2)); 13882 } 13883 13884 _NEON2SSESTORAGE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] 13885 _NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0] 13886 { 13887 //solution may be not optimal 13888 int16x8_t scalar; 13889 scalar = vdupq_n_s16(val2); 13890 return vqdmulhq_s16(vec1, scalar); 13891 } 13892 13893 _NEON2SSESTORAGE int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] 13894 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 13895 { 13896 int32x4_t scalar; 13897 scalar = vdupq_n_s32(val2); 13898 return vqdmulhq_s32(vec1, scalar); 13899 } 13900 13901 //***** Vector saturating doubling multiply high by scalar **************** 13902 _NEON2SSESTORAGE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 d0,d0,d0[0] 13903 _NEON2SSE_INLINE int16x4_t vqdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 d0,d0,d0[0] 13904 { 13905 //solution may be not optimal 13906 int16_t vlane; 13907 int16x4_t scalar; 13908 vlane = vget_lane_s16(val2, val3); 13909 scalar = vdup_n_s16(vlane); 13910 return vqdmulh_s16(vec1, scalar); 13911 } 13912 13913 _NEON2SSESTORAGE int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 d0,d0,d0[0] 13914 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 13915 { 13916 int32_t vlane; 13917 int32x2_t scalar; 13918 vlane = vget_lane_s32(val2, val3); 13919 scalar = vdup_n_s32(vlane); 13920 return vqdmulh_s32(vec1, scalar); 13921 } 13922 13923 _NEON2SSESTORAGE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQDMULH.S16 q0,q0,d0[0] 13924 _NEON2SSE_INLINE int16x8_t vqdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQDMULH.S16 q0,q0,d0[0] 13925 { 13926 //solution may be not optimal 13927 int16_t vlane; 13928 int16x8_t scalar; 13929 vlane = vget_lane_s16(val2, val3); 13930 scalar = vdupq_n_s16(vlane ); 13931 return vqdmulhq_s16(vec1, scalar); 13932 } 13933 13934 _NEON2SSESTORAGE int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQDMULH.S32 q0,q0,d0[0] 13935 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 13936 { 13937 //solution may be not optimal 13938 int32_t vlane; 13939 int32x4_t scalar; 13940 vlane = vgetq_lane_s32(_pM128i(val2), val3); 13941 scalar = vdupq_n_s32(vlane ); 13942 return vqdmulhq_s32(vec1, scalar); 13943 } 13944 13945 //******** Vector saturating rounding doubling multiply high with scalar *** 13946 _NEON2SSESTORAGE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2); // VQRDMULH.S16 d0,d0,d0[0] 13947 _NEON2SSE_INLINE int16x4_t vqrdmulh_n_s16(int16x4_t vec1, int16_t val2) // VQRDMULH.S16 d0,d0,d0[0] 13948 { 13949 //solution may be not optimal 13950 int16x4_t scalar; 13951 scalar = vdup_n_s16(val2); 13952 return vqrdmulh_s16(vec1, scalar); 13953 } 13954 13955 _NEON2SSESTORAGE int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2); // VQRDMULH.S32 d0,d0,d0[0] 13956 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_n_s32(int32x2_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 13957 { 13958 int32x2_t scalar; 13959 scalar = vdup_n_s32(val2); 13960 return vqrdmulh_s32(vec1, scalar); 13961 } 13962 13963 _NEON2SSESTORAGE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] 13964 _NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0] 13965 { 13966 //solution may be not optimal 13967 int16x8_t scalar; 13968 scalar = vdupq_n_s16(val2); 13969 return vqrdmulhq_s16(vec1, scalar); 13970 } 13971 13972 _NEON2SSESTORAGE int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] 13973 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 13974 { 13975 int32x4_t scalar; 13976 scalar = vdupq_n_s32(val2); 13977 return vqrdmulhq_s32(vec1, scalar); 13978 } 13979 13980 //********* Vector rounding saturating doubling multiply high by scalar **** 13981 _NEON2SSESTORAGE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 d0,d0,d0[0] 13982 _NEON2SSE_INLINE int16x4_t vqrdmulh_lane_s16(int16x4_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 d0,d0,d0[0] 13983 { 13984 //solution may be not optimal 13985 int16_t vlane; 13986 int16x4_t scalar; 13987 vlane = vget_lane_s16(val2, val3); 13988 scalar = vdup_n_s16(vlane); 13989 return vqrdmulh_s16(vec1, scalar); 13990 } 13991 13992 _NEON2SSESTORAGE int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 d0,d0,d0[0] 13993 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vqrdmulh_lane_s32(int32x2_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 13994 { 13995 int32_t vlane; 13996 int32x2_t scalar; 13997 vlane = vget_lane_s32(val2, val3); 13998 scalar = vdup_n_s32(vlane); 13999 return vqrdmulh_s32(vec1, scalar); 14000 } 14001 14002 _NEON2SSESTORAGE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3); // VQRDMULH.S16 q0,q0,d0[0] 14003 _NEON2SSE_INLINE int16x8_t vqrdmulhq_lane_s16(int16x8_t vec1, int16x4_t val2, __constrange(0, 3) int val3) // VQRDMULH.S16 q0,q0,d0[0] 14004 { 14005 //solution may be not optimal 14006 int16_t vlane; 14007 int16x8_t scalar; 14008 vlane = vget_lane_s16(val2, val3); 14009 scalar = vdupq_n_s16(vlane); 14010 return vqrdmulhq_s16(vec1, scalar); 14011 } 14012 14013 _NEON2SSESTORAGE int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3); // VQRDMULH.S32 q0,q0,d0[0] 14014 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_lane_s32(int32x4_t vec1, int32x2_t val2, __constrange(0, 1) int val3), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 14015 { 14016 //solution may be not optimal 14017 int32_t vlane; 14018 int32x4_t scalar; 14019 vlane = vgetq_lane_s32(_pM128i(val2), val3); 14020 scalar = vdupq_n_s32(vlane ); 14021 return vqrdmulhq_s32(vec1, scalar); 14022 } 14023 14024 //**************Vector multiply accumulate with scalar ******************* 14025 _NEON2SSESTORAGE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLA.I16 d0, d0, d0[0] 14026 _NEON2SSE_INLINE int16x4_t vmla_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLA.I16 d0, d0, d0[0] 14027 { 14028 int16x4_t scalar; 14029 scalar = vdup_n_s16(c); 14030 return vmla_s16(a, b, scalar); 14031 } 14032 14033 _NEON2SSESTORAGE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLA.I32 d0, d0, d0[0] 14034 _NEON2SSE_INLINE int32x2_t vmla_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLA.I32 d0, d0, d0[0] 14035 { 14036 int32x2_t scalar; 14037 scalar = vdup_n_s32(c); 14038 return vmla_s32(a, b, scalar); 14039 } 14040 14041 _NEON2SSESTORAGE uint16x4_t vmla_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLA.I16 d0, d0, d0[0] 14042 #define vmla_n_u16 vmla_n_s16 14043 14044 14045 _NEON2SSESTORAGE uint32x2_t vmla_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLA.I32 d0, d0, d0[0] 14046 #define vmla_n_u32 vmla_n_s32 14047 14048 14049 _NEON2SSESTORAGE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLA.F32 d0, d0, d0[0] 14050 _NEON2SSE_INLINE float32x2_t vmla_n_f32(float32x2_t a, float32x2_t b, float32_t c) // VMLA.F32 d0, d0, d0[0] 14051 { 14052 float32x2_t scalar; 14053 scalar = vdup_n_f32(c); 14054 return vmla_f32(a, b, scalar); 14055 } 14056 14057 _NEON2SSESTORAGE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] 14058 _NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0] 14059 { 14060 int16x8_t scalar; 14061 scalar = vdupq_n_s16(c); 14062 return vmlaq_s16(a,b,scalar); 14063 } 14064 14065 _NEON2SSESTORAGE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] 14066 _NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0] 14067 { 14068 int32x4_t scalar; 14069 scalar = vdupq_n_s32(c); 14070 return vmlaq_s32(a,b,scalar); 14071 } 14072 14073 _NEON2SSESTORAGE uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] 14074 #define vmlaq_n_u16 vmlaq_n_s16 14075 14076 _NEON2SSESTORAGE uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] 14077 #define vmlaq_n_u32 vmlaq_n_s32 14078 14079 _NEON2SSESTORAGE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] 14080 _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0] 14081 { 14082 float32x4_t scalar; 14083 scalar = vdupq_n_f32(c); 14084 return vmlaq_f32(a,b,scalar); 14085 } 14086 14087 //************Vector widening multiply accumulate with scalar**************************** 14088 _NEON2SSESTORAGE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLAL.S16 q0, d0, d0[0] 14089 _NEON2SSE_INLINE int32x4_t vmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLAL.S16 q0, d0, d0[0] 14090 { 14091 int16x4_t vc; 14092 vc = vdup_n_s16(c); 14093 return vmlal_s16(a, b, vc); 14094 } 14095 14096 _NEON2SSESTORAGE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLAL.S32 q0, d0, d0[0] 14097 _NEON2SSE_INLINE int64x2_t vmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLAL.S32 q0, d0, d0[0] 14098 { 14099 int32x2_t vc; 14100 vc = vdup_n_s32(c); 14101 return vmlal_s32(a, b, vc); 14102 } 14103 14104 _NEON2SSESTORAGE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLAL.s16 q0, d0, d0[0] 14105 _NEON2SSE_INLINE uint32x4_t vmlal_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLAL.s16 q0, d0, d0[0] 14106 { 14107 uint16x4_t vc; 14108 vc = vdup_n_u16(c); 14109 return vmlal_u16(a, b, vc); 14110 } 14111 14112 _NEON2SSESTORAGE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLAL.U32 q0, d0, d0[0] 14113 _NEON2SSE_INLINE uint64x2_t vmlal_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLAL.U32 q0, d0, d0[0] 14114 { 14115 uint32x2_t vc; 14116 vc = vdup_n_u32(c); 14117 return vmlal_u32(a, b, vc); 14118 } 14119 14120 //************ Vector widening saturating doubling multiply accumulate with scalar ************** 14121 _NEON2SSESTORAGE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLAL.S16 q0, d0, d0[0] 14122 _NEON2SSE_INLINE int32x4_t vqdmlal_n_s16(int32x4_t a, int16x4_t b, int16_t c) 14123 { 14124 //not optimal SIMD soulution, serial may be faster 14125 int16x4_t vc; 14126 vc = vdup_n_s16(c); 14127 return vqdmlal_s16(a, b, vc); 14128 } 14129 14130 _NEON2SSESTORAGE int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLAL.S32 q0, d0, d0[0] 14131 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlal_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) 14132 { 14133 int32x2_t vc; 14134 vc = vdup_n_s32(c); 14135 return vqdmlal_s32(a, b, vc); 14136 } 14137 14138 //******** Vector multiply subtract with scalar ************** 14139 _NEON2SSESTORAGE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c); // VMLS.I16 d0, d0, d0[0] 14140 _NEON2SSE_INLINE int16x4_t vmls_n_s16(int16x4_t a, int16x4_t b, int16_t c) // VMLS.I16 d0, d0, d0[0] 14141 { 14142 int16x4_t vc; 14143 vc = vdup_n_s16(c); 14144 return vmls_s16(a, b, vc); 14145 } 14146 14147 _NEON2SSESTORAGE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c); // VMLS.I32 d0, d0, d0[0] 14148 _NEON2SSE_INLINE int32x2_t vmls_n_s32(int32x2_t a, int32x2_t b, int32_t c) // VMLS.I32 d0, d0, d0[0] 14149 { 14150 int32x2_t vc; 14151 vc = vdup_n_s32(c); 14152 return vmls_s32(a, b, vc); 14153 } 14154 14155 _NEON2SSESTORAGE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c); // VMLS.I16 d0, d0, d0[0] 14156 _NEON2SSE_INLINE uint16x4_t vmls_n_u16(uint16x4_t a, uint16x4_t b, uint16_t c) // VMLS.I16 d0, d0, d0[0] 14157 { 14158 uint16x4_t vc; 14159 vc = vdup_n_s16(c); 14160 return vmls_s16(a, b, vc); 14161 } 14162 14163 _NEON2SSESTORAGE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c); // VMLS.I32 d0, d0, d0[0] 14164 _NEON2SSE_INLINE uint32x2_t vmls_n_u32(uint32x2_t a, uint32x2_t b, uint32_t c) // VMLS.I32 d0, d0, d0[0] 14165 { 14166 uint32x2_t vc; 14167 vc = vdup_n_u32(c); 14168 return vmls_u32(a, b, vc); 14169 } 14170 14171 _NEON2SSESTORAGE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c); // VMLS.F32 d0, d0, d0[0] 14172 _NEON2SSE_INLINE float32x2_t vmls_n_f32(float32x2_t a, float32x2_t b, float32_t c) 14173 { 14174 float32x2_t res; 14175 res.m64_f32[0] = a.m64_f32[0] - b.m64_f32[0] * c; 14176 res.m64_f32[1] = a.m64_f32[1] - b.m64_f32[1] * c; 14177 return res; 14178 } 14179 14180 _NEON2SSESTORAGE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] 14181 _NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0] 14182 { 14183 int16x8_t vc; 14184 vc = vdupq_n_s16(c); 14185 return vmlsq_s16(a, b,vc); 14186 } 14187 14188 _NEON2SSESTORAGE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] 14189 _NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0] 14190 { 14191 int32x4_t vc; 14192 vc = vdupq_n_s32(c); 14193 return vmlsq_s32(a,b,vc); 14194 } 14195 14196 _NEON2SSESTORAGE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] 14197 _NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0] 14198 { 14199 uint16x8_t vc; 14200 vc = vdupq_n_u16(c); 14201 return vmlsq_u16(a,b,vc); 14202 } 14203 14204 _NEON2SSESTORAGE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] 14205 _NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0] 14206 { 14207 uint32x4_t vc; 14208 vc = vdupq_n_u32(c); 14209 return vmlsq_u32(a,b,vc); 14210 } 14211 14212 _NEON2SSESTORAGE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] 14213 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) 14214 { 14215 float32x4_t vc; 14216 vc = vdupq_n_f32(c); 14217 return vmlsq_f32(a,b,vc); 14218 } 14219 14220 //**** Vector widening multiply subtract with scalar ****** 14221 _NEON2SSESTORAGE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VMLSL.S16 q0, d0, d0[0] 14222 _NEON2SSE_INLINE int32x4_t vmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) // VMLSL.S16 q0, d0, d0[0] 14223 { 14224 int16x4_t vc; 14225 vc = vdup_n_s16(c); 14226 return vmlsl_s16(a, b, vc); 14227 } 14228 14229 _NEON2SSESTORAGE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VMLSL.S32 q0, d0, d0[0] 14230 _NEON2SSE_INLINE int64x2_t vmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c) // VMLSL.S32 q0, d0, d0[0] 14231 { 14232 int32x2_t vc; 14233 vc = vdup_n_s32(c); 14234 return vmlsl_s32(a, b, vc); 14235 } 14236 14237 _NEON2SSESTORAGE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c); // VMLSL.s16 q0, d0, d0[0] 14238 _NEON2SSE_INLINE uint32x4_t vmlsl_n_u16(uint32x4_t a, uint16x4_t b, uint16_t c) // VMLSL.s16 q0, d0, d0[0] 14239 { 14240 uint16x4_t vc; 14241 vc = vdup_n_u16(c); 14242 return vmlsl_u16(a, b, vc); 14243 } 14244 14245 _NEON2SSESTORAGE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c); // VMLSL.U32 q0, d0, d0[0] 14246 _NEON2SSE_INLINE uint64x2_t vmlsl_n_u32(uint64x2_t a, uint32x2_t b, uint32_t c) // VMLSL.U32 q0, d0, d0[0] 14247 { 14248 uint32x2_t vc; 14249 vc = vdup_n_u32(c); 14250 return vmlsl_u32(a, b, vc); 14251 } 14252 14253 //***** Vector widening saturating doubling multiply subtract with scalar ********* 14254 //********************************************************************************** 14255 _NEON2SSESTORAGE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c); // VQDMLSL.S16 q0, d0, d0[0] 14256 _NEON2SSE_INLINE int32x4_t vqdmlsl_n_s16(int32x4_t a, int16x4_t b, int16_t c) 14257 { 14258 int16x4_t vc; 14259 vc = vdup_n_s16(c); 14260 return vqdmlsl_s16(a, b, vc); 14261 } 14262 14263 _NEON2SSESTORAGE int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c); // VQDMLSL.S32 q0, d0, d0[0] 14264 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqdmlsl_n_s32(int64x2_t a, int32x2_t b, int32_t c), _NEON2SSE_REASON_SLOW_SERIAL) 14265 { 14266 int32x2_t vc; 14267 vc = vdup_n_s32(c); 14268 return vqdmlsl_s32(a, b, vc); 14269 } 14270 14271 //******************* Vector extract *********************************************** 14272 //************************************************************************************* 14273 //VEXT (Vector Extract) extracts elements from the bottom end of the second operand 14274 //vector and the top end of the first, concatenates them, and places the result in the destination vector 14275 //c elements from the bottom end of the second operand and (8-c) from the top end of the first 14276 _NEON2SSESTORAGE int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 14277 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x8_t vext_s8(int8x8_t a, int8x8_t b, __constrange(0,7) int c),_NEON2SSE_REASON_SLOW_SERIAL) 14278 { 14279 int8x8_t res; 14280 int i; 14281 for (i = 0; i<8 - c; i++) { 14282 res.m64_i8[i] = a.m64_i8[i + c]; 14283 } 14284 for(i = 0; i<c; i++) { 14285 res.m64_i8[8 - c + i] = b.m64_i8[i]; 14286 } 14287 return res; 14288 } 14289 14290 _NEON2SSESTORAGE uint8x8_t vext_u8(uint8x8_t a, uint8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 14291 #define vext_u8 vext_s8 14292 //same result tested 14293 14294 _NEON2SSESTORAGE poly8x8_t vext_p8(poly8x8_t a, poly8x8_t b, __constrange(0,7) int c); // VEXT.8 d0,d0,d0,#0 14295 #define vext_p8 vext_u8 14296 14297 _NEON2SSESTORAGE int16x4_t vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 14298 _NEON2SSE_INLINE int16x4_t _NEON2SSE_PERFORMANCE_WARNING (vext_s16(int16x4_t a, int16x4_t b, __constrange(0,3) int c), _NEON2SSE_REASON_SLOW_SERIAL) 14299 { 14300 int16x4_t res; 14301 int i; 14302 for (i = 0; i<4 - c; i++) { 14303 res.m64_i16[i] = a.m64_i16[i + c]; 14304 } 14305 for(i = 0; i<c; i++) { 14306 res.m64_i16[4 - c + i] = b.m64_i16[i]; 14307 } 14308 return res; 14309 } 14310 14311 _NEON2SSESTORAGE uint16x4_t vext_u16(uint16x4_t a, uint16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 14312 #define vext_u16 vext_s16 14313 14314 _NEON2SSESTORAGE poly16x4_t vext_p16(poly16x4_t a, poly16x4_t b, __constrange(0,3) int c); // VEXT.16 d0,d0,d0,#0 14315 #define vext_p16 vext_s16 14316 14317 _NEON2SSESTORAGE int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 14318 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x2_t vext_s32(int32x2_t a, int32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL) 14319 { 14320 int32x2_t res; 14321 if (c==0) { 14322 res.m64_i32[0] = a.m64_i32[0]; 14323 res.m64_i32[1] = a.m64_i32[1]; 14324 } else { 14325 res.m64_i32[0] = a.m64_i32[1]; 14326 res.m64_i32[1] = b.m64_i32[0]; 14327 } 14328 return res; 14329 } 14330 14331 _NEON2SSESTORAGE float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 14332 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float32x2_t vext_f32(float32x2_t a, float32x2_t b, __constrange(0,1) int c), _NEON2SSE_REASON_SLOW_SERIAL) 14333 { 14334 float32x2_t res; 14335 if (c==0) { 14336 res.m64_f32[0] = a.m64_f32[0]; 14337 res.m64_f32[1] = a.m64_f32[1]; 14338 } else { 14339 res.m64_f32[0] = a.m64_f32[1]; 14340 res.m64_f32[1] = b.m64_f32[0]; 14341 } 14342 return res; 14343 } 14344 14345 _NEON2SSESTORAGE uint32x2_t vext_u32(uint32x2_t a, uint32x2_t b, __constrange(0,1) int c); // VEXT.32 d0,d0,d0,#0 14346 #define vext_u32 vext_s32 14347 14348 14349 _NEON2SSESTORAGE int64x1_t vext_s64(int64x1_t a, int64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 14350 #define vext_s64(a,b,c) a 14351 14352 _NEON2SSESTORAGE uint64x1_t vext_u64(uint64x1_t a, uint64x1_t b, __constrange(0,0) int c); // VEXT.64 d0,d0,d0,#0 14353 #define vext_u64(a,b,c) a 14354 14355 _NEON2SSESTORAGE int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 14356 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c) 14357 14358 _NEON2SSESTORAGE uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 14359 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c) 14360 14361 _NEON2SSESTORAGE poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 14362 #define vextq_p8 vextq_s8 14363 14364 _NEON2SSESTORAGE int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 14365 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2) 14366 14367 _NEON2SSESTORAGE uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 14368 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2) 14369 14370 _NEON2SSESTORAGE poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 14371 #define vextq_p16 vextq_s16 14372 14373 _NEON2SSESTORAGE int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 14374 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4) 14375 14376 _NEON2SSESTORAGE uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 14377 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4) 14378 14379 _NEON2SSESTORAGE float32x4_t vextq_f32(float32x4_t a, float32x4_t b, __constrange(0,3) float c); // VEXT.32 q0,q0,q0,#0 14380 #define vextq_f32(a,b,c) _M128(vextq_s32(_M128i(a),_M128i(b),c) ) 14381 14382 _NEON2SSESTORAGE int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 14383 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8) 14384 14385 _NEON2SSESTORAGE uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 14386 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8) 14387 14388 //************ Reverse vector elements (swap endianness)***************** 14389 //************************************************************************* 14390 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide. 14391 _NEON2SSESTORAGE int8x8_t vrev64_s8(int8x8_t vec); // VREV64.8 d0,d0 14392 _NEON2SSE_INLINE int8x8_t vrev64_s8(int8x8_t vec) 14393 { 14394 int8x8_t res64; 14395 __m128i res; 14396 res = vrev64q_s8(_pM128i(vec)); 14397 return64(res); 14398 } 14399 14400 _NEON2SSESTORAGE int16x4_t vrev64_s16(int16x4_t vec); // VREV64.16 d0,d0 14401 _NEON2SSE_INLINE int16x4_t vrev64_s16(int16x4_t vec) 14402 { 14403 int16x4_t res64; 14404 __m128i res; 14405 res = vrev64q_s16(_pM128i(vec)); 14406 return64(res); 14407 } 14408 14409 _NEON2SSESTORAGE int32x2_t vrev64_s32(int32x2_t vec); // VREV64.32 d0,d0 14410 _NEON2SSE_INLINE int32x2_t vrev64_s32(int32x2_t vec) 14411 { 14412 int32x2_t res; 14413 res.m64_i32[0] = vec.m64_i32[1]; 14414 res.m64_i32[1] = vec.m64_i32[0]; 14415 return res; 14416 } 14417 14418 _NEON2SSESTORAGE uint8x8_t vrev64_u8(uint8x8_t vec); // VREV64.8 d0,d0 14419 #define vrev64_u8 vrev64_s8 14420 14421 _NEON2SSESTORAGE uint16x4_t vrev64_u16(uint16x4_t vec); // VREV64.16 d0,d0 14422 #define vrev64_u16 vrev64_s16 14423 14424 _NEON2SSESTORAGE uint32x2_t vrev64_u32(uint32x2_t vec); // VREV64.32 d0,d0 14425 #define vrev64_u32 vrev64_s32 14426 14427 _NEON2SSESTORAGE poly8x8_t vrev64_p8(poly8x8_t vec); // VREV64.8 d0,d0 14428 #define vrev64_p8 vrev64_u8 14429 14430 _NEON2SSESTORAGE poly16x4_t vrev64_p16(poly16x4_t vec); // VREV64.16 d0,d0 14431 #define vrev64_p16 vrev64_u16 14432 14433 _NEON2SSESTORAGE float32x2_t vrev64_f32(float32x2_t vec); // VREV64.32 d0,d0 14434 _NEON2SSE_INLINE float32x2_t vrev64_f32(float32x2_t vec) 14435 { 14436 float32x2_t res; 14437 res.m64_f32[0] = vec.m64_f32[1]; 14438 res.m64_f32[1] = vec.m64_f32[0]; 14439 return res; 14440 } 14441 14442 _NEON2SSESTORAGE int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0 14443 _NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0 14444 { 14445 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8}; 14446 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8); 14447 } 14448 14449 _NEON2SSESTORAGE int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0 14450 _NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0 14451 { 14452 //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask 14453 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9}; 14454 return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16); 14455 } 14456 14457 _NEON2SSESTORAGE int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0 14458 _NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0 14459 { 14460 return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) ); 14461 } 14462 14463 _NEON2SSESTORAGE uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0 14464 #define vrev64q_u8 vrev64q_s8 14465 14466 _NEON2SSESTORAGE uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0 14467 #define vrev64q_u16 vrev64q_s16 14468 14469 _NEON2SSESTORAGE uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0 14470 #define vrev64q_u32 vrev64q_s32 14471 14472 _NEON2SSESTORAGE poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0 14473 #define vrev64q_p8 vrev64q_u8 14474 14475 _NEON2SSESTORAGE poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0 14476 #define vrev64q_p16 vrev64q_u16 14477 14478 _NEON2SSESTORAGE float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0 14479 #define vrev64q_f32(vec) _mm_shuffle_ps (vec, vec, _MM_SHUFFLE(2,3, 0,1)) 14480 14481 //******************** 32 bit shuffles ********************** 14482 //************************************************************ 14483 _NEON2SSESTORAGE int8x8_t vrev32_s8(int8x8_t vec); // VREV32.8 d0,d0 14484 _NEON2SSE_INLINE int8x8_t vrev32_s8(int8x8_t vec) 14485 { 14486 int8x8_t res64; 14487 __m128i res; 14488 res = vrev32q_s8(_pM128i(vec)); 14489 return64(res); 14490 } 14491 14492 _NEON2SSESTORAGE int16x4_t vrev32_s16(int16x4_t vec); // VREV32.16 d0,d0 14493 _NEON2SSE_INLINE int16x4_t vrev32_s16(int16x4_t vec) 14494 { 14495 int16x4_t res64; 14496 __m128i res; 14497 res = vrev32q_s16(_pM128i(vec)); 14498 return64(res); 14499 } 14500 14501 _NEON2SSESTORAGE uint8x8_t vrev32_u8(uint8x8_t vec); // VREV32.8 d0,d0 14502 #define vrev32_u8 vrev32_s8 14503 14504 _NEON2SSESTORAGE uint16x4_t vrev32_u16(uint16x4_t vec); // VREV32.16 d0,d0 14505 #define vrev32_u16 vrev32_s16 14506 14507 _NEON2SSESTORAGE poly8x8_t vrev32_p8(poly8x8_t vec); // VREV32.8 d0,d0 14508 #define vrev32_p8 vrev32_u8 14509 14510 _NEON2SSESTORAGE poly16x4_t vrev32_p16(poly16x4_t vec); // VREV32.16 d0,d0 14511 #define vrev32_p16 vrev32_u16 14512 14513 _NEON2SSESTORAGE int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0 14514 _NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0 14515 { 14516 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; 14517 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8); 14518 } 14519 14520 _NEON2SSESTORAGE int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0 14521 _NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0 14522 { 14523 _NEON2SSE_ALIGN_16 static const int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13}; 14524 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8); 14525 } 14526 14527 _NEON2SSESTORAGE uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0 14528 #define vrev32q_u8 vrev32q_s8 14529 14530 _NEON2SSESTORAGE uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0 14531 #define vrev32q_u16 vrev32q_s16 14532 14533 _NEON2SSESTORAGE poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0 14534 #define vrev32q_p8 vrev32q_u8 14535 14536 _NEON2SSESTORAGE poly16x8_t vrev32q_p16(poly16x8_t vec); // VREV32.16 q0,q0 14537 #define vrev32q_p16 vrev32q_u16 14538 14539 //************* 16 bit shuffles ********************** 14540 //****************************************************** 14541 _NEON2SSESTORAGE int8x8_t vrev16_s8(int8x8_t vec); // VREV16.8 d0,d0 14542 _NEON2SSE_INLINE int8x8_t vrev16_s8(int8x8_t vec) 14543 { 14544 int8x8_t res64; 14545 __m128i res; 14546 res = vrev16q_s8(_pM128i(vec)); 14547 return64(res); 14548 } 14549 14550 _NEON2SSESTORAGE uint8x8_t vrev16_u8(uint8x8_t vec); // VREV16.8 d0,d0 14551 #define vrev16_u8 vrev16_s8 14552 14553 _NEON2SSESTORAGE poly8x8_t vrev16_p8(poly8x8_t vec); // VREV16.8 d0,d0 14554 #define vrev16_p8 vrev16_u8 14555 14556 _NEON2SSESTORAGE int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0 14557 _NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0 14558 { 14559 _NEON2SSE_ALIGN_16 static const int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14}; 14560 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev8); 14561 } 14562 14563 _NEON2SSESTORAGE uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0 14564 #define vrev16q_u8 vrev16q_s8 14565 14566 _NEON2SSESTORAGE poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0 14567 #define vrev16q_p8 vrev16q_u8 14568 14569 //********************************************************************* 14570 //**************** Other single operand arithmetic ******************* 14571 //********************************************************************* 14572 14573 //*********** Absolute: Vd[i] = |Va[i]| ********************************** 14574 //************************************************************************ 14575 _NEON2SSESTORAGE int8x8_t vabs_s8(int8x8_t a); // VABS.S8 d0,d0 14576 _NEON2SSE_INLINE int8x8_t vabs_s8(int8x8_t a) 14577 { 14578 int8x8_t res64; 14579 __m128i res; 14580 res = _mm_abs_epi8(_pM128i(a)); 14581 return64(res); 14582 } 14583 14584 14585 _NEON2SSESTORAGE int16x4_t vabs_s16(int16x4_t a); // VABS.S16 d0,d0 14586 _NEON2SSE_INLINE int16x4_t vabs_s16(int16x4_t a) 14587 { 14588 int16x4_t res64; 14589 __m128i res; 14590 res = _mm_abs_epi16(_pM128i(a)); 14591 return64(res); 14592 } 14593 14594 _NEON2SSESTORAGE int32x2_t vabs_s32(int32x2_t a); // VABS.S32 d0,d0 14595 _NEON2SSE_INLINE int32x2_t vabs_s32(int32x2_t a) 14596 { 14597 int32x2_t res64; 14598 __m128i res; 14599 res = _mm_abs_epi32(_pM128i(a)); 14600 return64(res); 14601 } 14602 14603 _NEON2SSESTORAGE float32x2_t vabs_f32(float32x2_t a); // VABS.F32 d0,d0 14604 _NEON2SSE_INLINE float32x2_t vabs_f32(float32x2_t a) // VABS.F32 d0,d0 14605 { 14606 float32x4_t res; 14607 __m64_128 res64; 14608 _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; 14609 res = _mm_and_ps (_pM128(a), *(__m128*)c7fffffff); //use 64 low bits only 14610 _M64f(res64, res); 14611 return res64; 14612 } 14613 14614 _NEON2SSESTORAGE int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0 14615 #define vabsq_s8 _mm_abs_epi8 14616 14617 _NEON2SSESTORAGE int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 14618 #define vabsq_s16 _mm_abs_epi16 14619 14620 _NEON2SSESTORAGE int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 14621 #define vabsq_s32 _mm_abs_epi32 14622 14623 _NEON2SSESTORAGE float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 14624 _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0 14625 { 14626 _NEON2SSE_ALIGN_16 static const int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; 14627 return _mm_and_ps (a, *(__m128*)c7fffffff); 14628 } 14629 14630 #ifdef _NEON2SSE_64BIT 14631 _NEON2SSESTORAGE int64x2_t vabsq_s64(int64x2_t a); // VABS.S64 q0,q0 14632 _NEON2SSE_INLINE int64x2_t vabsq_s64(int64x2_t a) // VABS.S64 q0,q0 14633 { 14634 __m128i sign = _mm_srai_epi32 (_mm_shuffle_epi32 (a, 0xf5), 31); 14635 return _mm_sub_epi64 (_mm_xor_si128 (a, sign), sign); 14636 } 14637 14638 _NEON2SSESTORAGE float64x2_t vabsq_f64(float64x2_t a); // VABS.F64 q0,q0 14639 _NEON2SSE_INLINE float64x2_t vabsq_f64(float64x2_t a) // VABS.F64 q0,q0 14640 { 14641 _NEON2SSE_ALIGN_16 static const int64_t mask[2] = {0x7fffffffffffffffLL, 0x7fffffffffffffffLL}; 14642 return _mm_and_pd (a, *(__m128d*)mask); 14643 } 14644 #endif 14645 14646 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) ********************* 14647 //********************************************************************** 14648 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place 14649 _NEON2SSESTORAGE int8x8_t vqabs_s8(int8x8_t a); // VQABS.S8 d0,d0 14650 _NEON2SSE_INLINE int8x8_t vqabs_s8(int8x8_t a) 14651 { 14652 int8x8_t res64; 14653 __m128i res; 14654 res = vqabsq_s8(_pM128i(a)); 14655 return64(res); 14656 } 14657 14658 _NEON2SSESTORAGE int16x4_t vqabs_s16(int16x4_t a); // VQABS.S16 d0,d0 14659 _NEON2SSE_INLINE int16x4_t vqabs_s16(int16x4_t a) 14660 { 14661 int16x4_t res64; 14662 __m128i res; 14663 res = vqabsq_s16(_pM128i(a)); 14664 return64(res); 14665 } 14666 14667 _NEON2SSESTORAGE int32x2_t vqabs_s32(int32x2_t a); // VQABS.S32 d0,d0 14668 _NEON2SSE_INLINE int32x2_t vqabs_s32(int32x2_t a) 14669 { 14670 int32x2_t res64; 14671 __m128i res; 14672 res = vqabsq_s32(_pM128i(a)); 14673 return64(res); 14674 } 14675 14676 _NEON2SSESTORAGE int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0 14677 _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0 14678 { 14679 __m128i c_128, abs, abs_cmp; 14680 c_128 = _mm_set1_epi8 ((int8_t)0x80); //-128 14681 abs = _mm_abs_epi8 (a); 14682 abs_cmp = _mm_cmpeq_epi8 (abs, c_128); 14683 return _mm_xor_si128 (abs, abs_cmp); 14684 } 14685 14686 _NEON2SSESTORAGE int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0 14687 _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0 14688 { 14689 __m128i c_32768, abs, abs_cmp; 14690 c_32768 = _mm_set1_epi16 ((int16_t)0x8000); //-32768 14691 abs = _mm_abs_epi16 (a); 14692 abs_cmp = _mm_cmpeq_epi16 (abs, c_32768); 14693 return _mm_xor_si128 (abs, abs_cmp); 14694 } 14695 14696 _NEON2SSESTORAGE int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0 14697 _NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0 14698 { 14699 __m128i c80000000, abs, abs_cmp; 14700 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value 14701 abs = _mm_abs_epi32 (a); 14702 abs_cmp = _mm_cmpeq_epi32 (abs, c80000000); 14703 return _mm_xor_si128 (abs, abs_cmp); 14704 } 14705 14706 //*************** Negate: Vd[i] = - Va[i] ************************************* 14707 //***************************************************************************** 14708 //several Negate implementations possible for SIMD. 14709 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance: 14710 _NEON2SSESTORAGE int8x8_t vneg_s8(int8x8_t a); // VNE//d0,d0 14711 _NEON2SSE_INLINE int8x8_t vneg_s8(int8x8_t a) 14712 { 14713 int8x8_t res64; 14714 __m128i res; 14715 res = vnegq_s8(_pM128i(a)); 14716 return64(res); 14717 } 14718 14719 _NEON2SSESTORAGE int16x4_t vneg_s16(int16x4_t a); // VNE//d0,d0 14720 _NEON2SSE_INLINE int16x4_t vneg_s16(int16x4_t a) 14721 { 14722 int16x4_t res64; 14723 __m128i res; 14724 res = vnegq_s16(_pM128i(a)); 14725 return64(res); 14726 } 14727 14728 _NEON2SSESTORAGE int32x2_t vneg_s32(int32x2_t a); // VNE//d0,d0 14729 _NEON2SSE_INLINE int32x2_t vneg_s32(int32x2_t a) 14730 { 14731 int32x2_t res64; 14732 __m128i res; 14733 res = vnegq_s32(_pM128i(a)); 14734 return64(res); 14735 } 14736 14737 _NEON2SSESTORAGE float32x2_t vneg_f32(float32x2_t a); // VNE//d0,d0 14738 _NEON2SSE_INLINE float32x2_t vneg_f32(float32x2_t a) // VNE//d0,d0 14739 { 14740 float32x4_t res; 14741 __m64_128 res64; 14742 _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 14743 res = _mm_xor_ps (_pM128(a), *(__m128*) c80000000); //use low 64 bits 14744 _M64f(res64, res); 14745 return res64; 14746 } 14747 14748 _NEON2SSESTORAGE int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0 14749 _NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0 14750 { 14751 __m128i zero; 14752 zero = _mm_setzero_si128 (); 14753 return _mm_sub_epi8 (zero, a); 14754 } //or _mm_sign_epi8 (a, negative numbers vector) 14755 14756 _NEON2SSESTORAGE int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0 14757 _NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0 14758 { 14759 __m128i zero; 14760 zero = _mm_setzero_si128 (); 14761 return _mm_sub_epi16 (zero, a); 14762 } //or _mm_sign_epi16 (a, negative numbers vector) 14763 14764 _NEON2SSESTORAGE int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0 14765 _NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0 14766 { 14767 __m128i zero; 14768 zero = _mm_setzero_si128 (); 14769 return _mm_sub_epi32 (zero, a); 14770 } //or _mm_sign_epi32 (a, negative numbers vector) 14771 14772 _NEON2SSESTORAGE float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0 14773 _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0 14774 { 14775 _NEON2SSE_ALIGN_16 static const uint32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 14776 return _mm_xor_ps (a, *(__m128*) c80000000); 14777 } 14778 14779 //************** Saturating Negate: sat(Vd[i] = - Va[i]) ************************** 14780 //*************************************************************************************** 14781 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive 14782 _NEON2SSESTORAGE int8x8_t vqneg_s8(int8x8_t a); // VQNE//d0,d0 14783 _NEON2SSE_INLINE int8x8_t vqneg_s8(int8x8_t a) 14784 { 14785 int8x8_t res64; 14786 __m128i res; 14787 res = vqnegq_s8(_pM128i(a)); 14788 return64(res); 14789 } 14790 14791 _NEON2SSESTORAGE int16x4_t vqneg_s16(int16x4_t a); // VQNE//d0,d0 14792 _NEON2SSE_INLINE int16x4_t vqneg_s16(int16x4_t a) 14793 { 14794 int16x4_t res64; 14795 __m128i res; 14796 res = vqnegq_s16(_pM128i(a)); 14797 return64(res); 14798 } 14799 14800 _NEON2SSESTORAGE int32x2_t vqneg_s32(int32x2_t a); // VQNE//d0,d0 14801 _NEON2SSE_INLINE int32x2_t vqneg_s32(int32x2_t a) 14802 { 14803 int32x2_t res64; 14804 __m128i res; 14805 res = vqnegq_s32(_pM128i(a)); 14806 return64(res); 14807 } 14808 14809 _NEON2SSESTORAGE int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0 14810 _NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0 14811 { 14812 __m128i zero; 14813 zero = _mm_setzero_si128 (); 14814 return _mm_subs_epi8 (zero, a); //saturating substraction 14815 } 14816 14817 _NEON2SSESTORAGE int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0 14818 _NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0 14819 { 14820 __m128i zero; 14821 zero = _mm_setzero_si128 (); 14822 return _mm_subs_epi16 (zero, a); //saturating substraction 14823 } 14824 14825 _NEON2SSESTORAGE int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0 14826 _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0 14827 { 14828 //solution may be not optimal compared with a serial 14829 __m128i c80000000, zero, sub, cmp; 14830 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value 14831 zero = _mm_setzero_si128 (); 14832 sub = _mm_sub_epi32 (zero, a); //substraction 14833 cmp = _mm_cmpeq_epi32 (a, c80000000); 14834 return _mm_xor_si128 (sub, cmp); 14835 } 14836 14837 //****************** Count leading zeros ******************************** 14838 //************************************************************************** 14839 //no corresponding vector intrinsics in IA32, need to implement it. While the implementation is effective for 8 bits, it may be not for 16 and 32 bits 14840 _NEON2SSESTORAGE int8x8_t vclz_s8(int8x8_t a); // VCLZ.I8 d0,d0 14841 _NEON2SSE_INLINE int8x8_t vclz_s8(int8x8_t a) 14842 { 14843 int8x8_t res64; 14844 __m128i res; 14845 res = vclzq_s8(_pM128i(a)); 14846 return64(res); 14847 } 14848 14849 _NEON2SSESTORAGE int16x4_t vclz_s16(int16x4_t a); // VCLZ.I16 d0,d0 14850 _NEON2SSE_INLINE int16x4_t vclz_s16(int16x4_t a) 14851 { 14852 int16x4_t res64; 14853 __m128i res; 14854 res = vclzq_s16(_pM128i(a)); 14855 return64(res); 14856 } 14857 14858 _NEON2SSESTORAGE int32x2_t vclz_s32(int32x2_t a); // VCLZ.I32 d0,d0 14859 _NEON2SSE_INLINE int32x2_t vclz_s32(int32x2_t a) 14860 { 14861 int32x2_t res64; 14862 __m128i res; 14863 res = vclzq_s32(_pM128i(a)); 14864 return64(res); 14865 } 14866 14867 14868 _NEON2SSESTORAGE uint8x8_t vclz_u8(uint8x8_t a); // VCLZ.I8 d0,d0 14869 #define vclz_u8 vclz_s8 14870 14871 _NEON2SSESTORAGE uint16x4_t vclz_u16(uint16x4_t a); // VCLZ.I16 d0,d0 14872 #define vclz_u16 vclz_s16 14873 14874 _NEON2SSESTORAGE uint32x2_t vclz_u32(uint32x2_t a); // VCLZ.I32 d0,d0 14875 #define vclz_u32 vclz_s32 14876 14877 _NEON2SSESTORAGE int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0 14878 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a) 14879 { 14880 _NEON2SSE_ALIGN_16 static const int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2, 14881 /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1, 14882 /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0, 14883 /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0 }; 14884 __m128i maskLOW, c4, lowclz, mask, hiclz; 14885 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically 14886 c4 = _mm_set1_epi8(4); 14887 lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway 14888 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits 14889 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set 14890 hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway 14891 mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros 14892 lowclz = _mm_and_si128(lowclz,mask); 14893 return _mm_add_epi8(lowclz, hiclz); 14894 } 14895 14896 _NEON2SSESTORAGE int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0 14897 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a) 14898 { 14899 __m128i c7, res8x16, res8x16_swap; 14900 _NEON2SSE_ALIGN_16 static const int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; 14901 _NEON2SSE_ALIGN_16 static const uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff}; 14902 c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7 14903 res8x16 = vclzq_s8(a); 14904 res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap 14905 res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz 14906 res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz 14907 c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros 14908 res8x16 = _mm_and_si128(res8x16, c7); //lowclz 14909 return _mm_add_epi16(res8x16_swap, res8x16); 14910 } 14911 14912 _NEON2SSESTORAGE int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0 14913 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a) 14914 { 14915 __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res; 14916 c55555555 = _mm_set1_epi32(0x55555555); 14917 c33333333 = _mm_set1_epi32(0x33333333); 14918 c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f); 14919 c3f = _mm_set1_epi32(0x3f); 14920 c32 = _mm_set1_epi32(32); 14921 tmp = _mm_srli_epi32(a, 1); 14922 res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1); 14923 tmp = _mm_srli_epi32(res, 2); 14924 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2); 14925 tmp = _mm_srli_epi32(res, 4); 14926 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4); 14927 tmp = _mm_srli_epi32(res, 8); 14928 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8); 14929 tmp = _mm_srli_epi32(res, 16); 14930 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16); 14931 14932 tmp = _mm_srli_epi32(res, 1); 14933 tmp = _mm_and_si128(tmp, c55555555); 14934 res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555); 14935 14936 tmp = _mm_srli_epi32(res, 2); 14937 tmp = _mm_and_si128(tmp, c33333333); 14938 tmp1 = _mm_and_si128(res, c33333333); 14939 res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333)); 14940 14941 tmp = _mm_srli_epi32(res, 4); 14942 tmp = _mm_add_epi32(tmp, res); 14943 res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f); 14944 14945 tmp = _mm_srli_epi32(res, 8); 14946 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8); 14947 14948 tmp = _mm_srli_epi32(res, 16); 14949 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16); 14950 14951 res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f; 14952 14953 return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i]; 14954 } 14955 14956 _NEON2SSESTORAGE uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 14957 #define vclzq_u8 vclzq_s8 14958 14959 _NEON2SSESTORAGE uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 14960 #define vclzq_u16 vclzq_s16 14961 14962 _NEON2SSESTORAGE uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 14963 #define vclzq_u32 vclzq_s32 14964 14965 //************** Count leading sign bits ************************** 14966 //******************************************************************** 14967 //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following 14968 // the topmost bit, that are the same as the topmost bit, in each element in a vector 14969 //No corresponding vector intrinsics in IA32, need to implement it. 14970 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits 14971 _NEON2SSESTORAGE int8x8_t vcls_s8(int8x8_t a); // VCLS.S8 d0,d0 14972 _NEON2SSE_INLINE int8x8_t vcls_s8(int8x8_t a) 14973 { 14974 int8x8_t res64; 14975 __m128i res; 14976 res = vclsq_s8(_pM128i(a)); 14977 return64(res); 14978 } 14979 14980 _NEON2SSESTORAGE int16x4_t vcls_s16(int16x4_t a); // VCLS.S16 d0,d0 14981 _NEON2SSE_INLINE int16x4_t vcls_s16(int16x4_t a) 14982 { 14983 int16x4_t res64; 14984 __m128i res; 14985 res = vclsq_s16(_pM128i(a)); 14986 return64(res); 14987 } 14988 14989 _NEON2SSESTORAGE int32x2_t vcls_s32(int32x2_t a); // VCLS.S32 d0,d0 14990 _NEON2SSE_INLINE int32x2_t vcls_s32(int32x2_t a) 14991 { 14992 int32x2_t res64; 14993 __m128i res; 14994 res = vclsq_s32(_pM128i(a)); 14995 return64(res); 14996 } 14997 14998 _NEON2SSESTORAGE int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 14999 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a) 15000 { 15001 __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb; 15002 cff = _mm_cmpeq_epi8 (a,a); //0xff 15003 c80 = _mm_set1_epi8((int8_t)0x80); 15004 c1 = _mm_set1_epi8(1); 15005 a_mask = _mm_and_si128(a, c80); 15006 a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive 15007 a_neg = _mm_xor_si128(a, cff); 15008 a_neg = _mm_and_si128(a_mask, a_neg); 15009 a_pos = _mm_andnot_si128(a_mask, a); 15010 a_comb = _mm_or_si128(a_pos, a_neg); 15011 a_comb = vclzq_s8(a_comb); 15012 return _mm_sub_epi8(a_comb, c1); 15013 } 15014 15015 _NEON2SSESTORAGE int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 15016 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a) 15017 { 15018 __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb; 15019 cffff = _mm_cmpeq_epi16(a,a); 15020 c8000 = _mm_slli_epi16(cffff, 15); //0x8000 15021 c1 = _mm_srli_epi16(cffff,15); //0x1 15022 a_mask = _mm_and_si128(a, c8000); 15023 a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive 15024 a_neg = _mm_xor_si128(a, cffff); 15025 a_neg = _mm_and_si128(a_mask, a_neg); 15026 a_pos = _mm_andnot_si128(a_mask, a); 15027 a_comb = _mm_or_si128(a_pos, a_neg); 15028 a_comb = vclzq_s16(a_comb); 15029 return _mm_sub_epi16(a_comb, c1); 15030 } 15031 15032 _NEON2SSESTORAGE int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 15033 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a) 15034 { 15035 __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb; 15036 cffffffff = _mm_cmpeq_epi32(a,a); 15037 c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000 15038 c1 = _mm_srli_epi32(cffffffff,31); //0x1 15039 a_mask = _mm_and_si128(a, c80000000); 15040 a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive 15041 a_neg = _mm_xor_si128(a, cffffffff); 15042 a_neg = _mm_and_si128(a_mask, a_neg); 15043 a_pos = _mm_andnot_si128(a_mask, a); 15044 a_comb = _mm_or_si128(a_pos, a_neg); 15045 a_comb = vclzq_s32(a_comb); 15046 return _mm_sub_epi32(a_comb, c1); 15047 } 15048 15049 //************************* Count number of set bits ******************************** 15050 //************************************************************************************* 15051 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element 15052 //another option is to do the following algorithm: 15053 15054 _NEON2SSESTORAGE uint8x8_t vcnt_u8(uint8x8_t a); // VCNT.8 d0,d0 15055 _NEON2SSE_INLINE uint8x8_t vcnt_u8(uint8x8_t a) 15056 { 15057 uint8x8_t res64; 15058 __m128i res; 15059 res = vcntq_u8(_pM128i(a)); 15060 return64(res); 15061 } 15062 15063 _NEON2SSESTORAGE int8x8_t vcnt_s8(int8x8_t a); // VCNT.8 d0,d0 15064 #define vcnt_s8 vcnt_u8 15065 15066 _NEON2SSESTORAGE poly8x8_t vcnt_p8(poly8x8_t a); // VCNT.8 d0,d0 15067 #define vcnt_p8 vcnt_u8 15068 15069 _NEON2SSESTORAGE uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 15070 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a) 15071 { 15072 _NEON2SSE_ALIGN_16 static const int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2, 15073 /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3, 15074 /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3, 15075 /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4}; 15076 __m128i maskLOW, mask, lowpopcnt, hipopcnt; 15077 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set 15078 mask = _mm_and_si128(a, maskLOW); 15079 lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway 15080 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits 15081 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set 15082 hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway 15083 return _mm_add_epi8(lowpopcnt, hipopcnt); 15084 } 15085 15086 _NEON2SSESTORAGE int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 15087 #define vcntq_s8 vcntq_u8 15088 15089 _NEON2SSESTORAGE poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 15090 #define vcntq_p8 vcntq_u8 15091 15092 //************************************************************************************** 15093 //*********************** Logical operations **************************************** 15094 //************************************************************************************** 15095 //************************** Bitwise not *********************************** 15096 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance 15097 _NEON2SSESTORAGE int8x8_t vmvn_s8(int8x8_t a); // VMVN d0,d0 15098 _NEON2SSE_INLINE int8x8_t vmvn_s8(int8x8_t a) 15099 { 15100 int8x8_t res64; 15101 __m128i res; 15102 res = vmvnq_s8(_pM128i(a)); 15103 return64(res); 15104 } 15105 15106 _NEON2SSESTORAGE int16x4_t vmvn_s16(int16x4_t a); // VMVN d0,d0 15107 _NEON2SSE_INLINE int16x4_t vmvn_s16(int16x4_t a) 15108 { 15109 int16x4_t res64; 15110 __m128i res; 15111 res = vmvnq_s16(_pM128i(a)); 15112 return64(res); 15113 } 15114 15115 _NEON2SSESTORAGE int32x2_t vmvn_s32(int32x2_t a); // VMVN d0,d0 15116 _NEON2SSE_INLINE int32x2_t vmvn_s32(int32x2_t a) 15117 { 15118 int32x2_t res64; 15119 __m128i res; 15120 res = vmvnq_s32(_pM128i(a)); 15121 return64(res); 15122 } 15123 15124 _NEON2SSESTORAGE uint8x8_t vmvn_u8(uint8x8_t a); // VMVN d0,d0 15125 #define vmvn_u8 vmvn_s8 15126 15127 _NEON2SSESTORAGE uint16x4_t vmvn_u16(uint16x4_t a); // VMVN d0,d0 15128 #define vmvn_u16 vmvn_s16 15129 15130 _NEON2SSESTORAGE uint32x2_t vmvn_u32(uint32x2_t a); // VMVN d0,d0 15131 #define vmvn_u32 vmvn_s32 15132 15133 _NEON2SSESTORAGE poly8x8_t vmvn_p8(poly8x8_t a); // VMVN d0,d0 15134 #define vmvn_p8 vmvn_u8 15135 15136 _NEON2SSESTORAGE int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 15137 _NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0 15138 { 15139 __m128i c1; 15140 c1 = _mm_cmpeq_epi8 (a,a); //0xff 15141 return _mm_andnot_si128 (a, c1); 15142 } 15143 15144 _NEON2SSESTORAGE int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 15145 _NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0 15146 { 15147 __m128i c1; 15148 c1 = _mm_cmpeq_epi16 (a,a); //0xffff 15149 return _mm_andnot_si128 (a, c1); 15150 } 15151 15152 _NEON2SSESTORAGE int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 15153 _NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0 15154 { 15155 __m128i c1; 15156 c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff 15157 return _mm_andnot_si128 (a, c1); 15158 } 15159 15160 _NEON2SSESTORAGE uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 15161 #define vmvnq_u8 vmvnq_s8 15162 15163 _NEON2SSESTORAGE uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 15164 #define vmvnq_u16 vmvnq_s16 15165 15166 _NEON2SSESTORAGE uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 15167 #define vmvnq_u32 vmvnq_s32 15168 15169 _NEON2SSESTORAGE poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 15170 #define vmvnq_p8 vmvnq_u8 15171 15172 //****************** Bitwise and *********************** 15173 //****************************************************** 15174 _NEON2SSESTORAGE int8x8_t vand_s8(int8x8_t a, int8x8_t b); // VAND d0,d0,d0 15175 _NEON2SSE_INLINE int8x8_t vand_s8(int8x8_t a, int8x8_t b) 15176 { 15177 int8x8_t res64; 15178 return64(_mm_and_si128(_pM128i(a),_pM128i(b))); 15179 } 15180 15181 _NEON2SSESTORAGE int16x4_t vand_s16(int16x4_t a, int16x4_t b); // VAND d0,d0,d0 15182 _NEON2SSE_INLINE int16x4_t vand_s16(int16x4_t a, int16x4_t b) 15183 { 15184 int16x4_t res64; 15185 return64(_mm_and_si128(_pM128i(a),_pM128i(b))); 15186 } 15187 15188 _NEON2SSESTORAGE int32x2_t vand_s32(int32x2_t a, int32x2_t b); // VAND d0,d0,d0 15189 _NEON2SSE_INLINE int32x2_t vand_s32(int32x2_t a, int32x2_t b) 15190 { 15191 int32x2_t res64; 15192 return64(_mm_and_si128(_pM128i(a),_pM128i(b))); 15193 } 15194 15195 15196 _NEON2SSESTORAGE int64x1_t vand_s64(int64x1_t a, int64x1_t b); // VAND d0,d0,d0 15197 _NEON2SSE_INLINE int64x1_t vand_s64(int64x1_t a, int64x1_t b) 15198 { 15199 int64x1_t res; 15200 res.m64_i64[0] = a.m64_i64[0] & b.m64_i64[0]; 15201 return res; 15202 } 15203 15204 _NEON2SSESTORAGE uint8x8_t vand_u8(uint8x8_t a, uint8x8_t b); // VAND d0,d0,d0 15205 #define vand_u8 vand_s8 15206 15207 _NEON2SSESTORAGE uint16x4_t vand_u16(uint16x4_t a, uint16x4_t b); // VAND d0,d0,d0 15208 #define vand_u16 vand_s16 15209 15210 _NEON2SSESTORAGE uint32x2_t vand_u32(uint32x2_t a, uint32x2_t b); // VAND d0,d0,d0 15211 #define vand_u32 vand_s32 15212 15213 _NEON2SSESTORAGE uint64x1_t vand_u64(uint64x1_t a, uint64x1_t b); // VAND d0,d0,d0 15214 #define vand_u64 vand_s64 15215 15216 15217 _NEON2SSESTORAGE int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 15218 #define vandq_s8 _mm_and_si128 15219 15220 _NEON2SSESTORAGE int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 15221 #define vandq_s16 _mm_and_si128 15222 15223 _NEON2SSESTORAGE int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 15224 #define vandq_s32 _mm_and_si128 15225 15226 _NEON2SSESTORAGE int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 15227 #define vandq_s64 _mm_and_si128 15228 15229 _NEON2SSESTORAGE uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 15230 #define vandq_u8 _mm_and_si128 15231 15232 _NEON2SSESTORAGE uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 15233 #define vandq_u16 _mm_and_si128 15234 15235 _NEON2SSESTORAGE uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 15236 #define vandq_u32 _mm_and_si128 15237 15238 _NEON2SSESTORAGE uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 15239 #define vandq_u64 _mm_and_si128 15240 15241 //******************** Bitwise or ********************************* 15242 //****************************************************************** 15243 _NEON2SSESTORAGE int8x8_t vorr_s8(int8x8_t a, int8x8_t b); // VORR d0,d0,d0 15244 _NEON2SSE_INLINE int8x8_t vorr_s8(int8x8_t a, int8x8_t b) 15245 { 15246 int8x8_t res64; 15247 return64(_mm_or_si128(_pM128i(a),_pM128i(b))); 15248 } 15249 15250 15251 _NEON2SSESTORAGE int16x4_t vorr_s16(int16x4_t a, int16x4_t b); // VORR d0,d0,d0 15252 _NEON2SSE_INLINE int16x4_t vorr_s16(int16x4_t a, int16x4_t b) 15253 { 15254 int16x4_t res64; 15255 return64(_mm_or_si128(_pM128i(a),_pM128i(b))); 15256 } 15257 15258 15259 _NEON2SSESTORAGE int32x2_t vorr_s32(int32x2_t a, int32x2_t b); // VORR d0,d0,d0 15260 _NEON2SSE_INLINE int32x2_t vorr_s32(int32x2_t a, int32x2_t b) 15261 { 15262 int32x2_t res64; 15263 return64(_mm_or_si128(_pM128i(a),_pM128i(b))); 15264 } 15265 15266 15267 _NEON2SSESTORAGE int64x1_t vorr_s64(int64x1_t a, int64x1_t b); // VORR d0,d0,d0 15268 _NEON2SSE_INLINE int64x1_t vorr_s64(int64x1_t a, int64x1_t b) 15269 { 15270 int64x1_t res; 15271 res.m64_i64[0] = a.m64_i64[0] | b.m64_i64[0]; 15272 return res; 15273 } 15274 15275 _NEON2SSESTORAGE uint8x8_t vorr_u8(uint8x8_t a, uint8x8_t b); // VORR d0,d0,d0 15276 #define vorr_u8 vorr_s8 15277 15278 _NEON2SSESTORAGE uint16x4_t vorr_u16(uint16x4_t a, uint16x4_t b); // VORR d0,d0,d0 15279 #define vorr_u16 vorr_s16 15280 15281 _NEON2SSESTORAGE uint32x2_t vorr_u32(uint32x2_t a, uint32x2_t b); // VORR d0,d0,d0 15282 #define vorr_u32 vorr_s32 15283 15284 _NEON2SSESTORAGE uint64x1_t vorr_u64(uint64x1_t a, uint64x1_t b); // VORR d0,d0,d0 15285 #define vorr_u64 vorr_s64 15286 15287 _NEON2SSESTORAGE int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 15288 #define vorrq_s8 _mm_or_si128 15289 15290 _NEON2SSESTORAGE int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 15291 #define vorrq_s16 _mm_or_si128 15292 15293 _NEON2SSESTORAGE int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 15294 #define vorrq_s32 _mm_or_si128 15295 15296 _NEON2SSESTORAGE int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 15297 #define vorrq_s64 _mm_or_si128 15298 15299 _NEON2SSESTORAGE uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 15300 #define vorrq_u8 _mm_or_si128 15301 15302 _NEON2SSESTORAGE uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 15303 #define vorrq_u16 _mm_or_si128 15304 15305 _NEON2SSESTORAGE uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 15306 #define vorrq_u32 _mm_or_si128 15307 15308 _NEON2SSESTORAGE uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 15309 #define vorrq_u64 _mm_or_si128 15310 15311 //************* Bitwise exclusive or (EOR or XOR) ****************** 15312 //******************************************************************* 15313 _NEON2SSESTORAGE int8x8_t veor_s8(int8x8_t a, int8x8_t b); // VEOR d0,d0,d0 15314 _NEON2SSE_INLINE int8x8_t veor_s8(int8x8_t a, int8x8_t b) 15315 { 15316 int8x8_t res64; 15317 return64(_mm_xor_si128(_pM128i(a),_pM128i(b))); 15318 } 15319 15320 _NEON2SSESTORAGE int16x4_t veor_s16(int16x4_t a, int16x4_t b); // VEOR d0,d0,d0 15321 #define veor_s16 veor_s8 15322 15323 _NEON2SSESTORAGE int32x2_t veor_s32(int32x2_t a, int32x2_t b); // VEOR d0,d0,d0 15324 #define veor_s32 veor_s8 15325 15326 _NEON2SSESTORAGE int64x1_t veor_s64(int64x1_t a, int64x1_t b); // VEOR d0,d0,d0 15327 _NEON2SSE_INLINE int64x1_t veor_s64(int64x1_t a, int64x1_t b) 15328 { 15329 int64x1_t res; 15330 res.m64_i64[0] = a.m64_i64[0] ^ b.m64_i64[0]; 15331 return res; 15332 } 15333 15334 _NEON2SSESTORAGE uint8x8_t veor_u8(uint8x8_t a, uint8x8_t b); // VEOR d0,d0,d0 15335 #define veor_u8 veor_s8 15336 15337 _NEON2SSESTORAGE uint16x4_t veor_u16(uint16x4_t a, uint16x4_t b); // VEOR d0,d0,d0 15338 #define veor_u16 veor_s16 15339 15340 _NEON2SSESTORAGE uint32x2_t veor_u32(uint32x2_t a, uint32x2_t b); // VEOR d0,d0,d0 15341 #define veor_u32 veor_s32 15342 15343 _NEON2SSESTORAGE uint64x1_t veor_u64(uint64x1_t a, uint64x1_t b); // VEOR d0,d0,d0 15344 #define veor_u64 veor_s64 15345 15346 _NEON2SSESTORAGE int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 15347 #define veorq_s8 _mm_xor_si128 15348 15349 _NEON2SSESTORAGE int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 15350 #define veorq_s16 _mm_xor_si128 15351 15352 _NEON2SSESTORAGE int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 15353 #define veorq_s32 _mm_xor_si128 15354 15355 _NEON2SSESTORAGE int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 15356 #define veorq_s64 _mm_xor_si128 15357 15358 _NEON2SSESTORAGE uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 15359 #define veorq_u8 _mm_xor_si128 15360 15361 _NEON2SSESTORAGE uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 15362 #define veorq_u16 _mm_xor_si128 15363 15364 _NEON2SSESTORAGE uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 15365 #define veorq_u32 _mm_xor_si128 15366 15367 _NEON2SSESTORAGE uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 15368 #define veorq_u64 _mm_xor_si128 15369 15370 //********************** Bit Clear ********************************** 15371 //******************************************************************* 15372 //Logical AND complement (AND negation or AND NOT) 15373 _NEON2SSESTORAGE int8x8_t vbic_s8(int8x8_t a, int8x8_t b); // VBIC d0,d0,d0 15374 _NEON2SSE_INLINE int8x8_t vbic_s8(int8x8_t a, int8x8_t b) 15375 { 15376 int8x8_t res64; 15377 return64(_mm_andnot_si128(_pM128i(b),_pM128i(a))); //notice the arguments "swap" 15378 } 15379 15380 _NEON2SSESTORAGE int16x4_t vbic_s16(int16x4_t a, int16x4_t b); // VBIC d0,d0,d0 15381 #define vbic_s16 vbic_s8 15382 15383 _NEON2SSESTORAGE int32x2_t vbic_s32(int32x2_t a, int32x2_t b); // VBIC d0,d0,d0 15384 #define vbic_s32 vbic_s8 15385 15386 _NEON2SSESTORAGE int64x1_t vbic_s64(int64x1_t a, int64x1_t b); // VBIC d0,d0,d0 15387 _NEON2SSE_INLINE int64x1_t vbic_s64(int64x1_t a, int64x1_t b) 15388 { 15389 int64x1_t res; 15390 res.m64_i64[0] = a.m64_i64[0] & (~b.m64_i64[0]); 15391 return res; 15392 } 15393 15394 _NEON2SSESTORAGE uint8x8_t vbic_u8(uint8x8_t a, uint8x8_t b); // VBIC d0,d0,d0 15395 #define vbic_u8 vbic_s8 15396 15397 _NEON2SSESTORAGE uint16x4_t vbic_u16(uint16x4_t a, uint16x4_t b); // VBIC d0,d0,d0 15398 #define vbic_u16 vbic_s16 15399 15400 _NEON2SSESTORAGE uint32x2_t vbic_u32(uint32x2_t a, uint32x2_t b); // VBIC d0,d0,d0 15401 #define vbic_u32 vbic_s32 15402 15403 _NEON2SSESTORAGE uint64x1_t vbic_u64(uint64x1_t a, uint64x1_t b); // VBIC d0,d0,d0 15404 #define vbic_u64 vbic_s64 15405 15406 _NEON2SSESTORAGE int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 15407 #define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 15408 15409 _NEON2SSESTORAGE int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 15410 #define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 15411 15412 _NEON2SSESTORAGE int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 15413 #define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 15414 15415 _NEON2SSESTORAGE int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 15416 #define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 15417 15418 _NEON2SSESTORAGE uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 15419 #define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 15420 15421 _NEON2SSESTORAGE uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 15422 #define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 15423 15424 _NEON2SSESTORAGE uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 15425 #define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 15426 15427 _NEON2SSESTORAGE uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 15428 #define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 15429 15430 //**************** Bitwise OR complement ******************************** 15431 //**************************************** ******************************** 15432 //no exact IA 32 match, need to implement it as following 15433 _NEON2SSESTORAGE int8x8_t vorn_s8(int8x8_t a, int8x8_t b); // VORN d0,d0,d0 15434 _NEON2SSE_INLINE int8x8_t vorn_s8(int8x8_t a, int8x8_t b) 15435 { 15436 int8x8_t res64; 15437 return64(vornq_s8(_pM128i(a), _pM128i(b))); 15438 } 15439 15440 15441 _NEON2SSESTORAGE int16x4_t vorn_s16(int16x4_t a, int16x4_t b); // VORN d0,d0,d0 15442 _NEON2SSE_INLINE int16x4_t vorn_s16(int16x4_t a, int16x4_t b) 15443 { 15444 int16x4_t res64; 15445 return64(vornq_s16(_pM128i(a), _pM128i(b))); 15446 } 15447 15448 15449 _NEON2SSESTORAGE int32x2_t vorn_s32(int32x2_t a, int32x2_t b); // VORN d0,d0,d0 15450 _NEON2SSE_INLINE int32x2_t vorn_s32(int32x2_t a, int32x2_t b) 15451 { 15452 int32x2_t res64; 15453 return64(vornq_s32(_pM128i(a), _pM128i(b))); 15454 } 15455 15456 15457 _NEON2SSESTORAGE int64x1_t vorn_s64(int64x1_t a, int64x1_t b); // VORN d0,d0,d0 15458 _NEON2SSE_INLINE int64x1_t vorn_s64(int64x1_t a, int64x1_t b) 15459 { 15460 int64x1_t res; 15461 res.m64_i64[0] = a.m64_i64[0] | (~b.m64_i64[0]); 15462 return res; 15463 } 15464 15465 _NEON2SSESTORAGE uint8x8_t vorn_u8(uint8x8_t a, uint8x8_t b); // VORN d0,d0,d0 15466 #define vorn_u8 vorn_s8 15467 15468 15469 _NEON2SSESTORAGE uint16x4_t vorn_u16(uint16x4_t a, uint16x4_t b); // VORN d0,d0,d0 15470 #define vorn_u16 vorn_s16 15471 15472 _NEON2SSESTORAGE uint32x2_t vorn_u32(uint32x2_t a, uint32x2_t b); // VORN d0,d0,d0 15473 #define vorn_u32 vorn_s32 15474 15475 _NEON2SSESTORAGE uint64x1_t vorn_u64(uint64x1_t a, uint64x1_t b); // VORN d0,d0,d0 15476 #define vorn_u64 vorn_s64 15477 15478 15479 _NEON2SSESTORAGE int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 15480 _NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0 15481 { 15482 __m128i b1; 15483 b1 = vmvnq_s8( b); //bitwise not for b 15484 return _mm_or_si128 (a, b1); 15485 } 15486 15487 _NEON2SSESTORAGE int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 15488 _NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0 15489 { 15490 __m128i b1; 15491 b1 = vmvnq_s16( b); //bitwise not for b 15492 return _mm_or_si128 (a, b1); 15493 } 15494 15495 _NEON2SSESTORAGE int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 15496 _NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0 15497 { 15498 __m128i b1; 15499 b1 = vmvnq_s32( b); //bitwise not for b 15500 return _mm_or_si128 (a, b1); 15501 } 15502 15503 _NEON2SSESTORAGE int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 15504 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b) 15505 { 15506 __m128i c1, b1; 15507 c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff 15508 b1 = _mm_andnot_si128 (b, c1); 15509 return _mm_or_si128 (a, b1); 15510 } 15511 15512 _NEON2SSESTORAGE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 15513 _NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0 15514 { 15515 __m128i b1; 15516 b1 = vmvnq_u8( b); //bitwise not for b 15517 return _mm_or_si128 (a, b1); 15518 } 15519 15520 _NEON2SSESTORAGE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 15521 _NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0 15522 { 15523 __m128i b1; 15524 b1 = vmvnq_s16( b); //bitwise not for b 15525 return _mm_or_si128 (a, b1); 15526 } 15527 15528 _NEON2SSESTORAGE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 15529 _NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0 15530 { 15531 __m128i b1; 15532 b1 = vmvnq_u32( b); //bitwise not for b 15533 return _mm_or_si128 (a, b1); 15534 } 15535 _NEON2SSESTORAGE uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 15536 #define vornq_u64 vornq_s64 15537 15538 //********************* Bitwise Select ***************************** 15539 //****************************************************************** 15540 //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????) 15541 15542 //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the 15543 //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0. 15544 15545 //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination 15546 //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged 15547 15548 //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination 15549 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged. 15550 15551 //VBSL only is implemented for SIMD 15552 _NEON2SSESTORAGE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c); // VBSL d0,d0,d0 15553 _NEON2SSE_INLINE int8x8_t vbsl_s8(uint8x8_t a, int8x8_t b, int8x8_t c) 15554 { 15555 int8x8_t res64; 15556 __m128i res; 15557 res = vbslq_s8(_pM128i(a), _pM128i(b), _pM128i(c)); 15558 return64(res); 15559 } 15560 15561 _NEON2SSESTORAGE int16x4_t vbsl_s16(uint16x4_t a, int16x4_t b, int16x4_t c); // VBSL d0,d0,d0 15562 #define vbsl_s16 vbsl_s8 15563 15564 _NEON2SSESTORAGE int32x2_t vbsl_s32(uint32x2_t a, int32x2_t b, int32x2_t c); // VBSL d0,d0,d0 15565 #define vbsl_s32 vbsl_s8 15566 15567 _NEON2SSESTORAGE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c); // VBSL d0,d0,d0 15568 _NEON2SSE_INLINE int64x1_t vbsl_s64(uint64x1_t a, int64x1_t b, int64x1_t c) 15569 { 15570 int64x1_t res; 15571 res.m64_i64[0] = (a.m64_i64[0] & b.m64_i64[0]) | ( (~a.m64_i64[0]) & c.m64_i64[0]); 15572 return res; 15573 } 15574 15575 _NEON2SSESTORAGE uint8x8_t vbsl_u8(uint8x8_t a, uint8x8_t b, uint8x8_t c); // VBSL d0,d0,d0 15576 #define vbsl_u8 vbsl_s8 15577 15578 _NEON2SSESTORAGE uint16x4_t vbsl_u16(uint16x4_t a, uint16x4_t b, uint16x4_t c); // VBSL d0,d0,d0 15579 #define vbsl_u16 vbsl_s8 15580 15581 _NEON2SSESTORAGE uint32x2_t vbsl_u32(uint32x2_t a, uint32x2_t b, uint32x2_t c); // VBSL d0,d0,d0 15582 #define vbsl_u32 vbsl_s8 15583 15584 _NEON2SSESTORAGE uint64x1_t vbsl_u64(uint64x1_t a, uint64x1_t b, uint64x1_t c); // VBSL d0,d0,d0 15585 #define vbsl_u64 vbsl_s64 15586 15587 _NEON2SSESTORAGE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c); // VBSL d0,d0,d0 15588 _NEON2SSE_INLINE float32x2_t vbsl_f32(uint32x2_t a, float32x2_t b, float32x2_t c) 15589 { 15590 __m128 sel1, sel2; 15591 __m64_128 res64; 15592 sel1 = _mm_and_ps (_pM128(a), _pM128(b)); 15593 sel2 = _mm_andnot_ps (_pM128(a), _pM128(c)); 15594 sel1 = _mm_or_ps (sel1, sel2); 15595 _M64f(res64, sel1); 15596 return res64; 15597 } 15598 15599 _NEON2SSESTORAGE poly8x8_t vbsl_p8(uint8x8_t a, poly8x8_t b, poly8x8_t c); // VBSL d0,d0,d0 15600 #define vbsl_p8 vbsl_s8 15601 15602 _NEON2SSESTORAGE poly16x4_t vbsl_p16(uint16x4_t a, poly16x4_t b, poly16x4_t c); // VBSL d0,d0,d0 15603 #define vbsl_p16 vbsl_s8 15604 15605 _NEON2SSESTORAGE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 15606 _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0 15607 { 15608 __m128i sel1, sel2; 15609 sel1 = _mm_and_si128 (a, b); 15610 sel2 = _mm_andnot_si128 (a, c); 15611 return _mm_or_si128 (sel1, sel2); 15612 } 15613 15614 _NEON2SSESTORAGE int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 15615 #define vbslq_s16 vbslq_s8 15616 15617 _NEON2SSESTORAGE int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 15618 #define vbslq_s32 vbslq_s8 15619 15620 _NEON2SSESTORAGE int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 15621 #define vbslq_s64 vbslq_s8 15622 15623 _NEON2SSESTORAGE uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 15624 #define vbslq_u8 vbslq_s8 15625 15626 _NEON2SSESTORAGE uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 15627 #define vbslq_u16 vbslq_s8 15628 15629 _NEON2SSESTORAGE uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 15630 #define vbslq_u32 vbslq_s8 15631 15632 _NEON2SSESTORAGE uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 15633 #define vbslq_u64 vbslq_s8 15634 15635 _NEON2SSESTORAGE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 15636 _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0 15637 { 15638 __m128 sel1, sel2; 15639 sel1 = _mm_and_ps (*(__m128*)&a, b); 15640 sel2 = _mm_andnot_ps (*(__m128*)&a, c); 15641 return _mm_or_ps (sel1, sel2); 15642 } 15643 15644 _NEON2SSESTORAGE poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 15645 #define vbslq_p8 vbslq_u8 15646 15647 _NEON2SSESTORAGE poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 15648 #define vbslq_p16 vbslq_s8 15649 15650 //************************************************************************************ 15651 //**************** Transposition operations **************************************** 15652 //************************************************************************************ 15653 //***************** Vector Transpose ************************************************ 15654 //************************************************************************************ 15655 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices. 15656 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....) 15657 _NEON2SSESTORAGE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b); // VTRN.8 d0,d0 15658 _NEON2SSE_INLINE int8x8x2_t vtrn_s8(int8x8_t a, int8x8_t b) // VTRN.8 d0,d0 15659 { 15660 int8x8x2_t val; 15661 __m128i tmp, val0; 15662 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 15663 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)mask8_32_even_odd); //(a0, b0, a2, b2, a4, b4, a6, b6), (a1,b1, a3,b3, a5,b5, a7,b7) 15664 vst1q_s8 (val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3, a5,b5, a7,b7),(a0, b0, a2, b2, a4, b4, a6, b6), 15665 return val; 15666 } 15667 15668 _NEON2SSESTORAGE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b); // VTRN.16 d0,d0 15669 _NEON2SSE_INLINE int16x4x2_t vtrn_s16(int16x4_t a, int16x4_t b) // VTRN.16 d0,d0 15670 { 15671 int16x4x2_t val; 15672 __m128i tmp, val0; 15673 _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 2,3, 8,9, 10,11, 4,5, 6,7, 12,13, 14, 15}; 15674 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 15675 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0, b0, a2, b2, a1,b1, a3, b3 15676 vst1q_s16(val.val, val0); // _mm_shuffle_epi32 (val.val[0], _SWAP_HI_LOW32); //(a1,b1, a3,b3),(a0, b0, a2, b2), 15677 return val; 15678 } 15679 15680 _NEON2SSESTORAGE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b); // VTRN.32 d0,d0 15681 _NEON2SSE_INLINE int32x2x2_t vtrn_s32(int32x2_t a, int32x2_t b) 15682 { 15683 int32x2x2_t val; 15684 __m128i val0; 15685 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1 15686 vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); //a1,b1, a0,b0, 15687 return val; 15688 } 15689 15690 _NEON2SSESTORAGE uint8x8x2_t vtrn_u8(uint8x8_t a, uint8x8_t b); // VTRN.8 d0,d0 15691 #define vtrn_u8 vtrn_s8 15692 15693 _NEON2SSESTORAGE uint16x4x2_t vtrn_u16(uint16x4_t a, uint16x4_t b); // VTRN.16 d0,d0 15694 #define vtrn_u16 vtrn_s16 15695 15696 _NEON2SSESTORAGE uint32x2x2_t vtrn_u32(uint32x2_t a, uint32x2_t b); // VTRN.32 d0,d0 15697 #define vtrn_u32 vtrn_s32 15698 15699 _NEON2SSESTORAGE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b); // VTRN.32 d0,d0 15700 _NEON2SSE_INLINE float32x2x2_t vtrn_f32(float32x2_t a, float32x2_t b) 15701 { 15702 float32x2x2_t val; 15703 val.val[0].m64_f32[0] = a.m64_f32[0]; 15704 val.val[0].m64_f32[1] = b.m64_f32[0]; 15705 val.val[1].m64_f32[0] = a.m64_f32[1]; 15706 val.val[1].m64_f32[1] = b.m64_f32[1]; 15707 return val; //a0,b0,a1,b1 15708 } 15709 15710 _NEON2SSESTORAGE poly8x8x2_t vtrn_p8(poly8x8_t a, poly8x8_t b); // VTRN.8 d0,d0 15711 #define vtrn_p8 vtrn_u8 15712 15713 _NEON2SSESTORAGE poly16x4x2_t vtrn_p16(poly16x4_t a, poly16x4_t b); // VTRN.16 d0,d0 15714 #define vtrn_p16 vtrn_s16 15715 15716 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 15717 _NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0 15718 { 15719 int8x16x2_t r8x16; 15720 __m128i a_sh, b_sh; 15721 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 15722 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 15723 15724 r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14) 15725 r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15) 15726 return r8x16; 15727 } 15728 15729 _NEON2SSESTORAGE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 15730 _NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0 15731 { 15732 int16x8x2_t v16x8; 15733 __m128i a_sh, b_sh; 15734 a_sh = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 15735 b_sh = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 15736 v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6 15737 v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7 15738 return v16x8; 15739 } 15740 15741 _NEON2SSESTORAGE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 15742 _NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0 15743 { 15744 //may be not optimal solution compared with serial 15745 int32x4x2_t v32x4; 15746 __m128i a_sh, b_sh; 15747 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 15748 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 15749 15750 v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2 15751 v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3 15752 return v32x4; 15753 } 15754 15755 _NEON2SSESTORAGE uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 15756 #define vtrnq_u8 vtrnq_s8 15757 15758 _NEON2SSESTORAGE uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 15759 #define vtrnq_u16 vtrnq_s16 15760 15761 _NEON2SSESTORAGE uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 15762 #define vtrnq_u32 vtrnq_s32 15763 15764 _NEON2SSESTORAGE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 15765 _NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0 15766 { 15767 //may be not optimal solution compared with serial 15768 float32x4x2_t f32x4; 15769 __m128 a_sh, b_sh; 15770 a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness 15771 b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness 15772 15773 f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2 15774 f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3 15775 return f32x4; 15776 } 15777 15778 _NEON2SSESTORAGE poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 15779 #define vtrnq_p8 vtrnq_s8 15780 15781 _NEON2SSESTORAGE poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 15782 #define vtrnq_p16 vtrnq_s16 15783 15784 //***************** Interleave elements *************************** 15785 //***************************************************************** 15786 //output has (a0,b0,a1,b1, a2,b2,.....) 15787 _NEON2SSESTORAGE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b); // VZIP.8 d0,d0 15788 _NEON2SSE_INLINE int8x8x2_t vzip_s8(int8x8_t a, int8x8_t b) // VZIP.8 d0,d0 15789 { 15790 int8x8x2_t val; 15791 __m128i val0; 15792 val0 = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); 15793 vst1q_s8(val.val, val0); //_mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); 15794 return val; 15795 } 15796 15797 _NEON2SSESTORAGE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b); // VZIP.16 d0,d0 15798 _NEON2SSE_INLINE int16x4x2_t vzip_s16(int16x4_t a, int16x4_t b) // VZIP.16 d0,d0 15799 { 15800 int16x4x2_t val; 15801 __m128i val0; 15802 val0 = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); 15803 vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); 15804 return val; 15805 } 15806 15807 _NEON2SSESTORAGE int32x2x2_t vzip_s32(int32x2_t a, int32x2_t b); // VZIP.32 d0,d0 15808 #define vzip_s32 vtrn_s32 15809 15810 _NEON2SSESTORAGE uint8x8x2_t vzip_u8(uint8x8_t a, uint8x8_t b); // VZIP.8 d0,d0 15811 #define vzip_u8 vzip_s8 15812 15813 _NEON2SSESTORAGE uint16x4x2_t vzip_u16(uint16x4_t a, uint16x4_t b); // VZIP.16 d0,d0 15814 #define vzip_u16 vzip_s16 15815 15816 _NEON2SSESTORAGE uint32x2x2_t vzip_u32(uint32x2_t a, uint32x2_t b); // VZIP.32 d0,d0 15817 #define vzip_u32 vzip_s32 15818 15819 _NEON2SSESTORAGE float32x2x2_t vzip_f32(float32x2_t a, float32x2_t b); // VZIP.32 d0,d0 15820 #define vzip_f32 vtrn_f32 15821 15822 _NEON2SSESTORAGE poly8x8x2_t vzip_p8(poly8x8_t a, poly8x8_t b); // VZIP.8 d0,d0 15823 #define vzip_p8 vzip_u8 15824 15825 _NEON2SSESTORAGE poly16x4x2_t vzip_p16(poly16x4_t a, poly16x4_t b); // VZIP.16 d0,d0 15826 #define vzip_p16 vzip_u16 15827 15828 _NEON2SSESTORAGE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 15829 _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0 15830 { 15831 int8x16x2_t r8x16; 15832 r8x16.val[0] = _mm_unpacklo_epi8(a, b); 15833 r8x16.val[1] = _mm_unpackhi_epi8(a, b); 15834 return r8x16; 15835 } 15836 15837 _NEON2SSESTORAGE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 15838 _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0 15839 { 15840 int16x8x2_t r16x8; 15841 r16x8.val[0] = _mm_unpacklo_epi16(a, b); 15842 r16x8.val[1] = _mm_unpackhi_epi16(a, b); 15843 return r16x8; 15844 } 15845 15846 _NEON2SSESTORAGE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 15847 _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0 15848 { 15849 int32x4x2_t r32x4; 15850 r32x4.val[0] = _mm_unpacklo_epi32(a, b); 15851 r32x4.val[1] = _mm_unpackhi_epi32(a, b); 15852 return r32x4; 15853 } 15854 15855 _NEON2SSESTORAGE uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 15856 #define vzipq_u8 vzipq_s8 15857 15858 _NEON2SSESTORAGE uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 15859 #define vzipq_u16 vzipq_s16 15860 15861 _NEON2SSESTORAGE uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 15862 #define vzipq_u32 vzipq_s32 15863 15864 _NEON2SSESTORAGE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 15865 _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0 15866 { 15867 float32x4x2_t f32x4; 15868 f32x4.val[0] = _mm_unpacklo_ps ( a, b); 15869 f32x4.val[1] = _mm_unpackhi_ps ( a, b); 15870 return f32x4; 15871 } 15872 15873 _NEON2SSESTORAGE poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 15874 #define vzipq_p8 vzipq_u8 15875 15876 _NEON2SSESTORAGE poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 15877 #define vzipq_p16 vzipq_u16 15878 15879 //*********************** De-Interleave elements ************************* 15880 //************************************************************************* 15881 //As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...) 15882 //no such functions in IA32 SIMD, shuffle is required 15883 _NEON2SSESTORAGE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b); // VUZP.8 d0,d0 15884 _NEON2SSE_INLINE int8x8x2_t vuzp_s8(int8x8_t a, int8x8_t b) // VUZP.8 d0,d0 15885 { 15886 int8x8x2_t val; 15887 __m128i tmp, val0; 15888 _NEON2SSE_ALIGN_16 static const int8_t maskdlv8[16] = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11,15}; 15889 tmp = _mm_unpacklo_epi8(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3,...,a7,b7 15890 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv8); //(a0, a2, a4, a6, b0, b2, b4, b6), (a1, a3, a5, a7, b1,b3, b5, b7) 15891 vst1q_s8(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); 15892 return val; 15893 } 15894 15895 _NEON2SSESTORAGE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b); // VUZP.16 d0,d0 15896 _NEON2SSE_INLINE int16x4x2_t vuzp_s16(int16x4_t a, int16x4_t b) // VUZP.16 d0,d0 15897 { 15898 int16x4x2_t val; 15899 __m128i tmp, val0; 15900 _NEON2SSE_ALIGN_16 static const int8_t maskdlv16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; 15901 tmp = _mm_unpacklo_epi16(_pM128i(a), _pM128i(b)); //a0,b0,a1,b1,a2,b2,a3,b3 15902 val0 = _mm_shuffle_epi8 (tmp, *(__m128i*)maskdlv16); //a0,a2, b0, b2, a1,a3, b1,b3 15903 vst1q_s16(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); 15904 return val; 15905 } 15906 15907 _NEON2SSESTORAGE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b); // VUZP.32 d0,d0 15908 _NEON2SSE_INLINE int32x2x2_t vuzp_s32(int32x2_t a, int32x2_t b) // VUZP.32 d0,d0 15909 { 15910 int32x2x2_t val; 15911 __m128i val0; 15912 val0 = _mm_unpacklo_epi32(_pM128i(a), _pM128i(b)); //a0,b0, a1,b1 15913 vst1q_s32(val.val, val0); // _mm_shuffle_epi32(val.val[0], _SWAP_HI_LOW32); 15914 return val; 15915 } 15916 15917 _NEON2SSESTORAGE uint8x8x2_t vuzp_u8(uint8x8_t a, uint8x8_t b); // VUZP.8 d0,d0 15918 #define vuzp_u8 vuzp_s8 15919 15920 _NEON2SSESTORAGE uint16x4x2_t vuzp_u16(uint16x4_t a, uint16x4_t b); // VUZP.16 d0,d0 15921 #define vuzp_u16 vuzp_s16 15922 15923 _NEON2SSESTORAGE uint32x2x2_t vuzp_u32(uint32x2_t a, uint32x2_t b); // VUZP.32 d0,d0 15924 #define vuzp_u32 vuzp_s32 15925 15926 _NEON2SSESTORAGE float32x2x2_t vuzp_f32(float32x2_t a, float32x2_t b); // VUZP.32 d0,d0 15927 #define vuzp_f32 vzip_f32 15928 15929 _NEON2SSESTORAGE poly8x8x2_t vuzp_p8(poly8x8_t a, poly8x8_t b); // VUZP.8 d0,d0 15930 #define vuzp_p8 vuzp_u8 15931 15932 _NEON2SSESTORAGE poly16x4x2_t vuzp_p16(poly16x4_t a, poly16x4_t b); // VUZP.16 d0,d0 15933 #define vuzp_p16 vuzp_u16 15934 15935 _NEON2SSESTORAGE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 15936 _NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0 15937 { 15938 int8x16x2_t v8x16; 15939 __m128i a_sh, b_sh; 15940 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_16_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 15941 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_16_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 15942 //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b 15943 v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14, 15944 v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15 15945 return v8x16; 15946 } 15947 15948 _NEON2SSESTORAGE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 15949 _NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0 15950 { 15951 int16x8x2_t v16x8; 15952 __m128i a_sh, b_sh; 15953 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 15954 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_32_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 15955 v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6 15956 v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7 15957 return v16x8; 15958 } 15959 15960 _NEON2SSESTORAGE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 15961 _NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0 15962 { 15963 //may be not optimal solution compared with serial 15964 int32x4x2_t v32x4; 15965 __m128i a_sh, b_sh; 15966 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 15967 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 15968 15969 v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2 15970 v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3 15971 return v32x4; 15972 } 15973 15974 _NEON2SSESTORAGE uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 15975 #define vuzpq_u8 vuzpq_s8 15976 15977 _NEON2SSESTORAGE uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 15978 #define vuzpq_u16 vuzpq_s16 15979 15980 _NEON2SSESTORAGE uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 15981 #define vuzpq_u32 vuzpq_s32 15982 15983 _NEON2SSESTORAGE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 15984 _NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0 15985 { 15986 float32x4x2_t v32x4; 15987 v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however 15988 v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however 15989 return v32x4; 15990 } 15991 15992 _NEON2SSESTORAGE poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 15993 #define vuzpq_p8 vuzpq_u8 15994 15995 _NEON2SSESTORAGE poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 15996 #define vuzpq_p16 vuzpq_u16 15997 15998 //############################################################################################## 15999 //*********************** Reinterpret cast intrinsics.****************************************** 16000 //############################################################################################## 16001 // Not a part of oficial NEON instruction set but available in gcc compiler ********************* 16002 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u32 (uint32x2_t t); 16003 #define vreinterpret_p8_u32 16004 16005 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u16 (uint16x4_t t); 16006 #define vreinterpret_p8_u16 16007 16008 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u8 (uint8x8_t t); 16009 #define vreinterpret_p8_u8 16010 16011 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s32 (int32x2_t t); 16012 #define vreinterpret_p8_s32 16013 16014 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s16 (int16x4_t t); 16015 #define vreinterpret_p8_s16 16016 16017 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s8 (int8x8_t t); 16018 #define vreinterpret_p8_s8 16019 16020 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_u64 (uint64x1_t t); 16021 #define vreinterpret_p8_u64 16022 16023 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_s64 (int64x1_t t); 16024 #define vreinterpret_p8_s64 16025 16026 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_f32 (float32x2_t t); 16027 #define vreinterpret_p8_f32 16028 16029 _NEON2SSESTORAGE poly8x8_t vreinterpret_p8_p16 (poly16x4_t t); 16030 #define vreinterpret_p8_p16 16031 16032 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t); 16033 #define vreinterpretq_p8_u32 16034 16035 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t); 16036 #define vreinterpretq_p8_u16 16037 16038 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t); 16039 #define vreinterpretq_p8_u8 16040 16041 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s32 (int32x4_t t); 16042 #define vreinterpretq_p8_s32 16043 16044 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s16 (int16x8_t t); 16045 #define vreinterpretq_p8_s16 16046 16047 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s8 (int8x16_t t); 16048 #define vreinterpretq_p8_s8 16049 16050 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t); 16051 #define vreinterpretq_p8_u64 16052 16053 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_s64 (int64x2_t t); 16054 #define vreinterpretq_p8_s64 16055 16056 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_f32 (float32x4_t t); 16057 #define vreinterpretq_p8_f32(t) _M128i(t) 16058 16059 _NEON2SSESTORAGE poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t); 16060 #define vreinterpretq_p8_p16 16061 16062 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u32 (uint32x2_t t); 16063 #define vreinterpret_p16_u32 16064 16065 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u16 (uint16x4_t t); 16066 #define vreinterpret_p16_u16 16067 16068 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u8 (uint8x8_t t); 16069 #define vreinterpret_p16_u8 16070 16071 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s32 (int32x2_t t); 16072 #define vreinterpret_p16_s32 16073 16074 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s16 (int16x4_t t); 16075 #define vreinterpret_p16_s16 16076 16077 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s8 (int8x8_t t); 16078 #define vreinterpret_p16_s8 16079 16080 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_u64 (uint64x1_t t); 16081 #define vreinterpret_p16_u64 16082 16083 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_s64 (int64x1_t t); 16084 #define vreinterpret_p16_s64 16085 16086 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_f32 (float32x2_t t); 16087 #define vreinterpret_p16_f32 16088 16089 _NEON2SSESTORAGE poly16x4_t vreinterpret_p16_p8 (poly8x8_t t); 16090 #define vreinterpret_p16_p8 16091 16092 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t); 16093 #define vreinterpretq_p16_u32 16094 16095 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t); 16096 #define vreinterpretq_p16_u16 16097 16098 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s32 (int32x4_t t); 16099 #define vreinterpretq_p16_s32 16100 16101 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s16 (int16x8_t t); 16102 #define vreinterpretq_p16_s16 16103 16104 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s8 (int8x16_t t); 16105 #define vreinterpretq_p16_s8 16106 16107 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t); 16108 #define vreinterpretq_p16_u64 16109 16110 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_s64 (int64x2_t t); 16111 #define vreinterpretq_p16_s64 16112 16113 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_f32 (float32x4_t t); 16114 #define vreinterpretq_p16_f32(t) _M128i(t) 16115 16116 _NEON2SSESTORAGE poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t); 16117 #define vreinterpretq_p16_p8 vreinterpretq_s16_p8 16118 16119 //**** Integer to float ****** 16120 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u32 (uint32x2_t t); 16121 _NEON2SSE_INLINE float32x2_t vreinterpret_f32_u32 (uint32x2_t t) 16122 { 16123 return (*(__m64_128*)&(t)); 16124 } 16125 16126 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u16 (uint16x4_t t); 16127 #define vreinterpret_f32_u16 vreinterpret_f32_u32 16128 16129 16130 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u8 (uint8x8_t t); 16131 #define vreinterpret_f32_u8 vreinterpret_f32_u32 16132 16133 16134 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s32 (int32x2_t t); 16135 #define vreinterpret_f32_s32 vreinterpret_f32_u32 16136 16137 16138 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s16 (int16x4_t t); 16139 #define vreinterpret_f32_s16 vreinterpret_f32_u32 16140 16141 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s8 (int8x8_t t); 16142 #define vreinterpret_f32_s8 vreinterpret_f32_u32 16143 16144 16145 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_u64(uint64x1_t t); 16146 #define vreinterpret_f32_u64 vreinterpret_f32_u32 16147 16148 16149 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_s64 (int64x1_t t); 16150 #define vreinterpret_f32_s64 vreinterpret_f32_u32 16151 16152 16153 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p16 (poly16x4_t t); 16154 #define vreinterpret_f32_p16 vreinterpret_f32_u32 16155 16156 _NEON2SSESTORAGE float32x2_t vreinterpret_f32_p8 (poly8x8_t t); 16157 #define vreinterpret_f32_p8 vreinterpret_f32_u32 16158 16159 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u32 (uint32x4_t t); 16160 #define vreinterpretq_f32_u32(t) _M128(t) 16161 16162 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u16 (uint16x8_t t); 16163 #define vreinterpretq_f32_u16 vreinterpretq_f32_u32 16164 16165 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u8 (uint8x16_t t); 16166 #define vreinterpretq_f32_u8 vreinterpretq_f32_u32 16167 16168 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s32 (int32x4_t t); 16169 #define vreinterpretq_f32_s32 vreinterpretq_f32_u32 16170 16171 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s16 (int16x8_t t); 16172 #define vreinterpretq_f32_s16 vreinterpretq_f32_u32 16173 16174 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s8 (int8x16_t t); 16175 #define vreinterpretq_f32_s8 vreinterpretq_f32_u32 16176 16177 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_u64 (uint64x2_t t); 16178 #define vreinterpretq_f32_u64 vreinterpretq_f32_u32 16179 16180 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_s64 (int64x2_t t); 16181 #define vreinterpretq_f32_s64 vreinterpretq_f32_u32 16182 16183 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p16 (poly16x8_t t); 16184 #define vreinterpretq_f32_p16 vreinterpretq_f32_u32 16185 16186 _NEON2SSESTORAGE float32x4_t vreinterpretq_f32_p8 (poly8x16_t t); 16187 #define vreinterpretq_f32_p8 vreinterpretq_f32_u32 16188 16189 //*** Integer type conversions ****************** 16190 //no conversion necessary for the following functions because it is same data type 16191 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u32 (uint32x2_t t); 16192 #define vreinterpret_s64_u32 16193 16194 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u16 (uint16x4_t t); 16195 #define vreinterpret_s64_u16 16196 16197 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u8 (uint8x8_t t); 16198 #define vreinterpret_s64_u8 16199 16200 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s32 (int32x2_t t); 16201 #define vreinterpret_s64_s32 16202 16203 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s16 (int16x4_t t); 16204 #define vreinterpret_s64_s16 16205 16206 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_s8 (int8x8_t t); 16207 #define vreinterpret_s64_s8 16208 16209 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_u64 (uint64x1_t t); 16210 #define vreinterpret_s64_u64 16211 16212 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_f32 (float32x2_t t); 16213 #define vreinterpret_s64_f32 16214 16215 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p16 (poly16x4_t t); 16216 #define vreinterpret_s64_p16 16217 16218 _NEON2SSESTORAGE int64x1_t vreinterpret_s64_p8 (poly8x8_t t); 16219 #define vreinterpret_s64_p8 16220 16221 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u32 (uint32x4_t t); 16222 #define vreinterpretq_s64_u32 16223 16224 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s16 (uint16x8_t t); 16225 #define vreinterpretq_s64_s16 16226 16227 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u8 (uint8x16_t t); 16228 #define vreinterpretq_s64_u8 16229 16230 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s32 (int32x4_t t); 16231 #define vreinterpretq_s64_s32 16232 16233 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u16 (int16x8_t t); 16234 #define vreinterpretq_s64_u16 16235 16236 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_s8 (int8x16_t t); 16237 #define vreinterpretq_s64_s8 16238 16239 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_u64 (uint64x2_t t); 16240 #define vreinterpretq_s64_u64 16241 16242 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_f32 (float32x4_t t); 16243 #define vreinterpretq_s64_f32(t) _M128i(t) 16244 16245 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p16 (poly16x8_t t); 16246 #define vreinterpretq_s64_p16 16247 16248 _NEON2SSESTORAGE int64x2_t vreinterpretq_s64_p8 (poly8x16_t t); 16249 #define vreinterpretq_s64_p8 16250 16251 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u32 (uint32x2_t t); 16252 #define vreinterpret_u64_u32 16253 16254 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u16 (uint16x4_t t); 16255 #define vreinterpret_u64_u16 16256 16257 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_u8 (uint8x8_t t); 16258 #define vreinterpret_u64_u8 16259 16260 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s32 (int32x2_t t); 16261 #define vreinterpret_u64_s32 16262 16263 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s16 (int16x4_t t); 16264 #define vreinterpret_u64_s16 16265 16266 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s8 (int8x8_t t); 16267 #define vreinterpret_u64_s8 16268 16269 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_s64 (int64x1_t t); 16270 #define vreinterpret_u64_s64 16271 16272 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_f32 (float32x2_t t); 16273 #define vreinterpret_u64_f32 16274 16275 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p16 (poly16x4_t t); 16276 #define vreinterpret_u64_p16 16277 16278 _NEON2SSESTORAGE uint64x1_t vreinterpret_u64_p8 (poly8x8_t t); 16279 #define vreinterpret_u64_p8 16280 16281 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t); 16282 #define vreinterpretq_u64_u32 16283 16284 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t); 16285 #define vreinterpretq_u64_u16 16286 16287 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t); 16288 #define vreinterpretq_u64_u8 16289 16290 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s32 (int32x4_t t); 16291 #define vreinterpretq_u64_s32 16292 16293 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s16 (int16x8_t t); 16294 #define vreinterpretq_u64_s16 16295 16296 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s8 (int8x16_t t); 16297 #define vreinterpretq_u64_s8 16298 16299 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_s64 (int64x2_t t); 16300 #define vreinterpretq_u64_s64 16301 16302 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_f32 (float32x4_t t); 16303 #define vreinterpretq_u64_f32(t) _M128i(t) 16304 16305 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t); 16306 #define vreinterpretq_u64_p16 16307 16308 _NEON2SSESTORAGE uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t); 16309 #define vreinterpretq_u64_p8 16310 16311 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u32 (uint32x2_t t); 16312 #define vreinterpret_s8_u32 16313 16314 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u16 (uint16x4_t t); 16315 #define vreinterpret_s8_u16 16316 16317 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u8 (uint8x8_t t); 16318 #define vreinterpret_s8_u8 16319 16320 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s32 (int32x2_t t); 16321 #define vreinterpret_s8_s32 16322 16323 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s16 (int16x4_t t); 16324 #define vreinterpret_s8_s16 16325 16326 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_u64 (uint64x1_t t); 16327 #define vreinterpret_s8_u64 16328 16329 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_s64 (int64x1_t t); 16330 #define vreinterpret_s8_s64 16331 16332 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_f32 (float32x2_t t); 16333 #define vreinterpret_s8_f32 16334 16335 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p16 (poly16x4_t t); 16336 #define vreinterpret_s8_p16 16337 16338 _NEON2SSESTORAGE int8x8_t vreinterpret_s8_p8 (poly8x8_t t); 16339 #define vreinterpret_s8_p8 16340 16341 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u32 (uint32x4_t t); 16342 #define vreinterpretq_s8_u32 16343 16344 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u16 (uint16x8_t t); 16345 #define vreinterpretq_s8_u16 16346 16347 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u8 (uint8x16_t t); 16348 #define vreinterpretq_s8_u8 16349 16350 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s32 (int32x4_t t); 16351 #define vreinterpretq_s8_s32 16352 16353 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s16 (int16x8_t t); 16354 #define vreinterpretq_s8_s16 16355 16356 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_u64 (uint64x2_t t); 16357 #define vreinterpretq_s8_u64 16358 16359 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_s64 (int64x2_t t); 16360 #define vreinterpretq_s8_s64 16361 16362 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_f32 (float32x4_t t); 16363 #define vreinterpretq_s8_f32(t) _M128i(t) 16364 16365 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p16 (poly16x8_t t); 16366 #define vreinterpretq_s8_p16 16367 16368 _NEON2SSESTORAGE int8x16_t vreinterpretq_s8_p8 (poly8x16_t t); 16369 #define vreinterpretq_s8_p8 16370 16371 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u32 (uint32x2_t t); 16372 #define vreinterpret_s16_u32 16373 16374 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u16 (uint16x4_t t); 16375 #define vreinterpret_s16_u16 16376 16377 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u8 (uint8x8_t t); 16378 #define vreinterpret_s16_u8 16379 16380 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s32 (int32x2_t t); 16381 #define vreinterpret_s16_s32 16382 16383 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s8 (int8x8_t t); 16384 #define vreinterpret_s16_s8 16385 16386 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_u64 (uint64x1_t t); 16387 #define vreinterpret_s16_u64 16388 16389 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_s64 (int64x1_t t); 16390 #define vreinterpret_s16_s64 16391 16392 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_f32 (float32x2_t t); 16393 #define vreinterpret_s16_f32 16394 16395 16396 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p16 (poly16x4_t t); 16397 #define vreinterpret_s16_p16 16398 16399 _NEON2SSESTORAGE int16x4_t vreinterpret_s16_p8 (poly8x8_t t); 16400 #define vreinterpret_s16_p8 16401 16402 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u32 (uint32x4_t t); 16403 #define vreinterpretq_s16_u32 16404 16405 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u16 (uint16x8_t t); 16406 #define vreinterpretq_s16_u16 16407 16408 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u8 (uint8x16_t t); 16409 #define vreinterpretq_s16_u8 16410 16411 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s32 (int32x4_t t); 16412 #define vreinterpretq_s16_s32 16413 16414 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s8 (int8x16_t t); 16415 #define vreinterpretq_s16_s8 16416 16417 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_u64 (uint64x2_t t); 16418 #define vreinterpretq_s16_u64 16419 16420 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_s64 (int64x2_t t); 16421 #define vreinterpretq_s16_s64 16422 16423 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_f32 (float32x4_t t); 16424 #define vreinterpretq_s16_f32(t) _M128i(t) 16425 16426 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p16 (poly16x8_t t); 16427 #define vreinterpretq_s16_p16 16428 16429 _NEON2SSESTORAGE int16x8_t vreinterpretq_s16_p8 (poly8x16_t t); 16430 #define vreinterpretq_s16_p8 16431 16432 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u32 (uint32x2_t t); 16433 #define vreinterpret_s32_u32 16434 16435 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u16 (uint16x4_t t); 16436 #define vreinterpret_s32_u16 16437 16438 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u8 (uint8x8_t t); 16439 #define vreinterpret_s32_u8 16440 16441 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s16 (int16x4_t t); 16442 #define vreinterpret_s32_s16 16443 16444 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s8 (int8x8_t t); 16445 #define vreinterpret_s32_s8 16446 16447 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_u64 (uint64x1_t t); 16448 #define vreinterpret_s32_u64 16449 16450 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_s64 (int64x1_t t); 16451 #define vreinterpret_s32_s64 16452 16453 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_f32 (float32x2_t t); 16454 #define vreinterpret_s32_f32 16455 16456 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p16 (poly16x4_t t); 16457 #define vreinterpret_s32_p16 16458 16459 _NEON2SSESTORAGE int32x2_t vreinterpret_s32_p8 (poly8x8_t t); 16460 #define vreinterpret_s32_p8 16461 16462 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u32 (uint32x4_t t); 16463 #define vreinterpretq_s32_u32 16464 16465 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u16 (uint16x8_t t); 16466 #define vreinterpretq_s32_u16 16467 16468 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u8 (uint8x16_t t); 16469 #define vreinterpretq_s32_u8 16470 16471 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s16 (int16x8_t t); 16472 #define vreinterpretq_s32_s16 16473 16474 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s8 (int8x16_t t); 16475 #define vreinterpretq_s32_s8 16476 16477 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_u64 (uint64x2_t t); 16478 #define vreinterpretq_s32_u64 16479 16480 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_s64 (int64x2_t t); 16481 #define vreinterpretq_s32_s64 16482 16483 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_f32 (float32x4_t t); 16484 #define vreinterpretq_s32_f32(t) _M128i(t) 16485 16486 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p16 (poly16x8_t t); 16487 #define vreinterpretq_s32_p16 16488 16489 _NEON2SSESTORAGE int32x4_t vreinterpretq_s32_p8 (poly8x16_t t); 16490 #define vreinterpretq_s32_p8 16491 16492 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u32 (uint32x2_t t); 16493 #define vreinterpret_u8_u32 16494 16495 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u16 (uint16x4_t t); 16496 #define vreinterpret_u8_u16 16497 16498 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s32 (int32x2_t t); 16499 #define vreinterpret_u8_s32 16500 16501 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s16 (int16x4_t t); 16502 #define vreinterpret_u8_s16 16503 16504 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s8 (int8x8_t t); 16505 #define vreinterpret_u8_s8 16506 16507 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_u64 (uint64x1_t t); 16508 #define vreinterpret_u8_u64 16509 16510 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_s64 (int64x1_t t); 16511 #define vreinterpret_u8_s64 16512 16513 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_f32 (float32x2_t t); 16514 #define vreinterpret_u8_f32 16515 16516 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p16 (poly16x4_t t); 16517 #define vreinterpret_u8_p16 16518 16519 _NEON2SSESTORAGE uint8x8_t vreinterpret_u8_p8 (poly8x8_t t); 16520 #define vreinterpret_u8_p8 16521 16522 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t); 16523 #define vreinterpretq_u8_u32 16524 16525 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t); 16526 #define vreinterpretq_u8_u16 16527 16528 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s32 (int32x4_t t); 16529 #define vreinterpretq_u8_s32 16530 16531 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s16 (int16x8_t t); 16532 #define vreinterpretq_u8_s16 16533 16534 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s8 (int8x16_t t); 16535 #define vreinterpretq_u8_s8 16536 16537 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t); 16538 #define vreinterpretq_u8_u64 16539 16540 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_s64 (int64x2_t t); 16541 #define vreinterpretq_u8_s64 16542 16543 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_f32 (float32x4_t t); 16544 #define vreinterpretq_u8_f32(t) _M128i(t) 16545 16546 16547 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t); 16548 #define vreinterpretq_u8_p16 16549 16550 _NEON2SSESTORAGE uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t); 16551 #define vreinterpretq_u8_p8 16552 16553 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u32 (uint32x2_t t); 16554 #define vreinterpret_u16_u32 16555 16556 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u8 (uint8x8_t t); 16557 #define vreinterpret_u16_u8 16558 16559 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s32 (int32x2_t t); 16560 #define vreinterpret_u16_s32 16561 16562 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s16 (int16x4_t t); 16563 #define vreinterpret_u16_s16 16564 16565 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s8 (int8x8_t t); 16566 #define vreinterpret_u16_s8 16567 16568 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_u64 (uint64x1_t t); 16569 #define vreinterpret_u16_u64 16570 16571 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_s64 (int64x1_t t); 16572 #define vreinterpret_u16_s64 16573 16574 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_f32 (float32x2_t t); 16575 #define vreinterpret_u16_f32 16576 16577 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p16 (poly16x4_t t); 16578 #define vreinterpret_u16_p16 16579 16580 _NEON2SSESTORAGE uint16x4_t vreinterpret_u16_p8 (poly8x8_t t); 16581 #define vreinterpret_u16_p8 16582 16583 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t); 16584 #define vreinterpretq_u16_u32 16585 16586 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t); 16587 #define vreinterpretq_u16_u8 16588 16589 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s32 (int32x4_t t); 16590 #define vreinterpretq_u16_s32 16591 16592 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s16 (int16x8_t t); 16593 #define vreinterpretq_u16_s16 16594 16595 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s8 (int8x16_t t); 16596 #define vreinterpretq_u16_s8 16597 16598 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t); 16599 #define vreinterpretq_u16_u64 16600 16601 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_s64 (int64x2_t t); 16602 #define vreinterpretq_u16_s64 16603 16604 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_f32 (float32x4_t t); 16605 #define vreinterpretq_u16_f32(t) _M128i(t) 16606 16607 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t); 16608 #define vreinterpretq_u16_p16 16609 16610 _NEON2SSESTORAGE uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t); 16611 #define vreinterpretq_u16_p8 16612 16613 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u16 (uint16x4_t t); 16614 #define vreinterpret_u32_u16 16615 16616 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u8 (uint8x8_t t); 16617 #define vreinterpret_u32_u8 16618 16619 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s32 (int32x2_t t); 16620 #define vreinterpret_u32_s32 16621 16622 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s16 (int16x4_t t); 16623 #define vreinterpret_u32_s16 16624 16625 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s8 (int8x8_t t); 16626 #define vreinterpret_u32_s8 16627 16628 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_u64 (uint64x1_t t); 16629 #define vreinterpret_u32_u64 16630 16631 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_s64 (int64x1_t t); 16632 #define vreinterpret_u32_s64 16633 16634 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_f32 (float32x2_t t); 16635 #define vreinterpret_u32_f32 16636 16637 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p16 (poly16x4_t t); 16638 #define vreinterpret_u32_p16 16639 16640 _NEON2SSESTORAGE uint32x2_t vreinterpret_u32_p8 (poly8x8_t t); 16641 #define vreinterpret_u32_p8 16642 16643 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t); 16644 #define vreinterpretq_u32_u16 16645 16646 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t); 16647 #define vreinterpretq_u32_u8 16648 16649 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s32 (int32x4_t t); 16650 #define vreinterpretq_u32_s32 16651 16652 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s16 (int16x8_t t); 16653 #define vreinterpretq_u32_s16 16654 16655 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s8 (int8x16_t t); 16656 #define vreinterpretq_u32_s8 16657 16658 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t); 16659 #define vreinterpretq_u32_u64 16660 16661 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_s64 (int64x2_t t); 16662 #define vreinterpretq_u32_s64 16663 16664 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_f32 (float32x4_t t); 16665 #define vreinterpretq_u32_f32(t) _M128i(t) 16666 16667 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t); 16668 #define vreinterpretq_u32_p16 16669 16670 _NEON2SSESTORAGE uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t); 16671 #define vreinterpretq_u32_p8 16672 16673 //************* Round ****************** 16674 _NEON2SSESTORAGE float32x4_t vrndnq_f32(float32x4_t a); 16675 #ifdef USE_SSE4 16676 # define vrndnq_f32(a) _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) 16677 #else 16678 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( float32x4_t vrndnq_f32(float32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) 16679 { 16680 int i; 16681 _NEON2SSE_ALIGN_16 float32_t res[4]; 16682 _mm_store_ps(res, a); 16683 for(i = 0; i<4; i++) { 16684 res[i] = nearbyintf(res[i]); 16685 } 16686 return _mm_load_ps(res); 16687 } 16688 #endif 16689 16690 16691 _NEON2SSESTORAGE float64x2_t vrndnq_f64(float64x2_t a); 16692 #ifdef USE_SSE4 16693 # define vrndnq_f64(a) _mm_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) 16694 #else 16695 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(float64x2_t vrndnq_f64(float64x2_t a), _NEON2SSE_REASON_SLOW_SERIAL) 16696 { 16697 _NEON2SSE_ALIGN_16 float64_t res[2]; 16698 _mm_store_pd(res, a); 16699 res[0] = nearbyintf(res[0]); 16700 res[1] = nearbyintf(res[1]); 16701 return _mm_load_pd(res); 16702 } 16703 #endif 16704 16705 16706 16707 //************* Sqrt ****************** 16708 _NEON2SSESTORAGE float32x4_t vsqrtq_f32(float32x4_t a); 16709 #define vsqrtq_f32 _mm_sqrt_ps 16710 16711 _NEON2SSESTORAGE float64x2_t vsqrtq_f64(float64x2_t a); 16712 #define vsqrtq_f64 _mm_sqrt_pd 16713 16714 16715 #endif /* NEON2SSE_H */ 16716