1 //created by Victoria Zhislina, the Senior Application Engineer, Intel Corporation, victoria.zhislina (at) intel.com 2 3 //*** Copyright (C) 2012-2014 Intel Corporation. All rights reserved. 4 5 //IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 6 7 //By downloading, copying, installing or using the software you agree to this license. 8 //If you do not agree to this license, do not download, install, copy or use the software. 9 10 // License Agreement 11 12 //Permission to use, copy, modify, and/or distribute this software for any 13 //purpose with or without fee is hereby granted, provided that the above 14 //copyright notice and this permission notice appear in all copies. 15 16 //THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH 17 //REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY 18 //AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, 19 //INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM 20 //LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR 21 //OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 22 //PERFORMANCE OF THIS SOFTWARE. 23 24 //***************************************************************************************** 25 // This file is intended to simplify ARM->IA32 porting 26 // It makes the correspondence between ARM NEON intrinsics (as defined in "arm_neon.h") 27 // and x86 SSE(up to SSE4.2) intrinsic functions as defined in headers files below 28 // MMX instruction set is not used due to performance overhead and the necessity to use the 29 // EMMS instruction (_mm_empty())for mmx-x87 floating point switching 30 //***************************************************************************************** 31 32 //!!!!!!! To use this file in your project that uses ARM NEON intinsics just keep arm_neon.h included and complile it as usual. 33 //!!!!!!! Please pay attention at #define USE_SSSE3 and USE_SSE4 below - you need to define them for newest Intel platforms for 34 //!!!!!!! greater performance. It can be done by -mssse3 or -msse4.2 (which also implies -mssse3) compiler switch. 35 36 #ifndef NEON2SSE_H 37 #define NEON2SSE_H 38 39 #ifndef USE_SSE4 40 #if defined(__SSE4_2__) 41 #define USE_SSE4 42 #define USE_SSSE3 43 #endif 44 #endif 45 46 #ifndef USE_SSSE3 47 #if defined(__SSSE3__) 48 #define USE_SSSE3 49 #endif 50 #endif 51 52 #include <xmmintrin.h> //SSE 53 #include <emmintrin.h> //SSE2 54 #include <pmmintrin.h> //SSE3 55 56 #ifdef USE_SSSE3 57 #include <tmmintrin.h> //SSSE3 58 #else 59 # warning "Some functions require SSSE3 or higher." 60 #endif 61 62 #ifdef USE_SSE4 63 #include <smmintrin.h> //SSE4.1 64 #include <nmmintrin.h> //SSE4.2 65 #endif 66 67 /*********************************************************************************************************************/ 68 // data types conversion 69 /*********************************************************************************************************************/ 70 71 typedef __m128 float32x4_t; 72 73 typedef __m128 float16x8_t; //not supported by IA, for compartibility 74 75 typedef __m128i int8x16_t; 76 typedef __m128i int16x8_t; 77 typedef __m128i int32x4_t; 78 typedef __m128i int64x2_t; 79 typedef __m128i uint8x16_t; 80 typedef __m128i uint16x8_t; 81 typedef __m128i uint32x4_t; 82 typedef __m128i uint64x2_t; 83 typedef __m128i poly8x16_t; 84 typedef __m128i poly16x8_t; 85 86 #if defined(_MSC_VER) && (_MSC_VER < 1300) 87 typedef signed char int8_t; 88 typedef unsigned char uint8_t; 89 typedef signed short int16_t; 90 typedef unsigned short uint16_t; 91 typedef signed int int32_t; 92 typedef unsigned int uint32_t; 93 typedef signed long long int64_t; 94 typedef unsigned long long uint64_t; 95 #elif defined(_MSC_VER) 96 typedef signed __int8 int8_t; 97 typedef unsigned __int8 uint8_t; 98 typedef signed __int16 int16_t; 99 typedef unsigned __int16 uint16_t; 100 typedef signed __int32 int32_t; 101 typedef unsigned __int32 uint32_t; 102 103 typedef signed long long int64_t; 104 typedef unsigned long long uint64_t; 105 #else 106 #include <stdint.h> 107 #include <limits.h> 108 #endif 109 #if defined(_MSC_VER) 110 #define SINT_MIN (-2147483647 - 1) /* min signed int value */ 111 #define SINT_MAX 2147483647 /* max signed int value */ 112 #else 113 #define SINT_MIN INT_MIN /* min signed int value */ 114 #define SINT_MAX INT_MAX /* max signed int value */ 115 #endif 116 117 typedef float float32_t; 118 #if !defined(__clang__) 119 typedef float __fp16; 120 #endif 121 122 typedef uint8_t poly8_t; 123 typedef uint16_t poly16_t; 124 125 //MSVC compilers (tested up to 2012 VS version) doesn't allow using structures or arrays of __m128x type as functions arguments resulting in 126 //error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned. To avoid it we need the special trick for functions that use these types 127 128 //Unfortunately we are unable to merge two 64-bits in on 128 bit register because user should be able to access val[n] members explicitly!!! 129 struct int8x16x2_t { 130 int8x16_t val[2]; 131 }; 132 struct int16x8x2_t { 133 int16x8_t val[2]; 134 }; 135 struct int32x4x2_t { 136 int32x4_t val[2]; 137 }; 138 struct int64x2x2_t { 139 int64x2_t val[2]; 140 }; 141 142 typedef struct int8x16x2_t int8x16x2_t; //for C compilers to make them happy 143 typedef struct int16x8x2_t int16x8x2_t; //for C compilers to make them happy 144 typedef struct int32x4x2_t int32x4x2_t; //for C compilers to make them happy 145 typedef struct int64x2x2_t int64x2x2_t; //for C compilers to make them happy 146 //to avoid pointers conversion 147 typedef int8x16x2_t int8x8x2_t; 148 typedef int16x8x2_t int16x4x2_t; 149 typedef int32x4x2_t int32x2x2_t; 150 typedef int64x2x2_t int64x1x2_t; 151 152 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers structures above */ 153 typedef struct int8x16x2_t uint8x16x2_t; 154 typedef struct int16x8x2_t uint16x8x2_t; 155 typedef struct int32x4x2_t uint32x4x2_t; 156 typedef struct int64x2x2_t uint64x2x2_t; 157 typedef struct int8x16x2_t poly8x16x2_t; 158 typedef struct int16x8x2_t poly16x8x2_t; 159 160 typedef int8x8x2_t uint8x8x2_t; 161 typedef int16x4x2_t uint16x4x2_t; 162 typedef int32x2x2_t uint32x2x2_t; 163 typedef int64x1x2_t uint64x1x2_t; 164 typedef int8x8x2_t poly8x8x2_t; 165 typedef int16x4x2_t poly16x4x2_t; 166 167 //float 168 struct float32x4x2_t { 169 float32x4_t val[2]; 170 }; 171 struct float16x8x2_t { 172 float16x8_t val[2]; 173 }; 174 typedef struct float32x4x2_t float32x4x2_t; //for C compilers to make them happy 175 typedef struct float16x8x2_t float16x8x2_t; //for C compilers to make them happy 176 typedef float32x4x2_t float32x2x2_t; 177 typedef float16x8x2_t float16x4x2_t; 178 179 //4 180 struct int8x16x4_t { 181 int8x16_t val[4]; 182 }; 183 struct int16x8x4_t { 184 int16x8_t val[4]; 185 }; 186 struct int32x4x4_t { 187 int32x4_t val[4]; 188 }; 189 struct int64x2x4_t { 190 int64x2_t val[4]; 191 }; 192 193 typedef struct int8x16x4_t int8x16x4_t; //for C compilers to make them happy 194 typedef struct int16x8x4_t int16x8x4_t; //for C compilers to make them happy 195 typedef struct int32x4x4_t int32x4x4_t; //for C compilers to make them happy 196 typedef struct int64x2x4_t int64x2x4_t; //for C compilers to make them happy 197 typedef int8x16x4_t int8x8x4_t; 198 typedef int16x8x4_t int16x4x4_t; 199 typedef int32x4x4_t int32x2x4_t; 200 typedef int64x2x4_t int64x1x4_t; 201 202 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ 203 typedef int8x8x4_t uint8x8x4_t; 204 typedef int16x4x4_t uint16x4x4_t; 205 typedef int32x2x4_t uint32x2x4_t; 206 typedef int64x1x4_t uint64x1x4_t; 207 typedef uint8x8x4_t poly8x8x4_t; 208 typedef uint16x4x4_t poly16x4x4_t; 209 210 typedef struct int8x16x4_t uint8x16x4_t; 211 typedef struct int16x8x4_t uint16x8x4_t; 212 typedef struct int32x4x4_t uint32x4x4_t; 213 typedef struct int64x2x4_t uint64x2x4_t; 214 typedef struct int8x16x4_t poly8x16x4_t; 215 typedef struct int16x8x4_t poly16x8x4_t; 216 217 struct float32x4x4_t { 218 float32x4_t val[4]; 219 }; 220 struct float16x8x4_t { 221 float16x8_t val[4]; 222 }; 223 224 typedef struct float32x4x4_t float32x4x4_t; //for C compilers to make them happy 225 typedef struct float16x8x4_t float16x8x4_t; //for C compilers to make them happy 226 typedef float32x4x4_t float32x2x4_t; 227 typedef float16x8x4_t float16x4x4_t; 228 229 //3 230 struct int16x8x3_t { 231 int16x8_t val[3]; 232 }; 233 struct int32x4x3_t { 234 int32x4_t val[3]; 235 }; 236 struct int64x2x3_t { 237 int64x2_t val[3]; 238 }; 239 struct int8x16x3_t { 240 int8x16_t val[3]; 241 }; 242 243 typedef struct int16x8x3_t int16x8x3_t; //for C compilers to make them happy 244 typedef struct int32x4x3_t int32x4x3_t; //for C compilers to make them happy 245 typedef struct int64x2x3_t int64x2x3_t; //for C compilers to make them happy 246 typedef struct int8x16x3_t int8x16x3_t; //for C compilers to make them happy 247 typedef int16x8x3_t int16x4x3_t; 248 typedef int32x4x3_t int32x2x3_t; 249 typedef int64x2x3_t int64x1x3_t; 250 typedef int8x16x3_t int8x8x3_t; 251 252 /* to avoid pointer conversions the following unsigned integers structures are defined via the corresponding signed integers dealing structures above:*/ 253 typedef struct int8x16x3_t uint8x16x3_t; 254 typedef struct int16x8x3_t uint16x8x3_t; 255 typedef struct int32x4x3_t uint32x4x3_t; 256 typedef struct int64x2x3_t uint64x2x3_t; 257 typedef struct int8x16x3_t poly8x16x3_t; 258 typedef struct int16x8x3_t poly16x8x3_t; 259 typedef int8x8x3_t uint8x8x3_t; 260 typedef int16x4x3_t uint16x4x3_t; 261 typedef int32x2x3_t uint32x2x3_t; 262 typedef int64x1x3_t uint64x1x3_t; 263 typedef int8x8x3_t poly8x8x3_t; 264 typedef int16x4x3_t poly16x4x3_t; 265 266 //float 267 struct float32x4x3_t { 268 float32x4_t val[3]; 269 }; 270 struct float16x8x3_t { 271 float16x8_t val[3]; 272 }; 273 274 typedef struct float32x4x3_t float32x4x3_t; //for C compilers to make them happy 275 typedef struct float16x8x3_t float16x8x3_t; //for C compilers to make them happy 276 typedef float32x4x3_t float32x2x3_t; 277 typedef float16x8x3_t float16x4x3_t; 278 279 //**************************************************************************** 280 //****** Porting auxiliary macros ******************************************** 281 #define _M128i(a) (*(__m128i*)&(a)) 282 #define _M128d(a) (*(__m128d*)&(a)) 283 #define _M128(a) (*(__m128*)&(a)) 284 #define _Ui64(a) (*(uint64_t*)&(a)) 285 #define _UNSIGNED_T(a) u##a 286 287 #define _SIGNBIT64 ((uint64_t)1 << 63) 288 #define _SWAP_HI_LOW32 (2 | (3 << 2) | (0 << 4) | (1 << 6)) 289 #define _INSERTPS_NDX(srcField, dstField) (((srcField) << 6) | ((dstField) << 4) ) 290 291 #define _NEON2SSE_REASON_SLOW_SERIAL "The function may be very slow due to the serial implementation, please try to avoid it" 292 #define _NEON2SSE_REASON_SLOW_UNEFFECTIVE "The function may be slow due to inefficient x86 SIMD implementation, please try to avoid it" 293 294 //*************** functions attributes ******************************************** 295 //*********************************************************************************** 296 #ifdef __GNUC__ 297 #define _GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) 298 #define _NEON2SSE_ALIGN_16 __attribute__((aligned(16))) 299 #define _NEON2SSE_INLINE extern inline __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 300 #if _GCC_VERSION < 40500 301 #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated)) function 302 #else 303 #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) __attribute__((deprecated(explanation))) function 304 #endif 305 #elif defined(_MSC_VER)|| defined (__INTEL_COMPILER) 306 #define _NEON2SSE_ALIGN_16 __declspec(align(16)) 307 #define _NEON2SSE_INLINE __inline 308 #define _NEON2SSE_PERFORMANCE_WARNING(function, EXPLANATION) __declspec(deprecated(EXPLANATION)) function 309 #else 310 #define _NEON2SSE_ALIGN_16 __declspec(align(16)) 311 #define _NEON2SSE_INLINE inline 312 #define _NEON2SSE_PERFORMANCE_WARNING(function, explanation) function 313 #endif 314 315 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 316 #define __constrange(min,max) const 317 #define __transfersize(size) 318 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 319 320 //************************************************************************* 321 //************************************************************************* 322 //********* Functions declarations as declared in original arm_neon.h ***** 323 //************************************************************************* 324 //Vector add: vadd -> Vr[i]:=Va[i]+Vb[i], Vr, Va, Vb have equal lane sizes. 325 326 int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 327 int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 328 int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 329 int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 330 float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 331 uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 332 uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 333 uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 334 uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 335 //Vector long add: vaddl -> Vr[i]:=Va[i]+Vb[i], Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. 336 337 //Vector wide addw: vadd -> Vr[i]:=Va[i]+Vb[i] 338 339 //Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 340 341 int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 342 int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S16 q0,q0,q0 343 int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 344 uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 345 uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.U16 q0,q0,q0 346 uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 347 //Vector rounding halving add: vrhadd -> Vr[i]:=(Va[i]+Vb[i]+1)>>1 348 349 int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 350 int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 351 int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 352 uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 353 uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.U16 q0,q0,q0 354 uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 355 //Vector saturating add: vqadd -> Vr[i]:=sat<size>(Va[i]+Vb[i]) 356 357 int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 358 int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 359 int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 360 int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 361 uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 362 uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.U16 q0,q0,q0 363 uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 364 uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 365 //Vector add high half: vaddhn-> Vr[i]:=Va[i]+Vb[i] 366 367 //Vector rounding add high half: vraddhn 368 369 //Multiplication 370 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] 371 372 int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 373 int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 374 int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 375 float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 376 uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 377 uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 378 uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 379 poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 380 //Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] 381 382 int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 383 int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 384 int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 385 float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 386 uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 387 uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 388 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 389 //Vector multiply accumulate long: vmlal -> Vr[i] := Va[i] + Vb[i] * Vc[i] 390 391 //Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] 392 393 int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 394 int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 395 int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 396 float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 397 uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 398 uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 399 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 400 //Vector multiply subtract long 401 402 //Vector saturating doubling multiply high 403 404 int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 405 int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 406 //Vector saturating rounding doubling multiply high 407 408 int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 409 int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 410 //Vector saturating doubling multiply accumulate long 411 412 //Vector saturating doubling multiply subtract long 413 414 //Vector long multiply 415 416 //Vector saturating doubling long multiply 417 418 //Subtraction 419 //Vector subtract 420 421 int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 422 int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 423 int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 424 int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 425 float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 426 uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 427 uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 428 uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 429 uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 430 //Vector long subtract: vsub -> Vr[i]:=Va[i]+Vb[i] 431 432 //Vector wide subtract: vsub -> Vr[i]:=Va[i]+Vb[i] 433 434 //Vector saturating subtract 435 436 int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 437 int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 438 int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 439 int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 440 uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 441 uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.U16 q0,q0,q0 442 uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 443 uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b); // VQSUB.U64 q0,q0,q0 444 //Vector halving subtract 445 446 int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 447 int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 448 int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 449 uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 450 uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.U16 q0,q0,q0 451 uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 452 //Vector subtract high half 453 454 //Vector rounding subtract high half 455 456 //Comparison 457 //Vector compare equal 458 459 uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 460 uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 461 uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 462 uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 463 uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 464 uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 465 uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 466 uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 467 //Vector compare greater-than or equal 468 469 uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 470 uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 471 uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 472 uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 473 uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 474 uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 475 uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 476 //Vector compare less-than or equal 477 478 uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 479 uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 480 uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 481 uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 482 uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 483 uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.U16 q0, q0, q0 484 uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 485 //Vector compare greater-than 486 487 uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 488 uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 489 uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 490 uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 491 uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 492 uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 493 uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 494 //Vector compare less-than 495 496 uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 497 uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 498 uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 499 uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 500 uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 501 uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.U16 q0, q0, q0 502 uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 503 //Vector compare absolute greater-than or equal 504 505 uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 506 //Vector compare absolute less-than or equal 507 508 uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 509 //Vector compare absolute greater-than 510 511 uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 512 //Vector compare absolute less-than 513 514 uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 515 //Vector test bits 516 517 uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 518 uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 519 uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 520 uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 521 uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 522 uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 523 uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 524 //Absolute difference 525 //Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] | 526 527 int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 528 int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 529 int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 530 uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 531 uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.U16 q0,q0,q0 532 uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 533 float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 534 //Absolute difference - long 535 536 //Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | 537 538 int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 539 int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 540 int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 541 uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 542 uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.U16 q0,q0,q0 543 uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 544 //Absolute difference and accumulate - long 545 546 //Max/Min 547 //vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] 548 549 int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 550 int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 551 int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 552 uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 553 uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.U16 q0,q0,q0 554 uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 555 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 556 //vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] 557 558 int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 559 int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 560 int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 561 uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 562 uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.U16 q0,q0,q0 563 uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 564 float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 565 //Pairwise addition 566 //Pairwise add 567 568 //Long pairwise add 569 570 int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 571 int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 572 int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 573 uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 574 uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.U16 q0,q0 575 uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 576 //Long pairwise add and accumulate 577 578 int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 579 int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 580 int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 581 uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 582 uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.U16 q0,q0 583 uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 584 //Folding maximum vpmax -> takes maximum of adjacent pairs 585 586 //Folding minimum vpmin -> takes minimum of adjacent pairs 587 588 //Reciprocal/Sqrt 589 590 float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 591 592 float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 593 //Shifts by signed variable 594 //Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) 595 596 int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 597 int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 598 int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 599 int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 600 uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 601 uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.U16 q0,q0,q0 602 uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 603 uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 604 //Vector saturating shift left: (negative values shift right) 605 606 int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 607 int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 608 int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 609 int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 610 uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 611 uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.U16 q0,q0,q0 612 uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 613 uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 614 //Vector rounding shift left: (negative values shift right) 615 616 int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 617 int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 618 int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 619 int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 620 uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 621 uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.U16 q0,q0,q0 622 uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 623 uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 624 //Vector saturating rounding shift left: (negative values shift right) 625 626 int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 627 int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 628 int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 629 int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 630 uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 631 uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.U16 q0,q0,q0 632 uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 633 uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 634 //Shifts by a constant 635 //Vector shift right by constant 636 637 int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 638 int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 639 int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 640 int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 641 uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 642 uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.U16 q0,q0,#16 643 uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 644 uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 645 //Vector shift left by constant 646 647 int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 648 int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 649 int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 650 int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 651 uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 652 uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 653 uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 654 uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 655 //Vector rounding shift right by constant 656 657 int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 658 int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 659 int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 660 int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 661 uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 662 uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.U16 q0,q0,#16 663 uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 664 uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 665 //Vector shift right by constant and accumulate 666 667 int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 668 int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 669 int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 670 int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 671 uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 672 uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.U16 q0,q0,#16 673 uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 674 uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 675 //Vector rounding shift right by constant and accumulate 676 677 int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 678 int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 679 int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 680 int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 681 uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 682 uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.U16 q0,q0,#16 683 uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 684 uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 685 //Vector saturating shift left by constant 686 687 int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 688 int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 689 int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 690 int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 691 uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 692 uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.U16 q0,q0,#0 693 uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 694 uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 695 //Vector signed->unsigned saturating shift left by constant 696 697 uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 698 uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 699 uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 700 uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 701 //Vector narrowing shift right by constant 702 703 //Vector signed->unsigned narrowing saturating shift right by constant 704 705 //Vector signed->unsigned rounding narrowing saturating shift right by constant 706 707 //Vector narrowing saturating shift right by constant 708 709 //Vector rounding narrowing shift right by constant 710 711 //Vector rounding narrowing saturating shift right by constant 712 713 //Vector widening shift left by constant 714 715 //Shifts with insert 716 //Vector shift right and insert 717 718 int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 719 int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 720 int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 721 int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 722 uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 723 uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 724 uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 725 uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 726 poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 727 poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 728 //Vector shift left and insert 729 730 int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 731 int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 732 int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 733 int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 734 uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 735 uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 736 uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 737 uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 738 poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 739 poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 740 //Loads of a single vector or lane. Perform loads and stores of a single vector of some type. 741 //Load a single vector from memory 742 uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 743 uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 744 uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 745 uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 746 int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 747 int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 748 int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 749 int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 750 float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] 751 float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 752 poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 753 poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 754 755 //Load a single lane from memory 756 uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] 757 uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 758 uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 759 uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] 760 int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] 761 int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); //VLD1.16 {d0[0]}, [r0] 762 int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); //VLD1.32 {d0[0]}, [r0] 763 float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 764 int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); //VLD1.64 {d0}, [r0] 765 poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); //VLD1.8 {d0[0]}, [r0] 766 poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 767 768 //Load all lanes of vector with same value from memory 769 uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] 770 uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] 771 uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] 772 uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] 773 int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] 774 int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] 775 int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] 776 int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] 777 float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] 778 float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] 779 poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] 780 poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] 781 782 //Store a single vector or lane. Stores all lanes or a single lane of a vector. 783 //Store a single vector into memory 784 void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] 785 void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] 786 void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] 787 void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] 788 void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] 789 void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] 790 void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] 791 void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] 792 void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] 793 void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] 794 void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] 795 void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] 796 797 //Store a lane of a vector into memory 798 //Loads of an N-element structure 799 //Load N-element structure from memory 800 uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 801 uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 802 uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 803 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 804 int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 805 int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 806 float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] 807 float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 808 poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 809 poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 810 uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 811 uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 812 uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 813 uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 814 int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 815 int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 816 int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 817 int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 818 //float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] 819 float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 820 poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 821 poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 822 uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 823 uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 824 uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 825 int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 826 int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 827 int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 828 float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 829 float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 830 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 831 poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 832 uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 833 uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 834 uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 835 uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 836 int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 837 int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 838 int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 839 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 840 float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 841 float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 842 poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 843 poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 844 uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 845 uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 846 uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 847 int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 848 int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 849 int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 850 float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 851 float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 852 poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 853 poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 854 uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 855 uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 856 uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 857 uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 858 int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 859 int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 860 int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 861 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 862 float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 863 float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 864 poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 865 poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 866 //Load all lanes of N-element structure with same value from memory 867 uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 868 uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 869 uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 870 uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 871 int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 872 int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 873 int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 874 int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 875 //float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 876 float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 877 poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 878 poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 879 uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 880 uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 881 uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 882 uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 883 int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 884 int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 885 int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 886 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 887 float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 888 float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 889 poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 890 poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 891 uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 892 uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 893 uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 894 uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 895 int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 896 int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 897 int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 898 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 899 float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 900 float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 901 poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 902 poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 903 //Load a single lane of N-element structure from memory 904 //the functions below are modified to deal with the error C2719: 'src': formal parameter with __declspec(align('16')) won't be aligned 905 uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] 906 uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] 907 int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] 908 int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] 909 float16x8x2_t vld2q_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] 910 float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t * src, __constrange(0,3) int lane); // VLD2.32 {d0[0], d2[0]}, [r0] 911 poly16x8x2_t vld2q_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x8x2_t * src, __constrange(0,7) int lane); // VLD2.16 {d0[0], d2[0]}, [r0] 912 uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] 913 uint16x4x2_t vld2_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 914 uint32x2x2_t vld2_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] 915 int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] 916 int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); //VLD2.16 {d0[0], d1[0]}, [r0] 917 int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); //VLD2.32 {d0[0], d1[0]}, [r0] 918 //float16x4x2_t vld2_lane_f16_ptr(__transfersize(2) __fp16 const * ptr, float16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 919 float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] 920 poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); //VLD2.8 {d0[0], d1[0]}, [r0] 921 poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 922 uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 923 uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 924 int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 925 int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 926 float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 927 float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t * src, __constrange(0,3) int lane); // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 928 poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 929 uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 930 uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 931 uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 932 int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 933 int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); //VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 934 int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); //VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 935 float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 936 float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 937 poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); //VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 938 poly16x4x3_t vld3_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 939 uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 940 uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 941 int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 942 int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 943 float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 944 float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 945 poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 946 uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 947 uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 948 uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 949 int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 950 int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); //VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 951 int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); //VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 952 float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 953 float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t * src, __constrange(0,1) int lane); // VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 954 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); //VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 955 poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); // VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 956 //Store N-element structure to memory 957 void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t * val); // VST2.8 {d0, d2}, [r0] 958 void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t * val); // VST2.16 {d0, d2}, [r0] 959 void vst2q_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x4x2_t * val); // VST2.32 {d0, d2}, [r0] 960 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); // VST2.8 {d0, d2}, [r0] 961 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); // VST2.16 {d0, d2}, [r0] 962 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); // VST2.32 {d0, d2}, [r0] 963 void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); // VST2.16 {d0, d2}, [r0] 964 void vst2q_f32_ptr(__transfersize(8) float32_t * ptr, float32x4x2_t * val); // VST2.32 {d0, d2}, [r0] 965 void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); // VST2.8 {d0, d2}, [r0] 966 void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); // VST2.16 {d0, d2}, [r0] 967 void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t * val); // VST2.8 {d0, d1}, [r0] 968 void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t * val); // VST2.16 {d0, d1}, [r0] 969 void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t * val); // VST2.32 {d0, d1}, [r0] 970 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); // VST1.64 {d0, d1}, [r0] 971 void vst2_s8_ptr(__transfersize(16) int8_t * ptr, int8x8x2_t * val); // VST2.8 {d0, d1}, [r0] 972 void vst2_s16_ptr(__transfersize(8) int16_t * ptr, int16x4x2_t * val); // VST2.16 {d0, d1}, [r0] 973 void vst2_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x2_t * val); // VST2.32 {d0, d1}, [r0] 974 void vst2_s64_ptr(__transfersize(2) int64_t * ptr, int64x1x2_t * val); // VST1.64 {d0, d1}, [r0] 975 //void vst2_f16_ptr(__transfersize(8) __fp16 * ptr, float16x4x2_t * val); // VST2.16 {d0, d1}, [r0] 976 void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0] 977 void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] 978 void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] 979 void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] 980 void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] 981 void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] 982 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] 983 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] 984 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] 985 void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] 986 void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t * val); // VST3.32 {d0, d2, d4}, [r0] 987 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); // VST3.8 {d0, d2, d4}, [r0] 988 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); // VST3.16 {d0, d2, d4}, [r0] 989 void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] 990 void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] 991 void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] 992 void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] 993 void vst3_s8_ptr(__transfersize(24) int8_t * ptr, int8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] 994 void vst3_s16_ptr(__transfersize(12) int16_t * ptr, int16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] 995 void vst3_s32_ptr(__transfersize(6) int32_t * ptr, int32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] 996 void vst3_s64_ptr(__transfersize(3) int64_t * ptr, int64x1x3_t * val); // VST1.64 {d0, d1, d2}, [r0] 997 void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] 998 void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t * val); // VST3.32 {d0, d1, d2}, [r0] 999 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); // VST3.8 {d0, d1, d2}, [r0] 1000 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] 1001 void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] 1002 void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] 1003 void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] 1004 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] 1005 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] 1006 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] 1007 void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] 1008 void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t * val); // VST4.32 {d0, d2, d4, d6}, [r0] 1009 void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); // VST4.8 {d0, d2, d4, d6}, [r0] 1010 void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); // VST4.16 {d0, d2, d4, d6}, [r0] 1011 void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] 1012 void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] 1013 void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] 1014 void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] 1015 void vst4_s8_ptr(__transfersize(32) int8_t * ptr, int8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] 1016 void vst4_s16_ptr(__transfersize(16) int16_t * ptr, int16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] 1017 void vst4_s32_ptr(__transfersize(8) int32_t * ptr, int32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] 1018 void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); // VST1.64 {d0, d1, d2, d3}, [r0] 1019 void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] 1020 void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t * val); // VST4.32 {d0, d1, d2, d3}, [r0] 1021 void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); // VST4.8 {d0, d1, d2, d3}, [r0] 1022 void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); // VST4.16 {d0, d1, d2, d3}, [r0] 1023 //Store a single lane of N-element structure to memory 1024 void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] 1025 void vst2q_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] 1026 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] 1027 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); // VST2.32{d0[0], d2[0]}, [r0] 1028 void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] 1029 void vst2q_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x4x2_t * val, __constrange(0,3) int lane); //VST2.32 {d0[0], d2[0]}, [r0] 1030 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); // VST2.16{d0[0], d2[0]}, [r0] 1031 void vst2_lane_u8_ptr(__transfersize(2) uint8_t * ptr, uint8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] 1032 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] 1033 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] 1034 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); // VST2.8 {d0[0],d1[0]}, [r0] 1035 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] 1036 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] 1037 void vst2_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] 1038 void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32{d0[0], d1[0]}, [r0] 1039 void vst2_lane_p8_ptr(__transfersize(2) poly8_t * ptr, poly8x8x2_t * val, __constrange(0,7) int lane); // VST2.8{d0[0], d1[0]}, [r0] 1040 void vst2_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x4x2_t * val, __constrange(0,3) int lane); // VST2.16{d0[0], d1[0]}, [r0] 1041 void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] 1042 void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] 1043 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] 1044 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); // VST3.32{d0[0], d2[0], d4[0]}, [r0] 1045 void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] 1046 void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t * val, __constrange(0,3) int lane); //VST3.32 {d0[0], d2[0], d4[0]}, [r0] 1047 void vst3q_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x8x3_t * val, __constrange(0,7) int lane); // VST3.16{d0[0], d2[0], d4[0]}, [r0] 1048 void vst3_lane_u8_ptr(__transfersize(3) uint8_t * ptr, uint8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] 1049 void vst3_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] 1050 void vst3_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] 1051 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); // VST3.8 {d0[0],d1[0], d2[0]}, [r0] 1052 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] 1053 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] 1054 void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] 1055 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); // VST3.32{d0[0], d1[0], d2[0]}, [r0] 1056 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); // VST3.8{d0[0], d1[0], d2[0]}, [r0] 1057 void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); // VST3.16{d0[0], d1[0], d2[0]}, [r0] 1058 void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] 1059 void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] 1060 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] 1061 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); // VST4.32{d0[0], d2[0], d4[0], d6[0]}, [r0] 1062 void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] 1063 void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t * val, __constrange(0,3) int lane); //VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 1064 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); // VST4.16{d0[0], d2[0], d4[0], d6[0]}, [r0] 1065 void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] 1066 void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] 1067 void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] 1068 void vst4_lane_s8_ptr(__transfersize(4) int8_t * ptr, int8x8x4_t * val, __constrange(0,7) int lane); // VST4.8 {d0[0],d1[0], d2[0], d3[0]}, [r0] 1069 void vst4_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] 1070 void vst4_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] 1071 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] 1072 void vst4_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x4_t * val, __constrange(0,1) int lane); // VST4.32{d0[0], d1[0], d2[0], d3[0]}, [r0] 1073 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); // VST4.8{d0[0], d1[0], d2[0], d3[0]}, [r0] 1074 void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); // VST4.16{d0[0], d1[0], d2[0], d3[0]}, [r0] 1075 //Extract lanes from a vector and put into a register. These intrinsics extract a single lane (element) from a vector. 1076 1077 uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] 1078 uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] 1079 uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 1080 int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] 1081 int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] 1082 int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 1083 poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] 1084 poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.U16 r0, d0[0] 1085 float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 1086 1087 int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 1088 uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 1089 //Load a single lane of a vector from a literal. These intrinsics set a single lane (element) within a vector. 1090 1091 uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 1092 uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 1093 uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 1094 int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 1095 int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 1096 int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 1097 poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 1098 poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 1099 float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 1100 1101 int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 1102 uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 1103 //Initialize a vector from a literal bit pattern. 1104 1105 //Set all lanes to same value 1106 //Load all lanes of vector to the same literal value 1107 1108 uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 1109 uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 1110 uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 1111 int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 1112 int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 1113 int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 1114 poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 1115 poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 1116 float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 1117 1118 int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 1119 uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 1120 1121 uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 1122 uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 1123 uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 1124 int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 1125 int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 1126 int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 1127 poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 1128 poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 1129 float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 1130 1131 int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 1132 uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 1133 //Load all lanes of the vector to the value of a lane of a vector 1134 1135 //Combining vectors. These intrinsics join two 64 bit vectors into a single 128bit vector. 1136 1137 //Splitting vectors. These intrinsics split a 128 bit vector into 2 component 64 bit vectors 1138 1139 //Converting vectors. These intrinsics are used to convert vectors. 1140 //Convert from float 1141 1142 int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 1143 uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 1144 1145 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 1146 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 1147 //Convert to float 1148 1149 float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 1150 float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 1151 1152 float32x4_t vcvtq_n_f32_s32(int32x4_t a, __constrange(1,32) int b); // VCVT.F32.S32 q0, q0, #32 1153 float32x4_t vcvtq_n_f32_u32(uint32x4_t a, __constrange(1,32) int b); // VCVT.F32.U32 q0, q0, #32 1154 //Convert between floats 1155 1156 //Vector narrow integer 1157 1158 //Vector long move 1159 1160 //Vector saturating narrow integer 1161 1162 //Vector saturating narrow integer signed->unsigned 1163 1164 //Table look up 1165 1166 //Extended table look up intrinsics 1167 1168 //Operations with a scalar value 1169 //Vector multiply accumulate with scalar 1170 1171 //Vector widening multiply accumulate with scalar 1172 1173 //Vector widening saturating doubling multiply accumulate with scalar 1174 1175 //Vector multiply subtract with scalar 1176 1177 //Vector widening multiply subtract with scalar 1178 1179 //Vector widening saturating doubling multiply subtract with scalar 1180 1181 //Vector multiply by scalar 1182 1183 int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] 1184 int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] 1185 float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] 1186 uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] 1187 uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] 1188 //Vector long multiply with scalar 1189 1190 //Vector long multiply by scalar 1191 1192 //Vector saturating doubling long multiply with scalar 1193 1194 //Vector saturating doubling long multiply by scalar 1195 1196 //Vector saturating doubling multiply high with scalar 1197 1198 int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] 1199 int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] 1200 //Vector saturating doubling multiply high by scalar 1201 1202 //Vector saturating rounding doubling multiply high with scalar 1203 1204 int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] 1205 int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] 1206 //Vector rounding saturating doubling multiply high by scalar 1207 1208 //Vector multiply accumulate with scalar 1209 1210 int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] 1211 int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] 1212 uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] 1213 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] 1214 float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] 1215 //Vector widening multiply accumulate with scalar 1216 1217 //Vector widening saturating doubling multiply accumulate with scalar 1218 1219 //Vector multiply subtract with scalar 1220 1221 int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] 1222 int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] 1223 uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] 1224 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] 1225 float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] 1226 //Vector widening multiply subtract with scalar 1227 1228 //Vector widening saturating doubling multiply subtract with scalar 1229 1230 //Vector extract 1231 1232 int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 1233 uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 1234 poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 1235 int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 1236 uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 1237 poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 1238 int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 1239 uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 1240 int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 1241 uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 1242 //Reverse vector elements (swap endianness). VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide. 1243 1244 int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0 1245 int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0 1246 int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0 1247 uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0 1248 uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0 1249 uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0 1250 poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0 1251 poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0 1252 float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0 1253 1254 int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0 1255 int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0 1256 uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0 1257 uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0 1258 poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0 1259 1260 int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0 1261 uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0 1262 poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0 1263 //Other single operand arithmetic 1264 //Absolute: Vd[i] = |Va[i]| 1265 1266 int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0 1267 int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 1268 int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 1269 float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 1270 //Saturating absolute: Vd[i] = sat(|Va[i]|) 1271 1272 int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0 1273 int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0 1274 int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0 1275 //Negate: Vd[i] = - Va[i] 1276 1277 int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0 1278 int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0 1279 int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0 1280 float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0 1281 //Saturating Negate: sat(Vd[i] = - Va[i]) 1282 1283 int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0 1284 int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0 1285 int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0 1286 //Count leading sign bits 1287 1288 int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 1289 int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 1290 int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 1291 //Count leading zeros 1292 1293 int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0 1294 int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0 1295 int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0 1296 uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 1297 uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 1298 uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 1299 //Count number of set bits 1300 1301 uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 1302 int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 1303 poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 1304 //Reciprocal estimate 1305 1306 float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 1307 uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 1308 //Reciprocal square root estimate 1309 1310 float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 1311 uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 1312 //Logical operations 1313 //Bitwise not 1314 1315 int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 1316 int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 1317 int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 1318 uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 1319 uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 1320 uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 1321 poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 1322 //Bitwise and 1323 1324 int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 1325 int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 1326 int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 1327 int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 1328 uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 1329 uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 1330 uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 1331 uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 1332 //Bitwise or 1333 1334 int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 1335 int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 1336 int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 1337 int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 1338 uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 1339 uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 1340 uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 1341 uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 1342 //Bitwise exclusive or (EOR or XOR) 1343 1344 int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 1345 int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 1346 int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 1347 int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 1348 uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 1349 uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 1350 uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 1351 uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 1352 //Bit Clear 1353 1354 int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 1355 int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 1356 int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 1357 int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 1358 uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 1359 uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 1360 uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 1361 uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 1362 //Bitwise OR complement 1363 1364 int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 1365 int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 1366 int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 1367 int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 1368 uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 1369 uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 1370 uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 1371 uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 1372 //Bitwise Select 1373 1374 int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 1375 int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 1376 int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 1377 int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 1378 uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 1379 uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 1380 uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 1381 uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 1382 float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 1383 poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 1384 poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 1385 //Transposition operations 1386 //Transpose elements 1387 1388 int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 1389 int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 1390 int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 1391 uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 1392 uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 1393 uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 1394 float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 1395 poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 1396 poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 1397 //Interleave elements 1398 1399 int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 1400 int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 1401 int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 1402 uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 1403 uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 1404 uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 1405 float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 1406 poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 1407 poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 1408 //De-Interleave elements 1409 1410 int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 1411 int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 1412 int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 1413 uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 1414 uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 1415 uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 1416 float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 1417 poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 1418 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 1419 1420 //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 1421 // the following macros solve the problem of the "immediate parameters requirement" for some x86 intrinsics. While for release build it is not a must, 1422 //for debug build we need it to compile the code unless the "Intrinsic parameter must be an immediate value" error is our goal 1423 // 1424 #if ( ((defined _MSC_VER) && (_MSC_VER > 1600)) || defined __INTEL_COMPILER )&& defined NDEBUG //if it is a release build, we also need it to fix the issue for VS2010 and earlier compilers. 1425 1426 #if defined(USE_SSSE3) 1427 #define _MM_ALIGNR_EPI8 _mm_alignr_epi8 1428 #endif 1429 1430 #define _MM_EXTRACT_EPI16 _mm_extract_epi16 1431 #define _MM_INSERT_EPI16 _mm_insert_epi16 1432 #ifdef USE_SSE4 1433 #define _MM_EXTRACT_EPI8 _mm_extract_epi8 1434 #define _MM_EXTRACT_EPI32 _mm_extract_epi32 1435 #define _MM_EXTRACT_PS _mm_extract_ps 1436 1437 #define _MM_INSERT_EPI8 _mm_insert_epi8 1438 #define _MM_INSERT_EPI32 _mm_insert_epi32 1439 #define _MM_INSERT_PS _mm_insert_ps 1440 #ifdef _M_X64 1441 #define _MM_INSERT_EPI64 _mm_insert_epi64 1442 #define _MM_EXTRACT_EPI64 _mm_extract_epi64 1443 #endif 1444 #endif //SSE4 1445 #else 1446 #define _NEON2SSE_COMMA , 1447 #define _NEON2SSE_SWITCH16(NAME, a, b, LANE) \ 1448 switch(LANE) \ 1449 { \ 1450 case 0: return NAME(a b, 0); \ 1451 case 1: return NAME(a b, 1); \ 1452 case 2: return NAME(a b, 2); \ 1453 case 3: return NAME(a b, 3); \ 1454 case 4: return NAME(a b, 4); \ 1455 case 5: return NAME(a b, 5); \ 1456 case 6: return NAME(a b, 6); \ 1457 case 7: return NAME(a b, 7); \ 1458 case 8: return NAME(a b, 8); \ 1459 case 9: return NAME(a b, 9); \ 1460 case 10: return NAME(a b, 10); \ 1461 case 11: return NAME(a b, 11); \ 1462 case 12: return NAME(a b, 12); \ 1463 case 13: return NAME(a b, 13); \ 1464 case 14: return NAME(a b, 14); \ 1465 case 15: return NAME(a b, 15); \ 1466 default: return NAME(a b, 0); \ 1467 } 1468 1469 #define _NEON2SSE_SWITCH8(NAME, vec, LANE, p) \ 1470 switch(LANE) \ 1471 { \ 1472 case 0: return NAME(vec p,0); \ 1473 case 1: return NAME(vec p,1); \ 1474 case 2: return NAME(vec p,2); \ 1475 case 3: return NAME(vec p,3); \ 1476 case 4: return NAME(vec p,4); \ 1477 case 5: return NAME(vec p,5); \ 1478 case 6: return NAME(vec p,6); \ 1479 case 7: return NAME(vec p,7); \ 1480 default: return NAME(vec p,0); \ 1481 } 1482 1483 #define _NEON2SSE_SWITCH4(NAME, case0, case1, case2, case3, vec, LANE, p) \ 1484 switch(LANE) \ 1485 { \ 1486 case case0: return NAME(vec p,case0); \ 1487 case case1: return NAME(vec p,case1); \ 1488 case case2: return NAME(vec p,case2); \ 1489 case case3: return NAME(vec p,case3); \ 1490 default: return NAME(vec p,case0); \ 1491 } 1492 1493 #if defined(USE_SSSE3) 1494 _NEON2SSE_INLINE __m128i _MM_ALIGNR_EPI8(__m128i a, __m128i b, int LANE) 1495 { 1496 _NEON2SSE_SWITCH16(_mm_alignr_epi8, a, _NEON2SSE_COMMA b, LANE) 1497 } 1498 #endif 1499 1500 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI16(__m128i vec, int p, const int LANE) 1501 { 1502 _NEON2SSE_SWITCH8(_mm_insert_epi16, vec, LANE, _NEON2SSE_COMMA p) 1503 } 1504 1505 _NEON2SSE_INLINE int _MM_EXTRACT_EPI16(__m128i vec, const int LANE) 1506 { 1507 _NEON2SSE_SWITCH8(_mm_extract_epi16, vec, LANE,) 1508 } 1509 1510 #ifdef USE_SSE4 1511 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) 1512 { 1513 _NEON2SSE_SWITCH4(_mm_extract_epi32, 0,1,2,3, vec, LANE,) 1514 } 1515 1516 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) 1517 { 1518 _NEON2SSE_SWITCH4(_mm_extract_ps, 0,1,2,3, vec, LANE,) 1519 } 1520 1521 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) 1522 { 1523 _NEON2SSE_SWITCH16(_mm_extract_epi8, vec, , LANE) 1524 } 1525 1526 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) 1527 { 1528 _NEON2SSE_SWITCH4(_mm_insert_epi32, 0, 1, 2, 3, vec, LANE, _NEON2SSE_COMMA p) 1529 } 1530 1531 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) 1532 { 1533 _NEON2SSE_SWITCH16(_mm_insert_epi8, vec, _NEON2SSE_COMMA p, LANE) 1534 } 1535 #ifdef _M_X64 1536 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE) 1537 { 1538 switch(LANE) 1539 { 1540 case 0: 1541 return _mm_insert_epi64(vec, p, 0); 1542 case 1: 1543 return _mm_insert_epi64(vec, p, 1); 1544 default: 1545 return _mm_insert_epi64(vec, p, 0); 1546 } 1547 } 1548 1549 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE) 1550 { 1551 if (LANE ==0) return _mm_extract_epi64(val, 0); 1552 else return _mm_extract_epi64(val, 1); 1553 } 1554 #endif 1555 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) 1556 { 1557 _NEON2SSE_SWITCH4(_mm_insert_ps, 0, 16, 32, 48, vec, LANE, _NEON2SSE_COMMA p) 1558 } 1559 1560 #endif //USE_SSE4 1561 1562 #endif //#ifdef NDEBUG 1563 1564 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1565 // Below are some helper functions used either for SSE4 intrinsics "emulation" for SSSE3 limited devices 1566 // or for some specific commonly used operations implementation missing in SSE 1567 #ifdef USE_SSE4 1568 #define _MM_CVTEPU8_EPI16 _mm_cvtepu8_epi16 1569 #define _MM_CVTEPU16_EPI32 _mm_cvtepu16_epi32 1570 #define _MM_CVTEPU32_EPI64 _mm_cvtepu32_epi64 1571 1572 #define _MM_CVTEPI8_EPI16 _mm_cvtepi8_epi16 1573 #define _MM_CVTEPI16_EPI32 _mm_cvtepi16_epi32 1574 #define _MM_CVTEPI32_EPI64 _mm_cvtepi32_epi64 1575 1576 #define _MM_MAX_EPI8 _mm_max_epi8 1577 #define _MM_MAX_EPI32 _mm_max_epi32 1578 #define _MM_MAX_EPU16 _mm_max_epu16 1579 #define _MM_MAX_EPU32 _mm_max_epu32 1580 1581 #define _MM_MIN_EPI8 _mm_min_epi8 1582 #define _MM_MIN_EPI32 _mm_min_epi32 1583 #define _MM_MIN_EPU16 _mm_min_epu16 1584 #define _MM_MIN_EPU32 _mm_min_epu32 1585 1586 #define _MM_BLENDV_EPI8 _mm_blendv_epi8 1587 #define _MM_PACKUS_EPI32 _mm_packus_epi32 1588 #define _MM_PACKUS1_EPI32(a) _mm_packus_epi32(a, a) 1589 1590 #define _MM_MULLO_EPI32 _mm_mullo_epi32 1591 #define _MM_MUL_EPI32 _mm_mul_epi32 1592 #else //no SSE4 !!!!!! 1593 _NEON2SSE_INLINE __m128i _MM_CVTEPU8_EPI16(__m128i a) 1594 { 1595 __m128i zero = _mm_setzero_si128(); 1596 return _mm_unpacklo_epi8(a, zero); 1597 } 1598 1599 _NEON2SSE_INLINE __m128i _MM_CVTEPU16_EPI32(__m128i a) 1600 { 1601 __m128i zero = _mm_setzero_si128(); 1602 return _mm_unpacklo_epi16(a, zero); 1603 } 1604 1605 _NEON2SSE_INLINE __m128i _MM_CVTEPU32_EPI64(__m128i a) 1606 { 1607 __m128i zero = _mm_setzero_si128(); 1608 return _mm_unpacklo_epi32(a, zero); 1609 } 1610 1611 _NEON2SSE_INLINE __m128i _MM_CVTEPI8_EPI16(__m128i a) 1612 { 1613 __m128i zero = _mm_setzero_si128(); 1614 __m128i sign = _mm_cmpgt_epi8(zero, a); 1615 return _mm_unpacklo_epi8(a, sign); 1616 } 1617 1618 _NEON2SSE_INLINE __m128i _MM_CVTEPI16_EPI32(__m128i a) 1619 { 1620 __m128i zero = _mm_setzero_si128(); 1621 __m128i sign = _mm_cmpgt_epi16(zero, a); 1622 return _mm_unpacklo_epi16(a, sign); 1623 } 1624 1625 _NEON2SSE_INLINE __m128i _MM_CVTEPI32_EPI64(__m128i a) 1626 { 1627 __m128i zero = _mm_setzero_si128(); 1628 __m128i sign = _mm_cmpgt_epi32(zero, a); 1629 return _mm_unpacklo_epi32(a, sign); 1630 } 1631 1632 _NEON2SSE_INLINE int _MM_EXTRACT_EPI32(__m128i vec, const int LANE) 1633 { 1634 _NEON2SSE_ALIGN_16 int32_t tmp[4]; 1635 _mm_store_si128((__m128i*)tmp, vec); 1636 return tmp[LANE]; 1637 } 1638 1639 _NEON2SSE_INLINE int _MM_EXTRACT_EPI8(__m128i vec, const int LANE) 1640 { 1641 _NEON2SSE_ALIGN_16 int8_t tmp[16]; 1642 _mm_store_si128((__m128i*)tmp, vec); 1643 return (int)tmp[LANE]; 1644 } 1645 1646 _NEON2SSE_INLINE int _MM_EXTRACT_PS(__m128 vec, const int LANE) 1647 { 1648 _NEON2SSE_ALIGN_16 int32_t tmp[4]; 1649 _mm_store_si128((__m128i*)tmp, _M128i(vec)); 1650 return tmp[LANE]; 1651 } 1652 1653 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI32(__m128i vec, int p, const int LANE) 1654 { 1655 _NEON2SSE_ALIGN_16 int32_t pvec[4] = {0,0,0,0}; 1656 _NEON2SSE_ALIGN_16 uint32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; 1657 __m128i vec_masked, p_masked; 1658 pvec[LANE] = p; 1659 mask[LANE] = 0x0; 1660 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p 1661 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec 1662 return _mm_or_si128(vec_masked, p_masked); 1663 } 1664 1665 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI8(__m128i vec, int p, const int LANE) 1666 { 1667 _NEON2SSE_ALIGN_16 int8_t pvec[16] = {0,0,0,0, 0,0,0,0, 0,0,0,0, 0,0,0,0}; 1668 _NEON2SSE_ALIGN_16 uint8_t mask[16] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}; 1669 __m128i vec_masked, p_masked; 1670 pvec[LANE] = (int8_t)p; 1671 mask[LANE] = 0x0; 1672 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p 1673 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec 1674 return _mm_or_si128(vec_masked, p_masked); 1675 } 1676 1677 _NEON2SSE_INLINE __m128 _MM_INSERT_PS(__m128 vec, __m128 p, const int LANE) 1678 { 1679 _NEON2SSE_ALIGN_16 int32_t mask[4] = {0xffffffff,0xffffffff,0xffffffff,0xffffffff}; 1680 __m128 tmp, vec_masked, p_masked; 1681 mask[LANE >> 4] = 0x0; //here the LANE is not actural lane, need to deal with it 1682 vec_masked = _mm_and_ps (*(__m128*)mask,vec); //ready for p 1683 p_masked = _mm_andnot_ps (*(__m128*)mask, p); //ready for vec 1684 tmp = _mm_or_ps(vec_masked, p_masked); 1685 return tmp; 1686 } 1687 1688 _NEON2SSE_INLINE __m128i _MM_MAX_EPI8(__m128i a, __m128i b) 1689 { 1690 __m128i cmp, resa, resb; 1691 cmp = _mm_cmpgt_epi8 (a, b); 1692 resa = _mm_and_si128 (cmp, a); 1693 resb = _mm_andnot_si128 (cmp,b); 1694 return _mm_or_si128(resa, resb); 1695 } 1696 1697 _NEON2SSE_INLINE __m128i _MM_MAX_EPI32(__m128i a, __m128i b) 1698 { 1699 __m128i cmp, resa, resb; 1700 cmp = _mm_cmpgt_epi32(a, b); 1701 resa = _mm_and_si128 (cmp, a); 1702 resb = _mm_andnot_si128 (cmp,b); 1703 return _mm_or_si128(resa, resb); 1704 } 1705 1706 _NEON2SSE_INLINE __m128i _MM_MAX_EPU16(__m128i a, __m128i b) 1707 { 1708 __m128i c8000, b_s, a_s, cmp; 1709 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff 1710 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 1711 b_s = _mm_sub_epi16 (b, c8000); 1712 a_s = _mm_sub_epi16 (a, c8000); 1713 cmp = _mm_cmpgt_epi16 (a_s, b_s); //no unsigned comparison, need to go to signed 1714 a_s = _mm_and_si128 (cmp,a); 1715 b_s = _mm_andnot_si128 (cmp,b); 1716 return _mm_or_si128(a_s, b_s); 1717 } 1718 1719 _NEON2SSE_INLINE __m128i _MM_MAX_EPU32(__m128i a, __m128i b) 1720 { 1721 __m128i c80000000, b_s, a_s, cmp; 1722 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff 1723 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 1724 b_s = _mm_sub_epi32 (b, c80000000); 1725 a_s = _mm_sub_epi32 (a, c80000000); 1726 cmp = _mm_cmpgt_epi32 (a_s, b_s); //no unsigned comparison, need to go to signed 1727 a_s = _mm_and_si128 (cmp,a); 1728 b_s = _mm_andnot_si128 (cmp,b); 1729 return _mm_or_si128(a_s, b_s); 1730 } 1731 1732 _NEON2SSE_INLINE __m128i _MM_MIN_EPI8(__m128i a, __m128i b) 1733 { 1734 __m128i cmp, resa, resb; 1735 cmp = _mm_cmpgt_epi8 (b, a); 1736 resa = _mm_and_si128 (cmp, a); 1737 resb = _mm_andnot_si128 (cmp,b); 1738 return _mm_or_si128(resa, resb); 1739 } 1740 1741 _NEON2SSE_INLINE __m128i _MM_MIN_EPI32(__m128i a, __m128i b) 1742 { 1743 __m128i cmp, resa, resb; 1744 cmp = _mm_cmpgt_epi32(b, a); 1745 resa = _mm_and_si128 (cmp, a); 1746 resb = _mm_andnot_si128 (cmp,b); 1747 return _mm_or_si128(resa, resb); 1748 } 1749 1750 _NEON2SSE_INLINE __m128i _MM_MIN_EPU16(__m128i a, __m128i b) 1751 { 1752 __m128i c8000, b_s, a_s, cmp; 1753 c8000 = _mm_cmpeq_epi16 (a,a); //0xffff 1754 c8000 = _mm_slli_epi16 (c8000, 15); //0x8000 1755 b_s = _mm_sub_epi16 (b, c8000); 1756 a_s = _mm_sub_epi16 (a, c8000); 1757 cmp = _mm_cmpgt_epi16 (b_s, a_s); //no unsigned comparison, need to go to signed 1758 a_s = _mm_and_si128 (cmp,a); 1759 b_s = _mm_andnot_si128 (cmp,b); 1760 return _mm_or_si128(a_s, b_s); 1761 } 1762 1763 _NEON2SSE_INLINE __m128i _MM_MIN_EPU32(__m128i a, __m128i b) 1764 { 1765 __m128i c80000000, b_s, a_s, cmp; 1766 c80000000 = _mm_cmpeq_epi32 (a,a); //0xffffffff 1767 c80000000 = _mm_slli_epi32 (c80000000, 31); //0x80000000 1768 b_s = _mm_sub_epi32 (b, c80000000); 1769 a_s = _mm_sub_epi32 (a, c80000000); 1770 cmp = _mm_cmpgt_epi32 (b_s, a_s); //no unsigned comparison, need to go to signed 1771 a_s = _mm_and_si128 (cmp,a); 1772 b_s = _mm_andnot_si128 (cmp,b); 1773 return _mm_or_si128(a_s, b_s); 1774 } 1775 1776 _NEON2SSE_INLINE __m128i _MM_BLENDV_EPI8(__m128i a, __m128i b, __m128i mask) //this is NOT exact implementation of _mm_blendv_epi8 !!!!! - please see below 1777 { //it assumes mask is either 0xff or 0 always (like in all usecases below) while for the original _mm_blendv_epi8 only MSB mask byte matters. 1778 __m128i a_masked, b_masked; 1779 b_masked = _mm_and_si128 (mask,b); //use b if mask 0xff 1780 a_masked = _mm_andnot_si128 (mask,a); 1781 return _mm_or_si128(a_masked, b_masked); 1782 } 1783 1784 #if defined(USE_SSSE3) 1785 _NEON2SSE_INLINE __m128i _MM_PACKUS_EPI32(__m128i a, __m128i b) 1786 { 1787 _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; 1788 __m128i a16, b16, res, reshi,cmp, zero; 1789 zero = _mm_setzero_si128(); 1790 a16 = _mm_shuffle_epi8 (a, *(__m128i*) mask8_32_even_odd); 1791 b16 = _mm_shuffle_epi8 (b, *(__m128i*) mask8_32_even_odd); 1792 res = _mm_unpacklo_epi64(a16, b16); //result without saturation 1793 reshi = _mm_unpackhi_epi64(a16, b16); //hi part of result used for saturation 1794 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero 1795 res = _mm_andnot_si128(cmp,res); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 1796 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive 1797 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff 1798 } 1799 #endif 1800 1801 #if defined(USE_SSSE3) 1802 _NEON2SSE_INLINE __m128i _MM_PACKUS1_EPI32(__m128i a) 1803 { 1804 _NEON2SSE_ALIGN_16 int8_t mask8_32_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7,10,11,14,15}; 1805 __m128i a16, res, reshi,cmp, zero; 1806 zero = _mm_setzero_si128(); 1807 a16 = _mm_shuffle_epi8 (a, *(__m128i*)mask8_32_even_odd); 1808 reshi = _mm_unpackhi_epi64(a16, a16); //hi part of result used for saturation 1809 cmp = _mm_cmpgt_epi16(zero, reshi); //if cmp<0 the result should be zero 1810 res = _mm_andnot_si128(cmp, a16); //if cmp zero - do nothing, otherwise cmp <0 and the result is 0 1811 cmp = _mm_cmpgt_epi16(reshi,zero); //if cmp positive 1812 return _mm_or_si128(res, cmp); //if cmp positive we are out of 16bits need to saturaate to 0xffff 1813 } 1814 #endif 1815 1816 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(__m128i _MM_MULLO_EPI32(__m128i a, __m128i b), _NEON2SSE_REASON_SLOW_SERIAL) 1817 { 1818 _NEON2SSE_ALIGN_16 int32_t atmp[4], btmp[4], res[4]; 1819 int64_t res64; 1820 int i; 1821 _mm_store_si128((__m128i*)atmp, a); 1822 _mm_store_si128((__m128i*)btmp, b); 1823 for (i = 0; i<4; i++) { 1824 res64 = atmp[i] * btmp[i]; 1825 res[i] = (int)(res64 & 0xffffffff); 1826 } 1827 return _mm_load_si128((__m128i*)res); 1828 } 1829 1830 #if defined(USE_SSSE3) 1831 _NEON2SSE_INLINE __m128i _MM_MUL_EPI32(__m128i a, __m128i b) 1832 { 1833 __m128i sign, zero, mul_us, a_neg, b_neg, mul_us_neg; 1834 sign = _mm_xor_si128 (a, b); 1835 sign = _mm_srai_epi32 (sign, 31); //promote sign bit to all fields, all fff if negative and all 0 if positive 1836 zero = _mm_setzero_si128(); 1837 a_neg = _mm_abs_epi32 (a); //negate a and b 1838 b_neg = _mm_abs_epi32 (b); //negate a and b 1839 mul_us = _mm_mul_epu32 (a_neg, b_neg); //uses 0 and 2nd data lanes, (abs), the multiplication gives 64 bit result 1840 mul_us_neg = _mm_sub_epi64(zero, mul_us); 1841 mul_us_neg = _mm_and_si128(sign, mul_us_neg); 1842 mul_us = _mm_andnot_si128(sign, mul_us); 1843 return _mm_or_si128 (mul_us, mul_us_neg); 1844 } 1845 #endif 1846 #endif //SSE4 1847 1848 #ifndef _MM_INSERT_EPI64 //special case of SSE4 and _M_X64 1849 _NEON2SSE_INLINE __m128i _MM_INSERT_EPI64(__m128i vec, int p, const int LANE) 1850 { 1851 _NEON2SSE_ALIGN_16 uint64_t pvec[2] = {0,0}; 1852 _NEON2SSE_ALIGN_16 uint64_t mask[2] = {0xffffffffffffffff,0xffffffffffffffff}; 1853 __m128i vec_masked, p_masked; 1854 pvec[LANE] = p; 1855 mask[LANE] = 0x0; 1856 vec_masked = _mm_and_si128 (*(__m128i*)mask,vec); //ready for p 1857 p_masked = _mm_andnot_si128 (*(__m128i*)mask,*(__m128i*)pvec); //ready for vec 1858 return _mm_or_si128(vec_masked, p_masked); 1859 } 1860 #endif 1861 #ifndef _MM_EXTRACT_EPI64 //special case of SSE4 and _M_X64 1862 _NEON2SSE_INLINE int64_t _MM_EXTRACT_EPI64(__m128i val, const int LANE) 1863 { 1864 _NEON2SSE_ALIGN_16 int64_t tmp[2]; 1865 _mm_store_si128((__m128i*)tmp, val); 1866 return tmp[LANE]; 1867 } 1868 #endif 1869 1870 int32x4_t vqd_s32(int32x4_t a); //Doubling saturation for signed ints 1871 _NEON2SSE_INLINE int32x4_t vqd_s32(int32x4_t a) 1872 { //Overflow happens only if a and sum have the opposite signs 1873 __m128i c7fffffff, res, res_sat, res_xor_a; 1874 c7fffffff = _mm_set1_epi32(0x7fffffff); 1875 res = _mm_slli_epi32 (a, 1); // res = a*2 1876 res_sat = _mm_srli_epi32(a, 31); 1877 res_sat = _mm_add_epi32(res_sat, c7fffffff); 1878 res_xor_a = _mm_xor_si128(res, a); 1879 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise 1880 res_sat = _mm_and_si128(res_xor_a, res_sat); 1881 res = _mm_andnot_si128(res_xor_a, res); 1882 return _mm_or_si128(res, res_sat); 1883 } 1884 1885 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1886 //************************************************************************* 1887 //************************************************************************* 1888 //***************** Functions redefinition\implementatin starts here ***** 1889 //************************************************************************* 1890 //************************************************************************* 1891 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 1892 1893 /*If the unified intrinsics solutions is necessary please define your SSE intrinsics wrap here like in the following sample: 1894 #ifdef ARM 1895 #define vector_addq_s32 _mm_add_epi32 1896 #else //if we have IA 1897 #endif 1898 1899 ******************************************************************************************** 1900 Functions below are organised in the following way: 1901 1902 Each NEON intrinsic function has one of the following options: 1903 1. its x86 full equivalent SSE intrinsic - in this case x86 version just follows the NEON one under the corresponding #define statement 1904 2. x86 implementation using more than one x86 intrinsics. In this case it is shaped as inlined C function with return statement 1905 3. the reference to the NEON function returning the same result and implemented in x86 as above. In this case it is shaped as matching NEON function definition 1906 4. for about 5% of functions due to the corresponding x86 SIMD unavailability or inefficiency in terms of performance 1907 the serial implementation is provided along with the corresponding compiler warnin//these functions are on your app critical path 1908 - please consider such functions removal from your code. 1909 */ 1910 1911 //*********************************************************************** 1912 //************************ Vector add ***************************** 1913 //*********************************************************************** 1914 1915 int8x16_t vaddq_s8(int8x16_t a, int8x16_t b); // VADD.I8 q0,q0,q0 1916 #define vaddq_s8 _mm_add_epi8 1917 1918 int16x8_t vaddq_s16(int16x8_t a, int16x8_t b); // VADD.I16 q0,q0,q0 1919 #define vaddq_s16 _mm_add_epi16 1920 1921 int32x4_t vaddq_s32(int32x4_t a, int32x4_t b); // VADD.I32 q0,q0,q0 1922 #define vaddq_s32 _mm_add_epi32 1923 1924 int64x2_t vaddq_s64(int64x2_t a, int64x2_t b); // VADD.I64 q0,q0,q0 1925 #define vaddq_s64 _mm_add_epi64 1926 1927 float32x4_t vaddq_f32(float32x4_t a, float32x4_t b); // VADD.F32 q0,q0,q0 1928 #define vaddq_f32 _mm_add_ps 1929 1930 uint8x16_t vaddq_u8(uint8x16_t a, uint8x16_t b); // VADD.I8 q0,q0,q0 1931 #define vaddq_u8 _mm_add_epi8 1932 1933 uint16x8_t vaddq_u16(uint16x8_t a, uint16x8_t b); // VADD.I16 q0,q0,q0 1934 #define vaddq_u16 _mm_add_epi16 1935 1936 uint32x4_t vaddq_u32(uint32x4_t a, uint32x4_t b); // VADD.I32 q0,q0,q0 1937 #define vaddq_u32 _mm_add_epi32 1938 1939 uint64x2_t vaddq_u64(uint64x2_t a, uint64x2_t b); // VADD.I64 q0,q0,q0 1940 #define vaddq_u64 _mm_add_epi64 1941 1942 //**************************** Vector long add *****************************: 1943 //*********************************************************************** 1944 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. 1945 1946 //*************** Vector wide add: vaddw_<type>. Vr[i]:=Va[i]+Vb[i] ****************** 1947 //*************** ********************************************************************* 1948 1949 //******************************Vector halving add: vhadd -> Vr[i]:=(Va[i]+Vb[i])>>1 , result truncated ******************************* 1950 //************************************************************************************************************************* 1951 1952 int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b); // VHADD.S8 q0,q0,q0 1953 _NEON2SSE_INLINE int8x16_t vhaddq_s8(int8x16_t a, int8x16_t b) 1954 { //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). 1955 __m128i tmp1, tmp2; 1956 tmp1 = _mm_and_si128(a,b); 1957 tmp2 = _mm_xor_si128(a,b); 1958 tmp2 = vshrq_n_s8(tmp2,1); 1959 return _mm_add_epi8(tmp1,tmp2); 1960 } 1961 1962 int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b); // VHADD.S1 6 q0,q0,q0 1963 _NEON2SSE_INLINE int16x8_t vhaddq_s16(int16x8_t a, int16x8_t b) 1964 { //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). 1965 __m128i tmp1, tmp2; 1966 tmp1 = _mm_and_si128(a,b); 1967 tmp2 = _mm_xor_si128(a,b); 1968 tmp2 = _mm_srai_epi16(tmp2,1); 1969 return _mm_add_epi16(tmp1,tmp2); 1970 } 1971 1972 int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b); // VHADD.S32 q0,q0,q0 1973 _NEON2SSE_INLINE int32x4_t vhaddq_s32(int32x4_t a, int32x4_t b) // VHADD.S32 q0,q0,q0 1974 { //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). 1975 __m128i tmp1, tmp2; 1976 tmp1 = _mm_and_si128(a,b); 1977 tmp2 = _mm_xor_si128(a,b); 1978 tmp2 = _mm_srai_epi32(tmp2,1); 1979 return _mm_add_epi32(tmp1,tmp2); 1980 } 1981 1982 uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b); // VHADD.U8 q0,q0,q0 1983 _NEON2SSE_INLINE uint8x16_t vhaddq_u8(uint8x16_t a, uint8x16_t b) // VHADD.U8 q0,q0,q0 1984 { 1985 __m128i c1, sum, res; 1986 c1 = _mm_set1_epi8(1); 1987 sum = _mm_avg_epu8(a, b); //result is rounded, need to compensate it 1988 res = _mm_xor_si128(a, b); //for rounding compensation 1989 res = _mm_and_si128(res,c1); //for rounding compensation 1990 return _mm_sub_epi8 (sum, res); //actual rounding compensation 1991 } 1992 1993 uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b); // VHADD.s16 q0,q0,q0 1994 _NEON2SSE_INLINE uint16x8_t vhaddq_u16(uint16x8_t a, uint16x8_t b) // VHADD.s16 q0,q0,q0 1995 { 1996 __m128i sum, res; 1997 sum = _mm_avg_epu16(a, b); //result is rounded, need to compensate it 1998 res = _mm_xor_si128(a, b); //for rounding compensation 1999 res = _mm_slli_epi16 (res,15); //shift left then back right to 2000 res = _mm_srli_epi16 (res,15); //get 1 or zero 2001 return _mm_sub_epi16 (sum, res); //actual rounding compensation 2002 } 2003 2004 uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b); // VHADD.U32 q0,q0,q0 2005 _NEON2SSE_INLINE uint32x4_t vhaddq_u32(uint32x4_t a, uint32x4_t b) // VHADD.U32 q0,q0,q0 2006 { //need to avoid internal overflow, will use the (x&y)+((x^y)>>1). 2007 __m128i tmp1, tmp2; 2008 tmp1 = _mm_and_si128(a,b); 2009 tmp2 = _mm_xor_si128(a,b); 2010 tmp2 = _mm_srli_epi32(tmp2,1); 2011 return _mm_add_epi32(tmp1,tmp2); 2012 } 2013 2014 //************************Vector rounding halving add: vrhadd{q}_<type>. Vr[i]:=(Va[i]+Vb[i]+1)>>1 *************************** 2015 //***************************************************************************************************************************** 2016 2017 //SSE, result rounding!!! 2018 2019 //SSE, result rounding!!! 2020 2021 int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b); // VRHADD.S8 q0,q0,q0 2022 _NEON2SSE_INLINE int8x16_t vrhaddq_s8(int8x16_t a, int8x16_t b) // VRHADD.S8 q0,q0,q0 2023 { //no signed average in x86 SIMD, go to unsigned 2024 __m128i c128, au, bu, sum; 2025 c128 = _mm_set1_epi8(128); 2026 au = _mm_add_epi8(a, c128); 2027 bu = _mm_add_epi8(b, c128); 2028 sum = _mm_avg_epu8(au, bu); 2029 return _mm_sub_epi8 (sum, c128); 2030 } 2031 2032 int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b); // VRHADD.S16 q0,q0,q0 2033 _NEON2SSE_INLINE int16x8_t vrhaddq_s16(int16x8_t a, int16x8_t b) // VRHADD.S16 q0,q0,q0 2034 { //no signed average in x86 SIMD, go to unsigned 2035 __m128i cx8000, au, bu, sum; 2036 cx8000 = _mm_set1_epi16(0x8000); 2037 au = _mm_add_epi16(a, cx8000); 2038 bu = _mm_add_epi16(b, cx8000); 2039 sum = _mm_avg_epu16(au, bu); 2040 return _mm_sub_epi16 (sum, cx8000); 2041 } 2042 2043 int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b); // VRHADD.S32 q0,q0,q0 2044 _NEON2SSE_INLINE int32x4_t vrhaddq_s32(int32x4_t a, int32x4_t b) 2045 { //need to avoid overflow 2046 __m128i a2, b2, res, sum; 2047 a2 = _mm_srai_epi32(a,1); //a2=a/2; 2048 b2 = _mm_srai_epi32(b,1); // b2=b/2; 2049 res = _mm_or_si128(a,b); //for rounding 2050 res = _mm_slli_epi32 (res,31); //shift left then back right to 2051 res = _mm_srli_epi32 (res,31); //get 1 or zero 2052 sum = _mm_add_epi32(a2,b2); 2053 return _mm_add_epi32(sum,res); 2054 } 2055 2056 uint8x16_t vrhaddq_u8(uint8x16_t a, uint8x16_t b); // VRHADD.U8 q0,q0,q0 2057 #define vrhaddq_u8 _mm_avg_epu8 //SSE2, results rounded 2058 2059 uint16x8_t vrhaddq_u16(uint16x8_t a, uint16x8_t b); // VRHADD.s16 q0,q0,q0 2060 #define vrhaddq_u16 _mm_avg_epu16 //SSE2, results rounded 2061 2062 uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b); // VRHADD.U32 q0,q0,q0 2063 _NEON2SSE_INLINE uint32x4_t vrhaddq_u32(uint32x4_t a, uint32x4_t b) // VRHADD.U32 q0,q0,q0 2064 { //need to avoid overflow 2065 __m128i a2, b2, res, sum; 2066 a2 = _mm_srli_epi32(a,1); //a2=a/2; 2067 b2 = _mm_srli_epi32(b,1); // b2=b/2; 2068 res = _mm_or_si128(a,b); //for rounding 2069 res = _mm_slli_epi32 (res,31); //shift left then back right to 2070 res = _mm_srli_epi32 (res,31); //get 1 or zero 2071 sum = _mm_add_epi32(a2,b2); 2072 return _mm_add_epi32(sum,res); 2073 } 2074 2075 //****************** VQADD: Vector saturating add ************************ 2076 //************************************************************************ 2077 2078 int8x16_t vqaddq_s8(int8x16_t a, int8x16_t b); // VQADD.S8 q0,q0,q0 2079 #define vqaddq_s8 _mm_adds_epi8 2080 2081 int16x8_t vqaddq_s16(int16x8_t a, int16x8_t b); // VQADD.S16 q0,q0,q0 2082 #define vqaddq_s16 _mm_adds_epi16 2083 2084 int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b); // VQADD.S32 q0,q0,q0 2085 _NEON2SSE_INLINE int32x4_t vqaddq_s32(int32x4_t a, int32x4_t b) 2086 { //no corresponding x86 SIMD soulution, special tricks are necessary. Overflow happens only if a and b have the same sign and sum has the opposite sign 2087 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a_; 2088 c7fffffff = _mm_set1_epi32(0x7fffffff); 2089 res = _mm_add_epi32(a, b); 2090 res_sat = _mm_srli_epi32(a, 31); 2091 res_sat = _mm_add_epi32(res_sat, c7fffffff); 2092 res_xor_a = _mm_xor_si128(res, a); 2093 b_xor_a_ = _mm_xor_si128(b, a); 2094 res_xor_a = _mm_andnot_si128(b_xor_a_, res_xor_a); 2095 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise 2096 res_sat = _mm_and_si128(res_xor_a, res_sat); 2097 res = _mm_andnot_si128(res_xor_a, res); 2098 return _mm_or_si128(res, res_sat); 2099 } 2100 2101 int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b); // VQADD.S64 q0,q0,q0 2102 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqaddq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 2103 { 2104 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; 2105 _mm_store_si128((__m128i*)atmp, a); 2106 _mm_store_si128((__m128i*)btmp, b); 2107 res[0] = atmp[0] + btmp[0]; 2108 res[1] = atmp[1] + btmp[1]; 2109 2110 atmp[0] = (atmp[0] >> 63) + (~_SIGNBIT64); 2111 atmp[1] = (atmp[1] >> 63) + (~_SIGNBIT64); 2112 2113 if ((int64_t)((btmp[0] ^ atmp[0]) | ~(res[0] ^ btmp[0]))>=0) { 2114 res[0] = atmp[0]; 2115 } 2116 if ((int64_t)((btmp[1] ^ atmp[1]) | ~(res[1] ^ btmp[1]))>=0) { 2117 res[1] = atmp[1]; 2118 } 2119 return _mm_load_si128((__m128i*)res); 2120 } 2121 2122 uint8x16_t vqaddq_u8(uint8x16_t a, uint8x16_t b); // VQADD.U8 q0,q0,q0 2123 #define vqaddq_u8 _mm_adds_epu8 2124 2125 uint16x8_t vqaddq_u16(uint16x8_t a, uint16x8_t b); // VQADD.s16 q0,q0,q0 2126 #define vqaddq_u16 _mm_adds_epu16 2127 2128 uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b); // VQADD.U32 q0,q0,q0 2129 _NEON2SSE_INLINE uint32x4_t vqaddq_u32(uint32x4_t a, uint32x4_t b) 2130 { 2131 __m128i c80000000, cmp, subsum, suba, sum; 2132 c80000000 = _mm_set1_epi32 (0x80000000); 2133 sum = _mm_add_epi32 (a, b); 2134 subsum = _mm_sub_epi32 (sum, c80000000); 2135 suba = _mm_sub_epi32 (a, c80000000); 2136 cmp = _mm_cmpgt_epi32 ( suba, subsum); //no unsigned comparison, need to go to signed 2137 return _mm_or_si128 (sum, cmp); //saturation 2138 } 2139 2140 uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b); // VQADD.U64 q0,q0,q0 2141 #ifdef USE_SSE4 2142 _NEON2SSE_INLINE uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b) 2143 { 2144 __m128i c80000000, sum, cmp, suba, subsum; 2145 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); 2146 sum = _mm_add_epi64 (a, b); 2147 subsum = _mm_sub_epi64 (sum, c80000000); 2148 suba = _mm_sub_epi64 (a, c80000000); 2149 cmp = _mm_cmpgt_epi64 ( suba, subsum); //no unsigned comparison, need to go to signed, SSE4.2!!! 2150 return _mm_or_si128 (sum, cmp); //saturation 2151 } 2152 #else 2153 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqaddq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 2154 { 2155 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; 2156 _mm_store_si128((__m128i*)atmp, a); 2157 _mm_store_si128((__m128i*)btmp, b); 2158 res[0] = atmp[0] + btmp[0]; 2159 res[1] = atmp[1] + btmp[1]; 2160 if (res[0] < atmp[0]) res[0] = ~(uint64_t)0; 2161 if (res[1] < atmp[1]) res[1] = ~(uint64_t)0; 2162 return _mm_load_si128((__m128i*)(res)); 2163 } 2164 #endif 2165 2166 //******************* Vector add high half (truncated) ****************** 2167 //************************************************************************ 2168 2169 //*********** Vector rounding add high half: vraddhn_<type> ******************. 2170 //*************************************************************************** 2171 2172 //********************************************************************************** 2173 //********* Multiplication ************************************* 2174 //************************************************************************************** 2175 2176 //Vector multiply: vmul -> Vr[i] := Va[i] * Vb[i] 2177 //As we don't go to wider result functions are equal to "multiply low" in x86 2178 2179 #if defined(USE_SSSE3) 2180 int8x16_t vmulq_s8(int8x16_t a, int8x16_t b); // VMUL.I8 q0,q0,q0 2181 _NEON2SSE_INLINE int8x16_t vmulq_s8(int8x16_t a, int8x16_t b) // VMUL.I8 q0,q0,q0 2182 { // no 8 bit simd multiply, need to go to 16 bits 2183 //solution may be not optimal 2184 __m128i a16, b16, r16_1, r16_2; 2185 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; 2186 a16 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 2187 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 2188 r16_1 = _mm_mullo_epi16 (a16, b16); 2189 //swap hi and low part of a and b to process the remaining data 2190 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 2191 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 2192 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 2193 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 __m128i r16_2 2194 2195 r16_2 = _mm_mullo_epi16 (a16, b16); 2196 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*)mask8_16_even_odd); //return to 8 bit 2197 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*)mask8_16_even_odd); //return to 8 bit 2198 2199 return _mm_unpacklo_epi64(r16_1, r16_2); 2200 } 2201 #endif 2202 2203 int16x8_t vmulq_s16(int16x8_t a, int16x8_t b); // VMUL.I16 q0,q0,q0 2204 #define vmulq_s16 _mm_mullo_epi16 2205 2206 int32x4_t vmulq_s32(int32x4_t a, int32x4_t b); // VMUL.I32 q0,q0,q0 2207 #define vmulq_s32 _MM_MULLO_EPI32 //SSE4.1 2208 2209 float32x4_t vmulq_f32(float32x4_t a, float32x4_t b); // VMUL.F32 q0,q0,q0 2210 #define vmulq_f32 _mm_mul_ps 2211 2212 uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b); // VMUL.I8 q0,q0,q0 2213 _NEON2SSE_INLINE uint8x16_t vmulq_u8(uint8x16_t a, uint8x16_t b) // VMUL.I8 q0,q0,q0 2214 { // no 8 bit simd multiply, need to go to 16 bits 2215 //solution may be not optimal 2216 __m128i maskff, a16, b16, r16_1, r16_2; 2217 maskff = _mm_set1_epi16(0xff); 2218 a16 = _MM_CVTEPU8_EPI16 (a); // SSE 4.1 2219 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 2220 r16_1 = _mm_mullo_epi16 (a16, b16); 2221 r16_1 = _mm_and_si128(r16_1, maskff); //to avoid saturation 2222 //swap hi and low part of a and b to process the remaining data 2223 a16 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 2224 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 2225 a16 = _MM_CVTEPI8_EPI16 (a16); // SSE 4.1 2226 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 2227 2228 r16_2 = _mm_mullo_epi16 (a16, b16); 2229 r16_2 = _mm_and_si128(r16_2, maskff); //to avoid saturation 2230 return _mm_packus_epi16 (r16_1, r16_2); 2231 } 2232 2233 uint16x8_t vmulq_u16(uint16x8_t a, uint16x8_t b); // VMUL.I16 q0,q0,q0 2234 #define vmulq_u16 _mm_mullo_epi16 2235 2236 uint32x4_t vmulq_u32(uint32x4_t a, uint32x4_t b); // VMUL.I32 q0,q0,q0 2237 #define vmulq_u32 _MM_MULLO_EPI32 //SSE4.1 2238 2239 poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b); // VMUL.P8 q0,q0,q0 2240 _NEON2SSE_INLINE poly8x16_t vmulq_p8(poly8x16_t a, poly8x16_t b) 2241 { //may be optimized 2242 __m128i c1, res, tmp, bmasked; 2243 int i; 2244 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... 2245 c1 = vshrq_n_u8(c1,7); //0x1 2246 bmasked = _mm_and_si128(b, c1); //0x1 2247 res = vmulq_u8(a, bmasked); 2248 for(i = 1; i<8; i++) { 2249 c1 = _mm_slli_epi16(c1,1); //shift mask left by 1, 16 bit shift is OK here 2250 bmasked = _mm_and_si128(b, c1); //0x1 2251 tmp = vmulq_u8(a, bmasked); 2252 res = _mm_xor_si128(res, tmp); 2253 } 2254 return res; 2255 } 2256 2257 //************************* Vector long multiply *********************************** 2258 //**************************************************************************** 2259 2260 //****************Vector saturating doubling long multiply ************************** 2261 //***************************************************************** 2262 2263 //********************* Vector multiply accumulate: vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************************ 2264 //****************************************************************************************** 2265 2266 #if defined(USE_SSSE3) 2267 int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLA.I8 q0,q0,q0 2268 _NEON2SSE_INLINE int8x16_t vmlaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLA.I8 q0,q0,q0 2269 { //solution may be not optimal 2270 // no 8 bit simd multiply, need to go to 16 bits 2271 __m128i b16, c16, r16_1, a_2,r16_2; 2272 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; 2273 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 2274 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 2275 r16_1 = _mm_mullo_epi16 (b16, c16); 2276 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits 2277 r16_1 = _mm_add_epi8 (r16_1, a); 2278 //swap hi and low part of a, b and c to process the remaining data 2279 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 2280 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 2281 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); 2282 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 2283 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 2284 2285 r16_2 = _mm_mullo_epi16 (b16, c16); 2286 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); 2287 r16_2 = _mm_add_epi8(r16_2, a_2); 2288 return _mm_unpacklo_epi64(r16_1,r16_2); 2289 } 2290 #endif 2291 2292 int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLA.I16 q0,q0,q0 2293 _NEON2SSE_INLINE int16x8_t vmlaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLA.I16 q0,q0,q0 2294 { 2295 __m128i res; 2296 res = _mm_mullo_epi16 (c, b); 2297 return _mm_add_epi16 (res, a); 2298 } 2299 2300 int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLA.I32 q0,q0,q0 2301 _NEON2SSE_INLINE int32x4_t vmlaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLA.I32 q0,q0,q0 2302 { 2303 __m128i res; 2304 res = _MM_MULLO_EPI32 (c, b); //SSE4.1 2305 return _mm_add_epi32 (res, a); 2306 } 2307 2308 float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLA.F32 q0,q0,q0 2309 _NEON2SSE_INLINE float32x4_t vmlaq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLA.F32 q0,q0,q0 2310 { //fma is coming soon, but right now: 2311 __m128 res; 2312 res = _mm_mul_ps (c, b); 2313 return _mm_add_ps (a, res); 2314 } 2315 2316 #if defined(USE_SSSE3) 2317 uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLA.I8 q0,q0,q0 2318 _NEON2SSE_INLINE uint8x16_t vmlaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLA.I8 q0,q0,q0 2319 { //solution may be not optimal 2320 // no 8 bit simd multiply, need to go to 16 bits 2321 __m128i b16, c16, r16_1, a_2, r16_2; 2322 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; 2323 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 2324 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 2325 r16_1 = _mm_mullo_epi16 (b16, c16); 2326 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits 2327 r16_1 = _mm_add_epi8 (r16_1, a); 2328 //swap hi and low part of a, b and c to process the remaining data 2329 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 2330 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 2331 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); 2332 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 2333 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 2334 2335 r16_2 = _mm_mullo_epi16 (b16, c16); 2336 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); 2337 r16_2 = _mm_add_epi8(r16_2, a_2); 2338 return _mm_unpacklo_epi64(r16_1,r16_2); 2339 } 2340 #endif 2341 2342 uint16x8_t vmlaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLA.I16 q0,q0,q0 2343 #define vmlaq_u16 vmlaq_s16 2344 2345 uint32x4_t vmlaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLA.I32 q0,q0,q0 2346 #define vmlaq_u32 vmlaq_s32 2347 2348 //********************** Vector widening multiply accumulate (long multiply accumulate): 2349 // vmla -> Vr[i] := Va[i] + Vb[i] * Vc[i] ************** 2350 //******************************************************************************************** 2351 2352 //******************** Vector multiply subtract: vmls -> Vr[i] := Va[i] - Vb[i] * Vc[i] *************************************** 2353 //******************************************************************************************** 2354 2355 #if defined(USE_SSSE3) 2356 int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VMLS.I8 q0,q0,q0 2357 _NEON2SSE_INLINE int8x16_t vmlsq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VMLS.I8 q0,q0,q0 2358 { //solution may be not optimal 2359 // no 8 bit simd multiply, need to go to 16 bits 2360 __m128i b16, c16, r16_1, a_2, r16_2; 2361 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; 2362 b16 = _MM_CVTEPI8_EPI16 (b); // SSE 4.1 2363 c16 = _MM_CVTEPI8_EPI16 (c); // SSE 4.1 2364 r16_1 = _mm_mullo_epi16 (b16, c16); 2365 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); 2366 r16_1 = _mm_sub_epi8 (a, r16_1); 2367 //swap hi and low part of a, b, c to process the remaining data 2368 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 2369 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 2370 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); 2371 b16 = _MM_CVTEPI8_EPI16 (b16); // SSE 4.1 2372 c16 = _MM_CVTEPI8_EPI16 (c16); // SSE 4.1 2373 2374 r16_2 = _mm_mullo_epi16 (b16, c16); 2375 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); 2376 r16_2 = _mm_sub_epi8 (a_2, r16_2); 2377 return _mm_unpacklo_epi64(r16_1,r16_2); 2378 } 2379 #endif 2380 2381 int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VMLS.I16 q0,q0,q0 2382 _NEON2SSE_INLINE int16x8_t vmlsq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VMLS.I16 q0,q0,q0 2383 { 2384 __m128i res; 2385 res = _mm_mullo_epi16 (c, b); 2386 return _mm_sub_epi16 (a, res); 2387 } 2388 2389 int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VMLS.I32 q0,q0,q0 2390 _NEON2SSE_INLINE int32x4_t vmlsq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VMLS.I32 q0,q0,q0 2391 { 2392 __m128i res; 2393 res = _MM_MULLO_EPI32 (c, b); //SSE4.1 2394 return _mm_sub_epi32 (a, res); 2395 } 2396 2397 float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c); // VMLS.F32 q0,q0,q0 2398 _NEON2SSE_INLINE float32x4_t vmlsq_f32(float32x4_t a, float32x4_t b, float32x4_t c) // VMLS.F32 q0,q0,q0 2399 { 2400 __m128 res; 2401 res = _mm_mul_ps (c, b); 2402 return _mm_sub_ps (a, res); 2403 } 2404 2405 #if defined(USE_SSSE3) 2406 uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VMLS.I8 q0,q0,q0 2407 _NEON2SSE_INLINE uint8x16_t vmlsq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) // VMLS.I8 q0,q0,q0 2408 { //solution may be not optimal 2409 // no 8 bit simd multiply, need to go to 16 bits 2410 __m128i b16, c16, r16_1, a_2, r16_2; 2411 _NEON2SSE_ALIGN_16 int8_t mask8_16_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }; 2412 b16 = _MM_CVTEPU8_EPI16 (b); // SSE 4.1 2413 c16 = _MM_CVTEPU8_EPI16 (c); // SSE 4.1 2414 r16_1 = _mm_mullo_epi16 (b16, c16); 2415 r16_1 = _mm_shuffle_epi8 (r16_1, *(__m128i*) mask8_16_even_odd); //return to 8 bits 2416 r16_1 = _mm_sub_epi8 (a, r16_1); 2417 //swap hi and low part of a, b and c to process the remaining data 2418 a_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 2419 b16 = _mm_shuffle_epi32 (b, _SWAP_HI_LOW32); 2420 c16 = _mm_shuffle_epi32 (c, _SWAP_HI_LOW32); 2421 b16 = _MM_CVTEPU8_EPI16 (b16); // SSE 4.1 2422 c16 = _MM_CVTEPU8_EPI16 (c16); // SSE 4.1 2423 2424 r16_2 = _mm_mullo_epi16 (b16, c16); 2425 r16_2 = _mm_shuffle_epi8 (r16_2, *(__m128i*) mask8_16_even_odd); 2426 r16_2 = _mm_sub_epi8(a_2, r16_2); 2427 return _mm_unpacklo_epi64(r16_1,r16_2); 2428 } 2429 #endif 2430 2431 uint16x8_t vmlsq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VMLS.I16 q0,q0,q0 2432 #define vmlsq_u16 vmlsq_s16 2433 2434 uint32x4_t vmlsq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VMLS.I32 q0,q0,q0 2435 #define vmlsq_u32 vmlsq_s32 2436 2437 //******************** Vector multiply subtract long (widening multiply subtract) ************************************ 2438 //************************************************************************************************************* 2439 2440 //****** Vector saturating doubling multiply high ********************** 2441 //************************************************************************* 2442 //For some ARM implementations if the multiply high result is all 0xffffffff then it is not doubled. We do the same here 2443 2444 int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b); // VQDMULH.S16 q0,q0,q0 2445 _NEON2SSE_INLINE int16x8_t vqdmulhq_s16(int16x8_t a, int16x8_t b) // VQDMULH.S16 q0,q0,q0 2446 { 2447 __m128i res_sat, cffff, mask, res; 2448 res = _mm_mulhi_epi16 (a, b); 2449 cffff = _mm_cmpeq_epi16(res,res); //0xffff 2450 mask = _mm_cmpeq_epi16(res, cffff); //if ffff need to saturate 2451 res_sat = _mm_adds_epi16(res, res); //res *= 2 and saturate 2452 return _mm_or_si128(mask, res_sat); 2453 } 2454 2455 #if defined(USE_SSSE3) 2456 int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b); // VQDMULH.S32 q0,q0,q0 2457 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 2458 { // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target 2459 __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1; 2460 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 2461 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 2462 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 2463 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 2464 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 2465 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 2466 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits 2467 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits 2468 mul = _mm_unpacklo_epi64(mul, mul1); 2469 cffffffff = _mm_cmpeq_epi32(mul,mul); //0xffffffff 2470 mask = _mm_cmpeq_epi32(mul, cffffffff); //if ffffffff need to saturate 2471 res_sat = vqd_s32(mul); 2472 return _mm_or_si128(mask, res_sat); 2473 } 2474 #endif 2475 2476 //********* Vector saturating rounding doubling multiply high **************** 2477 //**************************************************************************** 2478 //If use _mm_mulhrs_xx functions the result may differ from NEON one a little due to different rounding rules and order 2479 2480 #if defined(USE_SSSE3) 2481 int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b); // VQRDMULH.S16 q0,q0,q0 2482 _NEON2SSE_INLINE int16x8_t vqrdmulhq_s16(int16x8_t a, int16x8_t b) // VQRDMULH.S16 q0,q0,q0 2483 { 2484 __m128i res_sat, cffff, mask, res; 2485 res = _mm_mulhrs_epi16 (a, b); 2486 cffff = _mm_cmpeq_epi16(res,res); //0xffff 2487 mask = _mm_cmpeq_epi16(res, cffff); //if ffff need to saturate 2488 res_sat = _mm_adds_epi16(res, res); //res *= 2 and saturate 2489 return _mm_or_si128(mask, res_sat); 2490 } 2491 #endif 2492 2493 #if defined(USE_SSSE3) 2494 int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b); // VQRDMULH.S32 q0,q0,q0 2495 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 2496 { // no multiply high 32 bit SIMD in IA32, may be not optimal compared with a serial solution for the SSSE3 target 2497 __m128i ab, ba, res_sat, cffffffff, mask, mul, mul1, mask1; 2498 ab = _mm_unpacklo_epi32 (a, b); //a0, b0, a1,b1 2499 ba = _mm_unpacklo_epi32 (b, a); //b0, a0, b1,a1 2500 mul = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 2501 ab = _mm_unpackhi_epi32 (a, b); //a2, b2, a3,b3 2502 ba = _mm_unpackhi_epi32 (b, a); //b2, a2, b3,a3 2503 mul1 = _MM_MUL_EPI32(ab, ba); //uses 1rst and 3rd data lanes, the multiplication gives 64 bit result 2504 mul = _mm_shuffle_epi32 (mul, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits 2505 mul1 = _mm_shuffle_epi32 (mul1, 1 | (3 << 2) | (0 << 4) | (2 << 6)); //shuffle the data to get 2 32-bits 2506 mul = _mm_unpacklo_epi64(mul, mul1); 2507 cffffffff = _mm_cmpeq_epi32(mul,mul); //0xffffffff 2508 mask1 = _mm_slli_epi32(mul, 17); //shift left then back right to 2509 mask1 = _mm_srli_epi32(mul,31); //get 15-th bit 1 or zero 2510 mul = _mm_add_epi32 (mul, mask1); //actual rounding 2511 mask = _mm_cmpeq_epi32(mul, cffffffff); //if ffffffff need to saturate 2512 res_sat = vqd_s32(mul); 2513 return _mm_or_si128(mask, res_sat); 2514 } 2515 #endif 2516 2517 //*************Vector widening saturating doubling multiply accumulate (long saturating doubling multiply accumulate) ***** 2518 //************************************************************************************************************************* 2519 2520 //************************************************************************************ 2521 //****************** Vector subtract *********************************************** 2522 //************************************************************************************ 2523 2524 int8x16_t vsubq_s8(int8x16_t a, int8x16_t b); // VSUB.I8 q0,q0,q0 2525 #define vsubq_s8 _mm_sub_epi8 2526 2527 int16x8_t vsubq_s16(int16x8_t a, int16x8_t b); // VSUB.I16 q0,q0,q0 2528 #define vsubq_s16 _mm_sub_epi16 2529 2530 int32x4_t vsubq_s32(int32x4_t a, int32x4_t b); // VSUB.I32 q0,q0,q0 2531 #define vsubq_s32 _mm_sub_epi32 2532 2533 int64x2_t vsubq_s64(int64x2_t a, int64x2_t b); // VSUB.I64 q0,q0,q0 2534 #define vsubq_s64 _mm_sub_epi64 2535 2536 float32x4_t vsubq_f32(float32x4_t a, float32x4_t b); // VSUB.F32 q0,q0,q0 2537 #define vsubq_f32 _mm_sub_ps 2538 2539 uint8x16_t vsubq_u8(uint8x16_t a, uint8x16_t b); // VSUB.I8 q0,q0,q0 2540 #define vsubq_u8 _mm_sub_epi8 2541 2542 uint16x8_t vsubq_u16(uint16x8_t a, uint16x8_t b); // VSUB.I16 q0,q0,q0 2543 #define vsubq_u16 _mm_sub_epi16 2544 2545 uint32x4_t vsubq_u32(uint32x4_t a, uint32x4_t b); // VSUB.I32 q0,q0,q0 2546 #define vsubq_u32 _mm_sub_epi32 2547 2548 uint64x2_t vsubq_u64(uint64x2_t a, uint64x2_t b); // VSUB.I64 q0,q0,q0 2549 #define vsubq_u64 _mm_sub_epi64 2550 2551 //***************Vector long subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ****************** 2552 //*********************************************************************************** 2553 //Va, Vb have equal lane sizes, result is a 128 bit vector of lanes that are twice the width. 2554 2555 //***************** Vector wide subtract: vsub -> Vr[i]:=Va[i]-Vb[i] ********************************** 2556 //***************************************************************************************************** 2557 2558 //************************Vector saturating subtract ********************************* 2559 //************************************************************************************* 2560 2561 int8x16_t vqsubq_s8(int8x16_t a, int8x16_t b); // VQSUB.S8 q0,q0,q0 2562 #define vqsubq_s8 _mm_subs_epi8 2563 2564 int16x8_t vqsubq_s16(int16x8_t a, int16x8_t b); // VQSUB.S16 q0,q0,q0 2565 #define vqsubq_s16 _mm_subs_epi16 2566 2567 int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b); // VQSUB.S32 q0,q0,q0 2568 _NEON2SSE_INLINE int32x4_t vqsubq_s32(int32x4_t a, int32x4_t b) 2569 { //no corresponding x86 SIMD soulution, special tricks are necessary. The overflow is possible only if a and b have opposite signs and sub has opposite sign to a 2570 __m128i c7fffffff, res, res_sat, res_xor_a, b_xor_a; 2571 c7fffffff = _mm_set1_epi32(0x7fffffff); 2572 res = _mm_sub_epi32(a, b); 2573 res_sat = _mm_srli_epi32(a, 31); 2574 res_sat = _mm_add_epi32(res_sat, c7fffffff); 2575 res_xor_a = _mm_xor_si128(res, a); 2576 b_xor_a = _mm_xor_si128(b, a); 2577 res_xor_a = _mm_and_si128(b_xor_a, res_xor_a); 2578 res_xor_a = _mm_srai_epi32(res_xor_a,31); //propagate the sigh bit, all ffff if <0 all ones otherwise 2579 res_sat = _mm_and_si128(res_xor_a, res_sat); 2580 res = _mm_andnot_si128(res_xor_a, res); 2581 return _mm_or_si128(res, res_sat); 2582 } 2583 2584 int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b); // VQSUB.S64 q0,q0,q0 2585 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqsubq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) //no optimal SIMD soulution 2586 { 2587 _NEON2SSE_ALIGN_16 int64_t atmp[2], btmp[2]; 2588 _NEON2SSE_ALIGN_16 uint64_t res[2]; 2589 _mm_store_si128((__m128i*)atmp, a); 2590 _mm_store_si128((__m128i*)btmp, b); 2591 res[0] = atmp[0] - btmp[0]; 2592 res[1] = atmp[1] - btmp[1]; 2593 if (((res[0] ^ atmp[0]) & _SIGNBIT64) && ((atmp[0] ^ btmp[0]) & _SIGNBIT64)) { 2594 res[0] = (atmp[0] >> 63) ^ ~_SIGNBIT64; 2595 } 2596 if (((res[1] ^ atmp[1]) & _SIGNBIT64) && ((atmp[1] ^ btmp[1]) & _SIGNBIT64)) { 2597 res[1] = (atmp[1] >> 63) ^ ~_SIGNBIT64; 2598 } 2599 return _mm_load_si128((__m128i*)res); 2600 } 2601 2602 uint8x16_t vqsubq_u8(uint8x16_t a, uint8x16_t b); // VQSUB.U8 q0,q0,q0 2603 #define vqsubq_u8 _mm_subs_epu8 2604 2605 uint16x8_t vqsubq_u16(uint16x8_t a, uint16x8_t b); // VQSUB.s16 q0,q0,q0 2606 #define vqsubq_u16 _mm_subs_epu16 2607 2608 uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b); // VQSUB.U32 q0,q0,q0 2609 _NEON2SSE_INLINE uint32x4_t vqsubq_u32(uint32x4_t a, uint32x4_t b) // VQSUB.U32 q0,q0,q0 2610 { 2611 __m128i min, mask, sub; 2612 min = _MM_MIN_EPU32(a, b); //SSE4.1 2613 mask = _mm_cmpeq_epi32 (min, b); 2614 sub = _mm_sub_epi32 (a, b); 2615 return _mm_and_si128 ( sub, mask); 2616 } 2617 2618 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL); // VQSUB.U64 q0,q0,q0 2619 #ifdef USE_SSE4 2620 _NEON2SSE_INLINE uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b) 2621 { 2622 __m128i c80000000, subb, suba, cmp, sub; 2623 c80000000 = _mm_set_epi32 (0x80000000, 0x0, 0x80000000, 0x0); 2624 sub = _mm_sub_epi64 (a, b); 2625 suba = _mm_sub_epi64 (a, c80000000); 2626 subb = _mm_sub_epi64 (b, c80000000); 2627 cmp = _mm_cmpgt_epi64 ( suba, subb); //no unsigned comparison, need to go to signed, SSE4.2!!! 2628 return _mm_and_si128 (sub, cmp); //saturation 2629 } 2630 #else 2631 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqsubq_u64(uint64x2_t a, uint64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 2632 { 2633 _NEON2SSE_ALIGN_16 uint64_t atmp[2], btmp[2], res[2]; 2634 _mm_store_si128((__m128i*)atmp, a); 2635 _mm_store_si128((__m128i*)btmp, b); 2636 res[0] = (atmp[0] > btmp[0]) ? atmp[0] - btmp[0] : 0; 2637 res[1] = (atmp[1] > btmp[1]) ? atmp[1] - btmp[1] : 0; 2638 return _mm_load_si128((__m128i*)(res)); 2639 } 2640 #endif 2641 2642 //**********Vector halving subtract Vr[i]:=(Va[i]-Vb[i])>>1 ****************************************************** 2643 //**************************************************************** 2644 2645 int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b); // VHSUB.S8 q0,q0,q0 2646 _NEON2SSE_INLINE int8x16_t vhsubq_s8(int8x16_t a, int8x16_t b) // VHSUB.S8 q0,q0,q0 2647 { // //need to deal with the possibility of internal overflow 2648 __m128i c128, au,bu; 2649 c128 = _mm_set1_epi8 (128); 2650 au = _mm_add_epi8( a, c128); 2651 bu = _mm_add_epi8( b, c128); 2652 return vhsubq_u8(au,bu); 2653 } 2654 2655 int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b); // VHSUB.S16 q0,q0,q0 2656 _NEON2SSE_INLINE int16x8_t vhsubq_s16(int16x8_t a, int16x8_t b) // VHSUB.S16 q0,q0,q0 2657 { //need to deal with the possibility of internal overflow 2658 __m128i c8000, au,bu; 2659 c8000 = _mm_set1_epi16(0x8000); 2660 au = _mm_add_epi16( a, c8000); 2661 bu = _mm_add_epi16( b, c8000); 2662 return vhsubq_u16(au,bu); 2663 } 2664 2665 int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b); // VHSUB.S32 q0,q0,q0 2666 _NEON2SSE_INLINE int32x4_t vhsubq_s32(int32x4_t a, int32x4_t b) // VHSUB.S32 q0,q0,q0 2667 {//need to deal with the possibility of internal overflow 2668 __m128i a2, b2,r, b_1; 2669 a2 = _mm_srai_epi32 (a,1); 2670 b2 = _mm_srai_epi32 (b,1); 2671 r = _mm_sub_epi32 (a2, b2); 2672 b_1 = _mm_andnot_si128(a, b); //!a and b 2673 b_1 = _mm_slli_epi32 (b_1,31); 2674 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit 2675 return _mm_sub_epi32(r,b_1); 2676 } 2677 2678 uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b); // VHSUB.U8 q0,q0,q0 2679 _NEON2SSE_INLINE uint8x16_t vhsubq_u8(uint8x16_t a, uint8x16_t b) // VHSUB.U8 q0,q0,q0 2680 { 2681 __m128i avg; 2682 avg = _mm_avg_epu8 (a, b); 2683 return _mm_sub_epi8(a, avg); 2684 } 2685 2686 uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b); // VHSUB.s16 q0,q0,q0 2687 _NEON2SSE_INLINE uint16x8_t vhsubq_u16(uint16x8_t a, uint16x8_t b) // VHSUB.s16 q0,q0,q0 2688 { 2689 __m128i avg; 2690 avg = _mm_avg_epu16 (a, b); 2691 return _mm_sub_epi16(a, avg); 2692 } 2693 2694 uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b); // VHSUB.U32 q0,q0,q0 2695 _NEON2SSE_INLINE uint32x4_t vhsubq_u32(uint32x4_t a, uint32x4_t b) // VHSUB.U32 q0,q0,q0 2696 {//need to deal with the possibility of internal overflow 2697 __m128i a2, b2,r, b_1; 2698 a2 = _mm_srli_epi32 (a,1); 2699 b2 = _mm_srli_epi32 (b,1); 2700 r = _mm_sub_epi32 (a2, b2); 2701 b_1 = _mm_andnot_si128(a, b); //!a and b 2702 b_1 = _mm_slli_epi32 (b_1,31); 2703 b_1 = _mm_srli_epi32 (b_1,31); //0 or 1, last b bit 2704 return _mm_sub_epi32(r,b_1); 2705 } 2706 2707 //******* Vector subtract high half (truncated) ** ************ 2708 //************************************************************ 2709 2710 //************ Vector rounding subtract high half ********************* 2711 //********************************************************************* 2712 2713 //*********** Vector saturating doubling multiply subtract long ******************** 2714 //************************************************************************************ 2715 2716 //****************** COMPARISON *************************************** 2717 //******************* Vector compare equal ************************************* 2718 //**************************************************************************** 2719 2720 uint8x16_t vceqq_s8(int8x16_t a, int8x16_t b); // VCEQ.I8 q0, q0, q0 2721 #define vceqq_s8 _mm_cmpeq_epi8 2722 2723 uint16x8_t vceqq_s16(int16x8_t a, int16x8_t b); // VCEQ.I16 q0, q0, q0 2724 #define vceqq_s16 _mm_cmpeq_epi16 2725 2726 uint32x4_t vceqq_s32(int32x4_t a, int32x4_t b); // VCEQ.I32 q0, q0, q0 2727 #define vceqq_s32 _mm_cmpeq_epi32 2728 2729 uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b); // VCEQ.F32 q0, q0, q0 2730 _NEON2SSE_INLINE uint32x4_t vceqq_f32(float32x4_t a, float32x4_t b) 2731 { 2732 __m128 res; 2733 res = _mm_cmpeq_ps(a,b); 2734 return *(__m128i*)&res; 2735 } 2736 2737 uint8x16_t vceqq_u8(uint8x16_t a, uint8x16_t b); // VCEQ.I8 q0, q0, q0 2738 #define vceqq_u8 _mm_cmpeq_epi8 2739 2740 uint16x8_t vceqq_u16(uint16x8_t a, uint16x8_t b); // VCEQ.I16 q0, q0, q0 2741 #define vceqq_u16 _mm_cmpeq_epi16 2742 2743 uint32x4_t vceqq_u32(uint32x4_t a, uint32x4_t b); // VCEQ.I32 q0, q0, q0 2744 #define vceqq_u32 _mm_cmpeq_epi32 2745 2746 uint8x16_t vceqq_p8(poly8x16_t a, poly8x16_t b); // VCEQ.I8 q0, q0, q0 2747 #define vceqq_p8 _mm_cmpeq_epi8 2748 2749 //******************Vector compare greater-than or equal************************* 2750 //******************************************************************************* 2751 //in IA SIMD no greater-than-or-equal comparison for integers, 2752 // there is greater-than available only, so we need the following tricks 2753 2754 uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 2755 _NEON2SSE_INLINE uint8x16_t vcgeq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 2756 { 2757 __m128i m1, m2; 2758 m1 = _mm_cmpgt_epi8 ( a, b); 2759 m2 = _mm_cmpeq_epi8 ( a, b); 2760 return _mm_or_si128 ( m1, m2); 2761 } 2762 2763 uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 2764 _NEON2SSE_INLINE uint16x8_t vcgeq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 2765 { 2766 __m128i m1, m2; 2767 m1 = _mm_cmpgt_epi16 ( a, b); 2768 m2 = _mm_cmpeq_epi16 ( a, b); 2769 return _mm_or_si128 ( m1,m2); 2770 } 2771 2772 uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 2773 _NEON2SSE_INLINE uint32x4_t vcgeq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 2774 { 2775 __m128i m1, m2; 2776 m1 = _mm_cmpgt_epi32 (a, b); 2777 m2 = _mm_cmpeq_epi32 (a, b); 2778 return _mm_or_si128 (m1, m2); 2779 } 2780 2781 uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 2782 _NEON2SSE_INLINE uint32x4_t vcgeq_f32(float32x4_t a, float32x4_t b) 2783 { 2784 __m128 res; 2785 res = _mm_cmpge_ps(a,b); //use only 2 first entries 2786 return *(__m128i*)&res; 2787 } 2788 2789 uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 2790 _NEON2SSE_INLINE uint8x16_t vcgeq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 2791 { //no unsigned chars comparison, only signed available,so need the trick 2792 #ifdef USE_SSE4 2793 __m128i cmp; 2794 cmp = _mm_max_epu8(a, b); 2795 return _mm_cmpeq_epi8(cmp, a); //a>=b 2796 #else 2797 __m128i c128, as, bs, m1, m2; 2798 c128 = _mm_set1_epi8 (128); 2799 as = _mm_sub_epi8( a, c128); 2800 bs = _mm_sub_epi8( b, c128); 2801 m1 = _mm_cmpgt_epi8( as, bs); 2802 m2 = _mm_cmpeq_epi8 (as, bs); 2803 return _mm_or_si128 ( m1, m2); 2804 #endif 2805 } 2806 2807 uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 2808 _NEON2SSE_INLINE uint16x8_t vcgeq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 2809 { //no unsigned shorts comparison, only signed available,so need the trick 2810 #ifdef USE_SSE4 2811 __m128i cmp; 2812 cmp = _mm_max_epu16(a, b); 2813 return _mm_cmpeq_epi16(cmp, a); //a>=b 2814 #else 2815 __m128i c8000, as, bs, m1, m2; 2816 c8000 = _mm_set1_epi16 (0x8000); 2817 as = _mm_sub_epi16(a,c8000); 2818 bs = _mm_sub_epi16(b,c8000); 2819 m1 = _mm_cmpgt_epi16(as, bs); 2820 m2 = _mm_cmpeq_epi16 (as, bs); 2821 return _mm_or_si128 ( m1, m2); 2822 #endif 2823 } 2824 2825 uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 2826 _NEON2SSE_INLINE uint32x4_t vcgeq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 2827 { //no unsigned ints comparison, only signed available,so need the trick 2828 #ifdef USE_SSE4 2829 __m128i cmp; 2830 cmp = _mm_max_epu32(a, b); 2831 return _mm_cmpeq_epi32(cmp, a); //a>=b 2832 #else 2833 //serial solution may be faster 2834 __m128i c80000000, as, bs, m1, m2; 2835 c80000000 = _mm_set1_epi32 (0x80000000); 2836 as = _mm_sub_epi32(a,c80000000); 2837 bs = _mm_sub_epi32(b,c80000000); 2838 m1 = _mm_cmpgt_epi32 (as, bs); 2839 m2 = _mm_cmpeq_epi32 (as, bs); 2840 return _mm_or_si128 ( m1, m2); 2841 #endif 2842 } 2843 2844 //**********************Vector compare less-than or equal****************************** 2845 //*************************************************************************************** 2846 //in IA SIMD no less-than-or-equal comparison for integers present, so we need the tricks 2847 2848 uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b); // VCGE.S8 q0, q0, q0 2849 _NEON2SSE_INLINE uint8x16_t vcleq_s8(int8x16_t a, int8x16_t b) // VCGE.S8 q0, q0, q0 2850 { 2851 __m128i c1, res; 2852 c1 = _mm_cmpeq_epi8 (a,a); //all ones 0xff.... 2853 res = _mm_cmpgt_epi8 ( a, b); 2854 return _mm_andnot_si128 (res, c1); //inverse the cmpgt result, get less-than-or-equal 2855 } 2856 2857 uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b); // VCGE.S16 q0, q0, q0 2858 _NEON2SSE_INLINE uint16x8_t vcleq_s16(int16x8_t a, int16x8_t b) // VCGE.S16 q0, q0, q0 2859 { 2860 __m128i c1, res; 2861 c1 = _mm_cmpeq_epi16 (a,a); //all ones 0xff.... 2862 res = _mm_cmpgt_epi16 ( a, b); 2863 return _mm_andnot_si128 (res, c1); 2864 } 2865 2866 uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b); // VCGE.S32 q0, q0, q0 2867 _NEON2SSE_INLINE uint32x4_t vcleq_s32(int32x4_t a, int32x4_t b) // VCGE.S32 q0, q0, q0 2868 { 2869 __m128i c1, res; 2870 c1 = _mm_cmpeq_epi32 (a,a); //all ones 0xff.... 2871 res = _mm_cmpgt_epi32 ( a, b); 2872 return _mm_andnot_si128 (res, c1); 2873 } 2874 2875 uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b); // VCGE.F32 q0, q0, q0 2876 _NEON2SSE_INLINE uint32x4_t vcleq_f32(float32x4_t a, float32x4_t b) 2877 { 2878 __m128 res; 2879 res = _mm_cmple_ps(a,b); 2880 return *(__m128i*)&res; 2881 } 2882 2883 uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b); // VCGE.U8 q0, q0, q0 2884 #ifdef USE_SSE4 2885 _NEON2SSE_INLINE uint8x16_t vcleq_u8(uint8x16_t a, uint8x16_t b) // VCGE.U8 q0, q0, q0 2886 { //no unsigned chars comparison in SSE, only signed available,so need the trick 2887 2888 __m128i cmp; 2889 cmp = _mm_min_epu8(a, b); 2890 return _mm_cmpeq_epi8(cmp, a); //a<=b 2891 } 2892 #else 2893 #define vcleq_u8(a,b) vcgeq_u8(b,a) 2894 #endif 2895 2896 uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b); // VCGE.s16 q0, q0, q0 2897 #ifdef USE_SSE4 2898 _NEON2SSE_INLINE uint16x8_t vcleq_u16(uint16x8_t a, uint16x8_t b) // VCGE.s16 q0, q0, q0 2899 { //no unsigned shorts comparison in SSE, only signed available,so need the trick 2900 __m128i cmp; 2901 cmp = _mm_min_epu16(a, b); 2902 return _mm_cmpeq_epi16(cmp, a); //a<=b 2903 } 2904 #else 2905 #define vcleq_u16(a,b) vcgeq_u16(b,a) 2906 #endif 2907 2908 uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b); // VCGE.U32 q0, q0, q0 2909 #ifdef USE_SSE4 2910 _NEON2SSE_INLINE uint32x4_t vcleq_u32(uint32x4_t a, uint32x4_t b) // VCGE.U32 q0, q0, q0 2911 { //no unsigned chars comparison in SSE, only signed available,so need the trick 2912 __m128i cmp; 2913 cmp = _mm_min_epu32(a, b); 2914 return _mm_cmpeq_epi32(cmp, a); //a<=b 2915 } 2916 #else 2917 //solution may be not optimal compared with the serial one 2918 #define vcleq_u32(a,b) vcgeq_u32(b,a) 2919 #endif 2920 2921 //****** Vector compare greater-than ****************************************** 2922 //************************************************************************** 2923 2924 uint8x16_t vcgtq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 2925 #define vcgtq_s8 _mm_cmpgt_epi8 2926 2927 uint16x8_t vcgtq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 2928 #define vcgtq_s16 _mm_cmpgt_epi16 2929 2930 uint32x4_t vcgtq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 2931 #define vcgtq_s32 _mm_cmpgt_epi32 2932 2933 uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 2934 _NEON2SSE_INLINE uint32x4_t vcgtq_f32(float32x4_t a, float32x4_t b) 2935 { 2936 __m128 res; 2937 res = _mm_cmpgt_ps(a,b); //use only 2 first entries 2938 return *(__m128i*)&res; 2939 } 2940 2941 uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 2942 _NEON2SSE_INLINE uint8x16_t vcgtq_u8(uint8x16_t a, uint8x16_t b) // VCGT.U8 q0, q0, q0 2943 { //no unsigned chars comparison, only signed available,so need the trick 2944 __m128i c128, as, bs; 2945 c128 = _mm_set1_epi8 (128); 2946 as = _mm_sub_epi8(a,c128); 2947 bs = _mm_sub_epi8(b,c128); 2948 return _mm_cmpgt_epi8 (as, bs); 2949 } 2950 2951 uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 2952 _NEON2SSE_INLINE uint16x8_t vcgtq_u16(uint16x8_t a, uint16x8_t b) // VCGT.s16 q0, q0, q0 2953 { //no unsigned short comparison, only signed available,so need the trick 2954 __m128i c8000, as, bs; 2955 c8000 = _mm_set1_epi16 (0x8000); 2956 as = _mm_sub_epi16(a,c8000); 2957 bs = _mm_sub_epi16(b,c8000); 2958 return _mm_cmpgt_epi16 ( as, bs); 2959 } 2960 2961 uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 2962 _NEON2SSE_INLINE uint32x4_t vcgtq_u32(uint32x4_t a, uint32x4_t b) // VCGT.U32 q0, q0, q0 2963 { //no unsigned int comparison, only signed available,so need the trick 2964 __m128i c80000000, as, bs; 2965 c80000000 = _mm_set1_epi32 (0x80000000); 2966 as = _mm_sub_epi32(a,c80000000); 2967 bs = _mm_sub_epi32(b,c80000000); 2968 return _mm_cmpgt_epi32 ( as, bs); 2969 } 2970 2971 //********************* Vector compare less-than ************************** 2972 //************************************************************************* 2973 2974 uint8x16_t vcltq_s8(int8x16_t a, int8x16_t b); // VCGT.S8 q0, q0, q0 2975 #define vcltq_s8(a,b) vcgtq_s8(b, a) //swap the arguments!! 2976 2977 uint16x8_t vcltq_s16(int16x8_t a, int16x8_t b); // VCGT.S16 q0, q0, q0 2978 #define vcltq_s16(a,b) vcgtq_s16(b, a) //swap the arguments!! 2979 2980 uint32x4_t vcltq_s32(int32x4_t a, int32x4_t b); // VCGT.S32 q0, q0, q0 2981 #define vcltq_s32(a,b) vcgtq_s32(b, a) //swap the arguments!! 2982 2983 uint32x4_t vcltq_f32(float32x4_t a, float32x4_t b); // VCGT.F32 q0, q0, q0 2984 #define vcltq_f32(a,b) vcgtq_f32(b, a) //swap the arguments!! 2985 2986 uint8x16_t vcltq_u8(uint8x16_t a, uint8x16_t b); // VCGT.U8 q0, q0, q0 2987 #define vcltq_u8(a,b) vcgtq_u8(b, a) //swap the arguments!! 2988 2989 uint16x8_t vcltq_u16(uint16x8_t a, uint16x8_t b); // VCGT.s16 q0, q0, q0 2990 #define vcltq_u16(a,b) vcgtq_u16(b, a) //swap the arguments!! 2991 2992 uint32x4_t vcltq_u32(uint32x4_t a, uint32x4_t b); // VCGT.U32 q0, q0, q0 2993 #define vcltq_u32(a,b) vcgtq_u32(b, a) //swap the arguments!! 2994 2995 //*****************Vector compare absolute greater-than or equal ************ 2996 //*************************************************************************** 2997 2998 uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 2999 _NEON2SSE_INLINE uint32x4_t vcageq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 3000 { 3001 __m128i c7fffffff; 3002 __m128 a0, b0; 3003 c7fffffff = _mm_set1_epi32 (0x7fffffff); 3004 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); 3005 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); 3006 a0 = _mm_cmpge_ps ( a0, b0); 3007 return (*(__m128i*)&a0); 3008 } 3009 3010 //********Vector compare absolute less-than or equal ****************** 3011 //******************************************************************** 3012 3013 uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b); // VACGE.F32 q0, q0, q0 3014 _NEON2SSE_INLINE uint32x4_t vcaleq_f32(float32x4_t a, float32x4_t b) // VACGE.F32 q0, q0, q0 3015 { 3016 __m128i c7fffffff; 3017 __m128 a0, b0; 3018 c7fffffff = _mm_set1_epi32 (0x7fffffff); 3019 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); 3020 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); 3021 a0 = _mm_cmple_ps (a0, b0); 3022 return (*(__m128i*)&a0); 3023 } 3024 3025 //******** Vector compare absolute greater-than ****************** 3026 //****************************************************************** 3027 3028 uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 3029 _NEON2SSE_INLINE uint32x4_t vcagtq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 3030 { 3031 __m128i c7fffffff; 3032 __m128 a0, b0; 3033 c7fffffff = _mm_set1_epi32 (0x7fffffff); 3034 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); 3035 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); 3036 a0 = _mm_cmpgt_ps (a0, b0); 3037 return (*(__m128i*)&a0); 3038 } 3039 3040 //***************Vector compare absolute less-than *********************** 3041 //************************************************************************* 3042 3043 uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b); // VACGT.F32 q0, q0, q0 3044 _NEON2SSE_INLINE uint32x4_t vcaltq_f32(float32x4_t a, float32x4_t b) // VACGT.F32 q0, q0, q0 3045 { 3046 __m128i c7fffffff; 3047 __m128 a0, b0; 3048 c7fffffff = _mm_set1_epi32 (0x7fffffff); 3049 a0 = _mm_and_ps (a, *(__m128*)&c7fffffff); 3050 b0 = _mm_and_ps (b, *(__m128*)&c7fffffff); 3051 a0 = _mm_cmplt_ps (a0, b0); 3052 return (*(__m128i*)&a0); 3053 3054 } 3055 3056 //*************************Vector test bits************************************ 3057 //***************************************************************************** 3058 /*VTST (Vector Test Bits) takes each element in a vector, and bitwise logical ANDs them 3059 with the corresponding element of a second vector. If the result is not zero, the 3060 corresponding element in the destination vector is set to all ones. Otherwise, it is set to 3061 all zeros. */ 3062 3063 uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b); // VTST.8 q0, q0, q0 3064 _NEON2SSE_INLINE uint8x16_t vtstq_s8(int8x16_t a, int8x16_t b) // VTST.8 q0, q0, q0 3065 { 3066 __m128i zero, one, res; 3067 zero = _mm_setzero_si128 (); 3068 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff 3069 res = _mm_and_si128 (a, b); 3070 res = _mm_cmpeq_epi8 (res, zero); 3071 return _mm_xor_si128(res, one); //invert result 3072 } 3073 3074 uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b); // VTST.16 q0, q0, q0 3075 _NEON2SSE_INLINE uint16x8_t vtstq_s16(int16x8_t a, int16x8_t b) // VTST.16 q0, q0, q0 3076 { 3077 __m128i zero, one, res; 3078 zero = _mm_setzero_si128 (); 3079 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff 3080 res = _mm_and_si128 (a, b); 3081 res = _mm_cmpeq_epi16 (res, zero); 3082 return _mm_xor_si128(res, one); //invert result 3083 } 3084 3085 uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b); // VTST.32 q0, q0, q0 3086 _NEON2SSE_INLINE uint32x4_t vtstq_s32(int32x4_t a, int32x4_t b) // VTST.32 q0, q0, q0 3087 { 3088 __m128i zero, one, res; 3089 zero = _mm_setzero_si128 (); 3090 one = _mm_cmpeq_epi8(zero,zero); //0xfff..ffff 3091 res = _mm_and_si128 (a, b); 3092 res = _mm_cmpeq_epi32 (res, zero); 3093 return _mm_xor_si128(res, one); //invert result 3094 } 3095 3096 uint8x16_t vtstq_u8(uint8x16_t a, uint8x16_t b); // VTST.8 q0, q0, q0 3097 #define vtstq_u8 vtstq_s8 3098 3099 uint16x8_t vtstq_u16(uint16x8_t a, uint16x8_t b); // VTST.16 q0, q0, q0 3100 #define vtstq_u16 vtstq_s16 3101 3102 uint32x4_t vtstq_u32(uint32x4_t a, uint32x4_t b); // VTST.32 q0, q0, q0 3103 #define vtstq_u32 vtstq_s32 3104 3105 uint8x16_t vtstq_p8(poly8x16_t a, poly8x16_t b); // VTST.8 q0, q0, q0 3106 #define vtstq_p8 vtstq_u8 3107 3108 //****************** Absolute difference ******************** 3109 //*** Absolute difference between the arguments: Vr[i] = | Va[i] - Vb[i] |***** 3110 //************************************************************ 3111 #if defined(USE_SSSE3) 3112 3113 #endif 3114 3115 #if defined(USE_SSSE3) 3116 int8x16_t vabdq_s8(int8x16_t a, int8x16_t b); // VABD.S8 q0,q0,q0 3117 _NEON2SSE_INLINE int8x16_t vabdq_s8(int8x16_t a, int8x16_t b) // VABD.S8 q0,q0,q0 3118 { 3119 __m128i res; 3120 res = _mm_sub_epi8 (a, b); 3121 return _mm_abs_epi8 (res); 3122 } 3123 #endif 3124 3125 #if defined(USE_SSSE3) 3126 int16x8_t vabdq_s16(int16x8_t a, int16x8_t b); // VABD.S16 q0,q0,q0 3127 _NEON2SSE_INLINE int16x8_t vabdq_s16(int16x8_t a, int16x8_t b) // VABD.S16 q0,q0,q0 3128 { 3129 __m128i res; 3130 res = _mm_sub_epi16 (a,b); 3131 return _mm_abs_epi16 (res); 3132 } 3133 #endif 3134 3135 #if defined(USE_SSSE3) 3136 int32x4_t vabdq_s32(int32x4_t a, int32x4_t b); // VABD.S32 q0,q0,q0 3137 _NEON2SSE_INLINE int32x4_t vabdq_s32(int32x4_t a, int32x4_t b) // VABD.S32 q0,q0,q0 3138 { 3139 __m128i res; 3140 res = _mm_sub_epi32 (a,b); 3141 return _mm_abs_epi32 (res); 3142 } 3143 #endif 3144 3145 uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b); // VABD.U8 q0,q0,q0 3146 _NEON2SSE_INLINE uint8x16_t vabdq_u8(uint8x16_t a, uint8x16_t b) //no abs for unsigned 3147 { 3148 __m128i cmp, difab, difba; 3149 cmp = vcgtq_u8(a,b); 3150 difab = _mm_sub_epi8(a,b); 3151 difba = _mm_sub_epi8 (b,a); 3152 difab = _mm_and_si128(cmp, difab); 3153 difba = _mm_andnot_si128(cmp, difba); 3154 return _mm_or_si128(difab, difba); 3155 } 3156 3157 uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b); // VABD.s16 q0,q0,q0 3158 _NEON2SSE_INLINE uint16x8_t vabdq_u16(uint16x8_t a, uint16x8_t b) 3159 { 3160 __m128i cmp, difab, difba; 3161 cmp = vcgtq_u16(a,b); 3162 difab = _mm_sub_epi16(a,b); 3163 difba = _mm_sub_epi16 (b,a); 3164 difab = _mm_and_si128(cmp, difab); 3165 difba = _mm_andnot_si128(cmp, difba); 3166 return _mm_or_si128(difab, difba); 3167 } 3168 3169 uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b); // VABD.U32 q0,q0,q0 3170 _NEON2SSE_INLINE uint32x4_t vabdq_u32(uint32x4_t a, uint32x4_t b) 3171 { 3172 __m128i cmp, difab, difba; 3173 cmp = vcgtq_u32(a,b); 3174 difab = _mm_sub_epi32(a,b); 3175 difba = _mm_sub_epi32 (b,a); 3176 difab = _mm_and_si128(cmp, difab); 3177 difba = _mm_andnot_si128(cmp, difba); 3178 return _mm_or_si128(difab, difba); 3179 } 3180 3181 float32x4_t vabdq_f32(float32x4_t a, float32x4_t b); // VABD.F32 q0,q0,q0 3182 _NEON2SSE_INLINE float32x4_t vabdq_f32(float32x4_t a, float32x4_t b) // VABD.F32 q0,q0,q0 3183 { 3184 __m128i c1; 3185 __m128 res; 3186 c1 = _mm_set1_epi32(0x7fffffff); 3187 res = _mm_sub_ps (a, b); 3188 return _mm_and_ps (res, *(__m128*)&c1); 3189 } 3190 3191 //************ Absolute difference - long ************************** 3192 //******************************************************************** 3193 3194 //**********Absolute difference and accumulate: Vr[i] = Va[i] + | Vb[i] - Vc[i] | ************* 3195 //********************************************************************************************* 3196 3197 #if defined(USE_SSSE3) 3198 int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c); // VABA.S8 q0,q0,q0 3199 _NEON2SSE_INLINE int8x16_t vabaq_s8(int8x16_t a, int8x16_t b, int8x16_t c) // VABA.S8 q0,q0,q0 3200 { 3201 int8x16_t sub; 3202 sub = vabdq_s8(b, c); 3203 return vaddq_s8( a, sub); 3204 } 3205 #endif 3206 3207 #if defined(USE_SSSE3) 3208 int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c); // VABA.S16 q0,q0,q0 3209 _NEON2SSE_INLINE int16x8_t vabaq_s16(int16x8_t a, int16x8_t b, int16x8_t c) // VABA.S16 q0,q0,q0 3210 { 3211 int16x8_t sub; 3212 sub = vabdq_s16(b, c); 3213 return vaddq_s16( a, sub); 3214 } 3215 #endif 3216 3217 #if defined(USE_SSSE3) 3218 int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c); // VABA.S32 q0,q0,q0 3219 _NEON2SSE_INLINE int32x4_t vabaq_s32(int32x4_t a, int32x4_t b, int32x4_t c) // VABA.S32 q0,q0,q0 3220 { 3221 int32x4_t sub; 3222 sub = vabdq_s32(b, c); 3223 return vaddq_s32( a, sub); 3224 } 3225 #endif 3226 3227 uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VABA.U8 q0,q0,q0 3228 _NEON2SSE_INLINE uint8x16_t vabaq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) 3229 { 3230 uint8x16_t sub; 3231 sub = vabdq_u8(b, c); 3232 return vaddq_u8( a, sub); 3233 } 3234 3235 uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VABA.s16 q0,q0,q0 3236 _NEON2SSE_INLINE uint16x8_t vabaq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) 3237 { 3238 uint16x8_t sub; 3239 sub = vabdq_u16(b, c); 3240 return vaddq_u16( a, sub); 3241 } 3242 3243 uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VABA.U32 q0,q0,q0 3244 _NEON2SSE_INLINE uint32x4_t vabaq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) 3245 { 3246 uint32x4_t sub; 3247 sub = vabdq_u32(b, c); 3248 return vaddq_u32( a, sub); 3249 } 3250 3251 //************** Absolute difference and accumulate - long ******************************** 3252 //************************************************************************************* 3253 3254 //*********************************************************************************** 3255 //**************** Maximum and minimum operations ********************************** 3256 //*********************************************************************************** 3257 //************* Maximum: vmax -> Vr[i] := (Va[i] >= Vb[i]) ? Va[i] : Vb[i] ******* 3258 //*********************************************************************************** 3259 3260 int8x16_t vmaxq_s8(int8x16_t a, int8x16_t b); // VMAX.S8 q0,q0,q0 3261 #define vmaxq_s8 _MM_MAX_EPI8 //SSE4.1 3262 3263 int16x8_t vmaxq_s16(int16x8_t a, int16x8_t b); // VMAX.S16 q0,q0,q0 3264 #define vmaxq_s16 _mm_max_epi16 3265 3266 int32x4_t vmaxq_s32(int32x4_t a, int32x4_t b); // VMAX.S32 q0,q0,q0 3267 #define vmaxq_s32 _MM_MAX_EPI32 //SSE4.1 3268 3269 uint8x16_t vmaxq_u8(uint8x16_t a, uint8x16_t b); // VMAX.U8 q0,q0,q0 3270 #define vmaxq_u8 _mm_max_epu8 3271 3272 uint16x8_t vmaxq_u16(uint16x8_t a, uint16x8_t b); // VMAX.s16 q0,q0,q0 3273 #define vmaxq_u16 _MM_MAX_EPU16 //SSE4.1 3274 3275 uint32x4_t vmaxq_u32(uint32x4_t a, uint32x4_t b); // VMAX.U32 q0,q0,q0 3276 #define vmaxq_u32 _MM_MAX_EPU32 //SSE4.1 3277 3278 float32x4_t vmaxq_f32(float32x4_t a, float32x4_t b); // VMAX.F32 q0,q0,q0 3279 #define vmaxq_f32 _mm_max_ps 3280 3281 //*************** Minimum: vmin -> Vr[i] := (Va[i] >= Vb[i]) ? Vb[i] : Va[i] ******************************** 3282 //*********************************************************************************************************** 3283 3284 int8x16_t vminq_s8(int8x16_t a, int8x16_t b); // VMIN.S8 q0,q0,q0 3285 #define vminq_s8 _MM_MIN_EPI8 //SSE4.1 3286 3287 int16x8_t vminq_s16(int16x8_t a, int16x8_t b); // VMIN.S16 q0,q0,q0 3288 #define vminq_s16 _mm_min_epi16 3289 3290 int32x4_t vminq_s32(int32x4_t a, int32x4_t b); // VMIN.S32 q0,q0,q0 3291 #define vminq_s32 _MM_MIN_EPI32 //SSE4.1 3292 3293 uint8x16_t vminq_u8(uint8x16_t a, uint8x16_t b); // VMIN.U8 q0,q0,q0 3294 #define vminq_u8 _mm_min_epu8 3295 3296 uint16x8_t vminq_u16(uint16x8_t a, uint16x8_t b); // VMIN.s16 q0,q0,q0 3297 #define vminq_u16 _MM_MIN_EPU16 //SSE4.1 3298 3299 uint32x4_t vminq_u32(uint32x4_t a, uint32x4_t b); // VMIN.U32 q0,q0,q0 3300 #define vminq_u32 _MM_MIN_EPU32 //SSE4.1 3301 3302 float32x4_t vminq_f32(float32x4_t a, float32x4_t b); // VMIN.F32 q0,q0,q0 3303 #define vminq_f32 _mm_min_ps 3304 3305 //************* Pairwise addition operations. ************************************** 3306 //************************************************************************************ 3307 //Pairwise add - adds adjacent pairs of elements of two vectors, and places the results in the destination vector 3308 3309 //************************** Long pairwise add ********************************** 3310 //********************************************************************************* 3311 //Adds adjacent pairs of elements of a vector,sign or zero extends the results to twice their original width, 3312 // and places the final results in the destination vector. 3313 3314 #if defined(USE_SSSE3) 3315 int16x8_t vpaddlq_s8(int8x16_t a); // VPADDL.S8 q0,q0 3316 _NEON2SSE_INLINE int16x8_t vpaddlq_s8(int8x16_t a) // VPADDL.S8 q0,q0 3317 { //no 8 bit hadd in IA32, need to go to 16 bit 3318 __m128i r16_1, r16_2; 3319 r16_1 = _MM_CVTEPI8_EPI16 (a); // SSE 4.1 3320 //swap hi and low part of r to process the remaining data 3321 r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 3322 r16_2 = _MM_CVTEPI8_EPI16 (r16_2); 3323 return _mm_hadd_epi16 (r16_1, r16_2); 3324 } 3325 #endif 3326 3327 #if defined(USE_SSSE3) 3328 int32x4_t vpaddlq_s16(int16x8_t a); // VPADDL.S16 q0,q0 3329 _NEON2SSE_INLINE int32x4_t vpaddlq_s16(int16x8_t a) // VPADDL.S16 q0,q0 3330 { //no 8 bit hadd in IA32, need to go to 16 bit 3331 __m128i r32_1, r32_2; 3332 r32_1 = _MM_CVTEPI16_EPI32(a); 3333 //swap hi and low part of r to process the remaining data 3334 r32_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 3335 r32_2 = _MM_CVTEPI16_EPI32 (r32_2); 3336 return _mm_hadd_epi32 (r32_1, r32_2); 3337 } 3338 #endif 3339 3340 int64x2_t vpaddlq_s32(int32x4_t a); // VPADDL.S32 q0,q0 3341 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vpaddlq_s32(int32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) // VPADDL.S32 q0,q0 3342 { 3343 _NEON2SSE_ALIGN_16 int32_t atmp[4]; 3344 _NEON2SSE_ALIGN_16 int64_t res[2]; 3345 _mm_store_si128((__m128i*)atmp, a); 3346 res[0] = (int64_t)atmp[0] + (int64_t)atmp[1]; 3347 res[1] = (int64_t)atmp[2] + (int64_t)atmp[3]; 3348 return _mm_load_si128((__m128i*)res); 3349 } 3350 3351 #if defined(USE_SSSE3) 3352 uint16x8_t vpaddlq_u8(uint8x16_t a); // VPADDL.U8 q0,q0 3353 _NEON2SSE_INLINE uint16x8_t vpaddlq_u8(uint8x16_t a) // VPADDL.U8 q0,q0 3354 { //no 8 bit hadd in IA32, need to go to 16 bit 3355 __m128i r16_1, r16_2; 3356 r16_1 = _MM_CVTEPU8_EPI16(a); 3357 //swap hi and low part of r to process the remaining data 3358 r16_2 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 3359 r16_2 = _MM_CVTEPU8_EPI16 (r16_2); 3360 return _mm_hadd_epi16 (r16_1, r16_2); 3361 } 3362 #endif 3363 3364 uint32x4_t vpaddlq_u16(uint16x8_t a); // VPADDL.s16 q0,q0 3365 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpaddlq_u16(uint16x8_t a), _NEON2SSE_REASON_SLOW_SERIAL) 3366 { //serial solution looks faster than a SIMD one 3367 _NEON2SSE_ALIGN_16 uint16_t atmp[8]; 3368 _NEON2SSE_ALIGN_16 uint32_t res[4]; 3369 _mm_store_si128((__m128i*)atmp, a); 3370 res[0] = (uint32_t)atmp[0] + (uint32_t)atmp[1]; 3371 res[1] = (uint32_t)atmp[2] + (uint32_t)atmp[3]; 3372 res[2] = (uint32_t)atmp[4] + (uint32_t)atmp[5]; 3373 res[3] = (uint32_t)atmp[6] + (uint32_t)atmp[7]; 3374 return _mm_load_si128((__m128i*)res); 3375 } 3376 3377 uint64x2_t vpaddlq_u32(uint32x4_t a); // VPADDL.U32 q0,q0 3378 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpaddlq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) 3379 { 3380 _NEON2SSE_ALIGN_16 uint32_t atmp[4]; 3381 _NEON2SSE_ALIGN_16 uint64_t res[2]; 3382 _mm_store_si128((__m128i*)atmp, a); 3383 res[0] = (uint64_t)atmp[0] + (uint64_t)atmp[1]; 3384 res[1] = (uint64_t)atmp[2] + (uint64_t)atmp[3]; 3385 return _mm_load_si128((__m128i*)res); 3386 } 3387 3388 //************************ Long pairwise add and accumulate ************************** 3389 //**************************************************************************************** 3390 //VPADAL (Vector Pairwise Add and Accumulate Long) adds adjacent pairs of elements of a vector, 3391 // and accumulates the values of the results into the elements of the destination (wide) vector 3392 3393 #if defined(USE_SSSE3) 3394 int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b); // VPADAL.S8 q0,q0 3395 _NEON2SSE_INLINE int16x8_t vpadalq_s8(int16x8_t a, int8x16_t b) // VPADAL.S8 q0,q0 3396 { 3397 int16x8_t pad; 3398 pad = vpaddlq_s8(b); 3399 return _mm_add_epi16 (a, pad); 3400 } 3401 #endif 3402 3403 #if defined(USE_SSSE3) 3404 int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b); // VPADAL.S16 q0,q0 3405 _NEON2SSE_INLINE int32x4_t vpadalq_s16(int32x4_t a, int16x8_t b) // VPADAL.S16 q0,q0 3406 { 3407 int32x4_t pad; 3408 pad = vpaddlq_s16(b); 3409 return _mm_add_epi32(a, pad); 3410 } 3411 #endif 3412 3413 int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b); // VPADAL.S32 q0,q0 3414 _NEON2SSE_INLINE int64x2_t vpadalq_s32(int64x2_t a, int32x4_t b) 3415 { 3416 int64x2_t pad; 3417 pad = vpaddlq_s32(b); 3418 return _mm_add_epi64 (a, pad); 3419 } 3420 3421 #if defined(USE_SSSE3) 3422 uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b); // VPADAL.U8 q0,q0 3423 _NEON2SSE_INLINE uint16x8_t vpadalq_u8(uint16x8_t a, uint8x16_t b) // VPADAL.U8 q0,q0 3424 { 3425 uint16x8_t pad; 3426 pad = vpaddlq_u8(b); 3427 return _mm_add_epi16 (a, pad); 3428 } 3429 #endif 3430 3431 uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b); // VPADAL.s16 q0,q0 3432 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vpadalq_u16(uint32x4_t a, uint16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3433 { 3434 uint32x4_t pad; 3435 pad = vpaddlq_u16(b); 3436 return _mm_add_epi32(a, pad); 3437 } //no optimal SIMD solution, serial is faster 3438 3439 uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b); // VPADAL.U32 q0,q0 3440 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vpadalq_u32(uint64x2_t a, uint32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3441 { //no optimal SIMD solution, serial is faster 3442 uint64x2_t pad; 3443 pad = vpaddlq_u32(b); 3444 return _mm_add_epi64(a, pad); 3445 } //no optimal SIMD solution, serial is faster 3446 3447 //********** Folding maximum ************************************* 3448 //******************************************************************* 3449 //VPMAX (Vector Pairwise Maximum) compares adjacent pairs of elements in two vectors, 3450 //and copies the larger of each pair into the corresponding element in the destination 3451 // no corresponding functionality in IA32 SIMD, so we need to do the vertical comparison 3452 3453 // ***************** Folding minimum **************************** 3454 // ************************************************************** 3455 //vpmin -> takes minimum of adjacent pairs 3456 3457 //*************************************************************** 3458 //*********** Reciprocal/Sqrt ************************************ 3459 //*************************************************************** 3460 //****************** Reciprocal estimate ******************************* 3461 3462 //the ARM NEON and x86 SIMD results may be slightly different 3463 3464 float32x4_t vrecpeq_f32(float32x4_t a); // VRECPE.F32 q0,q0 3465 //the ARM NEON and x86 SIMD results may be slightly different 3466 #define vrecpeq_f32 _mm_rcp_ps 3467 3468 uint32x4_t vrecpeq_u32(uint32x4_t a); // VRECPE.U32 q0,q0 3469 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrecpeq_u32(uint32x4_t a), _NEON2SSE_REASON_SLOW_SERIAL) 3470 { //no reciprocal for ints in IA32 available, neither for unsigned int to float 4 lanes conversion, so serial solution looks faster 3471 _NEON2SSE_ALIGN_16 uint32_t atmp[4], res[4]; 3472 _mm_store_si128((__m128i*)atmp, a); 3473 res[0] = (atmp[0]) ? 1 / atmp[0] : 0xffffffff; 3474 res[1] = (atmp[1]) ? 1 / atmp[1] : 0xffffffff; 3475 return _mm_load_si128((__m128i*)res); 3476 } 3477 3478 //**********Reciprocal square root estimate **************** 3479 //********************************************************** 3480 //no reciprocal square root for ints in IA32 available, neither for unsigned int to float4 lanes conversion, so a serial solution looks faster 3481 3482 float32x4_t vrsqrteq_f32(float32x4_t a); // VRSQRTE.F32 q0,q0 3483 //the ARM NEON and x86 SIMD results may be slightly different 3484 #define vrsqrteq_f32 _mm_rsqrt_ps 3485 3486 uint32x4_t vrsqrteq_u32(uint32x4_t a); // VRSQRTE.U32 q0,q0 3487 #define vrsqrteq_u32(a) _mm_castps_si128(_mm_rsqrt_ps(_M128(a)) ) 3488 3489 //************ Reciprocal estimate/step and 1/sqrt estimate/step *************************** 3490 //****************************************************************************************** 3491 //******VRECPS (Vector Reciprocal Step) *************************************************** 3492 //multiplies the elements of one vector by the corresponding elements of another vector, 3493 //subtracts each of the results from 2, and places the final results into the elements of the destination vector. 3494 3495 float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b); // VRECPS.F32 q0, q0, q0 3496 _NEON2SSE_INLINE float32x4_t vrecpsq_f32(float32x4_t a, float32x4_t b) // VRECPS.F32 q0, q0, q0 3497 { 3498 __m128 f2, mul; 3499 f2 = _mm_set1_ps(2.); 3500 mul = _mm_mul_ps(a,b); 3501 return _mm_sub_ps(f2,mul); 3502 } 3503 3504 //*****************VRSQRTS (Vector Reciprocal Square Root Step) ***************************** 3505 //multiplies the elements of one vector by the corresponding elements of another vector, 3506 //subtracts each of the results from 3, divides these results by two, and places the final results into the elements of the destination vector. 3507 3508 float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b); // VRSQRTS.F32 q0, q0, q0 3509 _NEON2SSE_INLINE float32x4_t vrsqrtsq_f32(float32x4_t a, float32x4_t b) // VRSQRTS.F32 q0, q0, q0 3510 { 3511 __m128 f3, f05, mul; 3512 f3 = _mm_set1_ps(3.); 3513 f05 = _mm_set1_ps(0.5); 3514 mul = _mm_mul_ps(a,b); 3515 f3 = _mm_sub_ps(f3,mul); 3516 return _mm_mul_ps (f3, f05); 3517 } 3518 //******************************************************************************************** 3519 //***************************** Shifts by signed variable *********************************** 3520 //******************************************************************************************** 3521 //***** Vector shift left: Vr[i] := Va[i] << Vb[i] (negative values shift right) *********************** 3522 //******************************************************************************************** 3523 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution 3524 //helper macro. It matches ARM implementation for big shifts 3525 #define SERIAL_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ 3526 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; int i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ 3527 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 3528 for (i = 0; i<LEN; i++) { \ 3529 if( (btmp[i] >= lanesize)||(btmp[i] <= -lanesize) ) res[i] = 0; \ 3530 else res[i] = (btmp[i] >=0) ? atmp[i] << btmp[i] : atmp[i] >> (-btmp[i]); } \ 3531 return _mm_load_si128((__m128i*)res); 3532 3533 int8x16_t vshlq_s8(int8x16_t a, int8x16_t b); // VSHL.S8 q0,q0,q0 3534 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3535 { 3536 SERIAL_SHIFT(int8_t, int8_t, 16, 16) 3537 } 3538 3539 int16x8_t vshlq_s16(int16x8_t a, int16x8_t b); // VSHL.S16 q0,q0,q0 3540 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3541 { 3542 SERIAL_SHIFT(int16_t, int16_t, 8, 8) 3543 } 3544 3545 int32x4_t vshlq_s32(int32x4_t a, int32x4_t b); // VSHL.S32 q0,q0,q0 3546 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3547 { 3548 SERIAL_SHIFT(int32_t, int32_t, 4, 4) 3549 } 3550 3551 int64x2_t vshlq_s64(int64x2_t a, int64x2_t b); // VSHL.S64 q0,q0,q0 3552 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3553 { 3554 SERIAL_SHIFT(int64_t, int64_t, 2, 2) 3555 } 3556 3557 uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b); // VSHL.U8 q0,q0,q0 3558 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3559 { 3560 SERIAL_SHIFT(uint8_t, int8_t, 16, 16) 3561 } 3562 3563 uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b); // VSHL.s16 q0,q0,q0 3564 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3565 { 3566 SERIAL_SHIFT(uint16_t, int16_t, 8, 8) 3567 } 3568 3569 uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b); // VSHL.U32 q0,q0,q0 3570 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3571 { 3572 SERIAL_SHIFT(uint32_t, int32_t, 4, 4) 3573 } 3574 3575 uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b); // VSHL.U64 q0,q0,q0 3576 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING( uint64x2_t vshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3577 { 3578 SERIAL_SHIFT(uint64_t, int64_t, 2, 2) 3579 } 3580 3581 //*********** Vector saturating shift left: (negative values shift right) ********************** 3582 //******************************************************************************************** 3583 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution 3584 #define SERIAL_SATURATING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ 3585 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ 3586 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ 3587 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 3588 for (i = 0; i<LEN; i++) { \ 3589 if (atmp[i] ==0) res[i] = 0; \ 3590 else{ \ 3591 if(btmp[i] <0) res[i] = atmp[i] >> (-btmp[i]); \ 3592 else{ \ 3593 if (btmp[i]>lanesize_1) { \ 3594 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ 3595 }else{ \ 3596 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ 3597 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ 3598 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ 3599 else res[i] = atmp[i] << btmp[i]; }}}} \ 3600 return _mm_load_si128((__m128i*)res); 3601 3602 #define SERIAL_SATURATING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ 3603 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ 3604 TYPE lanesize = (sizeof(TYPE) << 3); \ 3605 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 3606 for (i = 0; i<LEN; i++) { \ 3607 if (atmp[i] ==0) {res[i] = 0; \ 3608 }else{ \ 3609 if(btmp[i] < 0) res[i] = atmp[i] >> (-btmp[i]); \ 3610 else{ \ 3611 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ 3612 else{ \ 3613 limit = (TYPE) 1 << (lanesize - btmp[i]); \ 3614 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ 3615 return _mm_load_si128((__m128i*)res); 3616 3617 int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b); // VQSHL.S8 q0,q0,q0 3618 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3619 { 3620 SERIAL_SATURATING_SHIFT_SIGNED(int8_t, 16, 16) 3621 } 3622 3623 int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b); // VQSHL.S16 q0,q0,q0 3624 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3625 { 3626 SERIAL_SATURATING_SHIFT_SIGNED(int16_t, 8, 8) 3627 } 3628 3629 int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b); // VQSHL.S32 q0,q0,q0 3630 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3631 { 3632 SERIAL_SATURATING_SHIFT_SIGNED(int32_t, 4, 4) 3633 } 3634 3635 int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b); // VQSHL.S64 q0,q0,q0 3636 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3637 { 3638 SERIAL_SATURATING_SHIFT_SIGNED(int64_t, 2, 2) 3639 } 3640 3641 uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b); // VQSHL.U8 q0,q0,q0 3642 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3643 { 3644 SERIAL_SATURATING_SHIFT_UNSIGNED(int8_t, 16, 16) 3645 } 3646 3647 uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b); // VQSHL.s16 q0,q0,q0 3648 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3649 { 3650 SERIAL_SATURATING_SHIFT_UNSIGNED(int16_t, 8, 8) 3651 } 3652 3653 uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b); // VQSHL.U32 q0,q0,q0 3654 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3655 { 3656 SERIAL_SATURATING_SHIFT_UNSIGNED(int32_t, 4, 4) 3657 } 3658 3659 uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b); // VQSHL.U64 q0,q0,q0 3660 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3661 { 3662 SERIAL_SATURATING_SHIFT_UNSIGNED(int64_t, 2, 2) 3663 } 3664 3665 //******** Vector rounding shift left: (negative values shift right) ********** 3666 //**************************************************************************** 3667 //No such operations in IA32 SIMD available yet, constant shift only available, so need to do the serial solution 3668 //rounding makes sense for right shifts only. 3669 #define SERIAL_ROUNDING_SHIFT(TYPE, INTERNAL_TYPE, LENMAX, LEN) \ 3670 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 INTERNAL_TYPE btmp[LENMAX]; INTERNAL_TYPE i, lanesize = sizeof(INTERNAL_TYPE) << 3; \ 3671 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 3672 for (i = 0; i<LEN; i++) { \ 3673 if( btmp[i] >= 0) { \ 3674 if(btmp[i] >= lanesize) res[i] = 0; \ 3675 else res[i] = (atmp[i] << btmp[i]); \ 3676 }else{ \ 3677 res[i] = (btmp[i] < -lanesize) ? res[i] = 0 : \ 3678 (btmp[i] == -lanesize) ? (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) : \ 3679 (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((INTERNAL_TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); }} \ 3680 return _mm_load_si128((__m128i*)res); 3681 3682 int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b); // VRSHL.S8 q0,q0,q0 3683 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3684 { 3685 SERIAL_ROUNDING_SHIFT(int8_t, int8_t, 16, 16) 3686 } 3687 3688 int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b); // VRSHL.S16 q0,q0,q0 3689 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3690 { 3691 SERIAL_ROUNDING_SHIFT(int16_t, int16_t, 8, 8) 3692 } 3693 3694 int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b); // VRSHL.S32 q0,q0,q0 3695 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3696 { 3697 SERIAL_ROUNDING_SHIFT(int32_t, int32_t, 4, 4) 3698 } 3699 3700 int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b); // VRSHL.S64 q0,q0,q0 3701 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3702 { 3703 SERIAL_ROUNDING_SHIFT(int64_t, int64_t, 2, 2) 3704 } 3705 3706 uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b); // VRSHL.U8 q0,q0,q0 3707 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3708 { 3709 SERIAL_ROUNDING_SHIFT(uint8_t, int8_t, 16, 16) 3710 } 3711 3712 uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b); // VRSHL.s16 q0,q0,q0 3713 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3714 { 3715 SERIAL_ROUNDING_SHIFT(uint16_t, int16_t, 8, 8) 3716 } 3717 3718 uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b); // VRSHL.U32 q0,q0,q0 3719 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3720 { 3721 SERIAL_ROUNDING_SHIFT(uint32_t, int32_t, 4, 4) 3722 } 3723 3724 uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b); // VRSHL.U64 q0,q0,q0 3725 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3726 { 3727 SERIAL_ROUNDING_SHIFT(uint64_t, int64_t, 2, 2) 3728 } 3729 3730 //********** Vector saturating rounding shift left: (negative values shift right) **************** 3731 //************************************************************************************************* 3732 //No such operations in IA32 SIMD unfortunately, constant shift only available, so need to do the serial solution 3733 //Saturation happens for left shifts only while rounding makes sense for right shifts only. 3734 #define SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(TYPE, LENMAX, LEN) \ 3735 _NEON2SSE_ALIGN_16 TYPE atmp[LENMAX], res[LENMAX], btmp[LENMAX]; TYPE limit; int i; \ 3736 int lanesize_1 = (sizeof(TYPE) << 3) - 1; \ 3737 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 3738 for (i = 0; i<LEN; i++) { \ 3739 if (atmp[i] ==0) res[i] = 0; \ 3740 else{ \ 3741 if(btmp[i] <0) res[i] = (btmp[i] < (-lanesize_1)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ 3742 else{ \ 3743 if (btmp[i]>lanesize_1) { \ 3744 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ 3745 }else{ \ 3746 limit = (TYPE)1 << (lanesize_1 - btmp[i]); \ 3747 if((atmp[i] >= limit)||(atmp[i] <= -limit)) \ 3748 res[i] = ((_UNSIGNED_T(TYPE))atmp[i] >> lanesize_1 ) + ((TYPE)1 << lanesize_1) - 1; \ 3749 else res[i] = atmp[i] << btmp[i]; }}}} \ 3750 return _mm_load_si128((__m128i*)res); 3751 3752 #define SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(TYPE, LENMAX, LEN) \ 3753 _NEON2SSE_ALIGN_16 _UNSIGNED_T(TYPE) atmp[LENMAX], res[LENMAX]; _NEON2SSE_ALIGN_16 TYPE btmp[LENMAX]; _UNSIGNED_T(TYPE) limit; int i; \ 3754 int lanesize = (sizeof(TYPE) << 3); \ 3755 _mm_store_si128((__m128i*)atmp, a); _mm_store_si128((__m128i*)btmp, b); \ 3756 for (i = 0; i<LEN; i++) { \ 3757 if (atmp[i] ==0) {res[i] = 0; \ 3758 }else{ \ 3759 if(btmp[i] < 0) res[i] = (btmp[i] < (-lanesize)) ? 0 : (atmp[i] >> (-btmp[i])) + ( (atmp[i] & ((TYPE)1 << (-btmp[i] - 1))) >> (-btmp[i] - 1) ); \ 3760 else{ \ 3761 if (btmp[i]>lanesize) res[i] = ~((TYPE)0); \ 3762 else{ \ 3763 limit = (TYPE) 1 << (lanesize - btmp[i]); \ 3764 res[i] = ( atmp[i] >= limit) ? res[i] = ~((TYPE)0) : atmp[i] << btmp[i]; }}}} \ 3765 return _mm_load_si128((__m128i*)res); 3766 3767 int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b); // VQRSHL.S8 q0,q0,q0 3768 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int8x16_t vqrshlq_s8(int8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3769 { 3770 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int8_t, 16, 16) 3771 } 3772 3773 int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b); // VQRSHL.S16 q0,q0,q0 3774 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int16x8_t vqrshlq_s16(int16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3775 { 3776 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int16_t, 8, 8) 3777 } 3778 3779 int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b); // VQRSHL.S32 q0,q0,q0 3780 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrshlq_s32(int32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3781 { 3782 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int32_t, 4, 4) 3783 } 3784 3785 int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b); // VQRSHL.S64 q0,q0,q0 3786 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqrshlq_s64(int64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3787 { 3788 SERIAL_SATURATING_ROUNDING_SHIFT_SIGNED(int64_t, 2, 2) 3789 } 3790 3791 uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b); // VQRSHL.U8 q0,q0,q0 3792 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint8x16_t vqrshlq_u8(uint8x16_t a, int8x16_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3793 { 3794 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int8_t, 16, 16) 3795 } 3796 3797 uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b); // VQRSHL.s16 q0,q0,q0 3798 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint16x8_t vqrshlq_u16(uint16x8_t a, int16x8_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3799 { 3800 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int16_t, 8, 8) 3801 } 3802 3803 uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b); // VQRSHL.U32 q0,q0,q0 3804 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint32x4_t vqrshlq_u32(uint32x4_t a, int32x4_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3805 { 3806 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int32_t, 4, 4) 3807 } 3808 3809 uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b); // VQRSHL.U64 q0,q0,q0 3810 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqrshlq_u64(uint64x2_t a, int64x2_t b), _NEON2SSE_REASON_SLOW_SERIAL) 3811 { 3812 SERIAL_SATURATING_ROUNDING_SHIFT_UNSIGNED(int64_t, 2, 2) 3813 } 3814 3815 // ********************************************************************************* 3816 // ***************************** Shifts by a constant ***************************** 3817 // ********************************************************************************* 3818 //**************** Vector shift right by constant************************************* 3819 //************************************************************************************ 3820 3821 int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VSHR.S8 q0,q0,#8 3822 _NEON2SSE_INLINE int8x16_t vshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VSHR.S8 q0,q0,#8 3823 { //no 8 bit shift available, go to 16 bit trick 3824 __m128i zero, mask0, a_sign, r, a_sign_mask; 3825 _NEON2SSE_ALIGN_16 int16_t mask0_16[9] = {0x0000, 0x0080, 0x00c0, 0x00e0, 0x00f0, 0x00f8, 0x00fc, 0x00fe, 0x00ff}; 3826 zero = _mm_setzero_si128(); 3827 mask0 = _mm_set1_epi16(mask0_16[b]); //to mask the bits to be "spoiled" by 16 bit shift 3828 a_sign = _mm_cmpgt_epi8 (zero, a); //ff if a<0 or zero if a>0 3829 r = _mm_srai_epi16 (a, b); 3830 a_sign_mask = _mm_and_si128 (mask0, a_sign); 3831 r = _mm_andnot_si128 (mask0, r); 3832 return _mm_or_si128 (r, a_sign_mask); 3833 } 3834 3835 int16x8_t vshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VSHR.S16 q0,q0,#16 3836 #define vshrq_n_s16 _mm_srai_epi16 3837 3838 int32x4_t vshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VSHR.S32 q0,q0,#32 3839 #define vshrq_n_s32 _mm_srai_epi32 3840 3841 int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VSHR.S64 q0,q0,#64 3842 _NEON2SSE_INLINE int64x2_t vshrq_n_s64(int64x2_t a, __constrange(1,64) int b) 3843 { //SIMD implementation may be not optimal due to 64 bit arithmetic shift absense in x86 SIMD 3844 __m128i c1, signmask,a0, res64; 3845 _NEON2SSE_ALIGN_16 uint64_t mask[] = {0x8000000000000000, 0x8000000000000000}; 3846 c1 = _mm_cmpeq_epi32(a,a); //0xffffffffffffffff 3847 signmask = _mm_slli_epi64 (c1, (64 - b)); 3848 a0 = _mm_or_si128(a, *(__m128i*)mask); //get the first bit 3849 #ifdef USE_SSE4 3850 a0 = _mm_cmpeq_epi64 (a, a0); //SSE4.1 3851 #else 3852 a0 = _mm_cmpeq_epi32 (a, a0); 3853 a0 = _mm_shuffle_epi32 (a0, 1 | (1 << 2) | (3 << 4) | (3 << 6)); //copy the information from hi to low part of the 64 bit data 3854 #endif 3855 signmask = _mm_and_si128(a0, signmask); 3856 res64 = _mm_srli_epi64 (a, b); 3857 return _mm_or_si128(res64, signmask); 3858 } 3859 3860 uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VSHR.U8 q0,q0,#8 3861 _NEON2SSE_INLINE uint8x16_t vshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VSHR.U8 q0,q0,#8 3862 { //no 8 bit shift available, need the special trick 3863 __m128i mask0, r; 3864 _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xff7f, 0xff3f, 0xff1f, 0xff0f, 0xff07, 0xff03, 0xff01, 0xff00}; 3865 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift 3866 r = _mm_srli_epi16 ( a, b); 3867 return _mm_and_si128 (r, mask0); 3868 } 3869 3870 uint16x8_t vshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VSHR.s16 q0,q0,#16 3871 #define vshrq_n_u16 _mm_srli_epi16 3872 3873 uint32x4_t vshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VSHR.U32 q0,q0,#32 3874 #define vshrq_n_u32 _mm_srli_epi32 3875 3876 uint64x2_t vshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VSHR.U64 q0,q0,#64 3877 #define vshrq_n_u64 _mm_srli_epi64 3878 3879 //*************************** Vector shift left by constant ************************* 3880 //********************************************************************************* 3881 3882 int8x16_t vshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 3883 #define vshlq_n_s8 vshlq_n_u8 3884 3885 int16x8_t vshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 3886 #define vshlq_n_s16 _mm_slli_epi16 3887 3888 int32x4_t vshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 3889 #define vshlq_n_s32 _mm_slli_epi32 3890 3891 int64x2_t vshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 3892 #define vshlq_n_s64 _mm_slli_epi64 3893 3894 uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VSHL.I8 q0,q0,#0 3895 _NEON2SSE_INLINE uint8x16_t vshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) 3896 { //no 8 bit shift available, need the special trick 3897 __m128i mask0, r; 3898 _NEON2SSE_ALIGN_16 uint16_t mask10_16[9] = {0xffff, 0xfeff, 0xfcff, 0xf8ff, 0xf0ff, 0xe0ff, 0xc0ff, 0x80ff, 0xff}; 3899 mask0 = _mm_set1_epi16(mask10_16[b]); //to mask the bits to be "spoiled" by 16 bit shift 3900 r = _mm_slli_epi16 ( a, b); 3901 return _mm_and_si128 (r, mask0); 3902 } 3903 3904 uint16x8_t vshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VSHL.I16 q0,q0,#0 3905 #define vshlq_n_u16 vshlq_n_s16 3906 3907 uint32x4_t vshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VSHL.I32 q0,q0,#0 3908 #define vshlq_n_u32 vshlq_n_s32 3909 3910 uint64x2_t vshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VSHL.I64 q0,q0,#0 3911 #define vshlq_n_u64 vshlq_n_s64 3912 3913 //************* Vector rounding shift right by constant ****************** 3914 //************************************************************************* 3915 //No corresponding x86 intrinsics exist, need to do some tricks 3916 3917 int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b); // VRSHR.S8 q0,q0,#8 3918 _NEON2SSE_INLINE int8x16_t vrshrq_n_s8(int8x16_t a, __constrange(1,8) int b) // VRSHR.S8 q0,q0,#8 3919 { //no 8 bit shift available, go to 16 bit trick 3920 __m128i r, mask1, maskb; 3921 _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 3922 r = vshrq_n_s8 (a, b); 3923 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding 3924 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding 3925 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 3926 return _mm_add_epi8(r, maskb); //actual rounding 3927 } 3928 3929 int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b); // VRSHR.S16 q0,q0,#16 3930 _NEON2SSE_INLINE int16x8_t vrshrq_n_s16(int16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 3931 { 3932 __m128i maskb, r; 3933 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit 3934 maskb = _mm_srli_epi16(maskb, 15); //1 or 0 3935 r = _mm_srai_epi16 (a, b); 3936 return _mm_add_epi16 (r, maskb); //actual rounding 3937 } 3938 3939 int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b); // VRSHR.S32 q0,q0,#32 3940 _NEON2SSE_INLINE int32x4_t vrshrq_n_s32(int32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 3941 { 3942 __m128i maskb, r; 3943 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit 3944 maskb = _mm_srli_epi32 (maskb,31); //1 or 0 3945 r = _mm_srai_epi32(a, b); 3946 return _mm_add_epi32 (r, maskb); //actual rounding 3947 } 3948 3949 int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b); // VRSHR.S64 q0,q0,#64 3950 _NEON2SSE_INLINE int64x2_t vrshrq_n_s64(int64x2_t a, __constrange(1,64) int b) 3951 { //solution may be not optimal compared with a serial one 3952 __m128i maskb; 3953 int64x2_t r; 3954 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit 3955 maskb = _mm_srli_epi64 (maskb,63); //1 or 0 3956 r = vshrq_n_s64(a, b); 3957 return _mm_add_epi64 (r, maskb); //actual rounding 3958 } 3959 3960 uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b); // VRSHR.U8 q0,q0,#8 3961 _NEON2SSE_INLINE uint8x16_t vrshrq_n_u8(uint8x16_t a, __constrange(1,8) int b) // VRSHR.U8 q0,q0,#8 3962 { //no 8 bit shift available, go to 16 bit trick 3963 __m128i r, mask1, maskb; 3964 _NEON2SSE_ALIGN_16 uint16_t mask2b[9] = {0x0000, 0x0101, 0x0202, 0x0404, 0x0808, 0x1010, 0x2020, 0x4040, 0x8080}; // 2^b-th bit set to 1 3965 r = vshrq_n_u8 (a, b); 3966 mask1 = _mm_set1_epi16(mask2b[b]); // 2^b-th bit set to 1 for 16bit, need it for rounding 3967 maskb = _mm_and_si128(a, mask1); //get b or 0 for rounding 3968 maskb = _mm_srli_epi16 (maskb, b - 1); // to add 1 3969 return _mm_add_epi8(r, maskb); //actual rounding 3970 } 3971 3972 uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b); // VRSHR.s16 q0,q0,#16 3973 _NEON2SSE_INLINE uint16x8_t vrshrq_n_u16(uint16x8_t a, __constrange(1,16) int b) // VRSHR.S16 q0,q0,#16 3974 { 3975 __m128i maskb, r; 3976 maskb = _mm_slli_epi16(a, (16 - b)); //to get rounding (b-1)th bit 3977 maskb = _mm_srli_epi16(maskb, 15); //1 or 0 3978 r = _mm_srli_epi16 (a, b); 3979 return _mm_add_epi16 (r, maskb); //actual rounding 3980 } 3981 3982 uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b); // VRSHR.U32 q0,q0,#32 3983 _NEON2SSE_INLINE uint32x4_t vrshrq_n_u32(uint32x4_t a, __constrange(1,32) int b) // VRSHR.S32 q0,q0,#32 3984 { 3985 __m128i maskb, r; 3986 maskb = _mm_slli_epi32 (a, (32 - b)); //to get rounding (b-1)th bit 3987 maskb = _mm_srli_epi32 (maskb,31); //1 or 0 3988 r = _mm_srli_epi32(a, b); 3989 return _mm_add_epi32 (r, maskb); //actual rounding 3990 } 3991 3992 uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b); // VRSHR.U64 q0,q0,#64 3993 _NEON2SSE_INLINE uint64x2_t vrshrq_n_u64(uint64x2_t a, __constrange(1,64) int b) 3994 { //solution may be not optimal compared with a serial one 3995 __m128i maskb, r; 3996 maskb = _mm_slli_epi64 (a, (64 - b)); //to get rounding (b-1)th bit 3997 maskb = _mm_srli_epi64 (maskb,63); //1 or 0 3998 r = _mm_srli_epi64(a, b); 3999 return _mm_add_epi64 (r, maskb); //actual rounding 4000 } 4001 4002 //************* Vector shift right by constant and accumulate ********* 4003 //********************************************************************* 4004 4005 int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRA.S8 q0,q0,#8 4006 _NEON2SSE_INLINE int8x16_t vsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRA.S8 q0,q0,#8 4007 { 4008 int8x16_t shift; 4009 shift = vshrq_n_s8(b, c); 4010 return vaddq_s8(a, shift); 4011 } 4012 4013 int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRA.S16 q0,q0,#16 4014 _NEON2SSE_INLINE int16x8_t vsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRA.S16 q0,q0,#16 4015 { 4016 int16x8_t shift; 4017 shift = vshrq_n_s16(b, c); 4018 return vaddq_s16(a, shift); 4019 } 4020 4021 int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRA.S32 q0,q0,#32 4022 _NEON2SSE_INLINE int32x4_t vsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRA.S32 q0,q0,#32 4023 { 4024 int32x4_t shift; 4025 shift = vshrq_n_s32(b, c); 4026 return vaddq_s32(a, shift); 4027 } 4028 4029 int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRA.S64 q0,q0,#64 4030 _NEON2SSE_INLINE int64x2_t vsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) // VSRA.S64 q0,q0,#64 4031 { 4032 int64x2_t shift; 4033 shift = vshrq_n_s64(b, c); 4034 return vaddq_s64( a, shift); 4035 } 4036 4037 uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRA.U8 q0,q0,#8 4038 _NEON2SSE_INLINE uint8x16_t vsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VSRA.U8 q0,q0,#8 4039 { 4040 uint8x16_t shift; 4041 shift = vshrq_n_u8(b, c); 4042 return vaddq_u8(a, shift); 4043 } 4044 4045 uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRA.s16 q0,q0,#16 4046 _NEON2SSE_INLINE uint16x8_t vsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VSRA.s16 q0,q0,#16 4047 { 4048 uint16x8_t shift; 4049 shift = vshrq_n_u16(b, c); 4050 return vaddq_u16(a, shift); 4051 } 4052 4053 uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRA.U32 q0,q0,#32 4054 _NEON2SSE_INLINE uint32x4_t vsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VSRA.U32 q0,q0,#32 4055 { 4056 uint32x4_t shift; 4057 shift = vshrq_n_u32(b, c); 4058 return vaddq_u32(a, shift); 4059 } 4060 4061 uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRA.U64 q0,q0,#64 4062 _NEON2SSE_INLINE uint64x2_t vsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) // VSRA.U64 q0,q0,#64 4063 { 4064 uint64x2_t shift; 4065 shift = vshrq_n_u64(b, c); 4066 return vaddq_u64(a, shift); 4067 } 4068 4069 //************* Vector rounding shift right by constant and accumulate **************************** 4070 //************************************************************************************************ 4071 4072 int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VRSRA.S8 q0,q0,#8 4073 _NEON2SSE_INLINE int8x16_t vrsraq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VRSRA.S8 q0,q0,#8 4074 { 4075 int8x16_t shift; 4076 shift = vrshrq_n_s8(b, c); 4077 return vaddq_s8(a, shift); 4078 } 4079 4080 int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VRSRA.S16 q0,q0,#16 4081 _NEON2SSE_INLINE int16x8_t vrsraq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VRSRA.S16 q0,q0,#16 4082 { 4083 int16x8_t shift; 4084 shift = vrshrq_n_s16(b, c); 4085 return vaddq_s16(a, shift); 4086 } 4087 4088 int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VRSRA.S32 q0,q0,#32 4089 _NEON2SSE_INLINE int32x4_t vrsraq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VRSRA.S32 q0,q0,#32 4090 { 4091 int32x4_t shift; 4092 shift = vrshrq_n_s32(b, c); 4093 return vaddq_s32(a, shift); 4094 } 4095 4096 int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VRSRA.S64 q0,q0,#64 4097 _NEON2SSE_INLINE int64x2_t vrsraq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) 4098 { 4099 int64x2_t shift; 4100 shift = vrshrq_n_s64(b, c); 4101 return vaddq_s64(a, shift); 4102 } 4103 4104 uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VRSRA.U8 q0,q0,#8 4105 _NEON2SSE_INLINE uint8x16_t vrsraq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c) // VRSRA.U8 q0,q0,#8 4106 { 4107 uint8x16_t shift; 4108 shift = vrshrq_n_u8(b, c); 4109 return vaddq_u8(a, shift); 4110 } 4111 4112 uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VRSRA.s16 q0,q0,#16 4113 _NEON2SSE_INLINE uint16x8_t vrsraq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c) // VRSRA.s16 q0,q0,#16 4114 { 4115 uint16x8_t shift; 4116 shift = vrshrq_n_u16(b, c); 4117 return vaddq_u16(a, shift); 4118 } 4119 4120 uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VRSRA.U32 q0,q0,#32 4121 _NEON2SSE_INLINE uint32x4_t vrsraq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c) // VRSRA.U32 q0,q0,#32 4122 { 4123 uint32x4_t shift; 4124 shift = vrshrq_n_u32(b, c); 4125 return vaddq_u32(a, shift); 4126 } 4127 4128 uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VRSRA.U64 q0,q0,#64 4129 _NEON2SSE_INLINE uint64x2_t vrsraq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c) 4130 { 4131 uint64x2_t shift; 4132 shift = vrshrq_n_u64(b, c); 4133 return vaddq_u64(a, shift); 4134 } 4135 4136 //**********************Vector saturating shift left by constant ***************************** 4137 //******************************************************************************************** 4138 //we don't check const ranges assuming they are met 4139 4140 int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHL.S8 q0,q0,#0 4141 _NEON2SSE_INLINE int8x16_t vqshlq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHL.S8 q0,q0,#0 4142 { // go to 16 bit to get the auto saturation (in packs function) 4143 __m128i a128, r128_1, r128_2; 4144 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 4145 r128_1 = _mm_slli_epi16 (a128, b); 4146 //swap hi and low part of a128 to process the remaining data 4147 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 4148 a128 = _MM_CVTEPI8_EPI16 (a128); 4149 r128_2 = _mm_slli_epi16 (a128, b); 4150 return _mm_packs_epi16 (r128_1, r128_2); //saturated s8 4151 } 4152 4153 int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHL.S16 q0,q0,#0 4154 _NEON2SSE_INLINE int16x8_t vqshlq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHL.S16 q0,q0,#0 4155 { // manual saturation solution looks LESS optimal than 32 bits conversion one 4156 // go to 32 bit to get the auto saturation (in packs function) 4157 __m128i a128, r128_1, r128_2; 4158 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 4159 r128_1 = _mm_slli_epi32 (a128, b); //shift_res 4160 //swap hi and low part of a128 to process the remaining data 4161 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 4162 a128 = _MM_CVTEPI16_EPI32 (a128); 4163 r128_2 = _mm_slli_epi32 (a128, b); 4164 return _mm_packs_epi32 (r128_1, r128_2); //saturated s16 4165 } 4166 4167 int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHL.S32 q0,q0,#0 4168 _NEON2SSE_INLINE int32x4_t vqshlq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHL.S32 q0,q0,#0 4169 { // no 64 bit saturation option available, special tricks necessary 4170 __m128i c1, maskA, saturation_mask, c7ffffff_mask, shift_res, shift_res_mask; 4171 c1 = _mm_cmpeq_epi32(a,a); //0xff..ff 4172 maskA = _mm_srli_epi32(c1, b + 1); //mask for positive numbers (32-b+1) zeros and b-1 ones 4173 saturation_mask = _mm_cmpgt_epi32 (a, maskA); //0xff...ff if we need saturation, 0 otherwise 4174 c7ffffff_mask = _mm_srli_epi32(saturation_mask, 1); //saturated to 0x7f..ff when needed and zeros if not 4175 shift_res = _mm_slli_epi32 (a, b); 4176 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); 4177 //result with positive numbers saturated 4178 shift_res = _mm_or_si128 (c7ffffff_mask, shift_res_mask); 4179 //treat negative numbers 4180 maskA = _mm_slli_epi32(c1, 31 - b); //mask for negative numbers b-1 ones and (32-b+1) zeros 4181 saturation_mask = _mm_cmpgt_epi32 (maskA,a); //0xff...ff if we need saturation, 0 otherwise 4182 c7ffffff_mask = _mm_slli_epi32(saturation_mask, 31); //saturated to 0x80..00 when needed and zeros if not 4183 shift_res_mask = _mm_andnot_si128(saturation_mask, shift_res); 4184 return _mm_or_si128 (c7ffffff_mask, shift_res_mask); 4185 } 4186 4187 int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHL.S64 q0,q0,#0 4188 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int64x2_t vqshlq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) 4189 { // no effective SIMD solution here 4190 _NEON2SSE_ALIGN_16 int64_t atmp[2], res[2]; 4191 int64_t bmask; 4192 int i; 4193 bmask = ( int64_t)1 << (63 - b); //positive 4194 _mm_store_si128((__m128i*)atmp, a); 4195 for (i = 0; i<2; i++) { 4196 if (atmp[i] >= bmask) { 4197 res[i] = ~(_SIGNBIT64); 4198 } else { 4199 res[i] = (atmp[i] <= -bmask) ? _SIGNBIT64 : atmp[i] << b; 4200 } 4201 } 4202 return _mm_load_si128((__m128i*)res); 4203 } 4204 4205 uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b); // VQSHL.U8 q0,q0,#0 4206 _NEON2SSE_INLINE uint8x16_t vqshlq_n_u8(uint8x16_t a, __constrange(0,7) int b) // VQSHL.U8 q0,q0,#0 4207 { // go to 16 bit to get the auto saturation (in packs function) 4208 __m128i a128, r128_1, r128_2; 4209 a128 = _MM_CVTEPU8_EPI16 (a); //SSE 4.1 4210 r128_1 = _mm_slli_epi16 (a128, b); 4211 //swap hi and low part of a128 to process the remaining data 4212 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 4213 a128 = _MM_CVTEPU8_EPI16 (a128); 4214 r128_2 = _mm_slli_epi16 (a128, b); 4215 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 4216 } 4217 4218 uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b); // VQSHL.s16 q0,q0,#0 4219 _NEON2SSE_INLINE uint16x8_t vqshlq_n_u16(uint16x8_t a, __constrange(0,15) int b) // VQSHL.s16 q0,q0,#0 4220 { // manual saturation solution looks more optimal than 32 bits conversion one 4221 __m128i cb, c8000, a_signed, saturation_mask, shift_res; 4222 cb = _mm_set1_epi16((1 << (16 - b)) - 1 - 0x8000 ); 4223 c8000 = _mm_set1_epi16 (0x8000); 4224 //no unsigned shorts comparison in SSE, only signed available, so need the trick 4225 a_signed = _mm_sub_epi16(a, c8000); //go to signed 4226 saturation_mask = _mm_cmpgt_epi16 (a_signed, cb); 4227 shift_res = _mm_slli_epi16 (a, b); 4228 return _mm_or_si128 (shift_res, saturation_mask); 4229 } 4230 4231 uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b); // VQSHL.U32 q0,q0,#0 4232 _NEON2SSE_INLINE uint32x4_t vqshlq_n_u32(uint32x4_t a, __constrange(0,31) int b) // VQSHL.U32 q0,q0,#0 4233 { // manual saturation solution, no 64 bit saturation option, the serial version may be faster 4234 __m128i cb, c80000000, a_signed, saturation_mask, shift_res; 4235 cb = _mm_set1_epi32((1 << (32 - b)) - 1 - 0x80000000 ); 4236 c80000000 = _mm_set1_epi32 (0x80000000); 4237 //no unsigned ints comparison in SSE, only signed available, so need the trick 4238 a_signed = _mm_sub_epi32(a, c80000000); //go to signed 4239 saturation_mask = _mm_cmpgt_epi32 (a_signed, cb); 4240 shift_res = _mm_slli_epi32 (a, b); 4241 return _mm_or_si128 (shift_res, saturation_mask); 4242 } 4243 4244 uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b); // VQSHL.U64 q0,q0,#0 4245 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshlq_n_u64(uint64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) 4246 { // no effective SIMD solution here 4247 _NEON2SSE_ALIGN_16 uint64_t atmp[2], res[2]; 4248 uint64_t bmask; 4249 int i; 4250 bmask = ( uint64_t)1 << (64 - b); 4251 _mm_store_si128((__m128i*)atmp, a); 4252 for (i = 0; i<2; i++) { 4253 res[i] = (atmp[i] >= bmask)&&(b>0) ? 0xffffffffffffffff : atmp[i] << b; //if b=0 we are fine with any a 4254 } 4255 return _mm_load_si128((__m128i*)res); 4256 } 4257 4258 //**************Vector signed->unsigned saturating shift left by constant ************* 4259 //************************************************************************************* 4260 4261 uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b); // VQSHLU.S8 q0,q0,#0 4262 _NEON2SSE_INLINE uint8x16_t vqshluq_n_s8(int8x16_t a, __constrange(0,7) int b) // VQSHLU.S8 q0,q0,#0 4263 { 4264 __m128i a128, r128_1, r128_2; 4265 a128 = _MM_CVTEPI8_EPI16 (a); //SSE 4.1 4266 r128_1 = _mm_slli_epi16 (a128, b); 4267 //swap hi and low part of a128 to process the remaining data 4268 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 4269 a128 = _MM_CVTEPI8_EPI16 (a128); 4270 r128_2 = _mm_slli_epi16 (a128, b); 4271 return _mm_packus_epi16 (r128_1, r128_2); //saturated u8 4272 } 4273 4274 #if defined(USE_SSSE3) 4275 uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b); // VQSHLU.S16 q0,q0,#0 4276 _NEON2SSE_INLINE uint16x8_t vqshluq_n_s16(int16x8_t a, __constrange(0,15) int b) // VQSHLU.S16 q0,q0,#0 4277 { // manual saturation solution looks LESS optimal than 32 bits conversion one 4278 __m128i a128, r128_1, r128_2; 4279 a128 = _MM_CVTEPI16_EPI32 (a); //SSE 4.1 4280 r128_1 = _mm_slli_epi32 (a128, b); //shift_res 4281 //swap hi and low part of a128 to process the remaining data 4282 a128 = _mm_shuffle_epi32 (a, _SWAP_HI_LOW32); 4283 a128 = _MM_CVTEPI16_EPI32 (a128); 4284 r128_2 = _mm_slli_epi32 (a128, b); 4285 return _MM_PACKUS_EPI32 (r128_1, r128_2); //saturated s16 4286 } 4287 #endif 4288 4289 uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b); // VQSHLU.S32 q0,q0,#0 4290 _NEON2SSE_INLINE uint32x4_t vqshluq_n_s32(int32x4_t a, __constrange(0,31) int b) // VQSHLU.S32 q0,q0,#0 4291 { //solution may be not optimal compared with the serial one 4292 __m128i zero, maskA, maskGT0, a0, a_masked, a_shift; 4293 zero = _mm_setzero_si128(); 4294 maskA = _mm_cmpeq_epi32(a, a); 4295 maskA = _mm_slli_epi32(maskA,(32 - b)); // b ones and (32-b)zeros 4296 //saturate negative numbers to zero 4297 maskGT0 = _mm_cmpgt_epi32 (a, zero); // //0xffffffff if positive number and zero otherwise (negative numbers) 4298 a0 = _mm_and_si128 (a, maskGT0); //negative are zeros now 4299 //saturate positive to 0xffffffff 4300 a_masked = _mm_and_si128 (a0, maskA); 4301 a_masked = _mm_cmpgt_epi32 (a_masked, zero); //0xffffffff if saturation necessary 0 otherwise 4302 a_shift = _mm_slli_epi32 (a0, b); 4303 return _mm_or_si128 (a_shift, a_masked); //actual saturation 4304 } 4305 4306 uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b); // VQSHLU.S64 q0,q0,#0 4307 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(uint64x2_t vqshluq_n_s64(int64x2_t a, __constrange(0,63) int b), _NEON2SSE_REASON_SLOW_SERIAL) 4308 { // no effective SIMD solution here, serial execution looks faster 4309 _NEON2SSE_ALIGN_16 int64_t atmp[2]; 4310 _NEON2SSE_ALIGN_16 uint64_t res[2]; 4311 uint64_t limit; 4312 int i; 4313 _mm_store_si128((__m128i*)atmp, a); 4314 for (i = 0; i<2; i++) { 4315 if (atmp[i]<=0) { 4316 res[i] = 0; 4317 } else { 4318 limit = (uint64_t) 1 << (64 - b); 4319 res[i] = ( ((uint64_t)atmp[i]) >= limit) ? res[i] = ~((uint64_t)0) : atmp[i] << b; 4320 } 4321 } 4322 return _mm_load_si128((__m128i*)res); 4323 } 4324 4325 //************** Vector narrowing shift right by constant ************** 4326 //********************************************************************** 4327 4328 //************** Vector signed->unsigned narrowing saturating shift right by constant ******** 4329 //********************************************************************************************* 4330 4331 //**** Vector signed->unsigned rounding narrowing saturating shift right by constant ***** 4332 4333 //***** Vector narrowing saturating shift right by constant ****** 4334 //***************************************************************** 4335 4336 //********* Vector rounding narrowing shift right by constant ************************* 4337 //**************************************************************************************** 4338 4339 //************* Vector rounding narrowing saturating shift right by constant ************ 4340 //**************************************************************************************** 4341 4342 //************** Vector widening shift left by constant **************** 4343 //************************************************************************ 4344 4345 //************************************************************************************ 4346 //**************************** Shifts with insert ************************************ 4347 //************************************************************************************ 4348 //takes each element in a vector, shifts them by an immediate value, 4349 //and inserts the results in the destination vector. Bits shifted out of the each element are lost. 4350 4351 //**************** Vector shift right and insert ************************************ 4352 //Actually the "c" left bits from "a" are the only bits remained from "a" after the shift. 4353 //All other bits are taken from b shifted. 4354 4355 int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 4356 _NEON2SSE_INLINE int8x16_t vsriq_n_s8(int8x16_t a, int8x16_t b, __constrange(1,8) int c) // VSRI.8 q0,q0,#8 4357 { 4358 __m128i maskA, a_masked; 4359 uint8x16_t b_shift; 4360 _NEON2SSE_ALIGN_16 uint8_t maskLeft[9] = {0x0, 0x80, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc, 0xfe, 0xff}; //"a" bits mask, 0 bit not used 4361 maskA = _mm_set1_epi8(maskLeft[c]); // c ones and (8-c)zeros 4362 a_masked = _mm_and_si128 (a, maskA); 4363 b_shift = vshrq_n_u8( b, c); // c zeros on the left in b due to logical shift 4364 return _mm_or_si128 (a_masked, b_shift); //combine (insert b into a) 4365 } 4366 4367 int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 4368 _NEON2SSE_INLINE int16x8_t vsriq_n_s16(int16x8_t a, int16x8_t b, __constrange(1,16) int c) // VSRI.16 q0,q0,#16 4369 { //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a 4370 uint16x8_t b_shift; 4371 uint16x8_t a_c; 4372 b_shift = vshrq_n_u16( b, c); // c zeros on the left in b due to logical shift 4373 a_c = vshrq_n_u16( a, (16 - c)); 4374 a_c = _mm_slli_epi16(a_c, (16 - c)); //logical shift provides right "c" bits zeros in a 4375 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) 4376 } 4377 4378 int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 4379 _NEON2SSE_INLINE int32x4_t vsriq_n_s32(int32x4_t a, int32x4_t b, __constrange(1,32) int c) // VSRI.32 q0,q0,#32 4380 { //to cut "c" left bits from a we do shift right and then shift back left providing c right zeros in a 4381 uint32x4_t b_shift; 4382 uint32x4_t a_c; 4383 b_shift = vshrq_n_u32( b, c); // c zeros on the left in b due to logical shift 4384 a_c = vshrq_n_u32( a, (32 - c)); 4385 a_c = _mm_slli_epi32(a_c, (32 - c)); //logical shift provides right "c" bits zeros in a 4386 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) 4387 } 4388 4389 int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 4390 _NEON2SSE_INLINE int64x2_t vsriq_n_s64(int64x2_t a, int64x2_t b, __constrange(1,64) int c) 4391 { //serial solution may be faster 4392 uint64x2_t b_shift; 4393 uint64x2_t a_c; 4394 b_shift = _mm_srli_epi64(b, c); // c zeros on the left in b due to logical shift 4395 a_c = _mm_srli_epi64(a, (64 - c)); 4396 a_c = _mm_slli_epi64(a_c, (64 - c)); //logical shift provides right "c" bits zeros in a 4397 return _mm_or_si128 (a_c, b_shift); //combine (insert b into a) 4398 } 4399 4400 uint8x16_t vsriq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 4401 #define vsriq_n_u8 vsriq_n_s8 4402 4403 uint16x8_t vsriq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 4404 #define vsriq_n_u16 vsriq_n_s16 4405 4406 uint32x4_t vsriq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(1,32) int c); // VSRI.32 q0,q0,#32 4407 #define vsriq_n_u32 vsriq_n_s32 4408 4409 uint64x2_t vsriq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(1,64) int c); // VSRI.64 q0,q0,#64 4410 #define vsriq_n_u64 vsriq_n_s64 4411 4412 poly8x16_t vsriq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(1,8) int c); // VSRI.8 q0,q0,#8 4413 #define vsriq_n_p8 vsriq_n_u8 4414 4415 poly16x8_t vsriq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(1,16) int c); // VSRI.16 q0,q0,#16 4416 #define vsriq_n_p16 vsriq_n_u16 4417 4418 //***** Vector shift left and insert ********************************************* 4419 //********************************************************************************* 4420 //Actually the "c" right bits from "a" are the only bits remained from "a" after the shift. 4421 //All other bits are taken from b shifted. Ending zeros are inserted in b in the shift proces. We need to combine "a" and "b shifted". 4422 4423 int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 4424 _NEON2SSE_INLINE int8x16_t vsliq_n_s8(int8x16_t a, int8x16_t b, __constrange(0,7) int c) // VSLI.8 q0,q0,#0 4425 { 4426 __m128i maskA, a_masked; 4427 int8x16_t b_shift; 4428 _NEON2SSE_ALIGN_16 uint8_t maskRight[8] = {0x0, 0x1, 0x3, 0x7, 0x0f, 0x1f, 0x3f, 0x7f}; //"a" bits mask 4429 maskA = _mm_set1_epi8(maskRight[c]); // (8-c)zeros and c ones 4430 b_shift = vshlq_n_s8( b, c); 4431 a_masked = _mm_and_si128 (a, maskA); 4432 return _mm_or_si128 (b_shift, a_masked); //combine (insert b into a) 4433 } 4434 4435 int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 4436 _NEON2SSE_INLINE int16x8_t vsliq_n_s16(int16x8_t a, int16x8_t b, __constrange(0,15) int c) // VSLI.16 q0,q0,#0 4437 { //to cut "c" right bits from a we do shift left and then logical shift back right providing (16-c)zeros in a 4438 int16x8_t b_shift; 4439 int16x8_t a_c; 4440 b_shift = vshlq_n_s16( b, c); 4441 a_c = vshlq_n_s16( a, (16 - c)); 4442 a_c = _mm_srli_epi16(a_c, (16 - c)); 4443 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) 4444 } 4445 4446 int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 4447 _NEON2SSE_INLINE int32x4_t vsliq_n_s32(int32x4_t a, int32x4_t b, __constrange(0,31) int c) // VSLI.32 q0,q0,#0 4448 { //solution may be not optimal compared with the serial one 4449 //to cut "c" right bits from a we do shift left and then logical shift back right providing (32-c)zeros in a 4450 int32x4_t b_shift; 4451 int32x4_t a_c; 4452 b_shift = vshlq_n_s32( b, c); 4453 a_c = vshlq_n_s32( a, (32 - c)); 4454 a_c = _mm_srli_epi32(a_c, (32 - c)); 4455 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) 4456 } 4457 4458 int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 4459 _NEON2SSE_INLINE int64x2_t vsliq_n_s64(int64x2_t a, int64x2_t b, __constrange(0,63) int c) // VSLI.64 q0,q0,#0 4460 { //solution may be not optimal compared with the serial one 4461 //to cut "c" right bits from a we do shift left and then logical shift back right providing (64-c)zeros in a 4462 int64x2_t b_shift; 4463 int64x2_t a_c; 4464 b_shift = vshlq_n_s64( b, c); 4465 a_c = vshlq_n_s64( a, (64 - c)); 4466 a_c = _mm_srli_epi64(a_c, (64 - c)); 4467 return _mm_or_si128 (b_shift, a_c); //combine (insert b into a) 4468 } 4469 4470 uint8x16_t vsliq_n_u8(uint8x16_t a, uint8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 4471 #define vsliq_n_u8 vsliq_n_s8 4472 4473 uint16x8_t vsliq_n_u16(uint16x8_t a, uint16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 4474 #define vsliq_n_u16 vsliq_n_s16 4475 4476 uint32x4_t vsliq_n_u32(uint32x4_t a, uint32x4_t b, __constrange(0,31) int c); // VSLI.32 q0,q0,#0 4477 #define vsliq_n_u32 vsliq_n_s32 4478 4479 uint64x2_t vsliq_n_u64(uint64x2_t a, uint64x2_t b, __constrange(0,63) int c); // VSLI.64 q0,q0,#0 4480 #define vsliq_n_u64 vsliq_n_s64 4481 4482 poly8x16_t vsliq_n_p8(poly8x16_t a, poly8x16_t b, __constrange(0,7) int c); // VSLI.8 q0,q0,#0 4483 #define vsliq_n_p8 vsliq_n_u8 4484 4485 poly16x8_t vsliq_n_p16(poly16x8_t a, poly16x8_t b, __constrange(0,15) int c); // VSLI.16 q0,q0,#0 4486 #define vsliq_n_p16 vsliq_n_u16 4487 4488 // *********************************************************************************************** 4489 // ****************** Loads and stores of a single vector *************************************** 4490 // *********************************************************************************************** 4491 //Performs loads and stores of a single vector of some type. 4492 //******************************* Loads ******************************************************** 4493 // *********************************************************************************************** 4494 //We assume ptr is NOT aligned in general case and use __m128i _mm_loadu_si128 ((__m128i*) ptr);. 4495 //also for SSE3 supporting systems the __m128i _mm_lddqu_si128 (__m128i const* p) usage for unaligned access may be advantageous. 4496 // it loads a 32-byte block aligned on a 16-byte boundary and extracts the 16 bytes corresponding to the unaligned access 4497 //If the ptr is aligned then could use __m128i _mm_load_si128 ((__m128i*) ptr) instead; 4498 #define LOAD_SI128(ptr) \ 4499 ( ((unsigned long)(ptr) & 15) == 0 ) ? _mm_load_si128((__m128i*)(ptr)) : _mm_loadu_si128((__m128i*)(ptr)); 4500 4501 uint8x16_t vld1q_u8(__transfersize(16) uint8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 4502 #define vld1q_u8 LOAD_SI128 4503 4504 uint16x8_t vld1q_u16(__transfersize(8) uint16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 4505 #define vld1q_u16 LOAD_SI128 4506 4507 uint32x4_t vld1q_u32(__transfersize(4) uint32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 4508 #define vld1q_u32 LOAD_SI128 4509 4510 uint64x2_t vld1q_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 4511 #define vld1q_u64 LOAD_SI128 4512 4513 int8x16_t vld1q_s8(__transfersize(16) int8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 4514 #define vld1q_s8 LOAD_SI128 4515 4516 int16x8_t vld1q_s16(__transfersize(8) int16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 4517 #define vld1q_s16 LOAD_SI128 4518 4519 int32x4_t vld1q_s32(__transfersize(4) int32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 4520 #define vld1q_s32 LOAD_SI128 4521 4522 int64x2_t vld1q_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 4523 #define vld1q_s64 LOAD_SI128 4524 4525 float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr); // VLD1.16 {d0, d1}, [r0] 4526 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers 4527 /* _NEON2SSE_INLINE float16x8_t vld1q_f16(__transfersize(8) __fp16 const * ptr)// VLD1.16 {d0, d1}, [r0] 4528 {__m128 f1 = _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); 4529 __m128 f2; 4530 f2 = _mm_set_ps (ptr[7], ptr[6], ptr[5], ptr[4]); 4531 }*/ 4532 4533 float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); // VLD1.32 {d0, d1}, [r0] 4534 _NEON2SSE_INLINE float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr) 4535 { 4536 if( (((unsigned long)(ptr)) & 15 ) == 0 ) //16 bits aligned 4537 return _mm_load_ps(ptr); 4538 else 4539 return _mm_loadu_ps(ptr); 4540 } 4541 4542 poly8x16_t vld1q_p8(__transfersize(16) poly8_t const * ptr); // VLD1.8 {d0, d1}, [r0] 4543 #define vld1q_p8 LOAD_SI128 4544 4545 poly16x8_t vld1q_p16(__transfersize(8) poly16_t const * ptr); // VLD1.16 {d0, d1}, [r0] 4546 #define vld1q_p16 LOAD_SI128 4547 4548 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit like _mm_set_ps (ptr[3], ptr[2], ptr[1], ptr[0]); 4549 4550 //*********************************************************************************************************** 4551 //******* Lane load functions - insert the data at vector's given position (lane) ************************* 4552 //*********************************************************************************************************** 4553 uint8x16_t vld1q_lane_u8(__transfersize(1) uint8_t const * ptr, uint8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] 4554 #define vld1q_lane_u8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) 4555 4556 uint16x8_t vld1q_lane_u16(__transfersize(1) uint16_t const * ptr, uint16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 4557 #define vld1q_lane_u16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) 4558 4559 uint32x4_t vld1q_lane_u32(__transfersize(1) uint32_t const * ptr, uint32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 4560 #define vld1q_lane_u32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) 4561 4562 uint64x2_t vld1q_lane_u64(__transfersize(1) uint64_t const * ptr, uint64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] 4563 #define vld1q_lane_u64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane); // _p; 4564 4565 int8x16_t vld1q_lane_s8(__transfersize(1) int8_t const * ptr, int8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] 4566 #define vld1q_lane_s8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) 4567 4568 int16x8_t vld1q_lane_s16(__transfersize(1) int16_t const * ptr, int16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 4569 #define vld1q_lane_s16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) 4570 4571 int32x4_t vld1q_lane_s32(__transfersize(1) int32_t const * ptr, int32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 4572 #define vld1q_lane_s32(ptr, vec, lane) _MM_INSERT_EPI32(vec, *(ptr), lane) 4573 4574 //current IA SIMD doesn't support float16 4575 4576 float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane); // VLD1.32 {d0[0]}, [r0] 4577 _NEON2SSE_INLINE float32x4_t vld1q_lane_f32(__transfersize(1) float32_t const * ptr, float32x4_t vec, __constrange(0,3) int lane) 4578 { //we need to deal with ptr 16bit NOT aligned case 4579 __m128 p; 4580 p = _mm_set1_ps(*(ptr)); 4581 return _MM_INSERT_PS(vec, p, _INSERTPS_NDX(0, lane)); 4582 } 4583 4584 int64x2_t vld1q_lane_s64(__transfersize(1) int64_t const * ptr, int64x2_t vec, __constrange(0,1) int lane); // VLD1.64 {d0}, [r0] 4585 #define vld1q_lane_s64(ptr, vec, lane) _MM_INSERT_EPI64(vec, *(ptr), lane) 4586 4587 poly8x16_t vld1q_lane_p8(__transfersize(1) poly8_t const * ptr, poly8x16_t vec, __constrange(0,15) int lane); // VLD1.8 {d0[0]}, [r0] 4588 #define vld1q_lane_p8(ptr, vec, lane) _MM_INSERT_EPI8(vec, *(ptr), lane) 4589 4590 poly16x8_t vld1q_lane_p16(__transfersize(1) poly16_t const * ptr, poly16x8_t vec, __constrange(0,7) int lane); // VLD1.16 {d0[0]}, [r0] 4591 #define vld1q_lane_p16(ptr, vec, lane) _MM_INSERT_EPI16(vec, *(ptr), lane) 4592 4593 //serial solution may be faster 4594 4595 //current IA SIMD doesn't support float16 4596 4597 // ****************** Load single value ( set all lanes of vector with same value from memory)********************** 4598 // ****************************************************************************************************************** 4599 uint8x16_t vld1q_dup_u8(__transfersize(1) uint8_t const * ptr); // VLD1.8 {d0[]}, [r0] 4600 #define vld1q_dup_u8(ptr) _mm_set1_epi8(*(ptr)) 4601 4602 uint16x8_t vld1q_dup_u16(__transfersize(1) uint16_t const * ptr); // VLD1.16 {d0[]}, [r0] 4603 #define vld1q_dup_u16(ptr) _mm_set1_epi16(*(ptr)) 4604 4605 uint32x4_t vld1q_dup_u32(__transfersize(1) uint32_t const * ptr); // VLD1.32 {d0[]}, [r0] 4606 #define vld1q_dup_u32(ptr) _mm_set1_epi32(*(ptr)) 4607 4608 uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr); // VLD1.64 {d0}, [r0] 4609 _NEON2SSE_INLINE uint64x2_t vld1q_dup_u64(__transfersize(1) uint64_t const * ptr) 4610 { 4611 _NEON2SSE_ALIGN_16 uint64_t val[2] = {*(ptr), *(ptr)}; 4612 return LOAD_SI128(val); 4613 } 4614 4615 int8x16_t vld1q_dup_s8(__transfersize(1) int8_t const * ptr); // VLD1.8 {d0[]}, [r0] 4616 #define vld1q_dup_s8(ptr) _mm_set1_epi8(*(ptr)) 4617 4618 int16x8_t vld1q_dup_s16(__transfersize(1) int16_t const * ptr); // VLD1.16 {d0[]}, [r0] 4619 #define vld1q_dup_s16(ptr) _mm_set1_epi16 (*(ptr)) 4620 4621 int32x4_t vld1q_dup_s32(__transfersize(1) int32_t const * ptr); // VLD1.32 {d0[]}, [r0] 4622 #define vld1q_dup_s32(ptr) _mm_set1_epi32 (*(ptr)) 4623 4624 int64x2_t vld1q_dup_s64(__transfersize(1) int64_t const * ptr); // VLD1.64 {d0}, [r0] 4625 #define vld1q_dup_s64(ptr) vld1q_dup_u64((uint64_t*)ptr) 4626 4627 float16x8_t vld1q_dup_f16(__transfersize(1) __fp16 const * ptr); // VLD1.16 {d0[]}, [r0] 4628 //current IA SIMD doesn't support float16, need to go to 32 bits 4629 4630 float32x4_t vld1q_dup_f32(__transfersize(1) float32_t const * ptr); // VLD1.32 {d0[]}, [r0] 4631 #define vld1q_dup_f32(ptr) _mm_set1_ps (*(ptr)) 4632 4633 poly8x16_t vld1q_dup_p8(__transfersize(1) poly8_t const * ptr); // VLD1.8 {d0[]}, [r0] 4634 #define vld1q_dup_p8(ptr) _mm_set1_epi8(*(ptr)) 4635 4636 poly16x8_t vld1q_dup_p16(__transfersize(1) poly16_t const * ptr); // VLD1.16 {d0[]}, [r0] 4637 #define vld1q_dup_p16(ptr) _mm_set1_epi16 (*(ptr)) 4638 4639 //current IA SIMD doesn't support float16 4640 4641 //************************************************************************************* 4642 //********************************* Store ********************************************** 4643 //************************************************************************************* 4644 // If ptr is 16bit aligned and you need to store data without cache pollution then use void _mm_stream_si128 ((__m128i*)ptr, val); 4645 //here we assume the case of NOT 16bit aligned ptr possible. If it is aligned we could to use _mm_store_si128 like shown in the following macro 4646 #define STORE_SI128(ptr, val) \ 4647 (((unsigned long)(ptr) & 15) == 0 ) ? _mm_store_si128 ((__m128i*)(ptr), val) : _mm_storeu_si128 ((__m128i*)(ptr), val); 4648 4649 void vst1q_u8(__transfersize(16) uint8_t * ptr, uint8x16_t val); // VST1.8 {d0, d1}, [r0] 4650 #define vst1q_u8 STORE_SI128 4651 4652 void vst1q_u16(__transfersize(8) uint16_t * ptr, uint16x8_t val); // VST1.16 {d0, d1}, [r0] 4653 #define vst1q_u16 STORE_SI128 4654 4655 void vst1q_u32(__transfersize(4) uint32_t * ptr, uint32x4_t val); // VST1.32 {d0, d1}, [r0] 4656 #define vst1q_u32 STORE_SI128 4657 4658 void vst1q_u64(__transfersize(2) uint64_t * ptr, uint64x2_t val); // VST1.64 {d0, d1}, [r0] 4659 #define vst1q_u64 STORE_SI128 4660 4661 void vst1q_s8(__transfersize(16) int8_t * ptr, int8x16_t val); // VST1.8 {d0, d1}, [r0] 4662 #define vst1q_s8 STORE_SI128 4663 4664 void vst1q_s16(__transfersize(8) int16_t * ptr, int16x8_t val); // VST1.16 {d0, d1}, [r0] 4665 #define vst1q_s16 STORE_SI128 4666 4667 void vst1q_s32(__transfersize(4) int32_t * ptr, int32x4_t val); // VST1.32 {d0, d1}, [r0] 4668 #define vst1q_s32 STORE_SI128 4669 4670 void vst1q_s64(__transfersize(2) int64_t * ptr, int64x2_t val); // VST1.64 {d0, d1}, [r0] 4671 #define vst1q_s64 STORE_SI128 4672 4673 void vst1q_f16(__transfersize(8) __fp16 * ptr, float16x8_t val); // VST1.16 {d0, d1}, [r0] 4674 // IA32 SIMD doesn't work with 16bit floats currently 4675 4676 void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val); // VST1.32 {d0, d1}, [r0] 4677 _NEON2SSE_INLINE void vst1q_f32(__transfersize(4) float32_t * ptr, float32x4_t val) 4678 { 4679 if( ((unsigned long)(ptr) & 15) == 0 ) //16 bits aligned 4680 _mm_store_ps (ptr, val); 4681 else 4682 _mm_storeu_ps (ptr, val); 4683 } 4684 4685 void vst1q_p8(__transfersize(16) poly8_t * ptr, poly8x16_t val); // VST1.8 {d0, d1}, [r0] 4686 #define vst1q_p8 vst1q_u8 4687 4688 void vst1q_p16(__transfersize(8) poly16_t * ptr, poly16x8_t val); // VST1.16 {d0, d1}, [r0] 4689 #define vst1q_p16 vst1q_u16 4690 4691 //current IA SIMD doesn't support float16 4692 4693 //***********Store a lane of a vector into memory (extract given lane) ********************* 4694 //****************************************************************************************** 4695 void vst1q_lane_u8(__transfersize(1) uint8_t * ptr, uint8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] 4696 #define vst1q_lane_u8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) 4697 4698 void vst1q_lane_u16(__transfersize(1) uint16_t * ptr, uint16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] 4699 #define vst1q_lane_u16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) 4700 4701 void vst1q_lane_u32(__transfersize(1) uint32_t * ptr, uint32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] 4702 #define vst1q_lane_u32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) 4703 4704 void vst1q_lane_u64(__transfersize(1) uint64_t * ptr, uint64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] 4705 #define vst1q_lane_u64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) 4706 4707 void vst1q_lane_s8(__transfersize(1) int8_t * ptr, int8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] 4708 #define vst1q_lane_s8(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI8 (val, lane) 4709 4710 void vst1q_lane_s16(__transfersize(1) int16_t * ptr, int16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] 4711 #define vst1q_lane_s16(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI16 (val, lane) 4712 4713 void vst1q_lane_s32(__transfersize(1) int32_t * ptr, int32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] 4714 #define vst1q_lane_s32(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI32 (val, lane) 4715 4716 void vst1q_lane_s64(__transfersize(1) int64_t * ptr, int64x2_t val, __constrange(0,1) int lane); // VST1.64 {d0}, [r0] 4717 #define vst1q_lane_s64(ptr, val, lane) *(ptr) = _MM_EXTRACT_EPI64 (val, lane) 4718 4719 void vst1q_lane_f16(__transfersize(1) __fp16 * ptr, float16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] 4720 //current IA SIMD doesn't support float16 4721 4722 void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane); // VST1.32 {d0[0]}, [r0] 4723 _NEON2SSE_INLINE void vst1q_lane_f32(__transfersize(1) float32_t * ptr, float32x4_t val, __constrange(0,3) int lane) 4724 { 4725 int32_t ilane; 4726 ilane = _MM_EXTRACT_PS(val,lane); 4727 *(ptr) = *((float*)&ilane); 4728 } 4729 4730 void vst1q_lane_p8(__transfersize(1) poly8_t * ptr, poly8x16_t val, __constrange(0,15) int lane); // VST1.8 {d0[0]}, [r0] 4731 #define vst1q_lane_p8 vst1q_lane_u8 4732 4733 void vst1q_lane_p16(__transfersize(1) poly16_t * ptr, poly16x8_t val, __constrange(0,7) int lane); // VST1.16 {d0[0]}, [r0] 4734 #define vst1q_lane_p16 vst1q_lane_s16 4735 4736 //current IA SIMD doesn't support float16 4737 4738 //*********************************************************************************************** 4739 //**************** Loads and stores of an N-element structure ********************************** 4740 //*********************************************************************************************** 4741 //These intrinsics load or store an n-element structure. The array structures are defined in the beginning 4742 //We assume ptr is NOT aligned in general case, for more details see "Loads and stores of a single vector functions" 4743 //****************** 2 elements load ********************************************* 4744 uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 4745 _NEON2SSE_INLINE uint8x16x2_t vld2q_u8(__transfersize(32) uint8_t const * ptr) // VLD2.8 {d0, d2}, [r0] 4746 { 4747 uint8x16x2_t v; 4748 v.val[0] = vld1q_u8(ptr); 4749 v.val[1] = vld1q_u8((ptr + 16)); 4750 v = vuzpq_s8(v.val[0], v.val[1]); 4751 return v; 4752 } 4753 4754 #if defined(USE_SSSE3) 4755 uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 4756 _NEON2SSE_INLINE uint16x8x2_t vld2q_u16(__transfersize(16) uint16_t const * ptr) // VLD2.16 {d0, d2}, [r0] 4757 { 4758 uint16x8x2_t v; 4759 v.val[0] = vld1q_u16( ptr); 4760 v.val[1] = vld1q_u16( (ptr + 8)); 4761 v = vuzpq_s16(v.val[0], v.val[1]); 4762 return v; 4763 } 4764 #endif 4765 4766 uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 4767 _NEON2SSE_INLINE uint32x4x2_t vld2q_u32(__transfersize(8) uint32_t const * ptr) // VLD2.32 {d0, d2}, [r0] 4768 { 4769 uint32x4x2_t v; 4770 v.val[0] = vld1q_u32 ( ptr); 4771 v.val[1] = vld1q_u32 ( (ptr + 4)); 4772 v = vuzpq_s32(v.val[0], v.val[1]); 4773 return v; 4774 } 4775 4776 int8x16x2_t vld2q_s8(__transfersize(32) int8_t const * ptr); 4777 #define vld2q_s8(ptr) vld2q_u8((uint8_t*) ptr) 4778 4779 #if defined(USE_SSSE3) 4780 int16x8x2_t vld2q_s16(__transfersize(16) int16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 4781 #define vld2q_s16(ptr) vld2q_u16((uint16_t*) ptr) 4782 #endif 4783 4784 int32x4x2_t vld2q_s32(__transfersize(8) int32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 4785 #define vld2q_s32(ptr) vld2q_u32((uint32_t*) ptr) 4786 4787 float16x8x2_t vld2q_f16(__transfersize(16) __fp16 const * ptr); // VLD2.16 {d0, d2}, [r0] 4788 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 4789 4790 float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr); // VLD2.32 {d0, d2}, [r0] 4791 _NEON2SSE_INLINE float32x4x2_t vld2q_f32(__transfersize(8) float32_t const * ptr) // VLD2.32 {d0, d2}, [r0] 4792 { 4793 float32x4x2_t v; 4794 v.val[0] = vld1q_f32 (ptr); 4795 v.val[1] = vld1q_f32 ((ptr + 4)); 4796 v = vuzpq_f32(v.val[0], v.val[1]); 4797 return v; 4798 } 4799 4800 poly8x16x2_t vld2q_p8(__transfersize(32) poly8_t const * ptr); // VLD2.8 {d0, d2}, [r0] 4801 #define vld2q_p8 vld2q_u8 4802 4803 #if defined(USE_SSSE3) 4804 poly16x8x2_t vld2q_p16(__transfersize(16) poly16_t const * ptr); // VLD2.16 {d0, d2}, [r0] 4805 #define vld2q_p16 vld2q_u16 4806 #endif 4807 4808 #if defined(USE_SSSE3) 4809 uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 4810 _NEON2SSE_INLINE uint8x8x2_t vld2_u8(__transfersize(16) uint8_t const * ptr) 4811 { 4812 uint8x8x2_t v; 4813 _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; 4814 __m128i ld128; 4815 ld128 = vld1q_u8(ptr); //merge two 64-bits in 128 bit 4816 v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask8_even_odd); 4817 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); 4818 return v; 4819 } 4820 #endif 4821 4822 #if defined(USE_SSSE3) 4823 uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 4824 _NEON2SSE_INLINE uint16x4x2_t vld2_u16(__transfersize(8) uint16_t const * ptr) 4825 { 4826 uint16x4x2_t v; 4827 _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; 4828 __m128i ld128; 4829 ld128 = vld1q_u16(ptr); //merge two 64-bits in 128 bit 4830 v.val[0] = _mm_shuffle_epi8(ld128, *(__m128i*)mask16_even_odd); 4831 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); 4832 return v; 4833 } 4834 #endif 4835 4836 uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 4837 _NEON2SSE_INLINE uint32x2x2_t vld2_u32(__transfersize(4) uint32_t const * ptr) 4838 { 4839 uint32x2x2_t v; 4840 __m128i ld128; 4841 ld128 = vld1q_u32(ptr); //merge two 64-bits in 128 bit 4842 v.val[0] = _mm_shuffle_epi32(ld128, 0 | (2 << 2) | (1 << 4) | (3 << 6)); 4843 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); 4844 return v; 4845 } 4846 4847 uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 4848 _NEON2SSE_INLINE uint64x1x2_t vld2_u64(__transfersize(2) uint64_t const * ptr) 4849 { 4850 uint64x1x2_t v; 4851 v.val[0] = vld1q_u64(ptr); 4852 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); 4853 return v; 4854 } 4855 4856 #if defined(USE_SSSE3) 4857 int8x8x2_t vld2_s8(__transfersize(16) int8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 4858 #define vld2_s8(ptr) vld2_u8((uint8_t*)ptr) 4859 4860 int16x4x2_t vld2_s16(__transfersize(8) int16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 4861 #define vld2_s16(ptr) vld2_u16((uint16_t*)ptr) 4862 #endif 4863 4864 int32x2x2_t vld2_s32(__transfersize(4) int32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 4865 #define vld2_s32(ptr) vld2_u32((uint32_t*)ptr) 4866 4867 int64x1x2_t vld2_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 4868 #define vld2_s64(ptr) vld2_u64((uint64_t*)ptr) 4869 4870 float16x4x2_t vld2_f16(__transfersize(8) __fp16 const * ptr); // VLD2.16 {d0, d1}, [r0] 4871 4872 float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr); // VLD2.32 {d0, d1}, [r0] 4873 _NEON2SSE_INLINE float32x2x2_t vld2_f32(__transfersize(4) float32_t const * ptr) 4874 { 4875 float32x2x2_t v; 4876 v.val[0] = vld1q_f32(ptr); 4877 v.val[0] = _mm_shuffle_ps(v.val[0], v.val[0], _MM_SHUFFLE(3,1, 2, 0)); 4878 v.val[1] = _mm_movehl_ps(v.val[0],v.val[0]); 4879 return v; 4880 } 4881 4882 #if defined(USE_SSSE3) 4883 poly8x8x2_t vld2_p8(__transfersize(16) poly8_t const * ptr); // VLD2.8 {d0, d1}, [r0] 4884 #define vld2_p8 vld2_u8 4885 4886 poly16x4x2_t vld2_p16(__transfersize(8) poly16_t const * ptr); // VLD2.16 {d0, d1}, [r0] 4887 #define vld2_p16 vld2_u16 4888 #endif 4889 4890 //******************** Triplets *************************************** 4891 //********************************************************************* 4892 #if defined(USE_SSSE3) 4893 uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 4894 _NEON2SSE_INLINE uint8x16x3_t vld3q_u8(__transfersize(48) uint8_t const * ptr) // VLD3.8 {d0, d2, d4}, [r0] 4895 { //a0,a1,a2,a3,...a7,a8,...a15, b0,b1,b2,...b7,b8,...b15, c0,c1,c2,...c7,c8,...c15 -> 4896 //a:0,3,6,9,12,15,b:2,5,8,11,14, c:1,4,7,10,13 4897 //a:1,4,7,10,13, b:0,3,6,9,12,15,c:2,5,8,11,14, 4898 //a:2,5,8,11,14, b:1,4,7,10,13, c:0,3,6,9,12,15 4899 uint8x16x3_t v; 4900 __m128i tmp0, tmp1,tmp2, tmp3; 4901 _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15,1,4,7,10,13,2,5,8,11,14}; 4902 _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13}; 4903 _NEON2SSE_ALIGN_16 int8_t mask8_2[16] = {1,4,7,10,13,2,5,8,11,14,0,3,6,9,12,15}; 4904 4905 v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, ...a15 4906 v.val[1] = vld1q_u8 ((ptr + 16)); //b0,b1,b2,b3...b7, ...b15 4907 v.val[2] = vld1q_u8 ((ptr + 32)); //c0,c1,c2,c3,...c7,...c15 4908 4909 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a:0,3,6,9,12,15,1,4,7,10,13,2,5,8,11 4910 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask8_1); //b:2,5,8,11,14,0,3,6,9,12,15,1,4,7,10,13 4911 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_2); //c:1,4,7,10,13,2,5,8,11,14,3,6,9,12,15 4912 4913 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,0,0,0,0,0,a0,a3,a6,a9,a12,a15 4914 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a:0,3,6,9,12,15,b:2,5,8,11,14,x,x,x,x,x 4915 tmp3 = _mm_slli_si128(tmp3, 5); //0,0,0,0,0,a:0,3,6,9,12,15,b:2,5,8,11,14, 4916 tmp3 = _mm_srli_si128(tmp3, 5); //a:0,3,6,9,12,15,b:2,5,8,11,14,:0,0,0,0,0 4917 v.val[0] = _mm_slli_si128(tmp2, 11); //0,0,0,0,0,0,0,0,0,0,0,0, 1,4,7,10,13, 4918 v.val[0] = _mm_or_si128(v.val[0],tmp3) ;//a:0,3,6,9,12,15,b:2,5,8,11,14,c:1,4,7,10,13, 4919 4920 tmp3 = _mm_slli_si128(tmp0, 5);//0,0,0,0,0,a:0,3,6,9,12,15,1,4,7,10,13, 4921 tmp3 = _mm_srli_si128(tmp3, 11); //a:1,4,7,10,13, 0,0,0,0,0,0,0,0,0,0,0 4922 v.val[1] = _mm_srli_si128(tmp1,5); //b:0,3,6,9,12,15,C:1,4,7,10,13, 0,0,0,0,0 4923 v.val[1] = _mm_slli_si128(v.val[1], 5);//0,0,0,0,0,b:0,3,6,9,12,15,C:1,4,7,10,13, 4924 v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,C:1,4,7,10,13, 4925 v.val[1] = _mm_slli_si128(v.val[1],5);//0,0,0,0,0,a:1,4,7,10,13,b:0,3,6,9,12,15, 4926 v.val[1] = _mm_srli_si128(v.val[1], 5);//a:1,4,7,10,13,b:0,3,6,9,12,15,0,0,0,0,0 4927 tmp3 = _mm_srli_si128(tmp2,5); //c:2,5,8,11,14,0,3,6,9,12,15,0,0,0,0,0 4928 tmp3 = _mm_slli_si128(tmp3,11);//0,0,0,0,0,0,0,0,0,0,0,c:2,5,8,11,14, 4929 v.val[1] = _mm_or_si128(v.val[1],tmp3);//a:1,4,7,10,13,b:0,3,6,9,12,15,c:2,5,8,11,14, 4930 4931 tmp3 = _mm_srli_si128(tmp2,10); //c:0,3,6,9,12,15, 0,0,0,0,0,0,0,0,0,0, 4932 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,0,0,0,0,0, c:0,3,6,9,12,15, 4933 v.val[2] = _mm_srli_si128(tmp1,11); //b:1,4,7,10,13,0,0,0,0,0,0,0,0,0,0,0 4934 v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,b:1,4,7,10,13, 0,0,0,0,0,0 4935 v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0,0,0,0,b:1,4,7,10,13,c:0,3,6,9,12,15, 4936 tmp0 = _mm_srli_si128(tmp0, 11); //a:2,5,8,11,14, 0,0,0,0,0,0,0,0,0,0,0, 4937 v.val[2] = _mm_or_si128(v.val[2],tmp0);//a:2,5,8,11,14,b:1,4,7,10,13,c:0,3,6,9,12,15, 4938 return v; 4939 } 4940 #endif 4941 4942 #if defined(USE_SSSE3) 4943 uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 4944 _NEON2SSE_INLINE uint16x8x3_t vld3q_u16(__transfersize(24) uint16_t const * ptr) // VLD3.16 {d0, d2, d4}, [r0] 4945 { //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 4946 uint16x8x3_t v; 4947 __m128i tmp0, tmp1,tmp2, tmp3; 4948 _NEON2SSE_ALIGN_16 int8_t mask16_0[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; 4949 _NEON2SSE_ALIGN_16 int8_t mask16_1[16] = {2,3, 8,9, 14,15, 4,5, 10,11, 0,1, 6,7, 12,13}; 4950 _NEON2SSE_ALIGN_16 int8_t mask16_2[16] = {4,5, 10,11, 0,1, 6,7, 12,13, 2,3, 8,9, 14,15}; 4951 4952 v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3,...a7, 4953 v.val[1] = vld1q_u16 ((ptr + 8)); //b0,b1,b2,b3...b7 4954 v.val[2] = vld1q_u16 ((ptr + 16)); //c0,c1,c2,c3,...c7 4955 4956 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16_0); //a0,a3,a6,a1,a4,a7,a2,a5, 4957 tmp1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask16_1); //b1,b4,b7,b2,b5,b0,b3,b6 4958 tmp2 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask16_2); //c2,c5, c0,c3,c6, c1,c4,c7 4959 4960 tmp3 = _mm_slli_si128(tmp0,10); //0,0,0,0,0,a0,a3,a6, 4961 tmp3 = _mm_alignr_epi8(tmp1,tmp3, 10); //a0,a3,a6,b1,b4,b7,x,x 4962 tmp3 = _mm_slli_si128(tmp3, 4); //0,0, a0,a3,a6,b1,b4,b7 4963 tmp3 = _mm_srli_si128(tmp3, 4); //a0,a3,a6,b1,b4,b7,0,0 4964 v.val[0] = _mm_slli_si128(tmp2, 12); //0,0,0,0,0,0, c2,c5, 4965 v.val[0] = _mm_or_si128(v.val[0],tmp3);//a0,a3,a6,b1,b4,b7,c2,c5 4966 4967 tmp3 = _mm_slli_si128(tmp0, 4);//0,0,a0,a3,a6,a1,a4,a7 4968 tmp3 = _mm_srli_si128(tmp3,10); //a1,a4,a7, 0,0,0,0,0 4969 v.val[1] = _mm_srli_si128(tmp1,6); //b2,b5,b0,b3,b6,0,0 4970 v.val[1] = _mm_slli_si128(v.val[1], 6); //0,0,0,b2,b5,b0,b3,b6, 4971 v.val[1] = _mm_or_si128(v.val[1],tmp3);//a1,a4,a7,b2,b5,b0,b3,b6, 4972 v.val[1] = _mm_slli_si128(v.val[1],6);//0,0,0,a1,a4,a7,b2,b5, 4973 v.val[1] = _mm_srli_si128(v.val[1], 6);//a1,a4,a7,b2,b5,0,0,0, 4974 tmp3 = _mm_srli_si128(tmp2,4); //c0,c3,c6, c1,c4,c7,0,0 4975 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0,c0,c3,c6, 4976 v.val[1] = _mm_or_si128(v.val[1],tmp3); //a1,a4,a7,b2,b5,c0,c3,c6, 4977 4978 tmp3 = _mm_srli_si128(tmp2,10); //c1,c4,c7, 0,0,0,0,0 4979 tmp3 = _mm_slli_si128(tmp3,10); //0,0,0,0,0, c1,c4,c7, 4980 v.val[2] = _mm_srli_si128(tmp1,10); //b0,b3,b6,0,0, 0,0,0 4981 v.val[2] = _mm_slli_si128(v.val[2],4);//0,0, b0,b3,b6,0,0,0 4982 v.val[2] = _mm_or_si128(v.val[2],tmp3);//0,0, b0,b3,b6,c1,c4,c7, 4983 tmp0 = _mm_srli_si128(tmp0, 12); //a2,a5,0,0,0,0,0,0 4984 v.val[2] = _mm_or_si128(v.val[2],tmp0);//a2,a5,b0,b3,b6,c1,c4,c7, 4985 return v; 4986 } 4987 #endif 4988 4989 uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 4990 _NEON2SSE_INLINE uint32x4x3_t vld3q_u32(__transfersize(12) uint32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] 4991 {//a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, 4992 uint32x4x3_t v; 4993 __m128i tmp0, tmp1,tmp2, tmp3; 4994 v.val[0] = vld1q_u32 (ptr); //a0,a1,a2,a3, 4995 v.val[1] = vld1q_u32 ((ptr + 4)); //b0,b1,b2,b3 4996 v.val[2] = vld1q_u32 ((ptr + 8)); //c0,c1,c2,c3, 4997 4998 tmp0 = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,a3,a1,a2 4999 tmp1 = _mm_shuffle_epi32(v.val[1], _SWAP_HI_LOW32); //b2,b3,b0,b1 5000 tmp2 = _mm_shuffle_epi32(v.val[2], 1 | (2 << 2) | (0 << 4) | (3 << 6)); //c1,c2, c0,c3 5001 5002 tmp3 = _mm_unpacklo_epi32(tmp1, tmp2); //b2,c1, b3,c2 5003 v.val[0] = _mm_unpacklo_epi64(tmp0,tmp3); //a0,a3,b2,c1 5004 tmp0 = _mm_unpackhi_epi32(tmp0, tmp1); //a1,b0, a2,b1 5005 v.val[1] = _mm_shuffle_epi32(tmp0, _SWAP_HI_LOW32 ); //a2,b1, a1,b0, 5006 v.val[1] = _mm_unpackhi_epi64(v.val[1], tmp3); //a1,b0, b3,c2 5007 v.val[2] = _mm_unpackhi_epi64(tmp0, tmp2); //a2,b1, c0,c3 5008 return v; 5009 } 5010 5011 #if defined(USE_SSSE3) 5012 int8x16x3_t vld3q_s8(__transfersize(48) int8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 5013 #define vld3q_s8(ptr) vld3q_u8((uint8_t*) (ptr)) 5014 5015 int16x8x3_t vld3q_s16(__transfersize(24) int16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 5016 #define vld3q_s16(ptr) vld3q_u16((uint16_t*) (ptr)) 5017 #endif 5018 5019 int32x4x3_t vld3q_s32(__transfersize(12) int32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 5020 #define vld3q_s32(ptr) vld3q_u32((uint32_t*) (ptr)) 5021 5022 float16x8x3_t vld3q_f16(__transfersize(24) __fp16 const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 5023 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 5024 5025 float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr); // VLD3.32 {d0, d2, d4}, [r0] 5026 _NEON2SSE_INLINE float32x4x3_t vld3q_f32(__transfersize(12) float32_t const * ptr) // VLD3.32 {d0, d2, d4}, [r0] 5027 { //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, 5028 float32x4x3_t v; 5029 __m128 tmp0, tmp1,tmp2, tmp3; 5030 v.val[0] = vld1q_f32 (ptr); //a0,a1,a2,a3, 5031 v.val[1] = vld1q_f32 ((ptr + 4)); //b0,b1,b2,b3 5032 v.val[2] = vld1q_f32 ((ptr + 8)); //c0,c1,c2,c3, 5033 5034 tmp0 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[0]), 0 | (3 << 2) | (1 << 4) | (2 << 6))); //a0,a3,a1,a2 5035 tmp1 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[1]), _SWAP_HI_LOW32)); //b2,b3,b0,b1 5036 tmp2 = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(v.val[2]), 1 | (2 << 2) | (0 << 4) | (3 << 6))); //c1,c2, c0,c3 5037 tmp3 = _mm_unpacklo_ps(tmp1, tmp2); //b2,c1, b3,c2 5038 5039 v.val[0] = _mm_movelh_ps(tmp0,tmp3); //a0,a3,b2,c1 5040 tmp0 = _mm_unpackhi_ps(tmp0, tmp1); //a1,b0, a2,b1 5041 v.val[1] = _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(tmp0), _SWAP_HI_LOW32 )); //a2,b1, a1,b0, 5042 v.val[1] = _mm_movehl_ps(tmp3,v.val[1]); //a1,b0, b3,c2 5043 v.val[2] = _mm_movehl_ps(tmp2,tmp0); //a2,b1, c0,c3 5044 return v; 5045 } 5046 5047 #if defined(USE_SSSE3) 5048 poly8x16x3_t vld3q_p8(__transfersize(48) poly8_t const * ptr); // VLD3.8 {d0, d2, d4}, [r0] 5049 #define vld3q_p8 vld3q_u8 5050 5051 poly16x8x3_t vld3q_p16(__transfersize(24) poly16_t const * ptr); // VLD3.16 {d0, d2, d4}, [r0] 5052 #define vld3q_p16 vld3q_u16 5053 #endif 5054 5055 #if defined(USE_SSSE3) 5056 uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 5057 _NEON2SSE_INLINE uint8x8x3_t vld3_u8(__transfersize(24) uint8_t const * ptr) // VLD3.8 {d0, d1, d2}, [r0] 5058 { //a0, a1,a2,a3,...a7, b0,b1,b2,b3,...b7, c0,c1,c2,c3...c7 -> a0,a3,a6,b1,b4,b7,c2,c5, a1,a4,a7,b2,b5,c0,c3,c6, a2,a5,b0,b3,b6,c1,c4,c7 5059 uint8x8x3_t v; 5060 __m128i tmp0, tmp1; 5061 _NEON2SSE_ALIGN_16 int8_t mask8_0[16] = {0,3,6,9,12,15, 1,4,7,10,13, 2,5,8,11,14}; 5062 _NEON2SSE_ALIGN_16 int8_t mask8_1[16] = {2,5, 0,3,6, 1,4,7, 0,0,0,0,0,0,0,0}; 5063 v.val[0] = vld1q_u8 (ptr); //a0,a1,a2,a3,...a7, b0,b1,b2,b3...b7 5064 5065 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask8_0); //a0,a3,a6,b1,b4,b7, a1,a4,a7,b2,b5, a2,a5,b0,b3,b6, 5066 tmp1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask8_1); //c2,c5, c0,c3,c6, c1,c4,c7,x,x,x,x,x,x,x,x 5067 v.val[0] = _mm_slli_si128(tmp0,10); 5068 v.val[0] = _mm_srli_si128(v.val[0],10); //a0,a3,a6,b1,b4,b7, 0,0,0,0,0,0,0,0,0,0 5069 v.val[2] = _mm_slli_si128(tmp1,6);//0,0,0,0,0,0,c2,c5,x,x,x,x,x,x,x,x 5070 v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0,a3,a6,b1,b4,b7,c2,c5 x,x,x,x,x,x,x,x 5071 5072 v.val[1] = _mm_slli_si128(tmp0,5); //0,0,0,0,0,0,0,0,0,0,0, a1,a4,a7,b2,b5, 5073 v.val[1] = _mm_srli_si128(v.val[1],11); //a1,a4,a7,b2,b5,0,0,0,0,0,0,0,0,0,0,0, 5074 v.val[2] = _mm_srli_si128(tmp1,2); //c0,c3,c6,c1,c4,c7,x,x,x,x,x,x,x,x,0,0 5075 v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c0,c3,c6,0,0,0,0,0,0,0,0 5076 v.val[1] = _mm_or_si128(v.val[1],v.val[2]) ;//a1,a4,a7,b2,b5,c0,c3,c6,x,x,x,x,x,x,x,x 5077 5078 tmp0 = _mm_srli_si128(tmp0,11); //a2,a5,b0,b3,b6,0,0,0,0,0,0,0,0,0,0,0, 5079 v.val[2] = _mm_srli_si128(tmp1,5); //c1,c4,c7,0,0,0,0,0,0,0,0,0,0,0,0,0 5080 v.val[2] = _mm_slli_si128(v.val[2],5);//0,0,0,0,0,c1,c4,c7, 5081 v.val[2] = _mm_or_si128(tmp0, v.val[2]) ;//a2,a5,b0,b3,b6,c1,c4,c7,x,x,x,x,x,x,x,x 5082 return v; 5083 } 5084 #endif 5085 5086 #if defined(USE_SSSE3) 5087 uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 5088 _NEON2SSE_INLINE uint16x4x3_t vld3_u16(__transfersize(12) uint16_t const * ptr) // VLD3.16 {d0, d1, d2}, [r0] 5089 { //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,a3,b2,c1, a1,b0,b3,c2, a2,b1,c0,c3, 5090 uint16x4x3_t v; 5091 __m128i tmp0, tmp1; 5092 _NEON2SSE_ALIGN_16 int8_t mask16[16] = {0,1, 6,7, 12,13, 2,3, 8,9, 14,15, 4,5, 10,11}; 5093 v.val[0] = vld1q_u16 (ptr); //a0,a1,a2,a3, b0,b1,b2,b3 5094 5095 tmp0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask16); //a0, a3, b2,a1, b0, b3, a2, b1 5096 tmp1 = _mm_shufflelo_epi16(v.val[2], 201); //11 00 10 01 : c1, c2, c0, c3, 5097 v.val[0] = _mm_slli_si128(tmp0,10); 5098 v.val[0] = _mm_srli_si128(v.val[0],10); //a0, a3, b2, 0,0, 0,0, 5099 v.val[2] = _mm_slli_si128(tmp1,14);//0,0,0,0,0,0,0,c1 5100 v.val[2] = _mm_srli_si128(v.val[2],8);//0,0,0,c1,0,0,0,0 5101 v.val[0] = _mm_or_si128(v.val[0],v.val[2]) ;//a0, a3, b2, c1, x,x,x,x 5102 5103 v.val[1] = _mm_slli_si128(tmp0,4); //0,0,0,0,0,a1, b0, b3 5104 v.val[1] = _mm_srli_si128(v.val[1],10); //a1, b0, b3, 0,0, 0,0, 5105 v.val[2] = _mm_srli_si128(tmp1,2);//c2, 0,0,0,0,0,0,0, 5106 v.val[2] = _mm_slli_si128(v.val[2],6);//0,0,0,c2,0,0,0,0 5107 v.val[1] = _mm_or_si128(v.val[1],v.val[2]); //a1, b0, b3, c2, x,x,x,x 5108 5109 tmp0 = _mm_srli_si128(tmp0,12); //a2, b1,0,0,0,0,0,0 5110 tmp1 = _mm_srli_si128(tmp1,4); 5111 tmp1 = _mm_slli_si128(tmp1,4); //0,0,c0, c3, 5112 v.val[2] = _mm_or_si128(tmp0, tmp1); //a2, b1, c0, c3, 5113 return v; 5114 } 5115 #endif 5116 5117 uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 5118 _NEON2SSE_INLINE uint32x2x3_t vld3_u32(__transfersize(6) uint32_t const * ptr) // VLD3.32 {d0, d1, d2}, [r0] 5119 { //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 5120 uint32x2x3_t v; 5121 v.val[0] = vld1q_u32 (ptr); //a0,a1, b0,b1, 5122 5123 v.val[0] = _mm_shuffle_epi32(v.val[0], 0 | (3 << 2) | (1 << 4) | (2 << 6)); //a0,b1, a1, b0 5124 v.val[2] = _mm_slli_si128(v.val[2], 8); //x, x,c0,c1, 5125 v.val[1] = _mm_unpackhi_epi32(v.val[0],v.val[2]); //a1,c0, b0, c1 5126 v.val[2] = _mm_srli_si128(v.val[1], 8); //b0, c1, x, x, 5127 return v; 5128 } 5129 uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 5130 _NEON2SSE_INLINE uint64x1x3_t vld3_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] 5131 { 5132 uint64x1x3_t v; 5133 v.val[0] = vld1q_u64 (ptr); 5134 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); 5135 return v; 5136 } 5137 5138 #if defined(USE_SSSE3) 5139 int8x8x3_t vld3_s8(__transfersize(24) int8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 5140 #define vld3_s8(ptr) vld3_u8((uint8_t*)ptr) 5141 5142 int16x4x3_t vld3_s16(__transfersize(12) int16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 5143 #define vld3_s16(ptr) vld3_u16((uint16_t*)ptr) 5144 #endif 5145 5146 int32x2x3_t vld3_s32(__transfersize(6) int32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 5147 #define vld3_s32(ptr) vld3_u32((uint32_t*)ptr) 5148 5149 int64x1x3_t vld3_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 5150 #define vld3_s64(ptr) vld3_u64((uint64_t*)ptr) 5151 5152 float16x4x3_t vld3_f16(__transfersize(12) __fp16 const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 5153 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 5154 5155 float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr); // VLD3.32 {d0, d1, d2}, [r0] 5156 _NEON2SSE_INLINE float32x2x3_t vld3_f32(__transfersize(6) float32_t const * ptr) 5157 { //a0,a1, b0,b1, c0,c1, -> a0,b1, a1,c0, b0,c1 5158 float32x2x3_t v; 5159 v.val[0] = vld1q_f32 (ptr); //a0,a1, b0,b1, 5160 5161 v.val[0] = _mm_shuffle_ps(v.val[0],v.val[0], _MM_SHUFFLE(2,1, 3, 0)); //a0,b1, a1, b0 5162 v.val[2] = _mm_movelh_ps(v.val[2], v.val[2]); //x, x,c0,c1, 5163 v.val[1] = _mm_unpackhi_ps(v.val[0],v.val[2]); //a1,c0, b0, c1 5164 v.val[2] = _mm_movehl_ps(v.val[1], v.val[1]); //b0, c1, x, x, 5165 return v; 5166 } 5167 5168 #if defined(USE_SSSE3) 5169 poly8x8x3_t vld3_p8(__transfersize(24) poly8_t const * ptr); // VLD3.8 {d0, d1, d2}, [r0] 5170 #define vld3_p8 vld3_u8 5171 5172 poly16x4x3_t vld3_p16(__transfersize(12) poly16_t const * ptr); // VLD3.16 {d0, d1, d2}, [r0] 5173 #define vld3_p16 vld3_u16 5174 #endif 5175 5176 //*************** Quadruples load ******************************** 5177 //***************************************************************** 5178 uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 5179 _NEON2SSE_INLINE uint8x16x4_t vld4q_u8(__transfersize(64) uint8_t const * ptr) // VLD4.8 {d0, d2, d4, d6}, [r0] 5180 { 5181 uint8x16x4_t v; 5182 __m128i tmp3, tmp2, tmp1, tmp0; 5183 5184 v.val[0] = vld1q_u8 ( ptr); //a0,a1,a2,...a7, ...a15 5185 v.val[1] = vld1q_u8 ( (ptr + 16));//b0, b1,b2,...b7.... b15 5186 v.val[2] = vld1q_u8 ( (ptr + 32));//c0, c1,c2,...c7....c15 5187 v.val[3] = vld1q_u8 ( (ptr + 48)); //d0,d1,d2,...d7....d15 5188 5189 tmp0= _mm_unpacklo_epi8(v.val[0],v.val[1]); //a0,b0, a1,b1, a2,b2, a3,b3,....a7,b7 5190 tmp1= _mm_unpacklo_epi8(v.val[2],v.val[3]); //c0,d0, c1,d1, c2,d2, c3,d3,... c7,d7 5191 tmp2= _mm_unpackhi_epi8(v.val[0],v.val[1]);//a8,b8, a9,b9, a10,b10, a11,b11,...a15,b15 5192 tmp3= _mm_unpackhi_epi8(v.val[2],v.val[3]);//c8,d8, c9,d9, c10,d10, c11,d11,...c15,d15 5193 5194 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a8, b0,b8, a1,a9, b1,b9, ....a3,a11, b3,b11 5195 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a4,a12, b4,b12, a5,a13, b5,b13,....a7,a15,b7,b15 5196 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //c0,c8, d0,d8, c1,c9, d1,d9.....d3,d11 5197 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //c4,c12,d4,d12, c5,c13, d5,d13,....d7,d15 5198 5199 tmp0 = _mm_unpacklo_epi32(v.val[0] , v.val[2] ); ///a0,a8, b0,b8, c0,c8, d0,d8, a1,a9, b1,b9, c1,c9, d1,d9 5200 tmp1 = _mm_unpackhi_epi32(v.val[0] , v.val[2] ); //a2,a10, b2,b10, c2,c10, d2,d10, a3,a11, b3,b11, c3,c11, d3,d11 5201 tmp2 = _mm_unpacklo_epi32(v.val[1] , v.val[3] ); //a4,a12, b4,b12, c4,c12, d4,d12, a5,a13, b5,b13, c5,c13, d5,d13, 5202 tmp3 = _mm_unpackhi_epi32(v.val[1] , v.val[3] ); //a6,a14, b6,b14, c6,c14, d6,d14, a7,a15,b7,b15,c7,c15,d7,d15 5203 5204 v.val[0] = _mm_unpacklo_epi8(tmp0, tmp2); //a0,a4,a8,a12,b0,b4,b8,b12,c0,c4,c8,c12,d0,d4,d8,d12 5205 v.val[1] = _mm_unpackhi_epi8(tmp0, tmp2); //a1,a5, a9, a13, b1,b5, b9,b13, c1,c5, c9, c13, d1,d5, d9,d13 5206 v.val[2] = _mm_unpacklo_epi8(tmp1, tmp3); //a2,a6, a10,a14, b2,b6, b10,b14,c2,c6, c10,c14, d2,d6, d10,d14 5207 v.val[3] = _mm_unpackhi_epi8(tmp1, tmp3); //a3,a7, a11,a15, b3,b7, b11,b15,c3,c7, c11, c15,d3,d7, d11,d15 5208 return v; 5209 } 5210 5211 uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 5212 _NEON2SSE_INLINE uint16x8x4_t vld4q_u16(__transfersize(32) uint16_t const * ptr) // VLD4.16 {d0, d2, d4, d6}, [r0] 5213 { 5214 uint16x8x4_t v; 5215 __m128i tmp3, tmp2, tmp1, tmp0; 5216 tmp0 = vld1q_u16 (ptr); //a0,a1,a2,...a7 5217 tmp1 = vld1q_u16 ((ptr + 8)); //b0, b1,b2,...b7 5218 tmp2 = vld1q_u16 ((ptr + 16)); //c0, c1,c2,...c7 5219 tmp3 = vld1q_u16 ((ptr + 24)); //d0,d1,d2,...d7 5220 v.val[0]= _mm_unpacklo_epi16(tmp0,tmp1); //a0,b0, a1,b1, a2,b2, a3,b3, 5221 v.val[1]= _mm_unpacklo_epi16(tmp2,tmp3); //c0,d0, c1,d1, c2,d2, c3,d3, 5222 v.val[2]= _mm_unpackhi_epi16(tmp0,tmp1);//a4,b4, a5,b5, a6,b6, a7,b7 5223 v.val[3]= _mm_unpackhi_epi16(tmp2,tmp3);//c4,d4, c5,d5, c6,d6, c7,d7 5224 tmp0 = _mm_unpacklo_epi16(v.val[0], v.val[2]);//a0,a4, b0,b4, a1,a5, b1,b5 5225 tmp1 = _mm_unpackhi_epi16(v.val[0], v.val[2]); //a2,a6, b2,b6, a3,a7, b3,b7 5226 tmp2 = _mm_unpacklo_epi16(v.val[1], v.val[3]); //c0,c4, d0,d4, c1,c5, d1,d5 5227 tmp3 = _mm_unpackhi_epi16(v.val[1], v.val[3]);//c2,c6, d2,d6, c3,c7, d3,d7 5228 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp2); //a0,a4, b0,b4, c0,c4, d0,d4, 5229 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp2); //a1,a5, b1,b5, c1,c5, d1,d5 5230 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp3); //a2,a6, b2,b6, c2,c6, d2,d6, 5231 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp3); //a3,a7, b3,b7, c3,c7, d3,d7 5232 return v; 5233 } 5234 5235 uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 5236 _NEON2SSE_INLINE uint32x4x4_t vld4q_u32(__transfersize(16) uint32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] 5237 { 5238 uint32x4x4_t v; 5239 __m128i tmp3, tmp2, tmp1, tmp0; 5240 v.val[0] = vld1q_u32 (ptr); 5241 v.val[1] = vld1q_u32 ((ptr + 4)); 5242 v.val[2] = vld1q_u32 ((ptr + 8)); 5243 v.val[3] = vld1q_u32 ((ptr + 12)); 5244 tmp0 = _mm_unpacklo_epi32(v.val[0],v.val[1]); 5245 tmp1 = _mm_unpacklo_epi32(v.val[2],v.val[3]); 5246 tmp2 = _mm_unpackhi_epi32(v.val[0],v.val[1]); 5247 tmp3 = _mm_unpackhi_epi32(v.val[2],v.val[3]); 5248 v.val[0] = _mm_unpacklo_epi64(tmp0, tmp1); 5249 v.val[1] = _mm_unpackhi_epi64(tmp0, tmp1); 5250 v.val[2] = _mm_unpacklo_epi64(tmp2, tmp3); 5251 v.val[3] = _mm_unpackhi_epi64(tmp2, tmp3); 5252 return v; 5253 } 5254 5255 int8x16x4_t vld4q_s8(__transfersize(64) int8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 5256 #define vld4q_s8(ptr) vld4q_u8((uint8_t*)ptr) 5257 5258 int16x8x4_t vld4q_s16(__transfersize(32) int16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 5259 #define vld4q_s16(ptr) vld4q_u16((uint16_t*)ptr) 5260 5261 int32x4x4_t vld4q_s32(__transfersize(16) int32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 5262 #define vld4q_s32(ptr) vld4q_u32((uint32_t*)ptr) 5263 5264 float16x8x4_t vld4q_f16(__transfersize(32) __fp16 const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 5265 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 5266 5267 float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr); // VLD4.32 {d0, d2, d4, d6}, [r0] 5268 _NEON2SSE_INLINE float32x4x4_t vld4q_f32(__transfersize(16) float32_t const * ptr) // VLD4.32 {d0, d2, d4, d6}, [r0] 5269 { 5270 float32x4x4_t v; 5271 __m128 tmp3, tmp2, tmp1, tmp0; 5272 5273 v.val[0] = vld1q_f32 ((float*) ptr); 5274 v.val[1] = vld1q_f32 ((float*) (ptr + 4)); 5275 v.val[2] = vld1q_f32 ((float*) (ptr + 8)); 5276 v.val[3] = vld1q_f32 ((float*) (ptr + 12)); 5277 tmp0 = _mm_unpacklo_ps(v.val[0], v.val[1]); 5278 tmp2 = _mm_unpacklo_ps(v.val[2], v.val[3]); 5279 tmp1 = _mm_unpackhi_ps(v.val[0], v.val[1]); 5280 tmp3 = _mm_unpackhi_ps(v.val[2], v.val[3]); 5281 v.val[0] = _mm_movelh_ps(tmp0, tmp2); 5282 v.val[1] = _mm_movehl_ps(tmp2, tmp0); 5283 v.val[2] = _mm_movelh_ps(tmp1, tmp3); 5284 v.val[3] = _mm_movehl_ps(tmp3, tmp1); 5285 return v; 5286 } 5287 5288 poly8x16x4_t vld4q_p8(__transfersize(64) poly8_t const * ptr); // VLD4.8 {d0, d2, d4, d6}, [r0] 5289 #define vld4q_p8 vld4q_u8 5290 5291 poly16x8x4_t vld4q_p16(__transfersize(32) poly16_t const * ptr); // VLD4.16 {d0, d2, d4, d6}, [r0] 5292 #define vld4q_p16 vld4q_s16 5293 5294 #if defined(USE_SSSE3) 5295 uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 5296 _NEON2SSE_INLINE uint8x8x4_t vld4_u8(__transfersize(32) uint8_t const * ptr) // VLD4.8 {d0, d1, d2, d3}, [r0] 5297 { 5298 uint8x8x4_t v; 5299 __m128i sh0, sh1; 5300 _NEON2SSE_ALIGN_16 int8_t mask4_8[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; 5301 5302 v.val[0] = vld1q_u8(( ptr)); //load first 64-bits in val[0] and val[1] 5303 v.val[1] = vld1q_u8(( ptr + 16)); //load third and forth 64-bits in val[2], val[3] 5304 5305 sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_8); 5306 sh1 = _mm_shuffle_epi8(v.val[1], *(__m128i*)mask4_8); 5307 v.val[0] = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12,16,20,24,28, 1,5,9,13,17,21,25,29 5308 v.val[2] = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14,18,22,26,30, 3,7,11,15,19,23,27,31 5309 v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32); 5310 v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32); 5311 5312 return v; 5313 } 5314 #endif 5315 5316 #if defined(USE_SSSE3) 5317 uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 5318 _NEON2SSE_INLINE uint16x4x4_t vld4_u16(__transfersize(16) uint16_t const * ptr) // VLD4.16 {d0, d1, d2, d3}, [r0] 5319 { 5320 uint16x4x4_t v; 5321 __m128i sh0, sh1; 5322 _NEON2SSE_ALIGN_16 int8_t mask4_16[16] = {0,1, 8,9, 2,3, 10,11, 4,5, 12,13, 6,7, 14,15}; //0, 4, 1, 5, 2, 6, 3, 7 5323 v.val[0] = vld1q_u16 ( (ptr)); //load first 64-bits in val[0] and val[1] 5324 v.val[2] = vld1q_u16 ( (ptr + 8)); //load third and forth 64-bits in val[2], val[3] 5325 sh0 = _mm_shuffle_epi8(v.val[0], *(__m128i*)mask4_16); 5326 sh1 = _mm_shuffle_epi8(v.val[2], *(__m128i*)mask4_16); 5327 v.val[0] = _mm_unpacklo_epi32(sh0,sh1); //0,4,8,12, 1,5,9,13 5328 v.val[2] = _mm_unpackhi_epi32(sh0,sh1); //2,6,10,14, 3,7,11,15 5329 v.val[1] = _mm_shuffle_epi32(v.val[0],_SWAP_HI_LOW32); 5330 v.val[3] = _mm_shuffle_epi32(v.val[2],_SWAP_HI_LOW32); 5331 return v; 5332 } 5333 #endif 5334 5335 uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 5336 _NEON2SSE_INLINE uint32x2x4_t vld4_u32(__transfersize(8) uint32_t const * ptr) 5337 { //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 5338 uint32x4x4_t v, res; 5339 v.val[0] = vld1q_u32 (ptr); //a0,a1, b0,b1, 5340 v.val[2] = vld1q_u32 ((ptr + 4)); //c0,c1, d0,d1 5341 res.val[0] = _mm_unpacklo_epi32(v.val[0],v.val[2]); //a0, c0, a1,c1, 5342 res.val[2] = _mm_unpackhi_epi32(v.val[0],v.val[2]); //b0,d0, b1, d1 5343 res.val[1] = _mm_shuffle_epi32(res.val[0],_SWAP_HI_LOW32); //a1,c1, a0, c0, 5344 res.val[3] = _mm_shuffle_epi32(res.val[2],_SWAP_HI_LOW32);//b1, d1,b0,d0, 5345 return res; 5346 } 5347 5348 uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 5349 _NEON2SSE_INLINE uint64x1x4_t vld4_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] 5350 { 5351 uint64x1x4_t v; 5352 v.val[0] = vld1q_u64( (ptr)); //load first 64-bits in val[0] and val[1] 5353 v.val[2] = vld1q_u64( (ptr + 2)); //load third and forth 64-bits in val[2], val[3] 5354 return v; 5355 } 5356 5357 #if defined(USE_SSSE3) 5358 int8x8x4_t vld4_s8(__transfersize(32) int8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 5359 #define vld4_s8(ptr) vld4_u8((uint8_t*)ptr) 5360 5361 int16x4x4_t vld4_s16(__transfersize(16) int16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 5362 #define vld4_s16(ptr) vld4_u16((uint16_t*)ptr) 5363 #endif 5364 5365 int32x2x4_t vld4_s32(__transfersize(8) int32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 5366 #define vld4_s32(ptr) vld4_u32((uint32_t*)ptr) 5367 5368 int64x1x4_t vld4_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 5369 #define vld4_s64(ptr) vld4_u64((uint64_t*)ptr) 5370 5371 float16x4x4_t vld4_f16(__transfersize(16) __fp16 const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 5372 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 5373 5374 float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr); // VLD4.32 {d0, d1, d2, d3}, [r0] 5375 _NEON2SSE_INLINE float32x2x4_t vld4_f32(__transfersize(8) float32_t const * ptr) // VLD4.32 {d0, d1, d2, d3}, [r0] 5376 { //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 5377 float32x2x4_t v, res; 5378 v.val[0] = vld1q_f32 ((float*) ptr); //a0,a1, b0,b1, 5379 v.val[2] = vld1q_f32 ((float*) (ptr + 4)); //c0,c1, d0,d1 5380 res.val[0] = _mm_unpacklo_ps(v.val[0],v.val[2]); //a0, c0, a1,c1, 5381 res.val[2] = _mm_unpackhi_ps(v.val[0],v.val[2]); //b0,d0, b1, d1 5382 res.val[1] = _mm_movehl_ps(res.val[0],res.val[0]); // a1,c1, a0, c0, 5383 res.val[3] = _mm_movehl_ps(res.val[2],res.val[2]); // b1, d1, b0,d0, 5384 return res; 5385 } 5386 5387 #if defined(USE_SSSE3) 5388 poly8x8x4_t vld4_p8(__transfersize(32) poly8_t const * ptr); // VLD4.8 {d0, d1, d2, d3}, [r0] 5389 #define vld4_p8 vld4_u8 5390 5391 poly16x4x4_t vld4_p16(__transfersize(16) poly16_t const * ptr); // VLD4.16 {d0, d1, d2, d3}, [r0] 5392 #define vld4_p16 vld4_u16 5393 #endif 5394 5395 //************* Duplicate (or propagate) ptr[0] to all val[0] lanes and ptr[1] to all val[1] lanes ******************* 5396 //******************************************************************************************************************* 5397 uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 5398 _NEON2SSE_INLINE uint8x8x2_t vld2_dup_u8(__transfersize(2) uint8_t const * ptr) // VLD2.8 {d0[], d1[]}, [r0] 5399 { 5400 uint8x8x2_t v; 5401 v.val[0] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x,x,x,x,x, x,x,x,x 5402 v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,x,x,x,x, x,x,x,x,x,x,x,x, 5403 v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,x,x,x,x, x,x,x,x 5404 v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, 5405 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); 5406 return v; 5407 } 5408 5409 uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 5410 _NEON2SSE_INLINE uint16x4x2_t vld2_dup_u16(__transfersize(2) uint16_t const * ptr) // VLD2.16 {d0[], d1[]}, [r0] 5411 { 5412 uint16x4x2_t v; 5413 v.val[1] = LOAD_SI128(ptr); //0,1,x,x, x,x,x,x 5414 v.val[0] = _mm_shufflelo_epi16(v.val[1], 0); //00 00 00 00 (all 0) 5415 v.val[1] = _mm_shufflelo_epi16(v.val[1], 85);//01 01 01 01 (all 1) 5416 return v; 5417 } 5418 5419 uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 5420 _NEON2SSE_INLINE uint32x2x2_t vld2_dup_u32(__transfersize(2) uint32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] 5421 { 5422 uint32x2x2_t v; 5423 v.val[0] = LOAD_SI128(ptr); //0,1,x,x 5424 v.val[0] = _mm_shuffle_epi32(v.val[0], 0 | (0 << 2) | (1 << 4) | (1 << 6)); //0,0,1,1 5425 v.val[1] = _mm_srli_si128(v.val[0], 8); //1,1,0x0,0x0 5426 return v; 5427 } 5428 5429 uint64x1x2_t vld2_dup_u64(__transfersize(2) uint64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 5430 #define vld2_dup_u64 vld2_u64 5431 5432 int8x8x2_t vld2_dup_s8(__transfersize(2) int8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 5433 #define vld2_dup_s8(ptr) vld2_dup_u8((uint8_t*)ptr) 5434 5435 int16x4x2_t vld2_dup_s16(__transfersize(2) int16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 5436 #define vld2_dup_s16(ptr) vld2_dup_u16((uint16_t*)ptr) 5437 5438 int32x2x2_t vld2_dup_s32(__transfersize(2) int32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 5439 #define vld2_dup_s32(ptr) vld2_dup_u32((uint32_t*)ptr) 5440 5441 int64x1x2_t vld2_dup_s64(__transfersize(2) int64_t const * ptr); // VLD1.64 {d0, d1}, [r0] 5442 #define vld2_dup_s64(ptr) vld2_dup_u64((uint64_t*)ptr) 5443 5444 float16x4x2_t vld2_dup_f16(__transfersize(2) __fp16 const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 5445 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 5446 5447 float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr); // VLD2.32 {d0[], d1[]}, [r0] 5448 _NEON2SSE_INLINE float32x2x2_t vld2_dup_f32(__transfersize(2) float32_t const * ptr) // VLD2.32 {d0[], d1[]}, [r0] 5449 { 5450 float32x2x2_t v; 5451 v.val[0] = vld1q_f32(ptr); //0,1,x,x 5452 v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x 5453 v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,x,x 5454 return v; 5455 } 5456 5457 poly8x8x2_t vld2_dup_p8(__transfersize(2) poly8_t const * ptr); // VLD2.8 {d0[], d1[]}, [r0] 5458 #define vld2_dup_p8 vld2_dup_u8 5459 5460 poly16x4x2_t vld2_dup_p16(__transfersize(2) poly16_t const * ptr); // VLD2.16 {d0[], d1[]}, [r0] 5461 #define vld2_dup_p16 vld2_dup_s16 5462 5463 //************* Duplicate (or propagate)triplets: ******************* 5464 //******************************************************************** 5465 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes and ptr[2] to all val[2] lanes 5466 uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 5467 _NEON2SSE_INLINE uint8x8x3_t vld3_dup_u8(__transfersize(3) uint8_t const * ptr) // VLD3.8 {d0[], d1[], d2[]}, [r0] 5468 { 5469 uint8x8x3_t v; 5470 v.val[0] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x,x,x,x,x, x,x,x,x 5471 v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,x,x, x,x,x,x,x,x,x,x, 5472 v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,x,x,x,x, 5473 v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, 5474 v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, x,x,x,x,x,x,x,x, 5475 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); 5476 return v; 5477 } 5478 5479 uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 5480 _NEON2SSE_INLINE uint16x4x3_t vld3_dup_u16(__transfersize(3) uint16_t const * ptr) // VLD3.16 {d0[], d1[], d2[]}, [r0] 5481 { 5482 uint16x4x3_t v; 5483 v.val[2] = LOAD_SI128(ptr); //0,1,2,x, x,x,x,x 5484 v.val[0] = _mm_shufflelo_epi16(v.val[2], 0); //00 00 00 00 (all 0) 5485 v.val[1] = _mm_shufflelo_epi16(v.val[2], 85);//01 01 01 01 (all 1) 5486 v.val[2] = _mm_shufflelo_epi16(v.val[2], 170);//10 10 10 10 (all 2) 5487 return v; 5488 } 5489 5490 uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 5491 _NEON2SSE_INLINE uint32x2x3_t vld3_dup_u32(__transfersize(3) uint32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] 5492 { 5493 uint32x2x3_t v; 5494 v.val[2] = LOAD_SI128(ptr); //0,1,2,x 5495 v.val[0] = _mm_shuffle_epi32(v.val[2], 0 | (0 << 2) | (2 << 4) | (2 << 6)); //0,0,2,2 5496 v.val[1] = _mm_shuffle_epi32(v.val[2], 1 | (1 << 2) | (2 << 4) | (2 << 6)); //1,1,2,2 5497 v.val[2] = _mm_srli_si128(v.val[0], 8); //2,2,0x0,0x0 5498 return v; 5499 } 5500 5501 uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 5502 _NEON2SSE_INLINE uint64x1x3_t vld3_dup_u64(__transfersize(3) uint64_t const * ptr) // VLD1.64 {d0, d1, d2}, [r0] 5503 { 5504 uint64x1x3_t v; 5505 v.val[0] = LOAD_SI128(ptr);//0,1, 5506 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0 5507 v.val[2] = LOAD_SI128((ptr + 2)); //2,x 5508 return v; 5509 } 5510 5511 int8x8x3_t vld3_dup_s8(__transfersize(3) int8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 5512 #define vld3_dup_s8(ptr) vld3_dup_u8((uint8_t*)ptr) 5513 5514 int16x4x3_t vld3_dup_s16(__transfersize(3) int16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 5515 #define vld3_dup_s16(ptr) vld3_dup_u16((uint16_t*)ptr) 5516 5517 int32x2x3_t vld3_dup_s32(__transfersize(3) int32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 5518 #define vld3_dup_s32(ptr) vld3_dup_u32((uint32_t*)ptr) 5519 5520 int64x1x3_t vld3_dup_s64(__transfersize(3) int64_t const * ptr); // VLD1.64 {d0, d1, d2}, [r0] 5521 #define vld3_dup_s64(ptr) vld3_dup_u64((uint64_t*)ptr) 5522 5523 float16x4x3_t vld3_dup_f16(__transfersize(3) __fp16 const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 5524 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 5525 5526 float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr); // VLD3.32 {d0[], d1[], d2[]}, [r0] 5527 _NEON2SSE_INLINE float32x2x3_t vld3_dup_f32(__transfersize(3) float32_t const * ptr) // VLD3.32 {d0[], d1[], d2[]}, [r0] 5528 { 5529 float32x2x3_t v; 5530 v.val[0] = vld1q_f32(ptr); //0,1,2,x 5531 v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,x,x 5532 v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2 5533 v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0, 5534 return v; 5535 } 5536 5537 poly8x8x3_t vld3_dup_p8(__transfersize(3) poly8_t const * ptr); // VLD3.8 {d0[], d1[], d2[]}, [r0] 5538 #define vld3_dup_p8 vld3_dup_u8 5539 5540 poly16x4x3_t vld3_dup_p16(__transfersize(3) poly16_t const * ptr); // VLD3.16 {d0[], d1[], d2[]}, [r0] 5541 #define vld3_dup_p16 vld3_dup_s16 5542 5543 //************* Duplicate (or propagate) quadruples: ******************* 5544 //*********************************************************************** 5545 //ptr[0] to all val[0] lanes, ptr[1] to all val[1] lanes, ptr[2] to all val[2] lanes and ptr[3] to all val[3] lanes 5546 uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 5547 _NEON2SSE_INLINE uint8x8x4_t vld4_dup_u8(__transfersize(4) uint8_t const * ptr) // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 5548 { 5549 uint8x8x4_t v; 5550 v.val[0] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x,x,x,x,x, x,x,x,x 5551 v.val[1] = _mm_unpacklo_epi8(v.val[0],v.val[0]);//0,0,1,1,2,2,3,3, x,x,x,x,x,x,x,x, 5552 v.val[1] = _mm_unpacklo_epi16(v.val[1],v.val[1]);//0,0,0,0, 1,1,1,1,2,2,2,2,3,3,3,3 5553 v.val[0] = _mm_unpacklo_epi32(v.val[1],v.val[1]);//0,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1, 5554 v.val[2] = _mm_unpackhi_epi32(v.val[1],v.val[1]);// 2,2,2,2,2,2,2,2, 3,3,3,3, 3,3,3,3 5555 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); 5556 v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32); 5557 return v; 5558 } 5559 5560 uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 5561 _NEON2SSE_INLINE uint16x4x4_t vld4_dup_u16(__transfersize(4) uint16_t const * ptr) // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 5562 { 5563 uint16x4x4_t v; 5564 v.val[3] = LOAD_SI128(ptr); //0,1,2,3, x,x,x,x 5565 v.val[0] = _mm_shufflelo_epi16(v.val[3], 0); //00 00 00 00 (all 0) 5566 v.val[1] = _mm_shufflelo_epi16(v.val[3], 85);//01 01 01 01 (all 1) 5567 v.val[2] = _mm_shufflelo_epi16(v.val[3], 170);//10 10 10 10 (all 2) 5568 v.val[3] = _mm_shufflelo_epi16(v.val[3], 255);//11 11 11 11 (all 3) 5569 return v; 5570 } 5571 5572 uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 5573 _NEON2SSE_INLINE uint32x2x4_t vld4_dup_u32(__transfersize(4) uint32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 5574 { 5575 uint32x2x4_t v; 5576 v.val[3] = LOAD_SI128(ptr) ; //0,1,2,3 5577 v.val[0] = _mm_shuffle_epi32(v.val[3], 0 | (0 << 2) | (2 << 4) | (3 << 6)); //0,0,2,3 5578 v.val[1] = _mm_shuffle_epi32(v.val[3], 1 | (1 << 2) | (2 << 4) | (3 << 6)); //1,1,2,3 5579 v.val[2] = _mm_shuffle_epi32(v.val[3], 2 | (2 << 2) | (3 << 4) | (3 << 6)); //2,2,3,3 5580 v.val[3] = _mm_shuffle_epi32(v.val[3], 3 | (3 << 2) | (3 << 4) | (3 << 6)); //3,3,2,2 5581 return v; 5582 } 5583 5584 uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 5585 _NEON2SSE_INLINE uint64x1x4_t vld4_dup_u64(__transfersize(4) uint64_t const * ptr) // VLD1.64 {d0, d1, d2, d3}, [r0] 5586 { 5587 uint64x1x4_t v; 5588 v.val[0] = LOAD_SI128(ptr); //0,1, 5589 v.val[1] = _mm_shuffle_epi32(v.val[0], _SWAP_HI_LOW32); //1,0 5590 v.val[2] = LOAD_SI128((ptr + 2)); //2,3 5591 v.val[3] = _mm_shuffle_epi32(v.val[2], _SWAP_HI_LOW32); //3,2 5592 return v; 5593 } 5594 5595 int8x8x4_t vld4_dup_s8(__transfersize(4) int8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 5596 #define vld4_dup_s8(ptr) vld4_dup_u8((uint8_t*)ptr) 5597 5598 int16x4x4_t vld4_dup_s16(__transfersize(4) int16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 5599 #define vld4_dup_s16(ptr) vld4_dup_u16((uint16_t*)ptr) 5600 5601 int32x2x4_t vld4_dup_s32(__transfersize(4) int32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 5602 #define vld4_dup_s32(ptr) vld4_dup_u32((uint32_t*)ptr) 5603 5604 int64x1x4_t vld4_dup_s64(__transfersize(4) int64_t const * ptr); // VLD1.64 {d0, d1, d2, d3}, [r0] 5605 #define vld4_dup_s64(ptr) vld4_dup_u64((uint64_t*)ptr) 5606 5607 float16x4x4_t vld4_dup_f16(__transfersize(4) __fp16 const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 5608 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 5609 5610 float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr); // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 5611 _NEON2SSE_INLINE float32x2x4_t vld4_dup_f32(__transfersize(4) float32_t const * ptr) // VLD4.32 {d0[], d1[], d2[], d3[]}, [r0] 5612 { 5613 float32x2x4_t v; 5614 v.val[0] = vld1q_f32(ptr); //0,1,2,3 5615 v.val[1] = _mm_movehdup_ps(v.val[0]); //1,1,3,3 5616 v.val[0] = _mm_moveldup_ps(v.val[0]); //0,0,2,2 5617 v.val[2] = _mm_movehl_ps(v.val[0], v.val[0]); //2,2,0,0, 5618 v.val[3] = _mm_movehl_ps(v.val[1], v.val[1]); //3,3,1,1, 5619 return v; 5620 } 5621 5622 poly8x8x4_t vld4_dup_p8(__transfersize(4) poly8_t const * ptr); // VLD4.8 {d0[], d1[], d2[], d3[]}, [r0] 5623 #define vld4_dup_p8 vld4_dup_u8 5624 5625 poly16x4x4_t vld4_dup_p16(__transfersize(4) poly16_t const * ptr); // VLD4.16 {d0[], d1[], d2[], d3[]}, [r0] 5626 #define vld4_dup_p16 vld4_dup_u16 5627 5628 //********************************************************************************** 5629 //*******************Lane loads for an N-element structures *********************** 5630 //********************************************************************************** 5631 //********************** Lane pairs ************************************************ 5632 //does vld1_lane_xx ptr[0] to src->val[0] at lane positon and ptr[1] to src->val[1] at lane positon 5633 //we assume src is 16 bit aligned 5634 5635 //!!!!!! Microsoft compiler does not allow xxxxxx_2t function arguments resulting in "formal parameter with __declspec(align('16')) won't be aligned" error 5636 //to fix it the all functions below work with xxxxxx_2t pointers and the corresponding original functions are redefined 5637 5638 //uint16x8x2_t vld2q_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] 5639 _NEON2SSE_INLINE uint16x8x2_t vld2q_lane_u16_ptr(__transfersize(2) uint16_t const * ptr, uint16x8x2_t* src,__constrange(0,7) int lane) // VLD2.16 {d0[0], d2[0]}, [r0] 5640 { 5641 uint16x8x2_t v; 5642 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); 5643 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); 5644 return v; 5645 } 5646 #define vld2q_lane_u16(ptr, src, lane) vld2q_lane_u16_ptr(ptr, &src, lane) 5647 5648 //uint32x4x2_t vld2q_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] 5649 _NEON2SSE_INLINE uint32x4x2_t vld2q_lane_u32_ptr(__transfersize(2) uint32_t const * ptr, uint32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] 5650 { 5651 uint32x4x2_t v; 5652 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); 5653 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); 5654 return v; 5655 } 5656 #define vld2q_lane_u32(ptr, src, lane) vld2q_lane_u32_ptr(ptr, &src, lane) 5657 5658 //int16x8x2_t vld2q_lane_s16(__transfersize(2) int16_t const * ptr, int16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] 5659 _NEON2SSE_INLINE int16x8x2_t vld2q_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x8x2_t* src, __constrange(0,7) int lane) 5660 { 5661 int16x8x2_t v; 5662 v.val[0] = vld1q_lane_s16 (ptr, src->val[0], lane); 5663 v.val[1] = vld1q_lane_s16 ((ptr + 1), src->val[1], lane); 5664 return v; 5665 } 5666 #define vld2q_lane_s16(ptr, src, lane) vld2q_lane_s16_ptr(ptr, &src, lane) 5667 5668 //int32x4x2_t vld2q_lane_s32(__transfersize(2) int32_t const * ptr, int32x4x2_t src, __constrange(0,3)int lane);// VLD2.32 {d0[0], d2[0]}, [r0] 5669 _NEON2SSE_INLINE int32x4x2_t vld2q_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x4x2_t* src, __constrange(0,3) int lane) 5670 { 5671 int32x4x2_t v; 5672 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); 5673 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); 5674 return v; 5675 } 5676 #define vld2q_lane_s32(ptr, src, lane) vld2q_lane_s32_ptr(ptr, &src, lane) 5677 5678 //float16x8x2_t vld2q_lane_f16(__transfersize(2) __fp16 const * ptr, float16x8x2_t src, __constrange(0,7)int lane);// VLD2.16 {d0[0], d2[0]}, [r0] 5679 //current IA SIMD doesn't support float16 5680 5681 //float32x4x2_t vld2q_lane_f32(__transfersize(2) float32_t const * ptr, float32x4x2_t src,__constrange(0,3) int lane);// VLD2.32 {d0[0], d2[0]}, [r0] 5682 _NEON2SSE_INLINE float32x4x2_t vld2q_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x4x2_t* src,__constrange(0,3) int lane) // VLD2.32 {d0[0], d2[0]}, [r0] 5683 { 5684 float32x4x2_t v; 5685 v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane); 5686 v.val[1] = vld1q_lane_f32((ptr + 1), src->val[1], lane); 5687 return v; 5688 } 5689 #define vld2q_lane_f32(ptr, src, lane) vld2q_lane_f32_ptr(ptr, &src, lane) 5690 5691 //poly16x8x2_t vld2q_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x8x2_t src,__constrange(0,7) int lane);// VLD2.16 {d0[0], d2[0]}, [r0] 5692 #define vld2q_lane_p16 vld2q_lane_u16 5693 5694 //uint8x8x2_t vld2_lane_u8(__transfersize(2) uint8_t const * ptr, uint8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] 5695 _NEON2SSE_INLINE uint8x8x2_t vld2_lane_u8_ptr(__transfersize(2) uint8_t const * ptr, uint8x8x2_t* src, __constrange(0,7) int lane) // VLD2.8 {d0[0], d1[0]}, [r0] 5696 { 5697 uint8x8x2_t val; 5698 val.val[0] = _MM_INSERT_EPI8 (src->val[0], (int)ptr[0], lane); 5699 val.val[1] = _MM_INSERT_EPI8 (src->val[1], (int)ptr[1], lane); 5700 return val; 5701 } 5702 #define vld2_lane_u8(ptr, src, lane) vld2_lane_u8_ptr(ptr, &src, lane) 5703 5704 //uint16x4x2_t vld2_lane_u16(__transfersize(2) uint16_t const * ptr, uint16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] 5705 #define vld2_lane_u16 vld2q_lane_u16 5706 5707 //uint32x2x2_t vld2_lane_u32(__transfersize(2) uint32_t const * ptr, uint32x2x2_t src, __constrange(0,1)int lane);// VLD2.32 {d0[0], d1[0]}, [r0] 5708 #define vld2_lane_u32 vld2q_lane_u32 5709 5710 //int8x8x2_t vld2_lane_s8(__transfersize(2) int8_t const * ptr, int8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] 5711 int8x8x2_t vld2_lane_s8_ptr(__transfersize(2) int8_t const * ptr, int8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] 5712 #define vld2_lane_s8(ptr, src, lane) vld2_lane_u8(( uint8_t*) ptr, src, lane) 5713 5714 //int16x4x2_t vld2_lane_s16(__transfersize(2) int16_t const * ptr, int16x4x2_t src, __constrange(0,3) int lane);// VLD2.16 {d0[0], d1[0]}, [r0] 5715 int16x4x2_t vld2_lane_s16_ptr(__transfersize(2) int16_t const * ptr, int16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 5716 #define vld2_lane_s16(ptr, src, lane) vld2_lane_u16(( uint16_t*) ptr, src, lane) 5717 5718 //int32x2x2_t vld2_lane_s32(__transfersize(2) int32_t const * ptr, int32x2x2_t src, __constrange(0,1) int lane);// VLD2.32 {d0[0], d1[0]}, [r0] 5719 int32x2x2_t vld2_lane_s32_ptr(__transfersize(2) int32_t const * ptr, int32x2x2_t * src, __constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] 5720 #define vld2_lane_s32(ptr, src, lane) vld2_lane_u32(( uint32_t*) ptr, src, lane) 5721 5722 //float16x4x2_t vld2_lane_f16(__transfersize(2) __fp16 const * ptr, float16x4x2_t src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 5723 //current IA SIMD doesn't support float16 5724 5725 float32x2x2_t vld2_lane_f32_ptr(__transfersize(2) float32_t const * ptr, float32x2x2_t * src,__constrange(0,1) int lane); // VLD2.32 {d0[0], d1[0]}, [r0] 5726 #define vld2_lane_f32 vld2q_lane_f32 5727 5728 //poly8x8x2_t vld2_lane_p8(__transfersize(2) poly8_t const * ptr, poly8x8x2_t src, __constrange(0,7) int lane);// VLD2.8 {d0[0], d1[0]}, [r0] 5729 poly8x8x2_t vld2_lane_p8_ptr(__transfersize(2) poly8_t const * ptr, poly8x8x2_t * src, __constrange(0,7) int lane); // VLD2.8 {d0[0], d1[0]}, [r0] 5730 #define vld2_lane_p8 vld2_lane_u8 5731 5732 //poly16x4x2_t vld2_lane_p16(__transfersize(2) poly16_t const * ptr, poly16x4x2_t src, __constrange(0,3)int lane);// VLD2.16 {d0[0], d1[0]}, [r0] 5733 poly16x4x2_t vld2_lane_p16_ptr(__transfersize(2) poly16_t const * ptr, poly16x4x2_t * src, __constrange(0,3) int lane); // VLD2.16 {d0[0], d1[0]}, [r0] 5734 #define vld2_lane_p16 vld2_lane_u16 5735 5736 //*********** Lane triplets ********************** 5737 //************************************************* 5738 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1] and ptr[2] to src->val[2] at lane positon 5739 //we assume src is 16 bit aligned 5740 5741 //uint16x8x3_t vld3q_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x8x3_t src,__constrange(0,7) int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 5742 _NEON2SSE_INLINE uint16x8x3_t vld3q_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x8x3_t* src,__constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 5743 { 5744 uint16x8x3_t v; 5745 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); 5746 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); 5747 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); 5748 return v; 5749 } 5750 #define vld3q_lane_u16(ptr, src, lane) vld3q_lane_u16_ptr(ptr, &src, lane) 5751 5752 //uint32x4x3_t vld3q_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 5753 _NEON2SSE_INLINE uint32x4x3_t vld3q_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 5754 { 5755 uint32x4x3_t v; 5756 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); 5757 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); 5758 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); 5759 return v; 5760 } 5761 #define vld3q_lane_u32(ptr, src, lane) vld3q_lane_u32_ptr(ptr, &src, lane) 5762 5763 //int16x8x3_t vld3q_lane_s16(__transfersize(3) int16_t const * ptr, int16x8x3_t src, __constrange(0,7)int lane);// VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 5764 _NEON2SSE_INLINE int16x8x3_t vld3q_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x8x3_t* src, __constrange(0,7) int lane) // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 5765 { 5766 int16x8x3_t v; 5767 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); 5768 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); 5769 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); 5770 return v; 5771 } 5772 #define vld3q_lane_s16(ptr, src, lane) vld3q_lane_s16_ptr(ptr, &src, lane) 5773 5774 //int32x4x3_t vld3q_lane_s32(__transfersize(3) int32_t const * ptr, int32x4x3_t src, __constrange(0,3)int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 5775 _NEON2SSE_INLINE int32x4x3_t vld3q_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x4x3_t* src, __constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 5776 { 5777 int32x4x3_t v; 5778 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); 5779 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); 5780 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); 5781 return v; 5782 } 5783 #define vld3q_lane_s32(ptr, src, lane) vld3q_lane_s32_ptr(ptr, &src, lane) 5784 5785 float16x8x3_t vld3q_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x8x3_t * src, __constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 5786 //current IA SIMD doesn't support float16 5787 #define vld3q_lane_f16(ptr, src, lane) vld3q_lane_f16_ptr(ptr, &src, lane) 5788 5789 //float32x4x3_t vld3q_lane_f32(__transfersize(3) float32_t const * ptr, float32x4x3_t src,__constrange(0,3) int lane);// VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 5790 _NEON2SSE_INLINE float32x4x3_t vld3q_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x4x3_t* src,__constrange(0,3) int lane) // VLD3.32 {d0[0], d2[0], d4[0]}, [r0] 5791 { 5792 float32x4x3_t v; 5793 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); 5794 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); 5795 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); 5796 return v; 5797 } 5798 #define vld3q_lane_f32(ptr, src, lane) vld3q_lane_f32_ptr(ptr, &src, lane) 5799 5800 poly16x8x3_t vld3q_lane_p16_ptr(__transfersize(3) poly16_t const * ptr, poly16x8x3_t * src,__constrange(0,7) int lane); // VLD3.16 {d0[0], d2[0], d4[0]}, [r0] 5801 #define vld3q_lane_p16 vld3q_lane_u16 5802 5803 //uint8x8x3_t vld3_lane_u8(__transfersize(3) uint8_t const * ptr, uint8x8x3_t src, __constrange(0,7) int lane);// VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 5804 _NEON2SSE_INLINE uint8x8x3_t vld3_lane_u8_ptr(__transfersize(3) uint8_t const * ptr, uint8x8x3_t* src, __constrange(0,7) int lane) // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 5805 { 5806 uint8x8x3_t v; 5807 v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane); 5808 v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane); 5809 v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane); 5810 return v; 5811 } 5812 #define vld3_lane_u8(ptr, src, lane) vld3_lane_u8_ptr(ptr, &src, lane) 5813 5814 //uint16x4x3_t vld3_lane_u16(__transfersize(3) uint16_t const * ptr, uint16x4x3_t src, __constrange(0,3)int lane);// VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 5815 _NEON2SSE_INLINE uint16x4x3_t vld3_lane_u16_ptr(__transfersize(3) uint16_t const * ptr, uint16x4x3_t* src, __constrange(0,3) int lane) // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 5816 { 5817 uint16x4x3_t v; 5818 v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane); 5819 v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane); 5820 v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane); 5821 return v; 5822 } 5823 #define vld3_lane_u16(ptr, src, lane) vld3_lane_u16_ptr(ptr, &src, lane) 5824 5825 //uint32x2x3_t vld3_lane_u32(__transfersize(3) uint32_t const * ptr, uint32x2x3_t src, __constrange(0,1)int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 5826 _NEON2SSE_INLINE uint32x2x3_t vld3_lane_u32_ptr(__transfersize(3) uint32_t const * ptr, uint32x2x3_t* src, __constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 5827 { //need to merge into 128 bit anyway 5828 uint32x2x3_t v; 5829 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); 5830 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane); 5831 v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane); 5832 return v; 5833 } 5834 #define vld3_lane_u32(ptr, src, lane) vld3_lane_u32_ptr(ptr, &src, lane) 5835 5836 int8x8x3_t vld3_lane_s8_ptr(__transfersize(3) int8_t const * ptr, int8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 5837 #define vld3_lane_s8(ptr, src, lane) vld3_lane_u8_ptr(( uint8_t*) ptr, &src, lane) 5838 5839 int16x4x3_t vld3_lane_s16_ptr(__transfersize(3) int16_t const * ptr, int16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 5840 #define vld3_lane_s16(ptr, src, lane) vld3_lane_u16_ptr(( uint16_t*) ptr, &src, lane) 5841 5842 int32x2x3_t vld3_lane_s32_ptr(__transfersize(3) int32_t const * ptr, int32x2x3_t * src, __constrange(0,1) int lane); // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 5843 #define vld3_lane_s32(ptr, src, lane) vld3_lane_u32_ptr(( uint32_t*) ptr, &src, lane) 5844 5845 float16x4x3_t vld3_lane_f16_ptr(__transfersize(3) __fp16 const * ptr, float16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 5846 //current IA SIMD doesn't support float16 5847 5848 //float32x2x3_t vld3_lane_f32(__transfersize(3) float32_t const * ptr, float32x2x3_t src,__constrange(0,1) int lane);// VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 5849 _NEON2SSE_INLINE float32x2x3_t vld3_lane_f32_ptr(__transfersize(3) float32_t const * ptr, float32x2x3_t* src,__constrange(0,1) int lane) // VLD3.32 {d0[0], d1[0], d2[0]}, [r0] 5850 { 5851 float32x2x3_t v; 5852 v.val[0] = vld1q_lane_f32(ptr, src->val[0], lane); 5853 return v; 5854 } 5855 #define vld3_lane_f32(ptr, src, lane) vld3_lane_f32_ptr(ptr, &src, lane) 5856 5857 //poly8x8x3_t vld3_lane_p8_ptr(__transfersize(3) poly8_t const * ptr, poly8x8x3_t * src, __constrange(0,7) int lane); // VLD3.8 {d0[0], d1[0], d2[0]}, [r0] 5858 #define vld3_lane_p8 vld3_lane_u8 5859 5860 //poly16x4x3_t vld3_lane_p16(__transfersize(3) poly16_t const * ptr, poly16x4x3_t * src, __constrange(0,3) int lane); // VLD3.16 {d0[0], d1[0], d2[0]}, [r0] 5861 #define vld3_lane_p16 vld3_lane_u16 5862 5863 //******************* Lane Quadruples load *************************** 5864 //********************************************************************* 5865 //does vld1_lane_xx ptr[0] to src->val[0], ptr[1] to src->val[1], ptr[2] to src->val[2] and ptr[3] to src->val[3] at lane positon 5866 //we assume src is 16 bit aligned 5867 5868 //uint16x8x4_t vld4q_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x8x4_t src,__constrange(0,7) int lane)// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5869 _NEON2SSE_INLINE uint16x8x4_t vld4q_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x8x4_t* src,__constrange(0,7) int lane) 5870 { 5871 uint16x8x4_t v; 5872 v.val[0] = _MM_INSERT_EPI16 ( src->val[0], ptr[0], lane); 5873 v.val[1] = _MM_INSERT_EPI16 ( src->val[1], ptr[1], lane); 5874 v.val[2] = _MM_INSERT_EPI16 ( src->val[2], ptr[2], lane); 5875 v.val[3] = _MM_INSERT_EPI16 ( src->val[3], ptr[3], lane); 5876 return v; 5877 } 5878 #define vld4q_lane_u16(ptr, src, lane) vld4q_lane_u16_ptr(ptr, &src, lane) 5879 5880 //uint32x4x4_t vld4q_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5881 _NEON2SSE_INLINE uint32x4x4_t vld4q_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x4x4_t* src,__constrange(0,3) int lane) 5882 { 5883 uint32x4x4_t v; 5884 v.val[0] = _MM_INSERT_EPI32 ( src->val[0], ptr[0], lane); 5885 v.val[1] = _MM_INSERT_EPI32 ( src->val[1], ptr[1], lane); 5886 v.val[2] = _MM_INSERT_EPI32 ( src->val[2], ptr[2], lane); 5887 v.val[3] = _MM_INSERT_EPI32 ( src->val[3], ptr[3], lane); 5888 return v; 5889 } 5890 #define vld4q_lane_u32(ptr, src, lane) vld4q_lane_u32_ptr(ptr, &src, lane) 5891 5892 //int16x8x4_t vld4q_lane_s16(__transfersize(4) int16_t const * ptr, int16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5893 int16x8x4_t vld4q_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5894 #define vld4q_lane_s16(ptr, src, lane) vld4q_lane_u16(( uint16_t*) ptr, src, lane) 5895 5896 //int32x4x4_t vld4q_lane_s32(__transfersize(4) int32_t const * ptr, int32x4x4_t src, __constrange(0,3)int lane);// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5897 int32x4x4_t vld4q_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x4x4_t * src, __constrange(0,3) int lane); // VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5898 #define vld4q_lane_s32(ptr, src, lane) vld4q_lane_u32(( uint32_t*) ptr, src, lane) 5899 5900 //float16x8x4_t vld4q_lane_f16(__transfersize(4) __fp16 const * ptr, float16x8x4_t src, __constrange(0,7)int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5901 float16x8x4_t vld4q_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x8x4_t * src, __constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5902 //current IA SIMD doesn't support float16 5903 5904 //float32x4x4_t vld4q_lane_f32(__transfersize(4) float32_t const * ptr, float32x4x4_t src,__constrange(0,3) int lane)// VLD4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5905 _NEON2SSE_INLINE float32x4x4_t vld4q_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x4x4_t* src,__constrange(0,3) int lane) 5906 { 5907 float32x4x4_t v; 5908 v.val[0] = vld1q_lane_f32(&ptr[0], src->val[0], lane); 5909 v.val[1] = vld1q_lane_f32(&ptr[1], src->val[1], lane); 5910 v.val[2] = vld1q_lane_f32(&ptr[2], src->val[2], lane); 5911 v.val[3] = vld1q_lane_f32(&ptr[3], src->val[3], lane); 5912 return v; 5913 } 5914 #define vld4q_lane_f32(ptr, src, lane) vld4q_lane_f32_ptr(ptr, &src, lane) 5915 5916 //poly16x8x4_t vld4q_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x8x4_t src,__constrange(0,7) int lane);// VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5917 poly16x8x4_t vld4q_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x8x4_t * src,__constrange(0,7) int lane); // VLD4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 5918 #define vld4q_lane_p16 vld4q_lane_u16 5919 5920 //uint8x8x4_t vld4_lane_u8(__transfersize(4) uint8_t const * ptr, uint8x8x4_t src, __constrange(0,7) int lane)// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5921 _NEON2SSE_INLINE uint8x8x4_t vld4_lane_u8_ptr(__transfersize(4) uint8_t const * ptr, uint8x8x4_t* src, __constrange(0,7) int lane) 5922 { 5923 uint8x8x4_t v; 5924 v.val[0] = _MM_INSERT_EPI8 (src->val[0], ptr[0], lane); 5925 v.val[1] = _MM_INSERT_EPI8 (src->val[1], ptr[1], lane ); 5926 v.val[2] = _MM_INSERT_EPI8 (src->val[2], ptr[2], lane ); 5927 v.val[3] = _MM_INSERT_EPI8 (src->val[3], ptr[3], lane ); 5928 return v; 5929 } 5930 #define vld4_lane_u8(ptr, src, lane) vld4_lane_u8_ptr(ptr, &src, lane) 5931 5932 //uint16x4x4_t vld4_lane_u16(__transfersize(4) uint16_t const * ptr, uint16x4x4_t src, __constrange(0,3)int lane)// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5933 _NEON2SSE_INLINE uint16x4x4_t vld4_lane_u16_ptr(__transfersize(4) uint16_t const * ptr, uint16x4x4_t* src, __constrange(0,3) int lane) 5934 { 5935 uint16x4x4_t v; 5936 v.val[0] = _MM_INSERT_EPI16 (src->val[0], ptr[0], lane); 5937 v.val[1] = _MM_INSERT_EPI16 (src->val[1], ptr[1], lane ); 5938 v.val[2] = _MM_INSERT_EPI16 (src->val[2], ptr[2], lane ); 5939 v.val[3] = _MM_INSERT_EPI16 (src->val[3], ptr[3], lane ); 5940 return v; 5941 } 5942 #define vld4_lane_u16(ptr, src, lane) vld4_lane_u16_ptr(ptr, &src, lane) 5943 5944 //uint32x2x4_t vld4_lane_u32(__transfersize(4) uint32_t const * ptr, uint32x2x4_t src, __constrange(0,1)int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5945 _NEON2SSE_INLINE uint32x2x4_t vld4_lane_u32_ptr(__transfersize(4) uint32_t const * ptr, uint32x2x4_t* src, __constrange(0,1) int lane) 5946 { 5947 uint32x2x4_t v; 5948 v.val[0] = _MM_INSERT_EPI32 (src->val[0], ptr[0], lane); 5949 v.val[1] = _MM_INSERT_EPI32 (src->val[1], ptr[1], lane ); 5950 v.val[2] = _MM_INSERT_EPI32 (src->val[2], ptr[2], lane ); 5951 v.val[3] = _MM_INSERT_EPI32 (src->val[3], ptr[3], lane ); 5952 return v; 5953 } 5954 #define vld4_lane_u32(ptr, src, lane) vld4_lane_u32_ptr(ptr, &src, lane) 5955 5956 //int8x8x4_t vld4_lane_s8(__transfersize(4) int8_t const * ptr, int8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5957 int8x8x4_t vld4_lane_s8_ptr(__transfersize(4) int8_t const * ptr, int8x8x4_t * src, __constrange(0,7) int lane); 5958 #define vld4_lane_s8(ptr,src,lane) vld4_lane_u8((uint8_t*)ptr,src,lane) 5959 5960 //int16x4x4_t vld4_lane_s16(__transfersize(4) int16_t const * ptr, int16x4x4_t src, __constrange(0,3) int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5961 int16x4x4_t vld4_lane_s16_ptr(__transfersize(4) int16_t const * ptr, int16x4x4_t * src, __constrange(0,3) int lane); 5962 #define vld4_lane_s16(ptr,src,lane) vld4_lane_u16((uint16_t*)ptr,src,lane) 5963 5964 //int32x2x4_t vld4_lane_s32(__transfersize(4) int32_t const * ptr, int32x2x4_t src, __constrange(0,1) int lane);// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5965 int32x2x4_t vld4_lane_s32_ptr(__transfersize(4) int32_t const * ptr, int32x2x4_t * src, __constrange(0,1) int lane); 5966 #define vld4_lane_s32(ptr,src,lane) vld4_lane_u32((uint32_t*)ptr,src,lane) 5967 5968 //float16x4x4_t vld4_lane_f16(__transfersize(4) __fp16 const * ptr, float16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5969 float16x4x4_t vld4_lane_f16_ptr(__transfersize(4) __fp16 const * ptr, float16x4x4_t * src, __constrange(0,3) int lane); 5970 //current IA SIMD doesn't support float16 5971 5972 //float32x2x4_t vld4_lane_f32(__transfersize(4) float32_t const * ptr, float32x2x4_t src,__constrange(0,1) int lane)// VLD4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5973 _NEON2SSE_INLINE float32x2x4_t vld4_lane_f32_ptr(__transfersize(4) float32_t const * ptr, float32x2x4_t* src,__constrange(0,1) int lane) 5974 { //serial solution may be faster 5975 float32x2x4_t v; 5976 return v; 5977 } 5978 #define vld4_lane_f32(ptr, src, lane) vld4_lane_f32_ptr(ptr, &src, lane) 5979 5980 //poly8x8x4_t vld4_lane_p8(__transfersize(4) poly8_t const * ptr, poly8x8x4_t src, __constrange(0,7) int lane);// VLD4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5981 poly8x8x4_t vld4_lane_p8_ptr(__transfersize(4) poly8_t const * ptr, poly8x8x4_t * src, __constrange(0,7) int lane); 5982 #define vld4_lane_p8 vld4_lane_u8 5983 5984 //poly16x4x4_t vld4_lane_p16(__transfersize(4) poly16_t const * ptr, poly16x4x4_t src, __constrange(0,3)int lane);// VLD4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 5985 poly16x4x4_t vld4_lane_p16_ptr(__transfersize(4) poly16_t const * ptr, poly16x4x4_t * src, __constrange(0,3) int lane); 5986 #define vld4_lane_p16 vld4_lane_u16 5987 5988 //******************* Store duplets ********************************************* 5989 //******************************************************************************** 5990 //here we assume the ptr is 16bit aligned. If not we need to use _mm_storeu_si128 like shown in vst1q_u8 function 5991 //If necessary you need to modify all store functions accordingly. See more comments to "Store single" functions 5992 //void vst2q_u8(__transfersize(32) uint8_t * ptr, uint8x16x2_t val)// VST2.8 {d0, d2}, [r0] 5993 _NEON2SSE_INLINE void vst2q_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x16x2_t* val) 5994 { 5995 uint8x16x2_t v; 5996 v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]); 5997 v.val[1] = _mm_unpackhi_epi8(val->val[0], val->val[1]); 5998 vst1q_u8 (ptr, v.val[0]); 5999 vst1q_u8 ((ptr + 16), v.val[1]); 6000 } 6001 #define vst2q_u8(ptr, val) vst2q_u8_ptr(ptr, &val) 6002 6003 //void vst2q_u16(__transfersize(16) uint16_t * ptr, uint16x8x2_t val)// VST2.16 {d0, d2}, [r0] 6004 _NEON2SSE_INLINE void vst2q_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x8x2_t* val) 6005 { 6006 uint16x8x2_t v; 6007 v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]); 6008 v.val[1] = _mm_unpackhi_epi16(val->val[0], val->val[1]); 6009 vst1q_u16 (ptr, v.val[0]); 6010 vst1q_u16 ((ptr + 8), v.val[1]); 6011 } 6012 #define vst2q_u16(ptr, val) vst2q_u16_ptr(ptr, &val) 6013 6014 //void vst2q_u32(__transfersize(8) uint32_t * ptr, uint32x4x2_t val)// VST2.32 {d0, d2}, [r0] 6015 _NEON2SSE_INLINE void vst2q_u32_ptr(__transfersize(8) uint32_t* ptr, uint32x4x2_t* val) 6016 { 6017 uint32x4x2_t v; 6018 v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]); 6019 v.val[1] = _mm_unpackhi_epi32(val->val[0], val->val[1]); 6020 vst1q_u32 (ptr, v.val[0]); 6021 vst1q_u32 ((ptr + 4), v.val[1]); 6022 } 6023 #define vst2q_u32(ptr, val) vst2q_u32_ptr(ptr, &val) 6024 6025 //void vst2q_s8(__transfersize(32) int8_t * ptr, int8x16x2_t val); // VST2.8 {d0, d2}, [r0] 6026 void vst2q_s8_ptr(__transfersize(32) int8_t * ptr, int8x16x2_t * val); 6027 #define vst2q_s8(ptr, val) vst2q_u8((uint8_t*)(ptr), val) 6028 6029 //void vst2q_s16(__transfersize(16) int16_t * ptr, int16x8x2_t val);// VST2.16 {d0, d2}, [r0] 6030 void vst2q_s16_ptr(__transfersize(16) int16_t * ptr, int16x8x2_t * val); 6031 #define vst2q_s16(ptr, val) vst2q_u16((uint16_t*)(ptr), val) 6032 6033 //void vst2q_s32(__transfersize(8) int32_t * ptr, int32x4x2_t val);// VST2.32 {d0, d2}, [r0] 6034 void vst2q_s32_ptr(__transfersize(8) int32_t * ptr, int32x4x2_t * val); 6035 #define vst2q_s32(ptr, val) vst2q_u32((uint32_t*)(ptr), val) 6036 6037 //void vst2q_f16(__transfersize(16) __fp16 * ptr, float16x8x2_t val);// VST2.16 {d0, d2}, [r0] 6038 void vst2q_f16_ptr(__transfersize(16) __fp16 * ptr, float16x8x2_t * val); 6039 // IA32 SIMD doesn't work with 16bit floats currently 6040 6041 //void vst2q_f32(__transfersize(8) float32_t * ptr, float32x4x2_t val)// VST2.32 {d0, d2}, [r0] 6042 _NEON2SSE_INLINE void vst2q_f32_ptr(__transfersize(8) float32_t* ptr, float32x4x2_t* val) 6043 { 6044 float32x4x2_t v; 6045 v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]); 6046 v.val[1] = _mm_unpackhi_ps(val->val[0], val->val[1]); 6047 vst1q_f32 (ptr, v.val[0]); 6048 vst1q_f32 ((ptr + 4), v.val[1]); 6049 } 6050 #define vst2q_f32(ptr, val) vst2q_f32_ptr(ptr, &val) 6051 6052 //void vst2q_p8(__transfersize(32) poly8_t * ptr, poly8x16x2_t val);// VST2.8 {d0, d2}, [r0] 6053 void vst2q_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x16x2_t * val); 6054 #define vst2q_p8 vst2q_u8 6055 6056 //void vst2q_p16(__transfersize(16) poly16_t * ptr, poly16x8x2_t val);// VST2.16 {d0, d2}, [r0] 6057 void vst2q_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x8x2_t * val); 6058 #define vst2q_p16 vst2q_u16 6059 6060 //void vst2_u8(__transfersize(16) uint8_t * ptr, uint8x8x2_t val);// VST2.8 {d0, d1}, [r0] 6061 _NEON2SSE_INLINE void vst2_u8_ptr(__transfersize(16) uint8_t * ptr, uint8x8x2_t* val) 6062 { 6063 uint8x8x2_t v; 6064 v.val[0] = _mm_unpacklo_epi8(val->val[0], val->val[1]); 6065 vst1q_u8 (ptr, v.val[0]); 6066 } 6067 #define vst2_u8(ptr, val) vst2_u8_ptr(ptr, &val) 6068 6069 //void vst2_u16(__transfersize(8) uint16_t * ptr, uint16x4x2_t val);// VST2.16 {d0, d1}, [r0] 6070 _NEON2SSE_INLINE void vst2_u16_ptr(__transfersize(8) uint16_t * ptr, uint16x4x2_t* val) 6071 { 6072 uint16x4x2_t v; 6073 v.val[0] = _mm_unpacklo_epi16(val->val[0], val->val[1]); 6074 vst1q_u16 (ptr, v.val[0]); 6075 } 6076 #define vst2_u16(ptr, val) vst2_u16_ptr(ptr, &val) 6077 6078 //void vst2_u32(__transfersize(4) uint32_t * ptr, uint32x2x2_t val);// VST2.32 {d0, d1}, [r0] 6079 _NEON2SSE_INLINE void vst2_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x2_t* val) 6080 { 6081 uint32x2x2_t v; 6082 v.val[0] = _mm_unpacklo_epi32(val->val[0], val->val[1]); 6083 vst1q_u32 (ptr, v.val[0]); 6084 } 6085 #define vst2_u32(ptr, val) vst2_u32_ptr(ptr, &val) 6086 6087 //void vst2_u64(__transfersize(2) uint64_t * ptr, uint64x1x2_t val);// VST1.64 {d0, d1}, [r0] 6088 void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t * val); 6089 _NEON2SSE_INLINE void vst2_u64_ptr(__transfersize(2) uint64_t * ptr, uint64x1x2_t* val) 6090 { 6091 uint64x1x2_t v; 6092 v.val[0] = _mm_unpacklo_epi64(val->val[0], val->val[1]); 6093 vst1q_u64(ptr, v.val[0]); 6094 } 6095 #define vst2_u64(ptr, val) vst2_u64_ptr(ptr, &val) 6096 6097 //void vst2_s8(__transfersize(16) int8_t * ptr, int8x8x2_t val);// VST2.8 {d0, d1}, [r0] 6098 #define vst2_s8(ptr, val) vst2_u8((uint8_t*) ptr, val) 6099 6100 //void vst2_s16(__transfersize(8) int16_t * ptr, int16x4x2_t val); // VST2.16 {d0, d1}, [r0] 6101 #define vst2_s16(ptr,val) vst2_u16((uint16_t*) ptr, val) 6102 6103 //void vst2_s32(__transfersize(4) int32_t * ptr, int32x2x2_t val); // VST2.32 {d0, d1}, [r0] 6104 #define vst2_s32(ptr,val) vst2_u32((uint32_t*) ptr, val) 6105 6106 //void vst2_s64(__transfersize(2) int64_t * ptr, int64x1x2_t val); 6107 #define vst2_s64(ptr,val) vst2_u64((uint64_t*) ptr,val) 6108 6109 //void vst2_f16(__transfersize(8) __fp16 * ptr, float16x4x2_t val); // VST2.16 {d0, d1}, [r0] 6110 //current IA SIMD doesn't support float16 6111 6112 void vst2_f32_ptr(__transfersize(4) float32_t * ptr, float32x2x2_t * val); // VST2.32 {d0, d1}, [r0] 6113 _NEON2SSE_INLINE void vst2_f32_ptr(__transfersize(4) float32_t* ptr, float32x2x2_t* val) 6114 { 6115 float32x4x2_t v; 6116 v.val[0] = _mm_unpacklo_ps(val->val[0], val->val[1]); 6117 vst1q_f32 (ptr, v.val[0]); 6118 } 6119 #define vst2_f32(ptr, val) vst2_f32_ptr(ptr, &val) 6120 6121 //void vst2_p8_ptr(__transfersize(16) poly8_t * ptr, poly8x8x2_t * val); // VST2.8 {d0, d1}, [r0] 6122 #define vst2_p8 vst2_u8 6123 6124 //void vst2_p16_ptr(__transfersize(8) poly16_t * ptr, poly16x4x2_t * val); // VST2.16 {d0, d1}, [r0] 6125 #define vst2_p16 vst2_u16 6126 6127 //******************** Triplets store ***************************************** 6128 //****************************************************************************** 6129 //void vst3q_u8(__transfersize(48) uint8_t * ptr, uint8x16x3_t val)// VST3.8 {d0, d2, d4}, [r0] 6130 #if defined(USE_SSSE3) 6131 _NEON2SSE_INLINE void vst3q_u8_ptr(__transfersize(48) uint8_t * ptr, uint8x16x3_t* val) 6132 { 6133 uint8x16x3_t v; 6134 __m128i v0,v1,v2, cff, bldmask; 6135 _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0, 1, 0xff, 2, 3,0xff, 4, 5,0xff, 6,7,0xff, 8,9,0xff, 10}; 6136 _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0, 0xff, 1, 2, 0xff, 3, 4, 0xff, 5, 6, 0xff, 7,8,0xff, 9,10}; 6137 _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {0xff, 6, 7, 0xff, 8, 9,0xff, 10, 11,0xff, 12,13,0xff, 14,15,0xff}; 6138 _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0, 0xff,0xff, 1, 0xff,0xff, 2, 0xff,0xff, 3, 0xff,0xff, 4, 0xff}; 6139 _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {0xff, 5, 0xff, 0xff, 6, 0xff,0xff, 7, 0xff,0xff, 8, 0xff,0xff, 9, 0xff, 0xff}; 6140 _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {10, 0xff,0xff, 11, 0xff,0xff, 12, 0xff,0xff, 13, 0xff,0xff, 14, 0xff, 0xff, 15}; 6141 6142 v0 = _mm_unpacklo_epi8(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10, 12,13, 15,16, 18,19, 21,22 6143 v2 = _mm_unpackhi_epi8(val->val[0], val->val[1]); //24,25, 27,28, 30,31, 33,34, 36,37, 39,40, 42,43, 45,46 6144 v1 = _mm_alignr_epi8(v2, v0, 11); //12,13, 15,16, 18,19, 21,22, 24,25, 27,28, 30,31, 33,34 6145 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding 6146 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding 6147 cff = _mm_cmpeq_epi8(v0, v0); //all ff 6148 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask0, cff); 6149 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); 6150 vst1q_u8(ptr, v.val[0]); 6151 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding 6152 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding 6153 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask1, cff); 6154 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); 6155 vst1q_u8((ptr + 16), v.val[1]); 6156 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding 6157 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding 6158 bldmask = _mm_cmpeq_epi8(*(__m128i*)mask2, cff); 6159 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); 6160 vst1q_u8((ptr + 32), v.val[2]); 6161 } 6162 #define vst3q_u8(ptr, val) vst3q_u8_ptr(ptr, &val) 6163 #endif 6164 6165 #if defined(USE_SSSE3) 6166 //void vst3q_u16(__transfersize(24) uint16_t * ptr, uint16x8x3_t val)// VST3.16 {d0, d2, d4}, [r0] 6167 _NEON2SSE_INLINE void vst3q_u16_ptr(__transfersize(24) uint16_t * ptr, uint16x8x3_t* val) 6168 { 6169 uint16x8x3_t v; 6170 __m128i v0,v1,v2, cff, bldmask; 6171 _NEON2SSE_ALIGN_16 uint8_t mask0[16] = {0,1, 2,3, 0xff,0xff, 4,5, 6,7,0xff,0xff, 8,9,10,11}; 6172 _NEON2SSE_ALIGN_16 uint8_t mask1[16] = {0xff, 0xff, 0,1, 2,3, 0xff,0xff, 4,5, 6,7, 0xff,0xff, 8,9}; 6173 _NEON2SSE_ALIGN_16 uint8_t mask2[16] = {6,7,0xff,0xff, 8,9,10,11, 0xff, 0xff, 12,13,14,15, 0xff, 0xff}; 6174 _NEON2SSE_ALIGN_16 uint8_t mask2lo[16] = {0xff,0xff, 0xff,0xff, 0,1, 0xff,0xff, 0xff,0xff, 2,3, 0xff,0xff, 0xff,0xff}; 6175 _NEON2SSE_ALIGN_16 uint8_t mask2med[16] = {4,5, 0xff,0xff,0xff,0xff, 6,7, 0xff, 0xff,0xff,0xff, 8,9, 0xff, 0xff}; 6176 _NEON2SSE_ALIGN_16 uint8_t mask2hi[16] = {0xff, 0xff, 10,11, 0xff, 0xff, 0xff, 0xff, 12,13, 0xff, 0xff, 0xff, 0xff,14,15}; 6177 6178 v0 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 3,4, 6,7, 9,10 6179 v2 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //12,13, 15,16, 18,19, 21,22, 6180 v1 = _mm_alignr_epi8(v2, v0, 12); //9,10, 12,13, 15,16, 18,19 6181 v.val[0] = _mm_shuffle_epi8(v0, *(__m128i*)mask0); //make holes for the v.val[2] data embedding 6182 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2lo); //make plugs for the v.val[2] data embedding 6183 cff = _mm_cmpeq_epi16(v0, v0); //all ff 6184 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask0, cff); 6185 v.val[0] = _MM_BLENDV_EPI8(v.val[0], v.val[2], bldmask); 6186 vst1q_u16(ptr, v.val[0]); 6187 v.val[0] = _mm_shuffle_epi8(v1, *(__m128i*)mask1); //make holes for the v.val[2] data embedding 6188 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2med); //make plugs for the v.val[2] data embedding 6189 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask1, cff); 6190 v.val[1] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask); 6191 vst1q_u16((ptr + 8), v.val[1]); 6192 v.val[0] = _mm_shuffle_epi8(v2, *(__m128i*)mask2); //make holes for the v.val[2] data embedding 6193 v.val[2] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask2hi); //make plugs for the v.val[2] data embedding 6194 bldmask = _mm_cmpeq_epi16(*(__m128i*)mask2, cff); 6195 v.val[2] = _MM_BLENDV_EPI8(v.val[0],v.val[2], bldmask ); 6196 vst1q_u16((ptr + 16), v.val[2]); 6197 } 6198 #define vst3q_u16(ptr, val) vst3q_u16_ptr(ptr, &val) 6199 #endif 6200 6201 //void vst3q_u32(__transfersize(12) uint32_t * ptr, uint32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] 6202 _NEON2SSE_INLINE void vst3q_u32_ptr(__transfersize(12) uint32_t * ptr, uint32x4x3_t* val) 6203 { //a0,a1,a2,a3, b0,b1,b2,b3, c0,c1,c2,c3 -> a0,b0,c0,a1, b1,c1,a2,b2, c2,a3,b3,c3 6204 uint32x4x3_t v; 6205 __m128i tmp0, tmp1,tmp2; 6206 tmp0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //a0,b0,a1,b1 6207 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //a2,b2,a3,b3 6208 tmp2 = _mm_unpacklo_epi32(val->val[1], val->val[2]); //b0,c0,b1,c1 6209 v.val[1] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp2),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(1,0,3,2))); //b1,c1,a2,b2, 6210 v.val[2] = _mm_unpackhi_epi64(tmp1, val->val[2]); //a3,b3, c2,c3 6211 v.val[2] = _mm_shuffle_epi32(v.val[2], 2 | (0 << 2) | (1 << 4) | (3 << 6)); //c2,a3,b3,c3 6212 tmp1 = _mm_unpacklo_epi32(tmp2,val->val[0]); //b0,a0,c0,a1 6213 v.val[0] = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(tmp0),_mm_castsi128_ps(tmp1), _MM_SHUFFLE(3,2,1,0))); //a0,b0,c0,a1, 6214 6215 vst1q_u32(ptr, v.val[0]); 6216 vst1q_u32((ptr + 4), v.val[1]); 6217 vst1q_u32((ptr + 8), v.val[2]); 6218 } 6219 #define vst3q_u32(ptr, val) vst3q_u32_ptr(ptr, &val) 6220 6221 #if defined(USE_SSSE3) 6222 //void vst3q_s8(__transfersize(48) int8_t * ptr, int8x16x3_t val); 6223 void vst3q_s8_ptr(__transfersize(48) int8_t * ptr, int8x16x3_t * val); 6224 #define vst3q_s8(ptr, val) vst3q_u8((uint8_t*)(ptr), val) 6225 6226 //void vst3q_s16(__transfersize(24) int16_t * ptr, int16x8x3_t val); 6227 void vst3q_s16_ptr(__transfersize(24) int16_t * ptr, int16x8x3_t * val); 6228 #define vst3q_s16(ptr, val) vst3q_u16((uint16_t*)(ptr), val) 6229 #endif 6230 6231 //void vst3q_s32(__transfersize(12) int32_t * ptr, int32x4x3_t val); 6232 void vst3q_s32_ptr(__transfersize(12) int32_t * ptr, int32x4x3_t * val); 6233 #define vst3q_s32(ptr, val) vst3q_u32((uint32_t*)(ptr), val) 6234 6235 //void vst3q_f16(__transfersize(24) __fp16 * ptr, float16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] 6236 void vst3q_f16_ptr(__transfersize(24) __fp16 * ptr, float16x8x3_t * val); 6237 // IA32 SIMD doesn't work with 16bit floats currently 6238 6239 //void vst3q_f32(__transfersize(12) float32_t * ptr, float32x4x3_t val)// VST3.32 {d0, d2, d4}, [r0] 6240 _NEON2SSE_INLINE void vst3q_f32_ptr(__transfersize(12) float32_t * ptr, float32x4x3_t* val) 6241 { 6242 float32x4x3_t v; 6243 __m128 tmp0, tmp1,tmp2; 6244 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); //a0,b0,a1,b1 6245 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); //a2,b2,a3,b3 6246 tmp2 = _mm_unpacklo_ps(val->val[1], val->val[2]); //b0,c0,b1,c1 6247 v.val[1] = _mm_shuffle_ps(tmp2,tmp1, _MM_SHUFFLE(1,0,3,2)); //b1,c1,a2,b2, 6248 v.val[2] = _mm_movehl_ps(val->val[2],tmp1); //a3,b3, c2,c3 6249 v.val[2] = _mm_shuffle_ps(v.val[2],v.val[2], _MM_SHUFFLE(3,1,0,2)); //c2,a3,b3,c3 6250 tmp1 = _mm_unpacklo_ps(tmp2,val->val[0]); //b0,a0,c0,a1 6251 v.val[0] = _mm_shuffle_ps(tmp0,tmp1, _MM_SHUFFLE(3,2,1,0)); //a0,b0,c0,a1, 6252 6253 vst1q_f32( ptr, v.val[0]); 6254 vst1q_f32( (ptr + 4), v.val[1]); 6255 vst1q_f32( (ptr + 8), v.val[2]); 6256 } 6257 #define vst3q_f32(ptr, val) vst3q_f32_ptr(ptr, &val) 6258 6259 #if defined(USE_SSSE3) 6260 //void vst3q_p8(__transfersize(48) poly8_t * ptr, poly8x16x3_t val);// VST3.8 {d0, d2, d4}, [r0] 6261 void vst3q_p8_ptr(__transfersize(48) poly8_t * ptr, poly8x16x3_t * val); 6262 #define vst3q_p8 vst3q_u8 6263 6264 //void vst3q_p16(__transfersize(24) poly16_t * ptr, poly16x8x3_t val);// VST3.16 {d0, d2, d4}, [r0] 6265 void vst3q_p16_ptr(__transfersize(24) poly16_t * ptr, poly16x8x3_t * val); 6266 #define vst3q_p16 vst3q_u16 6267 #endif 6268 6269 //void vst3_u8(__transfersize(24) uint8_t * ptr, uint8x8x3_t val)// VST3.8 {d0, d1, d2}, [r0] 6270 #if defined(USE_SSSE3) 6271 _NEON2SSE_INLINE void vst3_u8_ptr(__transfersize(24) uint8_t * ptr, uint8x8x3_t* val) 6272 { 6273 uint8x8x3_t v; 6274 __m128i tmp, sh0, sh1; 6275 _NEON2SSE_ALIGN_16 int8_t mask0[16] = { 0, 8, 16, 1, 9, 17, 2, 10, 18, 3, 11, 19, 4, 12, 20, 5}; 6276 _NEON2SSE_ALIGN_16 int8_t mask1[16] = {13, 21, 6, 14, 22, 7, 15, 23, 0,0,0,0,0,0,0,0}; 6277 _NEON2SSE_ALIGN_16 int8_t mask0_sel[16] = {0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0}; 6278 _NEON2SSE_ALIGN_16 int8_t mask1_sel[16] = {0, 0xff, 0, 0, 0xff, 0, 0, 0xff, 0,0,0,0,0,0,0,0}; 6279 tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]); 6280 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); //for bi>15 bi is wrapped (bi-=15) 6281 sh1 = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0); 6282 v.val[0] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask0_sel); 6283 vst1q_u8(ptr, v.val[0]); //store as 128 bit structure 6284 sh0 = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); //for bi>15 bi is wrapped (bi-=15) 6285 sh1 = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1); 6286 v.val[1] = _MM_BLENDV_EPI8(sh0, sh1, *(__m128i*)mask1_sel); 6287 } 6288 #define vst3_u8(ptr, val) vst3_u8_ptr(ptr, &val) 6289 #endif 6290 6291 //void vst3_u16(__transfersize(12) uint16_t * ptr, uint16x4x3_t val)// VST3.16 {d0, d1, d2}, [r0] 6292 #if defined(USE_SSSE3) 6293 _NEON2SSE_INLINE void vst3_u16_ptr(__transfersize(12) uint16_t * ptr, uint16x4x3_t* val) 6294 { 6295 uint16x4x3_t v; 6296 __m128i tmp; 6297 _NEON2SSE_ALIGN_16 int8_t mask0[16] = {0,1, 8,9, 16,17, 2,3, 10,11, 18,19, 4,5, 12,13}; 6298 _NEON2SSE_ALIGN_16 int8_t mask1[16] = {20,21, 6,7, 14,15, 22,23, 0,0,0,0,0,0,0,0}; 6299 _NEON2SSE_ALIGN_16 uint16_t mask0f[8] = {0xffff, 0xffff, 0, 0xffff, 0xffff, 0, 0xffff, 0xffff}; //if all ones we take the result from v.val[0] otherwise from v.val[1] 6300 _NEON2SSE_ALIGN_16 uint16_t mask1f[8] = {0xffff, 0, 0, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff}; //if all ones we take the result from v.val[1] otherwise from v.val[0] 6301 tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]); 6302 v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask0); 6303 v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask0); 6304 v.val[0] = _MM_BLENDV_EPI8(v.val[1], v.val[0], *(__m128i*)mask0f); 6305 vst1q_u16(ptr, v.val[0]); //store as 128 bit structure 6306 v.val[0] = _mm_shuffle_epi8(tmp, *(__m128i*)mask1); 6307 v.val[1] = _mm_shuffle_epi8(val->val[2], *(__m128i*)mask1); 6308 v.val[1] = _MM_BLENDV_EPI8(v.val[0], v.val[1], *(__m128i*)mask1f); //change the operands order 6309 } 6310 #define vst3_u16(ptr, val) vst3_u16_ptr(ptr, &val) 6311 #endif 6312 6313 //void vst3_u32(__transfersize(6) uint32_t * ptr, uint32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] 6314 _NEON2SSE_INLINE void vst3_u32_ptr(__transfersize(6) uint32_t * ptr, uint32x2x3_t* val) 6315 { //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; 6316 uint32x2x3_t res; 6317 res.val[0] = _mm_unpacklo_epi64(val->val[1], val->val[2]); //val[0]: 1,4,2,5 6318 res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (2 << 2) | (1 << 4) | (3 << 6)); //1,2,4,5 6319 res.val[1] = _mm_srli_si128(res.val[0], 8); //4,5, x,x 6320 res.val[0] = _mm_unpacklo_epi32(val->val[0], res.val[0]); //0,1,3,2 6321 res.val[0] = _mm_shuffle_epi32(res.val[0], 0 | (1 << 2) | (3 << 4) | (2 << 6)); //0,1,2, 3 6322 vst1q_u32(ptr, res.val[0]); //store as 128 bit structure 6323 } 6324 #define vst3_u32(ptr, val) vst3_u32_ptr(ptr, &val) 6325 6326 //void vst3_u64(__transfersize(3) uint64_t * ptr, uint64x1x3_t val)// VST1.64 {d0, d1, d2}, [r0] 6327 _NEON2SSE_INLINE void vst3_u64_ptr(__transfersize(3) uint64_t * ptr, uint64x1x3_t* val) 6328 { 6329 __m128i tmp; 6330 tmp = _mm_unpacklo_epi64(val->val[0], val->val[1]); 6331 vst1q_u64(ptr, tmp); //store as 128 bit structure 6332 } 6333 #define vst3_u64(ptr, val) vst3_u64_ptr(ptr, &val) 6334 6335 #if defined(USE_SSSE3) 6336 //void vst3_s8(__transfersize(24) int8_t * ptr, int8x8x3_t val) // VST3.8 {d0, d1, d2}, [r0] 6337 #define vst3_s8(ptr, val) vst3_u8_ptr((uint8_t*)ptr, &val) 6338 6339 //void vst3_s16(__transfersize(12) int16_t * ptr, int16x4x3_t val) // VST3.16 {d0, d1, d2}, [r0] 6340 #define vst3_s16(ptr, val) vst3_u16_ptr((uint16_t*)ptr, &val) 6341 #endif 6342 6343 //void vst3_s32(__transfersize(6) int32_t * ptr, int32x2x3_t val); // VST3.32 {d0, d1, d2}, [r0] 6344 #define vst3_s32(ptr, val) vst3_u32_ptr((uint32_t*)ptr, &val) 6345 6346 //void vst3_s64(__transfersize(3) int64_t * ptr, int64x1x3_t val) // VST1.64 {d0, d1, d2}, [r0] 6347 #define vst3_s64(ptr, val) vst3_u64_ptr((uint64_t*)ptr, &val) 6348 6349 //void vst3_f16(__transfersize(12) __fp16 * ptr, float16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] 6350 void vst3_f16_ptr(__transfersize(12) __fp16 * ptr, float16x4x3_t * val); // VST3.16 {d0, d1, d2}, [r0] 6351 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 6352 6353 //void vst3_f32(__transfersize(6) float32_t * ptr, float32x2x3_t val)// VST3.32 {d0, d1, d2}, [r0] 6354 _NEON2SSE_INLINE void vst3_f32_ptr(__transfersize(6) float32_t * ptr, float32x2x3_t* val) 6355 { //val->val[0]:0,3,val->val[1]:1,4; val->val[2]:2,5,x,x; 6356 float32x2x3_t res; 6357 res.val[0] = _mm_castsi128_ps(_mm_unpacklo_epi64(_mm_castps_si128(val->val[1]), _mm_castps_si128(val->val[2])) ); 6358 res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(3,1,2,0)); //1,2,4,5 6359 res.val[1] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(1,0,3,2)); //4,5, 1,2 6360 res.val[0] = _mm_unpacklo_ps(val->val[0], res.val[0]); //0,1,3, 2 6361 res.val[0] = _mm_shuffle_ps(res.val[0],res.val[0], _MM_SHUFFLE(2,3,1,0)); //0,1,2, 3 6362 vst1q_f32(ptr, res.val[0]); //store as 128 bit structure 6363 } 6364 #define vst3_f32(ptr, val) vst3_f32_ptr(ptr, &val) 6365 6366 #if defined(USE_SSSE3) 6367 //void vst3_p8(__transfersize(24) poly8_t * ptr, poly8x8x3_t val);// VST3.8 {d0, d1, d2}, [r0] 6368 void vst3_p8_ptr(__transfersize(24) poly8_t * ptr, poly8x8x3_t * val); 6369 #define vst3_p8 vst3_u8 6370 6371 //void vst3_p16(__transfersize(12) poly16_t * ptr, poly16x4x3_t val);// VST3.16 {d0, d1, d2}, [r0] 6372 void vst3_p16_ptr(__transfersize(12) poly16_t * ptr, poly16x4x3_t * val); 6373 #define vst3_p16 vst3_s16 6374 #endif 6375 6376 //*************** Quadruples store ******************************** 6377 //********************************************************************* 6378 //void vst4q_u8(__transfersize(64) uint8_t * ptr, uint8x16x4_t val)// VST4.8 {d0, d2, d4, d6}, [r0] 6379 _NEON2SSE_INLINE void vst4q_u8_ptr(__transfersize(64) uint8_t * ptr, uint8x16x4_t* val) 6380 { 6381 __m128i tmp1, tmp2, res; 6382 tmp1 = _mm_unpacklo_epi8(val->val[0], val->val[1]); // 0,1, 4,5, 8,9, 12,13, 16,17, 20,21, 24,25, 28,29 6383 tmp2 = _mm_unpacklo_epi8(val->val[2], val->val[3]); // 2,3, 6,7, 10,11, 14,15, 18,19, 22,23, 26,27, 30,31 6384 res = _mm_unpacklo_epi16(tmp1, tmp2); //0,1, 2,3, 4,5, 6,7, 8,9, 10,11, 12,13, 14,15 6385 vst1q_u8(ptr, res); 6386 res = _mm_unpackhi_epi16(tmp1, tmp2); //16,17, 18,19, 20,21, 22,23, 24,25, 26,27, 28,29, 30,31 6387 vst1q_u8((ptr + 16), res); 6388 tmp1 = _mm_unpackhi_epi8(val->val[0], val->val[1]); // 6389 tmp2 = _mm_unpackhi_epi8(val->val[2], val->val[3]); // 6390 res = _mm_unpacklo_epi16(tmp1, tmp2); // 6391 vst1q_u8((ptr + 32), res); 6392 res = _mm_unpackhi_epi16(tmp1, tmp2); // 6393 vst1q_u8((ptr + 48), res); 6394 } 6395 #define vst4q_u8(ptr, val) vst4q_u8_ptr(ptr, &val) 6396 6397 //void vst4q_u16(__transfersize(32) uint16_t * ptr, uint16x8x4_t val)// VST4.16 {d0, d2, d4, d6}, [r0] 6398 _NEON2SSE_INLINE void vst4q_u16_ptr(__transfersize(32) uint16_t * ptr, uint16x8x4_t* val) 6399 { 6400 uint16x8x4_t v; 6401 __m128i tmp1, tmp2; 6402 tmp1 = _mm_unpacklo_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 6403 tmp2 = _mm_unpacklo_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 6404 v.val[0] = _mm_unpacklo_epi32(tmp1, tmp2); 6405 v.val[1] = _mm_unpackhi_epi32(tmp1, tmp2); 6406 tmp1 = _mm_unpackhi_epi16(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 6407 tmp2 = _mm_unpackhi_epi16(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 6408 v.val[2] = _mm_unpacklo_epi32(tmp1, tmp2); 6409 v.val[3] = _mm_unpackhi_epi32(tmp1, tmp2); 6410 vst1q_u16(ptr, v.val[0]); 6411 vst1q_u16((ptr + 8), v.val[1]); 6412 vst1q_u16((ptr + 16),v.val[2]); 6413 vst1q_u16((ptr + 24), v.val[3]); 6414 } 6415 #define vst4q_u16(ptr, val) vst4q_u16_ptr(ptr, &val) 6416 6417 //void vst4q_u32(__transfersize(16) uint32_t * ptr, uint32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] 6418 _NEON2SSE_INLINE void vst4q_u32_ptr(__transfersize(16) uint32_t * ptr, uint32x4x4_t* val) 6419 { 6420 uint16x8x4_t v; 6421 __m128i tmp1, tmp2; 6422 tmp1 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 6423 tmp2 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 6424 v.val[0] = _mm_unpacklo_epi64(tmp1, tmp2); 6425 v.val[1] = _mm_unpackhi_epi64(tmp1, tmp2); 6426 tmp1 = _mm_unpackhi_epi32(val->val[0], val->val[1]); //0,1, 4,5, 8,9, 12,13 6427 tmp2 = _mm_unpackhi_epi32(val->val[2], val->val[3]); //2,3, 6,7 , 10,11, 14,15 6428 v.val[2] = _mm_unpacklo_epi64(tmp1, tmp2); 6429 v.val[3] = _mm_unpackhi_epi64(tmp1, tmp2); 6430 vst1q_u32(ptr, v.val[0]); 6431 vst1q_u32((ptr + 4), v.val[1]); 6432 vst1q_u32((ptr + 8), v.val[2]); 6433 vst1q_u32((ptr + 12), v.val[3]); 6434 } 6435 #define vst4q_u32(ptr, val) vst4q_u32_ptr(ptr, &val) 6436 6437 //void vst4q_s8(__transfersize(64) int8_t * ptr, int8x16x4_t val); 6438 void vst4q_s8_ptr(__transfersize(64) int8_t * ptr, int8x16x4_t * val); 6439 #define vst4q_s8(ptr, val) vst4q_u8((uint8_t*)(ptr), val) 6440 6441 //void vst4q_s16(__transfersize(32) int16_t * ptr, int16x8x4_t val); 6442 void vst4q_s16_ptr(__transfersize(32) int16_t * ptr, int16x8x4_t * val); 6443 #define vst4q_s16(ptr, val) vst4q_u16((uint16_t*)(ptr), val) 6444 6445 //void vst4q_s32(__transfersize(16) int32_t * ptr, int32x4x4_t val); 6446 void vst4q_s32_ptr(__transfersize(16) int32_t * ptr, int32x4x4_t * val); 6447 #define vst4q_s32(ptr, val) vst4q_u32((uint32_t*)(ptr), val) 6448 6449 //void vst4q_f16(__transfersize(32) __fp16 * ptr, float16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] 6450 void vst4q_f16_ptr(__transfersize(32) __fp16 * ptr, float16x8x4_t * val); 6451 // IA32 SIMD doesn't work with 16bit floats currently 6452 6453 //void vst4q_f32(__transfersize(16) float32_t * ptr, float32x4x4_t val)// VST4.32 {d0, d2, d4, d6}, [r0] 6454 _NEON2SSE_INLINE void vst4q_f32_ptr(__transfersize(16) float32_t * ptr, float32x4x4_t* val) 6455 { 6456 __m128 tmp3, tmp2, tmp1, tmp0; 6457 float32x4x4_t v; 6458 tmp0 = _mm_unpacklo_ps(val->val[0], val->val[1]); 6459 tmp2 = _mm_unpacklo_ps(val->val[2], val->val[3]); 6460 tmp1 = _mm_unpackhi_ps(val->val[0], val->val[1]); 6461 tmp3 = _mm_unpackhi_ps(val->val[2], val->val[3]); 6462 v.val[0] = _mm_movelh_ps(tmp0, tmp2); 6463 v.val[1] = _mm_movehl_ps(tmp2, tmp0); 6464 v.val[2] = _mm_movelh_ps(tmp1, tmp3); 6465 v.val[3] = _mm_movehl_ps(tmp3, tmp1); 6466 vst1q_f32(ptr, v.val[0]); 6467 vst1q_f32((ptr + 4), v.val[1]); 6468 vst1q_f32((ptr + 8), v.val[2]); 6469 vst1q_f32((ptr + 12), v.val[3]); 6470 } 6471 #define vst4q_f32(ptr, val) vst4q_f32_ptr(ptr, &val) 6472 6473 //void vst4q_p8(__transfersize(64) poly8_t * ptr, poly8x16x4_t val);// VST4.8 {d0, d2, d4, d6}, [r0] 6474 void vst4q_p8_ptr(__transfersize(64) poly8_t * ptr, poly8x16x4_t * val); 6475 #define vst4q_p8 vst4q_u8 6476 6477 //void vst4q_p16(__transfersize(32) poly16_t * ptr, poly16x8x4_t val);// VST4.16 {d0, d2, d4, d6}, [r0] 6478 void vst4q_p16_ptr(__transfersize(32) poly16_t * ptr, poly16x8x4_t * val); 6479 #define vst4q_p16 vst4q_s16 6480 6481 //void vst4_u8(__transfersize(32) uint8_t * ptr, uint8x8x4_t val)// VST4.8 {d0, d1, d2, d3}, [r0] 6482 _NEON2SSE_INLINE void vst4_u8_ptr(__transfersize(32) uint8_t * ptr, uint8x8x4_t* val) 6483 { 6484 uint8x8x4_t v; 6485 __m128i sh0, sh1; 6486 sh0 = _mm_unpacklo_epi8(val->val[0],val->val[1]); // a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,a5,b5, a6,b6,a7,b7, 6487 sh1 = _mm_unpacklo_epi8(val->val[2],val->val[3]); // c0,d0,c1,d1,c2,d2,c3,d3, c4,d4,c5,d5,c6,d6,c7,d7 6488 v.val[0] = _mm_unpacklo_epi16(sh0,sh1); // a0,b0,c0,d0,a1,b1,c1,d1,a2,b2,c2,d2,a3,b3,c3,d3, 6489 v.val[2] = _mm_unpackhi_epi16(sh0,sh1); //a4,b4,c4,d4,a5,b5,c5,d5, a6,b6,c6,d6,a7,b7,c7,d7 6490 vst1q_u8(ptr, v.val[0]); 6491 vst1q_u8((ptr + 16), v.val[2]); 6492 } 6493 #define vst4_u8(ptr, val) vst4_u8_ptr(ptr, &val) 6494 6495 //void vst4_u16(__transfersize(16) uint16_t * ptr, uint16x4x4_t val)// VST4.16 {d0, d1, d2, d3}, [r0] 6496 _NEON2SSE_INLINE void vst4_u16_ptr(__transfersize(16) uint16_t * ptr, uint16x4x4_t* val) 6497 { 6498 uint16x4x4_t v; 6499 __m128i sh0, sh1; 6500 sh0 = _mm_unpacklo_epi16(val->val[0],val->val[1]); //a0,a1,b0,b1,c0,c1,d0,d1, 6501 sh1 = _mm_unpacklo_epi16(val->val[2],val->val[3]); //a2,a3,b2,b3,c2,c3,d2,d3 6502 v.val[0] = _mm_unpacklo_epi32(sh0,sh1); // a0,a1,a2,a3,b0,b1,b2,b3 6503 v.val[2] = _mm_unpackhi_epi32(sh0,sh1); // c0,c1,c2,c3,d0,d1,d2,d3 6504 vst1q_u16(ptr, v.val[0]); //store as 128 bit structure 6505 vst1q_u16((ptr + 8), v.val[2]); 6506 } 6507 #define vst4_u16(ptr, val) vst4_u16_ptr(ptr, &val) 6508 6509 //void vst4_u32(__transfersize(8) uint32_t * ptr, uint32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] 6510 _NEON2SSE_INLINE void vst4_u32_ptr(__transfersize(8) uint32_t * ptr, uint32x2x4_t* val) 6511 { //0,4, 1,5, 2,6, 3,7 6512 uint32x2x4_t v; 6513 __m128i sh0, sh1; 6514 sh0 = _mm_unpacklo_epi32(val->val[0], val->val[1]); //0,1,4,5 6515 sh1 = _mm_unpacklo_epi32(val->val[2], val->val[3]); //2,3,6,7 6516 v.val[0] = _mm_unpacklo_epi64(sh0,sh1); // 6517 v.val[1] = _mm_unpackhi_epi64(sh0,sh1); // 6518 vst1q_u32(ptr, v.val[0]); //store as 128 bit structure 6519 vst1q_u32((ptr + 4), v.val[1]); 6520 } 6521 #define vst4_u32(ptr, val) vst4_u32_ptr(ptr, &val) 6522 6523 //void vst4_u64(__transfersize(4) uint64_t * ptr, uint64x1x4_t val)// VST1.64 {d0, d1, d2, d3}, [r0] 6524 _NEON2SSE_INLINE void vst4_u64_ptr(__transfersize(4) uint64_t * ptr, uint64x1x4_t* val) 6525 { 6526 vst1q_u64(ptr, val->val[0]); 6527 vst1q_u64((ptr + 2), val->val[2]); 6528 } 6529 #define vst4_u64(ptr, val) vst4_u64_ptr(ptr, &val) 6530 6531 //void vst4_s8(__transfersize(32) int8_t * ptr, int8x8x4_t val) //VST4.8 {d0, d1, d2, d3}, [r0] 6532 #define vst4_s8(ptr, val) vst4_u8((uint8_t*)ptr, val) 6533 6534 //void vst4_s16(__transfersize(16) int16_t * ptr, int16x4x4_t val) // VST4.16 {d0, d1, d2, d3}, [r0] 6535 #define vst4_s16(ptr, val) vst4_u16((uint16_t*)ptr, val) 6536 6537 //void vst4_s32(__transfersize(8) int32_t * ptr, int32x2x4_t val) // VST4.32 {d0, d1, d2, d3}, [r0] 6538 #define vst4_s32(ptr, val) vst4_u32((uint32_t*)ptr, val) 6539 6540 //void vst4_s64(__transfersize(4) int64_t * ptr, int64x1x4_t val); // VST1.64 {d0, d1, d2, d3}, [r0] 6541 void vst4_s64_ptr(__transfersize(4) int64_t * ptr, int64x1x4_t * val); 6542 #define vst4_s64(ptr, val) vst4_u64((uint64_t*)ptr, val) 6543 6544 //void vst4_f16(__transfersize(16) __fp16 * ptr, float16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] 6545 void vst4_f16_ptr(__transfersize(16) __fp16 * ptr, float16x4x4_t * val); 6546 // IA32 SIMD doesn't work with 16bit floats currently, so need to go to 32 bit and then work with two 128bit registers. See vld1q_f16 for example 6547 6548 //void vst4_f32(__transfersize(8) float32_t * ptr, float32x2x4_t val)// VST4.32 {d0, d1, d2, d3}, [r0] 6549 _NEON2SSE_INLINE void vst4_f32_ptr(__transfersize(8) float32_t * ptr, float32x2x4_t* val) 6550 { //a0,a1, b0,b1, c0,c1, d0,d1 -> a0,c0, a1,c1, b0,d0, b1,d1 6551 float32x2x4_t v; 6552 v.val[0] = _mm_unpacklo_ps(val->val[0],val->val[1]); 6553 v.val[2] = _mm_unpacklo_ps(val->val[2],val->val[3]); 6554 v.val[1] = _mm_movelh_ps (v.val[0], v.val[2]); //a0, c0, a1,c1, 6555 v.val[3] = _mm_movehl_ps (v.val[2],v.val[0]); //b0,d0, b1, d1 6556 vst1q_f32(ptr, v.val[1]); //store as 128 bit structure 6557 vst1q_f32((ptr + 4), v.val[3]); 6558 } 6559 #define vst4_f32(ptr, val) vst4_f32_ptr(ptr, &val) 6560 6561 //void vst4_p8(__transfersize(32) poly8_t * ptr, poly8x8x4_t val);// VST4.8 {d0, d1, d2, d3}, [r0] 6562 void vst4_p8_ptr(__transfersize(32) poly8_t * ptr, poly8x8x4_t * val); 6563 #define vst4_p8 vst4_u8 6564 6565 //void vst4_p16(__transfersize(16) poly16_t * ptr, poly16x4x4_t val);// VST4.16 {d0, d1, d2, d3}, [r0] 6566 void vst4_p16_ptr(__transfersize(16) poly16_t * ptr, poly16x4x4_t * val); 6567 #define vst4_p16 vst4_u16 6568 6569 //*********** Store a lane of a vector into memory (extract given lane) for a couple of vectors ********************* 6570 //******************************************************************************************************************** 6571 //void vst2q_lane_u16(__transfersize(2) uint16_t * ptr, uint16x8x2_t val, __constrange(0,7) int lane)// VST2.16 {d0[0], d2[0]}, [r0] 6572 _NEON2SSE_INLINE void vst2q_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x8x2_t* val, __constrange(0,7) int lane) 6573 { 6574 vst1q_lane_s16(ptr, val->val[0], lane); 6575 vst1q_lane_s16((ptr + 1), val->val[1], lane); 6576 } 6577 #define vst2q_lane_u16(ptr, val, lane) vst2q_lane_u16_ptr(ptr, &val, lane) 6578 6579 //void vst2q_lane_u32(__transfersize(2) uint32_t * ptr, uint32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] 6580 _NEON2SSE_INLINE void vst2q_lane_u32_ptr(__transfersize(2) uint32_t* ptr, uint32x4x2_t* val, __constrange(0,3) int lane) 6581 { 6582 vst1q_lane_u32(ptr, val->val[0], lane); 6583 vst1q_lane_u32((ptr + 1), val->val[1], lane); 6584 } 6585 #define vst2q_lane_u32(ptr, val, lane) vst2q_lane_u32_ptr(ptr, &val, lane) 6586 6587 //void vst2q_lane_s16(__transfersize(2) int16_t * ptr, int16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] 6588 void vst2q_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x8x2_t * val, __constrange(0,7) int lane); 6589 #define vst2q_lane_s16(ptr, val, lane) vst2q_lane_u16((uint16_t*)ptr, val, lane) 6590 6591 //void vst2q_lane_s32(__transfersize(2) int32_t * ptr, int32x4x2_t val, __constrange(0,3) int lane);// VST2.32 {d0[0], d2[0]}, [r0] 6592 void vst2q_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x4x2_t * val, __constrange(0,3) int lane); 6593 #define vst2q_lane_s32(ptr, val, lane) vst2q_lane_u32((uint32_t*)ptr, val, lane) 6594 6595 //void vst2q_lane_f16(__transfersize(2) __fp16 * ptr, float16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] 6596 void vst2q_lane_f16_ptr(__transfersize(2) __fp16 * ptr, float16x8x2_t * val, __constrange(0,7) int lane); 6597 //current IA SIMD doesn't support float16 6598 6599 //void vst2q_lane_f32(__transfersize(2) float32_t * ptr, float32x4x2_t val, __constrange(0,3) int lane)// VST2.32 {d0[0], d2[0]}, [r0] 6600 _NEON2SSE_INLINE void vst2q_lane_f32_ptr(__transfersize(2) float32_t* ptr, float32x4x2_t* val, __constrange(0,3) int lane) 6601 { 6602 vst1q_lane_f32(ptr, val->val[0], lane); 6603 vst1q_lane_f32((ptr + 1), val->val[1], lane); 6604 } 6605 #define vst2q_lane_f32(ptr, val, lane) vst2q_lane_f32_ptr(ptr, &val, lane) 6606 6607 //void vst2q_lane_p16(__transfersize(2) poly16_t * ptr, poly16x8x2_t val, __constrange(0,7) int lane);// VST2.16 {d0[0], d2[0]}, [r0] 6608 void vst2q_lane_p16_ptr(__transfersize(2) poly16_t * ptr, poly16x8x2_t * val, __constrange(0,7) int lane); 6609 #define vst2q_lane_p16 vst2q_lane_s16 6610 6611 //void vst2_lane_u16(__transfersize(2) uint16_t * ptr, uint16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] 6612 void vst2_lane_u16_ptr(__transfersize(2) uint16_t * ptr, uint16x4x2_t * val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] 6613 #define vst2_lane_u16 vst2q_lane_u16 6614 6615 //void vst2_lane_u32(__transfersize(2) uint32_t * ptr, uint32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] 6616 void vst2_lane_u32_ptr(__transfersize(2) uint32_t * ptr, uint32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] 6617 #define vst2_lane_u32 vst2q_lane_u32 6618 6619 //void vst2_lane_s8(__transfersize(2) int8_t * ptr, int8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] 6620 void vst2_lane_s8_ptr(__transfersize(2) int8_t * ptr, int8x8x2_t * val, __constrange(0,7) int lane); 6621 #define vst2_lane_s8(ptr, val, lane) vst2_lane_u8((uint8_t*)ptr, val, lane) 6622 6623 //void vst2_lane_s16(__transfersize(2) int16_t * ptr, int16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] 6624 void vst2_lane_s16_ptr(__transfersize(2) int16_t * ptr, int16x4x2_t * val, __constrange(0,3) int lane); 6625 #define vst2_lane_s16 vst2q_lane_s16 6626 6627 //void vst2_lane_s32(__transfersize(2) int32_t * ptr, int32x2x2_t val, __constrange(0,1) int lane);// VST2.32 {d0[0], d1[0]}, [r0] 6628 void vst2_lane_s32_ptr(__transfersize(2) int32_t * ptr, int32x2x2_t * val, __constrange(0,1) int lane); 6629 #define vst2_lane_s32 vst2q_lane_s32 6630 6631 //void vst2_lane_f16(__transfersize(2) __fp16 * ptr, float16x4x2_t val, __constrange(0,3) int lane); // VST2.16 {d0[0], d1[0]}, [r0] 6632 //current IA SIMD doesn't support float16 6633 6634 void vst2_lane_f32_ptr(__transfersize(2) float32_t * ptr, float32x2x2_t * val, __constrange(0,1) int lane); // VST2.32 {d0[0], d1[0]}, [r0] 6635 #define vst2_lane_f32 vst2q_lane_f32 6636 6637 //void vst2_lane_p8(__transfersize(2) poly8_t * ptr, poly8x8x2_t val, __constrange(0,7) int lane);// VST2.8 {d0[0], d1[0]}, [r0] 6638 #define vst2_lane_p8 vst2_lane_u8 6639 6640 //void vst2_lane_p16(__transfersize(2) poly16_t * ptr, poly16x4x2_t val, __constrange(0,3) int lane);// VST2.16 {d0[0], d1[0]}, [r0] 6641 #define vst2_lane_p16 vst2_lane_u16 6642 6643 //************************* Triple lanes stores ******************************************************* 6644 //******************************************************************************************************* 6645 //void vst3q_lane_u16(__transfersize(3) uint16_t * ptr, uint16x8x3_t val, __constrange(0,7) int lane)// VST3.16 {d0[0], d2[0], d4[0]}, [r0] 6646 _NEON2SSE_INLINE void vst3q_lane_u16_ptr(__transfersize(3) uint16_t * ptr, uint16x8x3_t* val, __constrange(0,7) int lane) 6647 { 6648 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val, lane); 6649 vst1q_lane_u16((ptr + 2), val->val[2], lane); 6650 } 6651 #define vst3q_lane_u16(ptr, val, lane) vst3q_lane_u16_ptr(ptr, &val, lane) 6652 6653 //void vst3q_lane_u32(__transfersize(3) uint32_t * ptr, uint32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] 6654 _NEON2SSE_INLINE void vst3q_lane_u32_ptr(__transfersize(3) uint32_t * ptr, uint32x4x3_t* val, __constrange(0,3) int lane) 6655 { 6656 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val, lane); 6657 vst1q_lane_u32((ptr + 2), val->val[2], lane); 6658 } 6659 #define vst3q_lane_u32(ptr, val, lane) vst3q_lane_u32_ptr(ptr, &val, lane) 6660 6661 //void vst3q_lane_s16(__transfersize(3) int16_t * ptr, int16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] 6662 void vst3q_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x8x3_t * val, __constrange(0,7) int lane); 6663 #define vst3q_lane_s16(ptr, val, lane) vst3q_lane_u16((uint16_t *)ptr, val, lane) 6664 6665 //void vst3q_lane_s32(__transfersize(3) int32_t * ptr, int32x4x3_t val, __constrange(0,3) int lane);// VST3.32 {d0[0], d2[0], d4[0]}, [r0] 6666 void vst3q_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x4x3_t * val, __constrange(0,3) int lane); 6667 #define vst3q_lane_s32(ptr, val, lane) vst3q_lane_u32((uint32_t *)ptr, val, lane) 6668 6669 //void vst3q_lane_f16(__transfersize(3) __fp16 * ptr, float16x8x3_t val, __constrange(0,7) int lane);// VST3.16 {d0[0], d2[0], d4[0]}, [r0] 6670 void vst3q_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x8x3_t * val, __constrange(0,7) int lane); 6671 //current IA SIMD doesn't support float16 6672 6673 //void vst3q_lane_f32(__transfersize(3) float32_t * ptr, float32x4x3_t val, __constrange(0,3) int lane)// VST3.32 {d0[0], d2[0], d4[0]}, [r0] 6674 _NEON2SSE_INLINE void vst3q_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x4x3_t* val, __constrange(0,3) int lane) 6675 { 6676 vst1q_lane_f32(ptr, val->val[0], lane); 6677 vst1q_lane_f32((ptr + 1), val->val[1], lane); 6678 vst1q_lane_f32((ptr + 2), val->val[2], lane); 6679 } 6680 #define vst3q_lane_f32(ptr, val, lane) vst3q_lane_f32_ptr(ptr, &val, lane) 6681 6682 //void vst3_lane_s8(__transfersize(3) int8_t * ptr, int8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] 6683 void vst3_lane_s8_ptr(__transfersize(3) int8_t * ptr, int8x8x3_t * val, __constrange(0,7) int lane); 6684 #define vst3_lane_s8(ptr, val, lane) vst3_lane_u8((uint8_t *)ptr, val, lane) 6685 6686 //void vst3_lane_s16(__transfersize(3) int16_t * ptr, int16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] 6687 void vst3_lane_s16_ptr(__transfersize(3) int16_t * ptr, int16x4x3_t * val, __constrange(0,3) int lane); 6688 #define vst3_lane_s16(ptr, val, lane) vst3_lane_u16((uint16_t *)ptr, val, lane) 6689 6690 //void vst3_lane_s32(__transfersize(3) int32_t * ptr, int32x2x3_t val, __constrange(0,1) int lane);// VST3.32 {d0[0], d1[0], d2[0]}, [r0] 6691 void vst3_lane_s32_ptr(__transfersize(3) int32_t * ptr, int32x2x3_t * val, __constrange(0,1) int lane); 6692 #define vst3_lane_s32(ptr, val, lane) vst3_lane_u32((uint32_t *)ptr, val, lane) 6693 6694 //void vst3_lane_f16(__transfersize(3) __fp16 * ptr, float16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] 6695 void vst3_lane_f16_ptr(__transfersize(3) __fp16 * ptr, float16x4x3_t * val, __constrange(0,3) int lane); 6696 //current IA SIMD doesn't support float16 6697 6698 //void vst3_lane_f32(__transfersize(3) float32_t * ptr, float32x2x3_t val, __constrange(0,1) int lane)// VST3.32 {d0[0], d1[0], d2[0]}, [r0] 6699 void vst3_lane_f32_ptr(__transfersize(3) float32_t * ptr, float32x2x3_t * val, __constrange(0,1) int lane); 6700 #define vst3_lane_f32 vst3q_lane_f32 6701 6702 //void vst3_lane_p8(__transfersize(3) poly8_t * ptr, poly8x8x3_t val, __constrange(0,7) int lane);// VST3.8 {d0[0], d1[0], d2[0]}, [r0] 6703 void vst3_lane_p8_ptr(__transfersize(3) poly8_t * ptr, poly8x8x3_t * val, __constrange(0,7) int lane); 6704 #define vst3_lane_p8 vst3_lane_u8 6705 6706 //void vst3_lane_p16(__transfersize(3) poly16_t * ptr, poly16x4x3_t val, __constrange(0,3) int lane);// VST3.16 {d0[0], d1[0], d2[0]}, [r0] 6707 void vst3_lane_p16_ptr(__transfersize(3) poly16_t * ptr, poly16x4x3_t * val, __constrange(0,3) int lane); 6708 #define vst3_lane_p16 vst3_lane_s16 6709 6710 //******************************** Quadruple lanes stores *********************************************** 6711 //******************************************************************************************************* 6712 //void vst4q_lane_u16(__transfersize(4) uint16_t * ptr, uint16x8x4_t val, __constrange(0,7) int lane)// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 6713 _NEON2SSE_INLINE void vst4q_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x8x4_t* val4, __constrange(0,7) int lane) 6714 { 6715 vst2q_lane_u16_ptr(ptr, (uint16x8x2_t*)val4->val, lane); 6716 vst2q_lane_u16_ptr((ptr + 2),((uint16x8x2_t*)val4->val + 1), lane); 6717 } 6718 #define vst4q_lane_u16(ptr, val, lane) vst4q_lane_u16_ptr(ptr, &val, lane) 6719 6720 //void vst4q_lane_u32(__transfersize(4) uint32_t * ptr, uint32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 6721 _NEON2SSE_INLINE void vst4q_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x4x4_t* val4, __constrange(0,3) int lane) 6722 { 6723 vst2q_lane_u32_ptr(ptr, (uint32x4x2_t*)val4->val, lane); 6724 vst2q_lane_u32_ptr((ptr + 2), ((uint32x4x2_t*)val4->val + 1), lane); 6725 } 6726 #define vst4q_lane_u32(ptr, val, lane) vst4q_lane_u32_ptr(ptr, &val, lane) 6727 6728 //void vst4q_lane_s16(__transfersize(4) int16_t * ptr, int16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 6729 void vst4q_lane_s16_ptr(__transfersize(4) int16_t * ptr, int16x8x4_t * val, __constrange(0,7) int lane); 6730 #define vst4q_lane_s16(ptr,val,lane) vst4q_lane_u16((uint16_t *)ptr,val,lane) 6731 6732 //void vst4q_lane_s32(__transfersize(4) int32_t * ptr, int32x4x4_t val, __constrange(0,3) int lane);// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 6733 void vst4q_lane_s32_ptr(__transfersize(4) int32_t * ptr, int32x4x4_t * val, __constrange(0,3) int lane); 6734 #define vst4q_lane_s32(ptr,val,lane) vst4q_lane_u32((uint32_t *)ptr,val,lane) 6735 6736 //void vst4q_lane_f16(__transfersize(4) __fp16 * ptr, float16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 6737 void vst4q_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x8x4_t * val, __constrange(0,7) int lane); 6738 //current IA SIMD doesn't support float16 6739 6740 //void vst4q_lane_f32(__transfersize(4) float32_t * ptr, float32x4x4_t val, __constrange(0,3) int lane)// VST4.32 {d0[0], d2[0], d4[0], d6[0]}, [r0] 6741 _NEON2SSE_INLINE void vst4q_lane_f32_ptr(__transfersize(4) float32_t * ptr, float32x4x4_t* val, __constrange(0,3) int lane) 6742 { 6743 vst1q_lane_f32(ptr, val->val[0], lane); 6744 vst1q_lane_f32((ptr + 1), val->val[1], lane); 6745 vst1q_lane_f32((ptr + 2), val->val[2], lane); 6746 vst1q_lane_f32((ptr + 3), val->val[3], lane); 6747 } 6748 #define vst4q_lane_f32(ptr, val, lane) vst4q_lane_f32_ptr(ptr, &val, lane) 6749 6750 //void vst4q_lane_p16(__transfersize(4) poly16_t * ptr, poly16x8x4_t val, __constrange(0,7) int lane);// VST4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0] 6751 void vst4q_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x8x4_t * val, __constrange(0,7) int lane); 6752 #define vst4q_lane_p16 vst4q_lane_u16 6753 6754 //void vst4_lane_u8(__transfersize(4) uint8_t * ptr, uint8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6755 _NEON2SSE_INLINE void vst4_lane_u8_ptr(__transfersize(4) uint8_t * ptr, uint8x8x4_t* val, __constrange(0,7) int lane) 6756 { 6757 vst1q_lane_u8(ptr, val->val[0], lane); 6758 vst1q_lane_u8((ptr + 1), val->val[1], lane); 6759 vst1q_lane_u8((ptr + 2), val->val[2], lane); 6760 vst1q_lane_u8((ptr + 3), val->val[3], lane); 6761 } 6762 #define vst4_lane_u8(ptr, val, lane) vst4_lane_u8_ptr(ptr, &val, lane) 6763 6764 //void vst4_lane_u16(__transfersize(4) uint16_t * ptr, uint16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6765 _NEON2SSE_INLINE void vst4_lane_u16_ptr(__transfersize(4) uint16_t * ptr, uint16x4x4_t* val, __constrange(0,3) int lane) 6766 { 6767 vst1q_lane_u16(ptr, val->val[0], lane); 6768 vst1q_lane_u16((ptr + 1),val->val[1], lane); 6769 vst1q_lane_u16((ptr + 2), val->val[2], lane); 6770 vst1q_lane_u16((ptr + 3), val->val[3], lane); 6771 } 6772 #define vst4_lane_u16(ptr, val, lane) vst4_lane_u16_ptr(ptr, &val, lane) 6773 6774 //void vst4_lane_u32(__transfersize(4) uint32_t * ptr, uint32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6775 _NEON2SSE_INLINE void vst4_lane_u32_ptr(__transfersize(4) uint32_t * ptr, uint32x2x4_t* val, __constrange(0,1) int lane) 6776 { 6777 vst1q_lane_u32(ptr, val->val[0], lane); 6778 vst1q_lane_u32((ptr + 1), val->val[1], lane); 6779 vst1q_lane_u32((ptr + 2), val->val[2], lane); 6780 vst1q_lane_u32((ptr + 3), val->val[3], lane); 6781 6782 } 6783 #define vst4_lane_u32(ptr, val, lane) vst4_lane_u32_ptr(ptr, &val, lane) 6784 6785 //void vst4_lane_s8(__transfersize(4) int8_t * ptr, int8x8x4_t val, __constrange(0,7) int lane)// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6786 #define vst4_lane_s8(ptr, val, lane) vst4_lane_u8((uint8_t*)ptr, val, lane) 6787 6788 //void vst4_lane_s16(__transfersize(4) int16_t * ptr, int16x4x4_t val, __constrange(0,3) int lane)// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6789 #define vst4_lane_s16(ptr, val, lane) vst4_lane_u16((uint16_t*)ptr, val, lane) 6790 6791 //void vst4_lane_s32(__transfersize(4) int32_t * ptr, int32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6792 #define vst4_lane_s32(ptr, val, lane) vst4_lane_u32((uint32_t*)ptr, val, lane) 6793 6794 //void vst4_lane_f16(__transfersize(4) __fp16 * ptr, float16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6795 void vst4_lane_f16_ptr(__transfersize(4) __fp16 * ptr, float16x4x4_t * val, __constrange(0,3) int lane); 6796 //current IA SIMD doesn't support float16 6797 6798 //void vst4_lane_f32(__transfersize(4) float32_t * ptr, float32x2x4_t val, __constrange(0,1) int lane)// VST4.32 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6799 #define vst4_lane_f32 vst4q_lane_f32 6800 6801 //void vst4_lane_p8(__transfersize(4) poly8_t * ptr, poly8x8x4_t val, __constrange(0,7) int lane);// VST4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6802 void vst4_lane_p8_ptr(__transfersize(4) poly8_t * ptr, poly8x8x4_t * val, __constrange(0,7) int lane); 6803 #define vst4_lane_p8 vst4_lane_u8 6804 6805 //void vst4_lane_p16(__transfersize(4) poly16_t * ptr, poly16x4x4_t val, __constrange(0,3) int lane);// VST4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0] 6806 void vst4_lane_p16_ptr(__transfersize(4) poly16_t * ptr, poly16x4x4_t * val, __constrange(0,3) int lane); 6807 #define vst4_lane_p16 vst4_lane_u16 6808 6809 //************************************************************************************************** 6810 //************************ Extract lanes from a vector ******************************************** 6811 //************************************************************************************************** 6812 //These intrinsics extract a single lane (element) from a vector. 6813 6814 uint8_t vgetq_lane_u8(uint8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] 6815 #define vgetq_lane_u8 _MM_EXTRACT_EPI8 6816 6817 uint16_t vgetq_lane_u16(uint16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] 6818 #define vgetq_lane_u16 _MM_EXTRACT_EPI16 6819 6820 uint32_t vgetq_lane_u32(uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 6821 #define vgetq_lane_u32 _MM_EXTRACT_EPI32 6822 6823 int8_t vgetq_lane_s8(int8x16_t vec, __constrange(0,15) int lane); // VMOV.S8 r0, d0[0] 6824 #define vgetq_lane_s8 vgetq_lane_u8 6825 6826 int16_t vgetq_lane_s16(int16x8_t vec, __constrange(0,7) int lane); // VMOV.S16 r0, d0[0] 6827 #define vgetq_lane_s16 vgetq_lane_u16 6828 6829 int32_t vgetq_lane_s32(int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 6830 #define vgetq_lane_s32 vgetq_lane_u32 6831 6832 poly8_t vgetq_lane_p8(poly8x16_t vec, __constrange(0,15) int lane); // VMOV.U8 r0, d0[0] 6833 #define vgetq_lane_p8 vgetq_lane_u8 6834 6835 poly16_t vgetq_lane_p16(poly16x8_t vec, __constrange(0,7) int lane); // VMOV.s16 r0, d0[0] 6836 #define vgetq_lane_p16 vgetq_lane_u16 6837 6838 float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 r0, d0[0] 6839 _NEON2SSE_INLINE float32_t vgetq_lane_f32(float32x4_t vec, __constrange(0,3) int lane) 6840 { 6841 int32_t ilane; 6842 ilane = _MM_EXTRACT_PS(vec,lane); 6843 return *(float*)&ilane; 6844 } 6845 6846 int64_t vgetq_lane_s64(int64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 6847 #define vgetq_lane_s64 (int64_t) vgetq_lane_u64 6848 6849 uint64_t vgetq_lane_u64(uint64x2_t vec, __constrange(0,1) int lane); // VMOV r0,r0,d0 6850 #define vgetq_lane_u64 _MM_EXTRACT_EPI64 6851 6852 // ***************** Set lanes within a vector ******************************************** 6853 // ************************************************************************************** 6854 //These intrinsics set a single lane (element) within a vector. 6855 //same functions as vld1_lane_xx ones, but take the value to be set directly. 6856 6857 uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 6858 _NEON2SSE_INLINE uint8x16_t vsetq_lane_u8(uint8_t value, uint8x16_t vec, __constrange(0,15) int lane) 6859 { 6860 uint8_t val; 6861 val = value; 6862 return vld1q_lane_u8(&val, vec, lane); 6863 } 6864 6865 uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 6866 _NEON2SSE_INLINE uint16x8_t vsetq_lane_u16(uint16_t value, uint16x8_t vec, __constrange(0,7) int lane) 6867 { 6868 uint16_t val; 6869 val = value; 6870 return vld1q_lane_u16(&val, vec, lane); 6871 } 6872 6873 uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 6874 _NEON2SSE_INLINE uint32x4_t vsetq_lane_u32(uint32_t value, uint32x4_t vec, __constrange(0,3) int lane) 6875 { 6876 uint32_t val; 6877 val = value; 6878 return vld1q_lane_u32(&val, vec, lane); 6879 } 6880 6881 int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 6882 _NEON2SSE_INLINE int8x16_t vsetq_lane_s8(int8_t value, int8x16_t vec, __constrange(0,15) int lane) 6883 { 6884 int8_t val; 6885 val = value; 6886 return vld1q_lane_s8(&val, vec, lane); 6887 } 6888 6889 int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 6890 _NEON2SSE_INLINE int16x8_t vsetq_lane_s16(int16_t value, int16x8_t vec, __constrange(0,7) int lane) 6891 { 6892 int16_t val; 6893 val = value; 6894 return vld1q_lane_s16(&val, vec, lane); 6895 } 6896 6897 int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 6898 _NEON2SSE_INLINE int32x4_t vsetq_lane_s32(int32_t value, int32x4_t vec, __constrange(0,3) int lane) 6899 { 6900 int32_t val; 6901 val = value; 6902 return vld1q_lane_s32(&val, vec, lane); 6903 } 6904 6905 poly8x16_t vsetq_lane_p8(poly8_t value, poly8x16_t vec, __constrange(0,15) int lane); // VMOV.8 d0[0],r0 6906 #define vsetq_lane_p8 vsetq_lane_u8 6907 6908 poly16x8_t vsetq_lane_p16(poly16_t value, poly16x8_t vec, __constrange(0,7) int lane); // VMOV.16 d0[0],r0 6909 #define vsetq_lane_p16 vsetq_lane_u16 6910 6911 float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane); // VMOV.32 d0[0],r0 6912 _NEON2SSE_INLINE float32x4_t vsetq_lane_f32(float32_t value, float32x4_t vec, __constrange(0,3) int lane) 6913 { 6914 float32_t val; 6915 val = value; 6916 } 6917 6918 int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 6919 _NEON2SSE_INLINE int64x2_t vsetq_lane_s64(int64_t value, int64x2_t vec, __constrange(0,1) int lane) 6920 { 6921 uint64_t val; 6922 val = value; 6923 return vld1q_lane_s64(&val, vec, lane); 6924 } 6925 6926 uint64x2_t vsetq_lane_u64(uint64_t value, uint64x2_t vec, __constrange(0,1) int lane); // VMOV d0,r0,r0 6927 #define vsetq_lane_u64 vsetq_lane_s64 6928 6929 // ******************************************************************************* 6930 // **************** Initialize a vector from bit pattern *************************** 6931 // ******************************************************************************* 6932 //These intrinsics create a vector from a literal bit pattern. 6933 6934 //no IA32 SIMD avalilable 6935 6936 //********************* Set all lanes to same value ******************************** 6937 //********************************************************************************* 6938 //These intrinsics set all lanes to the same value. 6939 6940 uint8x16_t vdupq_n_u8(uint8_t value); // VDUP.8 q0,r0 6941 #define vdupq_n_u8(value) _mm_set1_epi8((uint8_t) (value)) 6942 6943 uint16x8_t vdupq_n_u16(uint16_t value); // VDUP.16 q0,r0 6944 #define vdupq_n_u16(value) _mm_set1_epi16((uint16_t) (value)) 6945 6946 uint32x4_t vdupq_n_u32(uint32_t value); // VDUP.32 q0,r0 6947 #define vdupq_n_u32(value) _mm_set1_epi32((uint32_t) (value)) 6948 6949 int8x16_t vdupq_n_s8(int8_t value); // VDUP.8 q0,r0 6950 #define vdupq_n_s8 _mm_set1_epi8 6951 6952 int16x8_t vdupq_n_s16(int16_t value); // VDUP.16 q0,r0 6953 #define vdupq_n_s16 _mm_set1_epi16 6954 6955 int32x4_t vdupq_n_s32(int32_t value); // VDUP.32 q0,r0 6956 #define vdupq_n_s32 _mm_set1_epi32 6957 6958 poly8x16_t vdupq_n_p8(poly8_t value); // VDUP.8 q0,r0 6959 #define vdupq_n_p8 vdupq_n_u8 6960 6961 poly16x8_t vdupq_n_p16(poly16_t value); // VDUP.16 q0,r0 6962 #define vdupq_n_p16 vdupq_n_u16 6963 6964 float32x4_t vdupq_n_f32(float32_t value); // VDUP.32 q0,r0 6965 #define vdupq_n_f32 _mm_set1_ps 6966 6967 int64x2_t vdupq_n_s64(int64_t value); // VMOV d0,r0,r0 6968 _NEON2SSE_INLINE int64x2_t vdupq_n_s64(int64_t value) 6969 { 6970 _NEON2SSE_ALIGN_16 int64_t value2[2] = {value, value}; //value may be an immediate 6971 return LOAD_SI128(value2); 6972 } 6973 6974 uint64x2_t vdupq_n_u64(uint64_t value); // VMOV d0,r0,r0 6975 _NEON2SSE_INLINE uint64x2_t vdupq_n_u64(uint64_t value) 6976 { 6977 _NEON2SSE_ALIGN_16 uint64_t val[2] = {value, value}; //value may be an immediate 6978 return LOAD_SI128(val); 6979 } 6980 6981 //**** Set all lanes to same value ************************ 6982 //Same functions as above - just aliaces.******************** 6983 //Probably they reflect the fact that 128-bit functions versions use VMOV instruction ********** 6984 6985 uint8x16_t vmovq_n_u8(uint8_t value); // VDUP.8 q0,r0 6986 #define vmovq_n_u8 vdupq_n_u8 6987 6988 uint16x8_t vmovq_n_u16(uint16_t value); // VDUP.16 q0,r0 6989 #define vmovq_n_u16 vdupq_n_s16 6990 6991 uint32x4_t vmovq_n_u32(uint32_t value); // VDUP.32 q0,r0 6992 #define vmovq_n_u32 vdupq_n_u32 6993 6994 int8x16_t vmovq_n_s8(int8_t value); // VDUP.8 q0,r0 6995 #define vmovq_n_s8 vdupq_n_s8 6996 6997 int16x8_t vmovq_n_s16(int16_t value); // VDUP.16 q0,r0 6998 #define vmovq_n_s16 vdupq_n_s16 6999 7000 int32x4_t vmovq_n_s32(int32_t value); // VDUP.32 q0,r0 7001 #define vmovq_n_s32 vdupq_n_s32 7002 7003 poly8x16_t vmovq_n_p8(poly8_t value); // VDUP.8 q0,r0 7004 #define vmovq_n_p8 vdupq_n_u8 7005 7006 poly16x8_t vmovq_n_p16(poly16_t value); // VDUP.16 q0,r0 7007 #define vmovq_n_p16 vdupq_n_s16 7008 7009 float32x4_t vmovq_n_f32(float32_t value); // VDUP.32 q0,r0 7010 #define vmovq_n_f32 vdupq_n_f32 7011 7012 int64x2_t vmovq_n_s64(int64_t value); // VMOV d0,r0,r0 7013 #define vmovq_n_s64 vdupq_n_s64 7014 7015 uint64x2_t vmovq_n_u64(uint64_t value); // VMOV d0,r0,r0 7016 #define vmovq_n_u64 vdupq_n_u64 7017 7018 //**************Set all lanes to the value of one lane of a vector ************* 7019 //**************************************************************************** 7020 //here shuffle is better solution than lane extraction followed by set1 function 7021 7022 // ******************************************************************** 7023 // ******************** Combining vectors ***************************** 7024 // ******************************************************************** 7025 //These intrinsics join two 64 bit vectors into a single 128bit vector. 7026 7027 //current IA SIMD doesn't support float16 7028 7029 //********************************************************************** 7030 //************************* Splitting vectors ************************** 7031 //********************************************************************** 7032 //**************** Get high part ****************************************** 7033 //These intrinsics split a 128 bit vector into 2 component 64 bit vectors 7034 7035 // IA32 SIMD doesn't work with 16bit floats currently 7036 7037 //********************** Get low part ********************** 7038 //********************************************************** 7039 7040 // IA32 SIMD doesn't work with 16bit floats currently 7041 7042 //************************************************************************** 7043 //************************ Converting vectors ********************************** 7044 //************************************************************************** 7045 //************* Convert from float *************************************** 7046 // need to set _MM_SET_ROUNDING_MODE ( x) accordingly 7047 7048 int32x4_t vcvtq_s32_f32(float32x4_t a); // VCVT.S32.F32 q0, q0 7049 #define vcvtq_s32_f32 _mm_cvtps_epi32 7050 7051 uint32x4_t vcvtq_u32_f32(float32x4_t a); // VCVT.U32.F32 q0, q0 7052 _NEON2SSE_INLINE uint32x4_t vcvtq_u32_f32(float32x4_t a) // VCVT.U32.F32 q0, q0 7053 { //No single instruction SSE solution but we could implement it as following: 7054 __m128i resi; 7055 __m128 zero, mask, a_pos, mask_f_max_si, res; 7056 _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; 7057 zero = _mm_setzero_ps(); 7058 mask = _mm_cmpgt_ps(a, zero); 7059 a_pos = _mm_and_ps(a, mask); 7060 mask_f_max_si = _mm_cmpgt_ps(a_pos,*(__m128*)c7fffffff); 7061 res = _mm_sub_ps(a_pos, mask_f_max_si); //if the input fits to signed we don't subtract anything 7062 resi = _mm_cvtps_epi32(res); 7063 return _mm_add_epi32(resi, *(__m128i*)&mask_f_max_si); 7064 } 7065 7066 // ***** Convert to the fixed point with the number of fraction bits specified by b *********** 7067 //************************************************************************************************* 7068 //Intel SIMD doesn't support fixed point 7069 7070 int32x4_t vcvtq_n_s32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.S32.F32 q0, q0, #32 7071 uint32x4_t vcvtq_n_u32_f32(float32x4_t a, __constrange(1,32) int b); // VCVT.U32.F32 q0, q0, #32 7072 7073 //***************** Convert to float ************************* 7074 //************************************************************* 7075 7076 float32x4_t vcvtq_f32_s32(int32x4_t a); // VCVT.F32.S32 q0, q0 7077 #define vcvtq_f32_s32(a) _mm_cvtepi32_ps(a) 7078 7079 float32x4_t vcvtq_f32_u32(uint32x4_t a); // VCVT.F32.U32 q0, q0 7080 _NEON2SSE_INLINE float32x4_t vcvtq_f32_u32(uint32x4_t a) // VCVT.F32.U32 q0, q0 7081 { //solution may be not optimal 7082 __m128 two16, fHi, fLo; 7083 __m128i hi, lo; 7084 two16 = _mm_set1_ps((float)0x10000); //2^16 7085 // Avoid double rounding by doing two exact conversions 7086 // of high and low 16-bit segments 7087 hi = _mm_srli_epi32(a, 16); 7088 lo = _mm_srli_epi32(_mm_slli_epi32(a, 16), 16); 7089 fHi = _mm_mul_ps(_mm_cvtepi32_ps(hi), two16); 7090 fLo = _mm_cvtepi32_ps(lo); 7091 // do single rounding according to current rounding mode 7092 return _mm_add_ps(fHi, fLo); 7093 } 7094 7095 //**************Convert between floats *********************** 7096 //************************************************************ 7097 7098 //Intel SIMD doesn't support 16bits floats curently 7099 7100 //Intel SIMD doesn't support 16bits floats curently, the only solution is to store 16bit floats and load as 32 bits 7101 7102 //************Vector narrow integer conversion (truncation) ****************** 7103 //**************************************************************************** 7104 7105 //**************** Vector long move *********************** 7106 //*********************************************************** 7107 7108 //*************Vector saturating narrow integer***************** 7109 //************************************************************** 7110 7111 //************* Vector saturating narrow integer signed->unsigned ************** 7112 //***************************************************************************** 7113 7114 // ******************************************************** 7115 // **************** Table look up ************************** 7116 // ******************************************************** 7117 //VTBL (Vector Table Lookup) uses byte indexes in a control vector to look up byte values 7118 //in a table and generate a new vector. Indexes out of range return 0. 7119 //for Intel SIMD we need to set the MSB to 1 for zero return 7120 7121 //Special trick to avoid __declspec(align('8')) won't be aligned" error 7122 7123 //Special trick to avoid __declspec(align('16')) won't be aligned" error 7124 7125 //****************** Extended table look up intrinsics *************************** 7126 //********************************************************************************** 7127 //VTBX (Vector Table Extension) works in the same way as VTBL do, 7128 // except that indexes out of range leave the destination element unchanged. 7129 7130 //Special trick to avoid __declspec(align('8')) won't be aligned" error 7131 7132 //************************************************************************************************* 7133 // *************************** Operations with a scalar value ********************************* 7134 //************************************************************************************************* 7135 7136 //******* Vector multiply accumulate by scalar ************************************************* 7137 //********************************************************************************************** 7138 7139 //***************** Vector widening multiply accumulate by scalar ********************** 7140 //*************************************************************************************** 7141 7142 // ******** Vector widening saturating doubling multiply accumulate by scalar ******************************* 7143 // ************************************************************************************************ 7144 7145 // ****** Vector multiply subtract by scalar ***************** 7146 // ************************************************************* 7147 7148 // **** Vector widening multiply subtract by scalar **** 7149 // **************************************************** 7150 7151 //********* Vector widening saturating doubling multiply subtract by scalar ************************** 7152 //****************************************************************************************************** 7153 7154 //********** Vector multiply with scalar ***************************** 7155 7156 int16x8_t vmulq_n_s16(int16x8_t a, int16_t b); // VMUL.I16 q0,q0,d0[0] 7157 _NEON2SSE_INLINE int16x8_t vmulq_n_s16(int16x8_t a, int16_t b) // VMUL.I16 q0,q0,d0[0] 7158 { 7159 int16x8_t b16x8; 7160 b16x8 = vdupq_n_s16(b); 7161 return vmulq_s16(a, b16x8); 7162 } 7163 7164 int32x4_t vmulq_n_s32(int32x4_t a, int32_t b); // VMUL.I32 q0,q0,d0[0] 7165 _NEON2SSE_INLINE int32x4_t vmulq_n_s32(int32x4_t a, int32_t b) // VMUL.I32 q0,q0,d0[0] 7166 { 7167 int32x4_t b32x4; 7168 b32x4 = vdupq_n_s32(b); 7169 return vmulq_s32(a, b32x4); 7170 } 7171 7172 float32x4_t vmulq_n_f32(float32x4_t a, float32_t b); // VMUL.F32 q0,q0,d0[0] 7173 _NEON2SSE_INLINE float32x4_t vmulq_n_f32(float32x4_t a, float32_t b) // VMUL.F32 q0,q0,d0[0] 7174 { 7175 float32x4_t b32x4; 7176 b32x4 = vdupq_n_f32(b); 7177 return vmulq_f32(a, b32x4); 7178 } 7179 7180 uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b); // VMUL.I16 q0,q0,d0[0] 7181 _NEON2SSE_INLINE uint16x8_t vmulq_n_u16(uint16x8_t a, uint16_t b) // VMUL.I16 q0,q0,d0[0] 7182 { 7183 uint16x8_t b16x8; 7184 b16x8 = vdupq_n_s16(b); 7185 return vmulq_s16(a, b16x8); 7186 } 7187 7188 uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b); // VMUL.I32 q0,q0,d0[0] 7189 _NEON2SSE_INLINE uint32x4_t vmulq_n_u32(uint32x4_t a, uint32_t b) // VMUL.I32 q0,q0,d0[0] 7190 { 7191 uint32x4_t b32x4; 7192 b32x4 = vdupq_n_u32(b); 7193 return vmulq_u32(a, b32x4); 7194 } 7195 7196 //**** Vector long multiply with scalar ************ 7197 7198 //**** Vector long multiply by scalar **** 7199 7200 //********* Vector saturating doubling long multiply with scalar ******************* 7201 7202 //************* Vector saturating doubling long multiply by scalar *********************************************** 7203 7204 // *****Vector saturating doubling multiply high with scalar ***** 7205 7206 int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQDMULH.S16 q0,q0,d0[0] 7207 _NEON2SSE_INLINE int16x8_t vqdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQDMULH.S16 q0,q0,d0[0] 7208 { //solution may be not optimal 7209 int16x8_t scalar; 7210 scalar = vdupq_n_s16(val2); 7211 return vqdmulhq_s16(vec1, scalar); 7212 } 7213 7214 int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQDMULH.S32 q0,q0,d0[0] 7215 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 7216 { 7217 int32x4_t scalar; 7218 scalar = vdupq_n_s32(val2); 7219 return vqdmulhq_s32(vec1, scalar); 7220 } 7221 7222 //***** Vector saturating doubling multiply high by scalar **************** 7223 7224 //******** Vector saturating rounding doubling multiply high with scalar *** 7225 7226 #if defined(USE_SSSE3) 7227 int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2); // VQRDMULH.S16 q0,q0,d0[0] 7228 _NEON2SSE_INLINE int16x8_t vqrdmulhq_n_s16(int16x8_t vec1, int16_t val2) // VQRDMULH.S16 q0,q0,d0[0] 7229 { //solution may be not optimal 7230 int16x8_t scalar; 7231 scalar = vdupq_n_s16(val2); 7232 return vqrdmulhq_s16(vec1, scalar); 7233 } 7234 #endif 7235 7236 #if defined(USE_SSSE3) 7237 int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2); // VQRDMULH.S32 q0,q0,d0[0] 7238 _NEON2SSE_INLINE _NEON2SSE_PERFORMANCE_WARNING(int32x4_t vqrdmulhq_n_s32(int32x4_t vec1, int32_t val2), _NEON2SSE_REASON_SLOW_UNEFFECTIVE) 7239 { 7240 int32x4_t scalar; 7241 scalar = vdupq_n_s32(val2); 7242 return vqrdmulhq_s32(vec1, scalar); 7243 } 7244 #endif 7245 7246 //********* Vector rounding saturating doubling multiply high by scalar **** 7247 7248 //**************Vector multiply accumulate with scalar ******************* 7249 7250 int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLA.I16 q0, q0, d0[0] 7251 _NEON2SSE_INLINE int16x8_t vmlaq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLA.I16 q0, q0, d0[0] 7252 { 7253 int16x8_t scalar; 7254 scalar = vdupq_n_s16(c); 7255 return vmlaq_s16(a,b,scalar); 7256 } 7257 7258 int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLA.I32 q0, q0, d0[0] 7259 _NEON2SSE_INLINE int32x4_t vmlaq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLA.I32 q0, q0, d0[0] 7260 { 7261 int32x4_t scalar; 7262 scalar = vdupq_n_s32(c); 7263 return vmlaq_s32(a,b,scalar); 7264 } 7265 7266 uint16x8_t vmlaq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLA.I16 q0, q0, d0[0] 7267 #define vmlaq_n_u16 vmlaq_n_s16 7268 7269 uint32x4_t vmlaq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLA.I32 q0, q0, d0[0] 7270 #define vmlaq_n_u32 vmlaq_n_s32 7271 7272 float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLA.F32 q0, q0, d0[0] 7273 _NEON2SSE_INLINE float32x4_t vmlaq_n_f32(float32x4_t a, float32x4_t b, float32_t c) // VMLA.F32 q0, q0, d0[0] 7274 { 7275 float32x4_t scalar; 7276 scalar = vdupq_n_f32(c); 7277 return vmlaq_f32(a,b,scalar); 7278 } 7279 7280 //************Vector widening multiply accumulate with scalar**************************** 7281 7282 //************ Vector widening saturating doubling multiply accumulate with scalar ************** 7283 7284 //******** Vector multiply subtract with scalar ************** 7285 7286 int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c); // VMLS.I16 q0, q0, d0[0] 7287 _NEON2SSE_INLINE int16x8_t vmlsq_n_s16(int16x8_t a, int16x8_t b, int16_t c) // VMLS.I16 q0, q0, d0[0] 7288 { 7289 int16x8_t vc; 7290 vc = vdupq_n_s16(c); 7291 return vmlsq_s16(a, b,vc); 7292 } 7293 7294 int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c); // VMLS.I32 q0, q0, d0[0] 7295 _NEON2SSE_INLINE int32x4_t vmlsq_n_s32(int32x4_t a, int32x4_t b, int32_t c) // VMLS.I32 q0, q0, d0[0] 7296 { 7297 int32x4_t vc; 7298 vc = vdupq_n_s32(c); 7299 return vmlsq_s32(a,b,vc); 7300 } 7301 7302 uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c); // VMLS.I16 q0, q0, d0[0] 7303 _NEON2SSE_INLINE uint16x8_t vmlsq_n_u16(uint16x8_t a, uint16x8_t b, uint16_t c) // VMLS.I16 q0, q0, d0[0] 7304 { 7305 uint32x4_t vc; 7306 vc = vdupq_n_u32(c); 7307 return vmlsq_u32(a,b,vc); 7308 } 7309 7310 uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c); // VMLS.I32 q0, q0, d0[0] 7311 _NEON2SSE_INLINE uint32x4_t vmlsq_n_u32(uint32x4_t a, uint32x4_t b, uint32_t c) // VMLS.I32 q0, q0, d0[0] 7312 { 7313 uint32x4_t vc; 7314 vc = vdupq_n_u32(c); 7315 return vmlsq_u32(a,b,vc); 7316 } 7317 7318 float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c); // VMLS.F32 q0, q0, d0[0] 7319 _NEON2SSE_INLINE float32x4_t vmlsq_n_f32(float32x4_t a, float32x4_t b, float32_t c) 7320 { 7321 float32x4_t vc; 7322 vc = vdupq_n_f32(c); 7323 return vmlsq_f32(a,b,vc); 7324 } 7325 7326 //**** Vector widening multiply subtract with scalar ****** 7327 7328 //***** Vector widening saturating doubling multiply subtract with scalar ********* 7329 //********************************************************************************** 7330 7331 //******************* Vector extract *********************************************** 7332 //************************************************************************************* 7333 //VEXT (Vector Extract) extracts elements from the bottom end of the second operand 7334 //vector and the top end of the first, concatenates them, and places the result in the destination vector 7335 //c elements from the bottom end of the second operand and (8-c) from the top end of the first 7336 7337 #if defined(USE_SSSE3) 7338 //same result tested 7339 7340 #endif 7341 7342 #if defined(USE_SSSE3) 7343 int8x16_t vextq_s8(int8x16_t a, int8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 7344 #define vextq_s8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c) 7345 7346 uint8x16_t vextq_u8(uint8x16_t a, uint8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 7347 #define vextq_u8(a,b,c) _MM_ALIGNR_EPI8 (b,a,c) 7348 7349 poly8x16_t vextq_p8(poly8x16_t a, poly8x16_t b, __constrange(0,15) int c); // VEXT.8 q0,q0,q0,#0 7350 #define vextq_p8 vextq_s8 7351 7352 int16x8_t vextq_s16(int16x8_t a, int16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 7353 #define vextq_s16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2) 7354 7355 uint16x8_t vextq_u16(uint16x8_t a, uint16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 7356 #define vextq_u16(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 2) 7357 7358 poly16x8_t vextq_p16(poly16x8_t a, poly16x8_t b, __constrange(0,7) int c); // VEXT.16 q0,q0,q0,#0 7359 #define vextq_p16 vextq_s16 7360 #endif 7361 7362 #if defined(USE_SSSE3) 7363 int32x4_t vextq_s32(int32x4_t a, int32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 7364 #define vextq_s32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4) 7365 7366 uint32x4_t vextq_u32(uint32x4_t a, uint32x4_t b, __constrange(0,3) int c); // VEXT.32 q0,q0,q0,#0 7367 #define vextq_u32(a,b,c) _MM_ALIGNR_EPI8 (b,a,c * 4) 7368 7369 int64x2_t vextq_s64(int64x2_t a, int64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 7370 #define vextq_s64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8) 7371 7372 uint64x2_t vextq_u64(uint64x2_t a, uint64x2_t b, __constrange(0,1) int c); // VEXT.64 q0,q0,q0,#0 7373 #define vextq_u64(a,b,c) _MM_ALIGNR_EPI8(b,a,c * 8) 7374 #endif 7375 7376 //************ Reverse vector elements (swap endianness)***************** 7377 //************************************************************************* 7378 //VREVn.m reverses the order of the m-bit lanes within a set that is n bits wide. 7379 7380 #if defined(USE_SSSE3) 7381 int8x16_t vrev64q_s8(int8x16_t vec); // VREV64.8 q0,q0 7382 _NEON2SSE_INLINE int8x16_t vrev64q_s8(int8x16_t vec) // VREV64.8 q0,q0 7383 { 7384 _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {7,6,5,4,3,2,1,0, 15,14,13,12,11,10,9, 8}; 7385 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8); 7386 } 7387 #endif 7388 7389 #if defined(USE_SSSE3) 7390 int16x8_t vrev64q_s16(int16x8_t vec); // VREV64.16 q0,q0 7391 _NEON2SSE_INLINE int16x8_t vrev64q_s16(int16x8_t vec) // VREV64.16 q0,q0 7392 { //no _mm_shuffle_epi16, _mm_shuffle_epi8 to be used with the corresponding mask 7393 _NEON2SSE_ALIGN_16 int8_t mask_rev_e16[16] = {6,7, 4,5,2,3,0,1,14,15,12,13,10,11,8,9}; 7394 return _mm_shuffle_epi8 (vec, *(__m128i*)mask_rev_e16); 7395 } 7396 #endif 7397 7398 int32x4_t vrev64q_s32(int32x4_t vec); // VREV64.32 q0,q0 7399 _NEON2SSE_INLINE int32x4_t vrev64q_s32(int32x4_t vec) // VREV64.32 q0,q0 7400 { 7401 return _mm_shuffle_epi32 (vec, 1 | (0 << 2) | (3 << 4) | (2 << 6) ); 7402 } 7403 7404 #if defined(USE_SSSE3) 7405 uint8x16_t vrev64q_u8(uint8x16_t vec); // VREV64.8 q0,q0 7406 #define vrev64q_u8 vrev64q_s8 7407 7408 uint16x8_t vrev64q_u16(uint16x8_t vec); // VREV64.16 q0,q0 7409 #define vrev64q_u16 vrev64q_s16 7410 #endif 7411 7412 uint32x4_t vrev64q_u32(uint32x4_t vec); // VREV64.32 q0,q0 7413 #define vrev64q_u32 vrev64q_s32 7414 7415 #if defined(USE_SSSE3) 7416 poly8x16_t vrev64q_p8(poly8x16_t vec); // VREV64.8 q0,q0 7417 #define vrev64q_p8 vrev64q_u8 7418 7419 poly16x8_t vrev64q_p16(poly16x8_t vec); // VREV64.16 q0,q0 7420 #define vrev64q_p16 vrev64q_s16 7421 #endif 7422 7423 float32x4_t vrev64q_f32(float32x4_t vec); // VREV64.32 q0,q0 7424 #define vrev64q_f32(vec) _mm_shuffle_ps (vec, vec, _MM_SHUFFLE(2,3, 0,1)) 7425 7426 //******************** 32 bit shuffles ********************** 7427 //************************************************************ 7428 7429 #if defined(USE_SSSE3) 7430 int8x16_t vrev32q_s8(int8x16_t vec); // VREV32.8 q0,q0 7431 _NEON2SSE_INLINE int8x16_t vrev32q_s8(int8x16_t vec) // VREV32.8 q0,q0 7432 { 7433 _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {3,2,1,0, 7,6,5,4, 11,10,9,8, 15,14,13,12}; 7434 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8); 7435 } 7436 #endif 7437 7438 #if defined(USE_SSSE3) 7439 int16x8_t vrev32q_s16(int16x8_t vec); // VREV32.16 q0,q0 7440 _NEON2SSE_INLINE int16x8_t vrev32q_s16(int16x8_t vec) // VREV32.16 q0,q0 7441 { 7442 _NEON2SSE_ALIGN_16 int8_t mask_rev_e8[16] = {2,3,0,1, 6,7, 4,5, 10,11, 8,9, 14,15,12,13}; 7443 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev_e8); 7444 } 7445 #endif 7446 7447 #if defined(USE_SSSE3) 7448 uint8x16_t vrev32q_u8(uint8x16_t vec); // VREV32.8 q0,q0 7449 #define vrev32q_u8 vrev32q_s8 7450 7451 uint16x8_t vrev32q_u16(uint16x8_t vec); // VREV32.16 q0,q0 7452 #define vrev32q_u16 vrev32q_s16 7453 7454 poly8x16_t vrev32q_p8(poly8x16_t vec); // VREV32.8 q0,q0 7455 #define vrev32q_p8 vrev32q_u8 7456 #endif 7457 7458 //************* 16 bit shuffles ********************** 7459 //****************************************************** 7460 7461 #if defined(USE_SSSE3) 7462 int8x16_t vrev16q_s8(int8x16_t vec); // VREV16.8 q0,q0 7463 _NEON2SSE_INLINE int8x16_t vrev16q_s8(int8x16_t vec) // VREV16.8 q0,q0 7464 { 7465 _NEON2SSE_ALIGN_16 int8_t mask_rev8[16] = {1,0, 3,2, 5,4, 7,6, 9,8, 11, 10, 13, 12, 15, 14}; 7466 return _mm_shuffle_epi8 (vec, *(__m128i*) mask_rev8); 7467 } 7468 #endif 7469 7470 #if defined(USE_SSSE3) 7471 uint8x16_t vrev16q_u8(uint8x16_t vec); // VREV16.8 q0,q0 7472 #define vrev16q_u8 vrev16q_s8 7473 7474 poly8x16_t vrev16q_p8(poly8x16_t vec); // VREV16.8 q0,q0 7475 #define vrev16q_p8 vrev16q_u8 7476 #endif 7477 7478 //********************************************************************* 7479 //**************** Other single operand arithmetic ******************* 7480 //********************************************************************* 7481 7482 //*********** Absolute: Vd[i] = |Va[i]| ********************************** 7483 //************************************************************************ 7484 7485 int8x16_t vabsq_s8(int8x16_t a); // VABS.S8 q0,q0 7486 #define vabsq_s8 _mm_abs_epi8 7487 7488 int16x8_t vabsq_s16(int16x8_t a); // VABS.S16 q0,q0 7489 #define vabsq_s16 _mm_abs_epi16 7490 7491 int32x4_t vabsq_s32(int32x4_t a); // VABS.S32 q0,q0 7492 #define vabsq_s32 _mm_abs_epi32 7493 7494 float32x4_t vabsq_f32(float32x4_t a); // VABS.F32 q0,q0 7495 _NEON2SSE_INLINE float32x4_t vabsq_f32(float32x4_t a) // VABS.F32 q0,q0 7496 { 7497 _NEON2SSE_ALIGN_16 int32_t c7fffffff[4] = {0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff}; 7498 return _mm_and_ps (a, *(__m128*)c7fffffff); 7499 } 7500 7501 //****** Saturating absolute: Vd[i] = sat(|Va[i]|) ********************* 7502 //********************************************************************** 7503 //For signed-integer data types, the absolute value of the most negative value is not representable by the data type, saturation takes place 7504 7505 #if defined(USE_SSSE3) 7506 int8x16_t vqabsq_s8(int8x16_t a); // VQABS.S8 q0,q0 7507 _NEON2SSE_INLINE int8x16_t vqabsq_s8(int8x16_t a) // VQABS.S8 q0,q0 7508 { 7509 __m128i c_128, abs, abs_cmp; 7510 c_128 = _mm_set1_epi8 (0x80); //-128 7511 abs = _mm_abs_epi8 (a); 7512 abs_cmp = _mm_cmpeq_epi8 (abs, c_128); 7513 return _mm_xor_si128 (abs, abs_cmp); 7514 } 7515 #endif 7516 7517 #if defined(USE_SSSE3) 7518 int16x8_t vqabsq_s16(int16x8_t a); // VQABS.S16 q0,q0 7519 _NEON2SSE_INLINE int16x8_t vqabsq_s16(int16x8_t a) // VQABS.S16 q0,q0 7520 { 7521 __m128i c_32768, abs, abs_cmp; 7522 c_32768 = _mm_set1_epi16 (0x8000); //-32768 7523 abs = _mm_abs_epi16 (a); 7524 abs_cmp = _mm_cmpeq_epi16 (abs, c_32768); 7525 return _mm_xor_si128 (abs, abs_cmp); 7526 } 7527 #endif 7528 7529 #if defined(USE_SSSE3) 7530 int32x4_t vqabsq_s32(int32x4_t a); // VQABS.S32 q0,q0 7531 _NEON2SSE_INLINE int32x4_t vqabsq_s32(int32x4_t a) // VQABS.S32 q0,q0 7532 { 7533 __m128i c80000000, abs, abs_cmp; 7534 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value 7535 abs = _mm_abs_epi32 (a); 7536 abs_cmp = _mm_cmpeq_epi32 (abs, c80000000); 7537 return _mm_xor_si128 (abs, abs_cmp); 7538 } 7539 #endif 7540 7541 //*************** Negate: Vd[i] = - Va[i] ************************************* 7542 //***************************************************************************** 7543 //several Negate implementations possible for SIMD. 7544 //e.//function _mm_sign function(a, negative numbers vector), but the following one gives good performance: 7545 7546 int8x16_t vnegq_s8(int8x16_t a); // VNE//q0,q0 7547 _NEON2SSE_INLINE int8x16_t vnegq_s8(int8x16_t a) // VNE//q0,q0 7548 { 7549 __m128i zero; 7550 zero = _mm_setzero_si128 (); 7551 return _mm_sub_epi8 (zero, a); 7552 } //or _mm_sign_epi8 (a, negative numbers vector) 7553 7554 int16x8_t vnegq_s16(int16x8_t a); // VNE//q0,q0 7555 _NEON2SSE_INLINE int16x8_t vnegq_s16(int16x8_t a) // VNE//q0,q0 7556 { 7557 __m128i zero; 7558 zero = _mm_setzero_si128 (); 7559 return _mm_sub_epi16 (zero, a); 7560 } //or _mm_sign_epi16 (a, negative numbers vector) 7561 7562 int32x4_t vnegq_s32(int32x4_t a); // VNE//q0,q0 7563 _NEON2SSE_INLINE int32x4_t vnegq_s32(int32x4_t a) // VNE//q0,q0 7564 { 7565 __m128i zero; 7566 zero = _mm_setzero_si128 (); 7567 return _mm_sub_epi32 (zero, a); 7568 } //or _mm_sign_epi32 (a, negative numbers vector) 7569 7570 float32x4_t vnegq_f32(float32x4_t a); // VNE//q0,q0 7571 _NEON2SSE_INLINE float32x4_t vnegq_f32(float32x4_t a) // VNE//q0,q0 7572 { 7573 _NEON2SSE_ALIGN_16 int32_t c80000000[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000}; 7574 return _mm_xor_ps (a, *(__m128*) c80000000); 7575 } 7576 7577 //************** Saturating Negate: sat(Vd[i] = - Va[i]) ************************** 7578 //*************************************************************************************** 7579 //For signed-integer data types, the negation of the most negative value can't be produced without saturation, while with saturation it is max positive 7580 7581 int8x16_t vqnegq_s8(int8x16_t a); // VQNE//q0,q0 7582 _NEON2SSE_INLINE int8x16_t vqnegq_s8(int8x16_t a) // VQNE//q0,q0 7583 { 7584 __m128i zero; 7585 zero = _mm_setzero_si128 (); 7586 return _mm_subs_epi8 (zero, a); //saturating substraction 7587 } 7588 7589 int16x8_t vqnegq_s16(int16x8_t a); // VQNE//q0,q0 7590 _NEON2SSE_INLINE int16x8_t vqnegq_s16(int16x8_t a) // VQNE//q0,q0 7591 { 7592 __m128i zero; 7593 zero = _mm_setzero_si128 (); 7594 return _mm_subs_epi16 (zero, a); //saturating substraction 7595 } 7596 7597 int32x4_t vqnegq_s32(int32x4_t a); // VQNE//q0,q0 7598 _NEON2SSE_INLINE int32x4_t vqnegq_s32(int32x4_t a) // VQNE//q0,q0 7599 { //solution may be not optimal compared with a serial 7600 __m128i c80000000, zero, sub, cmp; 7601 c80000000 = _mm_set1_epi32 (0x80000000); //most negative value 7602 zero = _mm_setzero_si128 (); 7603 sub = _mm_sub_epi32 (zero, a); //substraction 7604 cmp = _mm_cmpeq_epi32 (a, c80000000); 7605 return _mm_xor_si128 (sub, cmp); 7606 } 7607 7608 //****************** Count leading zeros ******************************** 7609 //************************************************************************** 7610 //no corresponding vector intrinsics in IA32, need to implement it. While the implementation is effective for 8 bits, it may be not for 16 and 32 bits 7611 7612 #if defined(USE_SSSE3) 7613 int8x16_t vclzq_s8(int8x16_t a); // VCLZ.I8 q0,q0 7614 _NEON2SSE_INLINE int8x16_t vclzq_s8(int8x16_t a) 7615 { 7616 _NEON2SSE_ALIGN_16 int8_t mask_CLZ[16] = { /* 0 */ 4,/* 1 */ 3,/* 2 */ 2,/* 3 */ 2, 7617 /* 4 */ 1,/* 5 */ 1,/* 6 */ 1,/* 7 */ 1, 7618 /* 8 */ 0,/* 9 */ 0,/* a */ 0,/* b */ 0, 7619 /* c */ 0,/* d */ 0,/* e */ 0,/* f */ 0}; 7620 __m128i maskLOW, c4, lowclz, mask, hiclz; 7621 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, don't need masking low to avoid zero if MSB is set - it happens automatically 7622 c4 = _mm_set1_epi8(4); 7623 lowclz = _mm_shuffle_epi8( *(__m128i*)mask_CLZ, a); //uses low 4 bits anyway 7624 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits 7625 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set 7626 hiclz = _mm_shuffle_epi8( *(__m128i*) mask_CLZ, mask); //uses low 4 bits anyway 7627 mask = _mm_cmpeq_epi8(hiclz, c4); // shows the need to add lowclz zeros 7628 lowclz = _mm_and_si128(lowclz,mask); 7629 return _mm_add_epi8(lowclz, hiclz); 7630 } 7631 #endif 7632 7633 #if defined(USE_SSSE3) 7634 int16x8_t vclzq_s16(int16x8_t a); // VCLZ.I16 q0,q0 7635 _NEON2SSE_INLINE int16x8_t vclzq_s16(int16x8_t a) 7636 { 7637 __m128i c7, res8x16, res8x16_swap; 7638 _NEON2SSE_ALIGN_16 int8_t mask8_sab[16] = { 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14}; 7639 _NEON2SSE_ALIGN_16 uint16_t mask8bit[8] = {0x00ff, 0x00ff, 0x00ff, 0x00ff,0x00ff, 0x00ff, 0x00ff, 0x00ff}; 7640 c7 = _mm_srli_epi16(*(__m128i*)mask8bit, 5); //7 7641 res8x16 = vclzq_s8(a); 7642 res8x16_swap = _mm_shuffle_epi8 (res8x16, *(__m128i*) mask8_sab); //horisontal pairs swap 7643 res8x16 = _mm_and_si128(res8x16, *(__m128i*)mask8bit); //lowclz 7644 res8x16_swap = _mm_and_si128(res8x16_swap, *(__m128i*)mask8bit); //hiclz 7645 c7 = _mm_cmpgt_epi16(res8x16_swap, c7); // shows the need to add lowclz zeros 7646 res8x16 = _mm_and_si128(res8x16, c7); //lowclz 7647 return _mm_add_epi16(res8x16_swap, res8x16); 7648 } 7649 #endif 7650 7651 int32x4_t vclzq_s32(int32x4_t a); // VCLZ.I32 q0,q0 7652 _NEON2SSE_INLINE int32x4_t vclzq_s32(int32x4_t a) 7653 { 7654 __m128i c55555555, c33333333, c0f0f0f0f, c3f, c32, tmp, tmp1, res; 7655 c55555555 = _mm_set1_epi32(0x55555555); 7656 c33333333 = _mm_set1_epi32(0x33333333); 7657 c0f0f0f0f = _mm_set1_epi32(0x0f0f0f0f); 7658 c3f = _mm_set1_epi32(0x3f); 7659 c32 = _mm_set1_epi32(32); 7660 tmp = _mm_srli_epi32(a, 1); 7661 res = _mm_or_si128(tmp, a); //atmp[i] |= (atmp[i] >> 1); 7662 tmp = _mm_srli_epi32(res, 2); 7663 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 2); 7664 tmp = _mm_srli_epi32(res, 4); 7665 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 4); 7666 tmp = _mm_srli_epi32(res, 8); 7667 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 8); 7668 tmp = _mm_srli_epi32(res, 16); 7669 res = _mm_or_si128(tmp, res); //atmp[i] |= (atmp[i] >> 16); 7670 7671 tmp = _mm_srli_epi32(res, 1); 7672 tmp = _mm_and_si128(tmp, c55555555); 7673 res = _mm_sub_epi32(res, tmp); //atmp[i] -= ((atmp[i] >> 1) & 0x55555555); 7674 7675 tmp = _mm_srli_epi32(res, 2); 7676 tmp = _mm_and_si128(tmp, c33333333); 7677 tmp1 = _mm_and_si128(res, c33333333); 7678 res = _mm_add_epi32(tmp, tmp1); //atmp[i] = (((atmp[i] >> 2) & 0x33333333) + (atmp[i] & 0x33333333)); 7679 7680 tmp = _mm_srli_epi32(res, 4); 7681 tmp = _mm_add_epi32(tmp, res); 7682 res = _mm_and_si128(tmp, c0f0f0f0f); //atmp[i] = (((atmp[i] >> 4) + atmp[i]) & 0x0f0f0f0f); 7683 7684 tmp = _mm_srli_epi32(res, 8); 7685 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 8); 7686 7687 tmp = _mm_srli_epi32(res, 16); 7688 res = _mm_add_epi32(tmp, res); //atmp[i] += (atmp[i] >> 16); 7689 7690 res = _mm_and_si128(res, c3f); //atmp[i] = atmp[i] & 0x0000003f; 7691 7692 return _mm_sub_epi32(c32, res); //res[i] = 32 - atmp[i]; 7693 } 7694 7695 #if defined(USE_SSSE3) 7696 uint8x16_t vclzq_u8(uint8x16_t a); // VCLZ.I8 q0,q0 7697 #define vclzq_u8 vclzq_s8 7698 7699 uint16x8_t vclzq_u16(uint16x8_t a); // VCLZ.I16 q0,q0 7700 #define vclzq_u16 vclzq_s16 7701 #endif 7702 7703 uint32x4_t vclzq_u32(uint32x4_t a); // VCLZ.I32 q0,q0 7704 #define vclzq_u32 vclzq_s32 7705 7706 //************** Count leading sign bits ************************** 7707 //******************************************************************** 7708 //VCLS (Vector Count Leading Sign bits) counts the number of consecutive bits following 7709 // the topmost bit, that are the same as the topmost bit, in each element in a vector 7710 //No corresponding vector intrinsics in IA32, need to implement it. 7711 //While the implementation is effective for 8 bits, it may be not for 16 and 32 bits 7712 7713 #if defined(USE_SSSE3) 7714 int8x16_t vclsq_s8(int8x16_t a); // VCLS.S8 q0,q0 7715 _NEON2SSE_INLINE int8x16_t vclsq_s8(int8x16_t a) 7716 { 7717 __m128i cff, c80, c1, a_mask, a_neg, a_pos, a_comb; 7718 cff = _mm_cmpeq_epi8 (a,a); //0xff 7719 c80 = _mm_set1_epi8(0x80); 7720 c1 = _mm_set1_epi8(1); 7721 a_mask = _mm_and_si128(a, c80); 7722 a_mask = _mm_cmpeq_epi8(a_mask, c80); //0xff if negative input and 0 if positive 7723 a_neg = _mm_xor_si128(a, cff); 7724 a_neg = _mm_and_si128(a_mask, a_neg); 7725 a_pos = _mm_andnot_si128(a_mask, a); 7726 a_comb = _mm_or_si128(a_pos, a_neg); 7727 a_comb = vclzq_s8(a_comb); 7728 return _mm_sub_epi8(a_comb, c1); 7729 } 7730 #endif 7731 7732 #if defined(USE_SSSE3) 7733 int16x8_t vclsq_s16(int16x8_t a); // VCLS.S16 q0,q0 7734 _NEON2SSE_INLINE int16x8_t vclsq_s16(int16x8_t a) 7735 { 7736 __m128i cffff, c8000, c1, a_mask, a_neg, a_pos, a_comb; 7737 cffff = _mm_cmpeq_epi16(a,a); 7738 c8000 = _mm_slli_epi16(cffff, 15); //0x8000 7739 c1 = _mm_srli_epi16(cffff,15); //0x1 7740 a_mask = _mm_and_si128(a, c8000); 7741 a_mask = _mm_cmpeq_epi16(a_mask, c8000); //0xffff if negative input and 0 if positive 7742 a_neg = _mm_xor_si128(a, cffff); 7743 a_neg = _mm_and_si128(a_mask, a_neg); 7744 a_pos = _mm_andnot_si128(a_mask, a); 7745 a_comb = _mm_or_si128(a_pos, a_neg); 7746 a_comb = vclzq_s16(a_comb); 7747 return _mm_sub_epi16(a_comb, c1); 7748 } 7749 #endif 7750 7751 int32x4_t vclsq_s32(int32x4_t a); // VCLS.S32 q0,q0 7752 _NEON2SSE_INLINE int32x4_t vclsq_s32(int32x4_t a) 7753 { 7754 __m128i cffffffff, c80000000, c1, a_mask, a_neg, a_pos, a_comb; 7755 cffffffff = _mm_cmpeq_epi32(a,a); 7756 c80000000 = _mm_slli_epi32(cffffffff, 31); //0x80000000 7757 c1 = _mm_srli_epi32(cffffffff,31); //0x1 7758 a_mask = _mm_and_si128(a, c80000000); 7759 a_mask = _mm_cmpeq_epi32(a_mask, c80000000); //0xffffffff if negative input and 0 if positive 7760 a_neg = _mm_xor_si128(a, cffffffff); 7761 a_neg = _mm_and_si128(a_mask, a_neg); 7762 a_pos = _mm_andnot_si128(a_mask, a); 7763 a_comb = _mm_or_si128(a_pos, a_neg); 7764 a_comb = vclzq_s32(a_comb); 7765 return _mm_sub_epi32(a_comb, c1); 7766 } 7767 7768 //************************* Count number of set bits ******************************** 7769 //************************************************************************************* 7770 //No corresponding SIMD solution. One option is to get a elements, convert it to 32 bits and then use SSE4.2 _mm_popcnt__u32 (unsigned int v) for each element 7771 //another option is to do the following algorithm: 7772 7773 #if defined(USE_SSSE3) 7774 uint8x16_t vcntq_u8(uint8x16_t a); // VCNT.8 q0,q0 7775 _NEON2SSE_INLINE uint8x16_t vcntq_u8(uint8x16_t a) 7776 { 7777 _NEON2SSE_ALIGN_16 int8_t mask_POPCOUNT[16] = { /* 0 */ 0,/* 1 */ 1,/* 2 */ 1,/* 3 */ 2, 7778 /* 4 */ 1,/* 5 */ 2,/* 6 */ 2,/* 7 */ 3, 7779 /* 8 */ 1,/* 9 */ 2,/* a */ 2,/* b */ 3, 7780 /* c */ 2,/* d */ 3,/* e */ 3,/* f */ 4}; 7781 __m128i maskLOW, mask, lowpopcnt, hipopcnt; 7782 maskLOW = _mm_set1_epi8(0x0f); //low 4 bits, need masking to avoid zero if MSB is set 7783 mask = _mm_and_si128(a, maskLOW); 7784 lowpopcnt = _mm_shuffle_epi8( *(__m128i*)mask_POPCOUNT, mask); //uses low 4 bits anyway 7785 mask = _mm_srli_epi16(a, 4); //get high 4 bits as low bits 7786 mask = _mm_and_si128(mask, maskLOW); //low 4 bits, need masking to avoid zero if MSB is set 7787 hipopcnt = _mm_shuffle_epi8( *(__m128i*) mask_POPCOUNT, mask); //uses low 4 bits anyway 7788 return _mm_add_epi8(lowpopcnt, hipopcnt); 7789 } 7790 #endif 7791 7792 #if defined(USE_SSSE3) 7793 int8x16_t vcntq_s8(int8x16_t a); // VCNT.8 q0,q0 7794 #define vcntq_s8 vcntq_u8 7795 7796 poly8x16_t vcntq_p8(poly8x16_t a); // VCNT.8 q0,q0 7797 #define vcntq_p8 vcntq_u8 7798 #endif 7799 7800 //************************************************************************************** 7801 //*********************** Logical operations **************************************** 7802 //************************************************************************************** 7803 //************************** Bitwise not *********************************** 7804 //several Bitwise not implementations possible for SIMD. Eg "xor" with all ones, but the following one gives good performance 7805 7806 int8x16_t vmvnq_s8(int8x16_t a); // VMVN q0,q0 7807 _NEON2SSE_INLINE int8x16_t vmvnq_s8(int8x16_t a) // VMVN q0,q0 7808 { 7809 __m128i c1; 7810 c1 = _mm_cmpeq_epi8 (a,a); //0xff 7811 return _mm_andnot_si128 (a, c1); 7812 } 7813 7814 int16x8_t vmvnq_s16(int16x8_t a); // VMVN q0,q0 7815 _NEON2SSE_INLINE int16x8_t vmvnq_s16(int16x8_t a) // VMVN q0,q0 7816 { 7817 __m128i c1; 7818 c1 = _mm_cmpeq_epi16 (a,a); //0xffff 7819 return _mm_andnot_si128 (a, c1); 7820 } 7821 7822 int32x4_t vmvnq_s32(int32x4_t a); // VMVN q0,q0 7823 _NEON2SSE_INLINE int32x4_t vmvnq_s32(int32x4_t a) // VMVN q0,q0 7824 { 7825 __m128i c1; 7826 c1 = _mm_cmpeq_epi32 (a,a); //0xffffffff 7827 return _mm_andnot_si128 (a, c1); 7828 } 7829 7830 uint8x16_t vmvnq_u8(uint8x16_t a); // VMVN q0,q0 7831 #define vmvnq_u8 vmvnq_s8 7832 7833 uint16x8_t vmvnq_u16(uint16x8_t a); // VMVN q0,q0 7834 #define vmvnq_u16 vmvnq_s16 7835 7836 uint32x4_t vmvnq_u32(uint32x4_t a); // VMVN q0,q0 7837 #define vmvnq_u32 vmvnq_s32 7838 7839 poly8x16_t vmvnq_p8(poly8x16_t a); // VMVN q0,q0 7840 #define vmvnq_p8 vmvnq_u8 7841 7842 //****************** Bitwise and *********************** 7843 //****************************************************** 7844 7845 int8x16_t vandq_s8(int8x16_t a, int8x16_t b); // VAND q0,q0,q0 7846 #define vandq_s8 _mm_and_si128 7847 7848 int16x8_t vandq_s16(int16x8_t a, int16x8_t b); // VAND q0,q0,q0 7849 #define vandq_s16 _mm_and_si128 7850 7851 int32x4_t vandq_s32(int32x4_t a, int32x4_t b); // VAND q0,q0,q0 7852 #define vandq_s32 _mm_and_si128 7853 7854 int64x2_t vandq_s64(int64x2_t a, int64x2_t b); // VAND q0,q0,q0 7855 #define vandq_s64 _mm_and_si128 7856 7857 uint8x16_t vandq_u8(uint8x16_t a, uint8x16_t b); // VAND q0,q0,q0 7858 #define vandq_u8 _mm_and_si128 7859 7860 uint16x8_t vandq_u16(uint16x8_t a, uint16x8_t b); // VAND q0,q0,q0 7861 #define vandq_u16 _mm_and_si128 7862 7863 uint32x4_t vandq_u32(uint32x4_t a, uint32x4_t b); // VAND q0,q0,q0 7864 #define vandq_u32 _mm_and_si128 7865 7866 uint64x2_t vandq_u64(uint64x2_t a, uint64x2_t b); // VAND q0,q0,q0 7867 #define vandq_u64 _mm_and_si128 7868 7869 //******************** Bitwise or ********************************* 7870 //****************************************************************** 7871 7872 int8x16_t vorrq_s8(int8x16_t a, int8x16_t b); // VORR q0,q0,q0 7873 #define vorrq_s8 _mm_or_si128 7874 7875 int16x8_t vorrq_s16(int16x8_t a, int16x8_t b); // VORR q0,q0,q0 7876 #define vorrq_s16 _mm_or_si128 7877 7878 int32x4_t vorrq_s32(int32x4_t a, int32x4_t b); // VORR q0,q0,q0 7879 #define vorrq_s32 _mm_or_si128 7880 7881 int64x2_t vorrq_s64(int64x2_t a, int64x2_t b); // VORR q0,q0,q0 7882 #define vorrq_s64 _mm_or_si128 7883 7884 uint8x16_t vorrq_u8(uint8x16_t a, uint8x16_t b); // VORR q0,q0,q0 7885 #define vorrq_u8 _mm_or_si128 7886 7887 uint16x8_t vorrq_u16(uint16x8_t a, uint16x8_t b); // VORR q0,q0,q0 7888 #define vorrq_u16 _mm_or_si128 7889 7890 uint32x4_t vorrq_u32(uint32x4_t a, uint32x4_t b); // VORR q0,q0,q0 7891 #define vorrq_u32 _mm_or_si128 7892 7893 uint64x2_t vorrq_u64(uint64x2_t a, uint64x2_t b); // VORR q0,q0,q0 7894 #define vorrq_u64 _mm_or_si128 7895 7896 //************* Bitwise exclusive or (EOR or XOR) ****************** 7897 //******************************************************************* 7898 7899 int8x16_t veorq_s8(int8x16_t a, int8x16_t b); // VEOR q0,q0,q0 7900 #define veorq_s8 _mm_xor_si128 7901 7902 int16x8_t veorq_s16(int16x8_t a, int16x8_t b); // VEOR q0,q0,q0 7903 #define veorq_s16 _mm_xor_si128 7904 7905 int32x4_t veorq_s32(int32x4_t a, int32x4_t b); // VEOR q0,q0,q0 7906 #define veorq_s32 _mm_xor_si128 7907 7908 int64x2_t veorq_s64(int64x2_t a, int64x2_t b); // VEOR q0,q0,q0 7909 #define veorq_s64 _mm_xor_si128 7910 7911 uint8x16_t veorq_u8(uint8x16_t a, uint8x16_t b); // VEOR q0,q0,q0 7912 #define veorq_u8 _mm_xor_si128 7913 7914 uint16x8_t veorq_u16(uint16x8_t a, uint16x8_t b); // VEOR q0,q0,q0 7915 #define veorq_u16 _mm_xor_si128 7916 7917 uint32x4_t veorq_u32(uint32x4_t a, uint32x4_t b); // VEOR q0,q0,q0 7918 #define veorq_u32 _mm_xor_si128 7919 7920 uint64x2_t veorq_u64(uint64x2_t a, uint64x2_t b); // VEOR q0,q0,q0 7921 #define veorq_u64 _mm_xor_si128 7922 7923 //********************** Bit Clear ********************************** 7924 //******************************************************************* 7925 //Logical AND complement (AND negation or AND NOT) 7926 7927 //notice arguments "swap" 7928 7929 //notice arguments "swap" 7930 7931 //notice arguments "swap" 7932 7933 //notice arguments "swap" 7934 7935 //notice arguments "swap" 7936 7937 //notice arguments "swap" 7938 7939 //notice arguments "swap" 7940 7941 //notice arguments "swap" 7942 7943 int8x16_t vbicq_s8(int8x16_t a, int8x16_t b); // VBIC q0,q0,q0 7944 #define vbicq_s8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 7945 7946 int16x8_t vbicq_s16(int16x8_t a, int16x8_t b); // VBIC q0,q0,q0 7947 #define vbicq_s16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 7948 7949 int32x4_t vbicq_s32(int32x4_t a, int32x4_t b); // VBIC q0,q0,q0 7950 #define vbicq_s32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 7951 7952 int64x2_t vbicq_s64(int64x2_t a, int64x2_t b); // VBIC q0,q0,q0 7953 #define vbicq_s64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 7954 7955 uint8x16_t vbicq_u8(uint8x16_t a, uint8x16_t b); // VBIC q0,q0,q0 7956 #define vbicq_u8(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 7957 7958 uint16x8_t vbicq_u16(uint16x8_t a, uint16x8_t b); // VBIC q0,q0,q0 7959 #define vbicq_u16(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 7960 7961 uint32x4_t vbicq_u32(uint32x4_t a, uint32x4_t b); // VBIC q0,q0,q0 7962 #define vbicq_u32(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 7963 7964 uint64x2_t vbicq_u64(uint64x2_t a, uint64x2_t b); // VBIC q0,q0,q0 7965 #define vbicq_u64(a,b) _mm_andnot_si128 (b,a) //notice arguments "swap" 7966 7967 //**************** Bitwise OR complement ******************************** 7968 //**************************************** ******************************** 7969 //no exact IA 32 match, need to implement it as following 7970 7971 int8x16_t vornq_s8(int8x16_t a, int8x16_t b); // VORN q0,q0,q0 7972 _NEON2SSE_INLINE int8x16_t vornq_s8(int8x16_t a, int8x16_t b) // VORN q0,q0,q0 7973 { 7974 __m128i b1; 7975 b1 = vmvnq_s8( b); //bitwise not for b 7976 return _mm_or_si128 (a, b1); 7977 } 7978 7979 int16x8_t vornq_s16(int16x8_t a, int16x8_t b); // VORN q0,q0,q0 7980 _NEON2SSE_INLINE int16x8_t vornq_s16(int16x8_t a, int16x8_t b) // VORN q0,q0,q0 7981 { 7982 __m128i b1; 7983 b1 = vmvnq_s16( b); //bitwise not for b 7984 return _mm_or_si128 (a, b1); 7985 } 7986 7987 int32x4_t vornq_s32(int32x4_t a, int32x4_t b); // VORN q0,q0,q0 7988 _NEON2SSE_INLINE int32x4_t vornq_s32(int32x4_t a, int32x4_t b) // VORN q0,q0,q0 7989 { 7990 __m128i b1; 7991 b1 = vmvnq_s32( b); //bitwise not for b 7992 return _mm_or_si128 (a, b1); 7993 } 7994 7995 int64x2_t vornq_s64(int64x2_t a, int64x2_t b); // VORN q0,q0,q0 7996 _NEON2SSE_INLINE int64x2_t vornq_s64(int64x2_t a, int64x2_t b) 7997 { 7998 __m128i c1, b1; 7999 c1 = _mm_cmpeq_epi8 (a, a); //all ones 0xfffffff...fffff 8000 b1 = _mm_andnot_si128 (b, c1); 8001 return _mm_or_si128 (a, b1); 8002 } 8003 8004 uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b); // VORN q0,q0,q0 8005 _NEON2SSE_INLINE uint8x16_t vornq_u8(uint8x16_t a, uint8x16_t b) // VORN q0,q0,q0 8006 { 8007 __m128i b1; 8008 b1 = vmvnq_u8( b); //bitwise not for b 8009 return _mm_or_si128 (a, b1); 8010 } 8011 8012 uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b); // VORN q0,q0,q0 8013 _NEON2SSE_INLINE uint16x8_t vornq_u16(uint16x8_t a, uint16x8_t b) // VORN q0,q0,q0 8014 { 8015 __m128i b1; 8016 b1 = vmvnq_s16( b); //bitwise not for b 8017 return _mm_or_si128 (a, b1); 8018 } 8019 8020 uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b); // VORN q0,q0,q0 8021 _NEON2SSE_INLINE uint32x4_t vornq_u32(uint32x4_t a, uint32x4_t b) // VORN q0,q0,q0 8022 { 8023 __m128i b1; 8024 b1 = vmvnq_u32( b); //bitwise not for b 8025 return _mm_or_si128 (a, b1); 8026 } 8027 uint64x2_t vornq_u64(uint64x2_t a, uint64x2_t b); // VORN q0,q0,q0 8028 #define vornq_u64 vornq_s64 8029 8030 //********************* Bitwise Select ***************************** 8031 //****************************************************************** 8032 //Note This intrinsic can compile to any of VBSL/VBIF/VBIT depending on register allocation.(?????????) 8033 8034 //VBSL (Bitwise Select) selects each bit for the destination from the first operand if the 8035 //corresponding bit of the destination is 1, or from the second operand if the corresponding bit of the destination is 0. 8036 8037 //VBIF (Bitwise Insert if False) inserts each bit from the first operand into the destination 8038 //if the corresponding bit of the second operand is 0, otherwise leaves the destination bit unchanged 8039 8040 //VBIT (Bitwise Insert if True) inserts each bit from the first operand into the destination 8041 //if the corresponding bit of the second operand is 1, otherwise leaves the destination bit unchanged. 8042 8043 //VBSL only is implemented for SIMD 8044 8045 int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c); // VBSL q0,q0,q0 8046 _NEON2SSE_INLINE int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) // VBSL q0,q0,q0 8047 { 8048 __m128i sel1, sel2; 8049 sel1 = _mm_and_si128 (a, b); 8050 sel2 = _mm_andnot_si128 (a, c); 8051 return _mm_or_si128 (sel1, sel2); 8052 } 8053 8054 int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c); // VBSL q0,q0,q0 8055 #define vbslq_s16 vbslq_s8 8056 8057 int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c); // VBSL q0,q0,q0 8058 #define vbslq_s32 vbslq_s8 8059 8060 int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c); // VBSL q0,q0,q0 8061 #define vbslq_s64 vbslq_s8 8062 8063 uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c); // VBSL q0,q0,q0 8064 #define vbslq_u8 vbslq_s8 8065 8066 uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c); // VBSL q0,q0,q0 8067 #define vbslq_u16 vbslq_s8 8068 8069 uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c); // VBSL q0,q0,q0 8070 #define vbslq_u32 vbslq_s8 8071 8072 uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c); // VBSL q0,q0,q0 8073 #define vbslq_u64 vbslq_s8 8074 8075 float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c); // VBSL q0,q0,q0 8076 _NEON2SSE_INLINE float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) // VBSL q0,q0,q0 8077 { 8078 __m128 sel1, sel2; 8079 sel1 = _mm_and_ps (*(__m128*)&a, b); 8080 sel2 = _mm_andnot_ps (*(__m128*)&a, c); 8081 return _mm_or_ps (sel1, sel2); 8082 } 8083 8084 poly8x16_t vbslq_p8(uint8x16_t a, poly8x16_t b, poly8x16_t c); // VBSL q0,q0,q0 8085 #define vbslq_p8 vbslq_u8 8086 8087 poly16x8_t vbslq_p16(uint16x8_t a, poly16x8_t b, poly16x8_t c); // VBSL q0,q0,q0 8088 #define vbslq_p16 vbslq_s8 8089 8090 //************************************************************************************ 8091 //**************** Transposition operations **************************************** 8092 //************************************************************************************ 8093 //***************** Vector Transpose ************************************************ 8094 //************************************************************************************ 8095 //VTRN (Vector Transpose) treats the elements of its operand vectors as elements of 2 x 2 matrices, and transposes the matrices. 8096 // making the result look as (a0, b0, a2, b2, a4, b4,....) (a1, b1, a3, b3, a5, b5,.....) 8097 8098 #if defined(USE_SSSE3) 8099 //int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b); // VTRN.8 q0,q0 8100 _NEON2SSE_INLINE int8x16x2_t vtrnq_s8(int8x16_t a, int8x16_t b) // VTRN.8 q0,q0 8101 { 8102 int8x16x2_t r8x16; 8103 __m128i a_sh, b_sh; 8104 _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; 8105 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 8106 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 8107 8108 r8x16.val[0] = _mm_unpacklo_epi8(a_sh, b_sh); //(a0, b0, a2, b2, a4, b4, a6, b6, a8,b8, a10,b10, a12,b12, a14,b14) 8109 r8x16.val[1] = _mm_unpackhi_epi8(a_sh, b_sh); // (a1, b1, a3, b3, a5, b5, a7, b7, a9,b9, a11,b11, a13,b13, a15,b15) 8110 return r8x16; 8111 } 8112 #endif 8113 8114 #if defined(USE_SSSE3) 8115 int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b); // VTRN.16 q0,q0 8116 _NEON2SSE_INLINE int16x8x2_t vtrnq_s16(int16x8_t a, int16x8_t b) // VTRN.16 q0,q0 8117 { 8118 int16x8x2_t v16x8; 8119 __m128i a_sh, b_sh; 8120 _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; 8121 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 8122 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 8123 v16x8.val[0] = _mm_unpacklo_epi16(a_sh, b_sh); //a0, b0, a2, b2, a4, b4, a6, b6 8124 v16x8.val[1] = _mm_unpackhi_epi16(a_sh, b_sh); //a1, b1, a3, b3, a5, b5, a7, b7 8125 return v16x8; 8126 } 8127 #endif 8128 8129 int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b); // VTRN.32 q0,q0 8130 _NEON2SSE_INLINE int32x4x2_t vtrnq_s32(int32x4_t a, int32x4_t b) // VTRN.32 q0,q0 8131 { //may be not optimal solution compared with serial 8132 int32x4x2_t v32x4; 8133 __m128i a_sh, b_sh; 8134 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 8135 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 8136 8137 v32x4.val[0] = _mm_unpacklo_epi32(a_sh, b_sh); //a0, b0, a2, b2 8138 v32x4.val[1] = _mm_unpackhi_epi32(a_sh, b_sh); //a1, b1, a3, b3 8139 return v32x4; 8140 } 8141 8142 #if defined(USE_SSSE3) 8143 uint8x16x2_t vtrnq_u8(uint8x16_t a, uint8x16_t b); // VTRN.8 q0,q0 8144 #define vtrnq_u8 vtrnq_s8 8145 8146 uint16x8x2_t vtrnq_u16(uint16x8_t a, uint16x8_t b); // VTRN.16 q0,q0 8147 #define vtrnq_u16 vtrnq_s16 8148 #endif 8149 8150 uint32x4x2_t vtrnq_u32(uint32x4_t a, uint32x4_t b); // VTRN.32 q0,q0 8151 #define vtrnq_u32 vtrnq_s32 8152 8153 float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b); // VTRN.32 q0,q0 8154 _NEON2SSE_INLINE float32x4x2_t vtrnq_f32(float32x4_t a, float32x4_t b) // VTRN.32 q0,q0 8155 { //may be not optimal solution compared with serial 8156 float32x4x2_t f32x4; 8157 __m128 a_sh, b_sh; 8158 a_sh = _mm_shuffle_ps (a, a, _MM_SHUFFLE(3,1, 2, 0)); //a0, a2, a1, a3, need to check endiness 8159 b_sh = _mm_shuffle_ps (b, b, _MM_SHUFFLE(3,1, 2, 0)); //b0, b2, b1, b3, need to check endiness 8160 8161 f32x4.val[0] = _mm_unpacklo_ps(a_sh, b_sh); //a0, b0, a2, b2 8162 f32x4.val[1] = _mm_unpackhi_ps(a_sh, b_sh); //a1, b1, a3, b3 8163 return f32x4; 8164 } 8165 8166 #if defined(USE_SSSE3) 8167 poly8x16x2_t vtrnq_p8(poly8x16_t a, poly8x16_t b); // VTRN.8 q0,q0 8168 #define vtrnq_p8 vtrnq_s8 8169 8170 poly16x8x2_t vtrnq_p16(poly16x8_t a, poly16x8_t b); // VTRN.16 q0,q0 8171 #define vtrnq_p16 vtrnq_s16 8172 #endif 8173 8174 //***************** Interleave elements *************************** 8175 //***************************************************************** 8176 //output has (a0,b0,a1,b1, a2,b2,.....) 8177 8178 int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b); // VZIP.8 q0,q0 8179 _NEON2SSE_INLINE int8x16x2_t vzipq_s8(int8x16_t a, int8x16_t b) // VZIP.8 q0,q0 8180 { 8181 int8x16x2_t r8x16; 8182 r8x16.val[0] = _mm_unpacklo_epi8(a, b); 8183 r8x16.val[1] = _mm_unpackhi_epi8(a, b); 8184 return r8x16; 8185 } 8186 8187 int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b); // VZIP.16 q0,q0 8188 _NEON2SSE_INLINE int16x8x2_t vzipq_s16(int16x8_t a, int16x8_t b) // VZIP.16 q0,q0 8189 { 8190 int16x8x2_t r16x8; 8191 r16x8.val[0] = _mm_unpacklo_epi16(a, b); 8192 r16x8.val[1] = _mm_unpackhi_epi16(a, b); 8193 return r16x8; 8194 } 8195 8196 int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b); // VZIP.32 q0,q0 8197 _NEON2SSE_INLINE int32x4x2_t vzipq_s32(int32x4_t a, int32x4_t b) // VZIP.32 q0,q0 8198 { 8199 int32x4x2_t r32x4; 8200 r32x4.val[0] = _mm_unpacklo_epi32(a, b); 8201 r32x4.val[1] = _mm_unpackhi_epi32(a, b); 8202 return r32x4; 8203 } 8204 8205 uint8x16x2_t vzipq_u8(uint8x16_t a, uint8x16_t b); // VZIP.8 q0,q0 8206 #define vzipq_u8 vzipq_s8 8207 8208 uint16x8x2_t vzipq_u16(uint16x8_t a, uint16x8_t b); // VZIP.16 q0,q0 8209 #define vzipq_u16 vzipq_s16 8210 8211 uint32x4x2_t vzipq_u32(uint32x4_t a, uint32x4_t b); // VZIP.32 q0,q0 8212 #define vzipq_u32 vzipq_s32 8213 8214 float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b); // VZIP.32 q0,q0 8215 _NEON2SSE_INLINE float32x4x2_t vzipq_f32(float32x4_t a, float32x4_t b) // VZIP.32 q0,q0 8216 { 8217 float32x4x2_t f32x4; 8218 f32x4.val[0] = _mm_unpacklo_ps ( a, b); 8219 f32x4.val[1] = _mm_unpackhi_ps ( a, b); 8220 return f32x4; 8221 } 8222 8223 poly8x16x2_t vzipq_p8(poly8x16_t a, poly8x16_t b); // VZIP.8 q0,q0 8224 #define vzipq_p8 vzipq_u8 8225 8226 poly16x8x2_t vzipq_p16(poly16x8_t a, poly16x8_t b); // VZIP.16 q0,q0 8227 #define vzipq_p16 vzipq_u16 8228 8229 //*********************** De-Interleave elements ************************* 8230 //************************************************************************* 8231 //As the result of these functions first val contains (a0,a2,a4,....,b0,b2, b4,...) and the second val (a1,a3,a5,....b1,b3,b5...) 8232 //no such functions in IA32 SIMD, shuffle is required 8233 8234 #if defined(USE_SSSE3) 8235 int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b); // VUZP.8 q0,q0 8236 _NEON2SSE_INLINE int8x16x2_t vuzpq_s8(int8x16_t a, int8x16_t b) // VUZP.8 q0,q0 8237 { 8238 int8x16x2_t v8x16; 8239 __m128i a_sh, b_sh; 8240 _NEON2SSE_ALIGN_16 int8_t mask8_even_odd[16] = { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3,5, 7, 9, 11, 13, 15}; 8241 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask8_even_odd); //a0, a2, a4, a6, a8, a10, a12, a14, a1, a3, a5, a7, a9, a11, a13, a15 8242 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask8_even_odd); //b0, b2, b4, b6, b8, b10, b12, b14, b1, b3, b5, b7, b9, b11, b13, b15 8243 //we need unpack64 to combine lower (upper) 64 bits from a with lower (upper) 64 bits from b 8244 v8x16.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); ///a0, a2, a4, a6, a8, a10, a12, a14, b0, b2, b4, b6, b8, b10, b12, b14, 8245 v8x16.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, a9, a11, a13, a15, b1, b3, b5, b7, b9, b11, b13, b15 8246 return v8x16; 8247 } 8248 #endif 8249 8250 #if defined(USE_SSSE3) 8251 int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b); // VUZP.16 q0,q0 8252 _NEON2SSE_INLINE int16x8x2_t vuzpq_s16(int16x8_t a, int16x8_t b) // VUZP.16 q0,q0 8253 { 8254 int16x8x2_t v16x8; 8255 __m128i a_sh, b_sh; 8256 _NEON2SSE_ALIGN_16 int8_t mask16_even_odd[16] = { 0,1, 4,5, 8,9, 12,13, 2,3, 6,7, 10,11, 14,15}; 8257 a_sh = _mm_shuffle_epi8 (a, *(__m128i*)mask16_even_odd); //a0, a2, a4, a6, a1, a3, a5, a7 8258 b_sh = _mm_shuffle_epi8 (b, *(__m128i*)mask16_even_odd); //b0, b2, b4, b6, b1, b3, b5, b7 8259 v16x8.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, a4, a6, b0, b2, b4, b6 8260 v16x8.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, a5, a7, b1, b3, b5, b7 8261 return v16x8; 8262 } 8263 #endif 8264 8265 int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b); // VUZP.32 q0,q0 8266 _NEON2SSE_INLINE int32x4x2_t vuzpq_s32(int32x4_t a, int32x4_t b) // VUZP.32 q0,q0 8267 { //may be not optimal solution compared with serial 8268 int32x4x2_t v32x4; 8269 __m128i a_sh, b_sh; 8270 a_sh = _mm_shuffle_epi32 (a, 216); //a0, a2, a1, a3 8271 b_sh = _mm_shuffle_epi32 (b, 216); //b0, b2, b1, b3 8272 8273 v32x4.val[0] = _mm_unpacklo_epi64(a_sh, b_sh); //a0, a2, b0, b2 8274 v32x4.val[1] = _mm_unpackhi_epi64(a_sh, b_sh); //a1, a3, b1, b3 8275 return v32x4; 8276 } 8277 8278 #if defined(USE_SSSE3) 8279 uint8x16x2_t vuzpq_u8(uint8x16_t a, uint8x16_t b); // VUZP.8 q0,q0 8280 #define vuzpq_u8 vuzpq_s8 8281 8282 uint16x8x2_t vuzpq_u16(uint16x8_t a, uint16x8_t b); // VUZP.16 q0,q0 8283 #define vuzpq_u16 vuzpq_s16 8284 #endif 8285 8286 uint32x4x2_t vuzpq_u32(uint32x4_t a, uint32x4_t b); // VUZP.32 q0,q0 8287 #define vuzpq_u32 vuzpq_s32 8288 8289 float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b); // VUZP.32 q0,q0 8290 _NEON2SSE_INLINE float32x4x2_t vuzpq_f32(float32x4_t a, float32x4_t b) // VUZP.32 q0,q0 8291 { 8292 float32x4x2_t v32x4; 8293 v32x4.val[0] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(2,0, 2, 0)); //a0, a2, b0, b2 , need to check endianess however 8294 v32x4.val[1] = _mm_shuffle_ps(a, b, _MM_SHUFFLE(3,1, 3, 1)); //a1, a3, b1, b3, need to check endianess however 8295 return v32x4; 8296 } 8297 8298 #if defined(USE_SSSE3) 8299 poly8x16x2_t vuzpq_p8(poly8x16_t a, poly8x16_t b); // VUZP.8 q0,q0 8300 #define vuzpq_p8 vuzpq_u8 8301 8302 poly16x8x2_t vuzpq_p16(poly16x8_t a, poly16x8_t b); // VUZP.16 q0,q0 8303 #define vuzpq_p16 vuzpq_u16 8304 #endif 8305 8306 //############################################################################################## 8307 //*********************** Reinterpret cast intrinsics.****************************************** 8308 //############################################################################################## 8309 // Not a part of oficial NEON instruction set but available in gcc compiler ********************* 8310 8311 poly8x16_t vreinterpretq_p8_u32 (uint32x4_t t); 8312 #define vreinterpretq_p8_u32 8313 8314 poly8x16_t vreinterpretq_p8_u16 (uint16x8_t t); 8315 #define vreinterpretq_p8_u16 8316 8317 poly8x16_t vreinterpretq_p8_u8 (uint8x16_t t); 8318 #define vreinterpretq_p8_u8 8319 8320 poly8x16_t vreinterpretq_p8_s32 (int32x4_t t); 8321 #define vreinterpretq_p8_s32 8322 8323 poly8x16_t vreinterpretq_p8_s16 (int16x8_t t); 8324 #define vreinterpretq_p8_s16 8325 8326 poly8x16_t vreinterpretq_p8_s8 (int8x16_t t); 8327 #define vreinterpretq_p8_s8 8328 8329 poly8x16_t vreinterpretq_p8_u64 (uint64x2_t t); 8330 #define vreinterpretq_p8_u64 8331 8332 poly8x16_t vreinterpretq_p8_s64 (int64x2_t t); 8333 #define vreinterpretq_p8_s64 8334 8335 poly8x16_t vreinterpretq_p8_f32 (float32x4_t t); 8336 #define vreinterpretq_p8_f32(t) _M128i(t) 8337 8338 poly8x16_t vreinterpretq_p8_p16 (poly16x8_t t); 8339 #define vreinterpretq_p8_p16 8340 8341 poly16x8_t vreinterpretq_p16_u32 (uint32x4_t t); 8342 #define vreinterpretq_p16_u32 8343 8344 poly16x8_t vreinterpretq_p16_u16 (uint16x8_t t); 8345 #define vreinterpretq_p16_u16 8346 8347 poly16x8_t vreinterpretq_p16_s32 (int32x4_t t); 8348 #define vreinterpretq_p16_s32 8349 8350 poly16x8_t vreinterpretq_p16_s16 (int16x8_t t); 8351 #define vreinterpretq_p16_s16 8352 8353 poly16x8_t vreinterpretq_p16_s8 (int8x16_t t); 8354 #define vreinterpretq_p16_s8 8355 8356 poly16x8_t vreinterpretq_p16_u64 (uint64x2_t t); 8357 #define vreinterpretq_p16_u64 8358 8359 poly16x8_t vreinterpretq_p16_s64 (int64x2_t t); 8360 #define vreinterpretq_p16_s64 8361 8362 poly16x8_t vreinterpretq_p16_f32 (float32x4_t t); 8363 #define vreinterpretq_p16_f32(t) _M128i(t) 8364 8365 poly16x8_t vreinterpretq_p16_p8 (poly8x16_t t); 8366 #define vreinterpretq_p16_p8 vreinterpretq_s16_p8 8367 8368 //**** Integer to float ****** 8369 8370 float32x4_t vreinterpretq_f32_u32 (uint32x4_t t); 8371 #define vreinterpretq_f32_u32(t) *(__m128*)&(t) 8372 8373 float32x4_t vreinterpretq_f32_u16 (uint16x8_t t); 8374 #define vreinterpretq_f32_u16 vreinterpretq_f32_u32 8375 8376 float32x4_t vreinterpretq_f32_u8 (uint8x16_t t); 8377 #define vreinterpretq_f32_u8 vreinterpretq_f32_u32 8378 8379 float32x4_t vreinterpretq_f32_s32 (int32x4_t t); 8380 #define vreinterpretq_f32_s32 vreinterpretq_f32_u32 8381 8382 float32x4_t vreinterpretq_f32_s16 (int16x8_t t); 8383 #define vreinterpretq_f32_s16 vreinterpretq_f32_u32 8384 8385 float32x4_t vreinterpretq_f32_s8 (int8x16_t t); 8386 #define vreinterpretq_f32_s8 vreinterpretq_f32_u32 8387 8388 float32x4_t vreinterpretq_f32_u64 (uint64x2_t t); 8389 #define vreinterpretq_f32_u64 vreinterpretq_f32_u32 8390 8391 float32x4_t vreinterpretq_f32_s64 (int64x2_t t); 8392 #define vreinterpretq_f32_s64 vreinterpretq_f32_u32 8393 8394 float32x4_t vreinterpretq_f32_p16 (poly16x8_t t); 8395 #define vreinterpretq_f32_p16 vreinterpretq_f32_u32 8396 8397 float32x4_t vreinterpretq_f32_p8 (poly8x16_t t); 8398 #define vreinterpretq_f32_p8 vreinterpretq_f32_u32 8399 8400 //*** Integer type conversions ****************** 8401 //no conversion necessary for the following functions because it is same data type 8402 8403 int64x2_t vreinterpretq_s64_u32 (uint32x4_t t); 8404 #define vreinterpretq_s64_u32 8405 8406 int64x2_t vreinterpretq_s64_s16 (uint16x8_t t); 8407 #define vreinterpretq_s64_s16 8408 8409 int64x2_t vreinterpretq_s64_u8 (uint8x16_t t); 8410 #define vreinterpretq_s64_u8 8411 8412 int64x2_t vreinterpretq_s64_s32 (int32x4_t t); 8413 #define vreinterpretq_s64_s32 8414 8415 int64x2_t vreinterpretq_s64_u16 (int16x8_t t); 8416 #define vreinterpretq_s64_u16 8417 8418 int64x2_t vreinterpretq_s64_s8 (int8x16_t t); 8419 #define vreinterpretq_s64_s8 8420 8421 int64x2_t vreinterpretq_s64_u64 (uint64x2_t t); 8422 #define vreinterpretq_s64_u64 8423 8424 int64x2_t vreinterpretq_s64_f32 (float32x4_t t); 8425 #define vreinterpretq_s64_f32(t) _M128i(t) 8426 8427 int64x2_t vreinterpretq_s64_p16 (poly16x8_t t); 8428 #define vreinterpretq_s64_p16 8429 8430 int64x2_t vreinterpretq_s64_p8 (poly8x16_t t); 8431 #define vreinterpretq_s64_p8 8432 8433 uint64x2_t vreinterpretq_u64_u32 (uint32x4_t t); 8434 #define vreinterpretq_u64_u32 8435 8436 uint64x2_t vreinterpretq_u64_u16 (uint16x8_t t); 8437 #define vreinterpretq_u64_u16 8438 8439 uint64x2_t vreinterpretq_u64_u8 (uint8x16_t t); 8440 #define vreinterpretq_u64_u8 8441 8442 uint64x2_t vreinterpretq_u64_s32 (int32x4_t t); 8443 #define vreinterpretq_u64_s32 8444 8445 uint64x2_t vreinterpretq_u64_s16 (int16x8_t t); 8446 #define vreinterpretq_u64_s16 8447 8448 uint64x2_t vreinterpretq_u64_s8 (int8x16_t t); 8449 #define vreinterpretq_u64_s8 8450 8451 uint64x2_t vreinterpretq_u64_s64 (int64x2_t t); 8452 #define vreinterpretq_u64_s64 8453 8454 uint64x2_t vreinterpretq_u64_f32 (float32x4_t t); 8455 #define vreinterpretq_u64_f32(t) _M128i(t) 8456 8457 uint64x2_t vreinterpretq_u64_p16 (poly16x8_t t); 8458 #define vreinterpretq_u64_p16 8459 8460 uint64x2_t vreinterpretq_u64_p8 (poly8x16_t t); 8461 #define vreinterpretq_u64_p8 8462 8463 int8x16_t vreinterpretq_s8_u32 (uint32x4_t t); 8464 #define vreinterpretq_s8_u32 8465 8466 int8x16_t vreinterpretq_s8_u16 (uint16x8_t t); 8467 #define vreinterpretq_s8_u16 8468 8469 int8x16_t vreinterpretq_s8_u8 (uint8x16_t t); 8470 #define vreinterpretq_s8_u8 8471 8472 int8x16_t vreinterpretq_s8_s32 (int32x4_t t); 8473 #define vreinterpretq_s8_s32 8474 8475 int8x16_t vreinterpretq_s8_s16 (int16x8_t t); 8476 #define vreinterpretq_s8_s16 8477 8478 int8x16_t vreinterpretq_s8_u64 (uint64x2_t t); 8479 #define vreinterpretq_s8_u64 8480 8481 int8x16_t vreinterpretq_s8_s64 (int64x2_t t); 8482 #define vreinterpretq_s8_s64 8483 8484 int8x16_t vreinterpretq_s8_f32 (float32x4_t t); 8485 #define vreinterpretq_s8_f32(t) _M128i(t) 8486 8487 int8x16_t vreinterpretq_s8_p16 (poly16x8_t t); 8488 #define vreinterpretq_s8_p16 8489 8490 int8x16_t vreinterpretq_s8_p8 (poly8x16_t t); 8491 #define vreinterpretq_s8_p8 8492 8493 int16x8_t vreinterpretq_s16_u32 (uint32x4_t t); 8494 #define vreinterpretq_s16_u32 8495 8496 int16x8_t vreinterpretq_s16_u16 (uint16x8_t t); 8497 #define vreinterpretq_s16_u16 8498 8499 int16x8_t vreinterpretq_s16_u8 (uint8x16_t t); 8500 #define vreinterpretq_s16_u8 8501 8502 int16x8_t vreinterpretq_s16_s32 (int32x4_t t); 8503 #define vreinterpretq_s16_s32 8504 8505 int16x8_t vreinterpretq_s16_s8 (int8x16_t t); 8506 #define vreinterpretq_s16_s8 8507 8508 int16x8_t vreinterpretq_s16_u64 (uint64x2_t t); 8509 #define vreinterpretq_s16_u64 8510 8511 int16x8_t vreinterpretq_s16_s64 (int64x2_t t); 8512 #define vreinterpretq_s16_s64 8513 8514 int16x8_t vreinterpretq_s16_f32 (float32x4_t t); 8515 #define vreinterpretq_s16_f32(t) _M128i(t) 8516 8517 int16x8_t vreinterpretq_s16_p16 (poly16x8_t t); 8518 #define vreinterpretq_s16_p16 8519 8520 int16x8_t vreinterpretq_s16_p8 (poly8x16_t t); 8521 #define vreinterpretq_s16_p8 8522 8523 int32x4_t vreinterpretq_s32_u32 (uint32x4_t t); 8524 #define vreinterpretq_s32_u32 8525 8526 int32x4_t vreinterpretq_s32_u16 (uint16x8_t t); 8527 #define vreinterpretq_s32_u16 8528 8529 int32x4_t vreinterpretq_s32_u8 (uint8x16_t t); 8530 #define vreinterpretq_s32_u8 8531 8532 int32x4_t vreinterpretq_s32_s16 (int16x8_t t); 8533 #define vreinterpretq_s32_s16 8534 8535 int32x4_t vreinterpretq_s32_s8 (int8x16_t t); 8536 #define vreinterpretq_s32_s8 8537 8538 int32x4_t vreinterpretq_s32_u64 (uint64x2_t t); 8539 #define vreinterpretq_s32_u64 8540 8541 int32x4_t vreinterpretq_s32_s64 (int64x2_t t); 8542 #define vreinterpretq_s32_s64 8543 8544 int32x4_t vreinterpretq_s32_f32 (float32x4_t t); 8545 #define vreinterpretq_s32_f32(t) _mm_castps_si128(t) //(*(__m128i*)&(t)) 8546 8547 int32x4_t vreinterpretq_s32_p16 (poly16x8_t t); 8548 #define vreinterpretq_s32_p16 8549 8550 int32x4_t vreinterpretq_s32_p8 (poly8x16_t t); 8551 #define vreinterpretq_s32_p8 8552 8553 uint8x16_t vreinterpretq_u8_u32 (uint32x4_t t); 8554 #define vreinterpretq_u8_u32 8555 8556 uint8x16_t vreinterpretq_u8_u16 (uint16x8_t t); 8557 #define vreinterpretq_u8_u16 8558 8559 uint8x16_t vreinterpretq_u8_s32 (int32x4_t t); 8560 #define vreinterpretq_u8_s32 8561 8562 uint8x16_t vreinterpretq_u8_s16 (int16x8_t t); 8563 #define vreinterpretq_u8_s16 8564 8565 uint8x16_t vreinterpretq_u8_s8 (int8x16_t t); 8566 #define vreinterpretq_u8_s8 8567 8568 uint8x16_t vreinterpretq_u8_u64 (uint64x2_t t); 8569 #define vreinterpretq_u8_u64 8570 8571 uint8x16_t vreinterpretq_u8_s64 (int64x2_t t); 8572 #define vreinterpretq_u8_s64 8573 8574 uint8x16_t vreinterpretq_u8_f32 (float32x4_t t); 8575 #define vreinterpretq_u8_f32(t) _M128i(t) 8576 8577 uint8x16_t vreinterpretq_u8_p16 (poly16x8_t t); 8578 #define vreinterpretq_u8_p16 8579 8580 uint8x16_t vreinterpretq_u8_p8 (poly8x16_t t); 8581 #define vreinterpretq_u8_p8 8582 8583 uint16x8_t vreinterpretq_u16_u32 (uint32x4_t t); 8584 #define vreinterpretq_u16_u32 8585 8586 uint16x8_t vreinterpretq_u16_u8 (uint8x16_t t); 8587 #define vreinterpretq_u16_u8 8588 8589 uint16x8_t vreinterpretq_u16_s32 (int32x4_t t); 8590 #define vreinterpretq_u16_s32 8591 8592 uint16x8_t vreinterpretq_u16_s16 (int16x8_t t); 8593 #define vreinterpretq_u16_s16 8594 8595 uint16x8_t vreinterpretq_u16_s8 (int8x16_t t); 8596 #define vreinterpretq_u16_s8 8597 8598 uint16x8_t vreinterpretq_u16_u64 (uint64x2_t t); 8599 #define vreinterpretq_u16_u64 8600 8601 uint16x8_t vreinterpretq_u16_s64 (int64x2_t t); 8602 #define vreinterpretq_u16_s64 8603 8604 uint16x8_t vreinterpretq_u16_f32 (float32x4_t t); 8605 #define vreinterpretq_u16_f32(t) _M128i(t) 8606 8607 uint16x8_t vreinterpretq_u16_p16 (poly16x8_t t); 8608 #define vreinterpretq_u16_p16 8609 8610 uint16x8_t vreinterpretq_u16_p8 (poly8x16_t t); 8611 #define vreinterpretq_u16_p8 8612 8613 uint32x4_t vreinterpretq_u32_u16 (uint16x8_t t); 8614 #define vreinterpretq_u32_u16 8615 8616 uint32x4_t vreinterpretq_u32_u8 (uint8x16_t t); 8617 #define vreinterpretq_u32_u8 8618 8619 uint32x4_t vreinterpretq_u32_s32 (int32x4_t t); 8620 #define vreinterpretq_u32_s32 8621 8622 uint32x4_t vreinterpretq_u32_s16 (int16x8_t t); 8623 #define vreinterpretq_u32_s16 8624 8625 uint32x4_t vreinterpretq_u32_s8 (int8x16_t t); 8626 #define vreinterpretq_u32_s8 8627 8628 uint32x4_t vreinterpretq_u32_u64 (uint64x2_t t); 8629 #define vreinterpretq_u32_u64 8630 8631 uint32x4_t vreinterpretq_u32_s64 (int64x2_t t); 8632 #define vreinterpretq_u32_s64 8633 8634 uint32x4_t vreinterpretq_u32_f32 (float32x4_t t); 8635 #define vreinterpretq_u32_f32(t) _M128i(t) 8636 8637 uint32x4_t vreinterpretq_u32_p16 (poly16x8_t t); 8638 #define vreinterpretq_u32_p16 8639 8640 uint32x4_t vreinterpretq_u32_p8 (poly8x16_t t); 8641 #define vreinterpretq_u32_p8 8642 8643 #endif /* NEON2SSE_H */ 8644