Home | History | Annotate | Download | only in include
      1 /*===---- arm_neon.h - ARM Neon intrinsics ---------------------------------===
      2  *
      3  * Permission is hereby granted, free of charge, to any person obtaining a copy
      4  * of this software and associated documentation files (the "Software"), to deal
      5  * in the Software without restriction, including without limitation the rights
      6  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
      7  * copies of the Software, and to permit persons to whom the Software is
      8  * furnished to do so, subject to the following conditions:
      9  *
     10  * The above copyright notice and this permission notice shall be included in
     11  * all copies or substantial portions of the Software.
     12  *
     13  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     14  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     15  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
     16  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     17  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
     18  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
     19  * THE SOFTWARE.
     20  *
     21  *===-----------------------------------------------------------------------===
     22  */
     23 
     24 #ifndef __ARM_NEON_H
     25 #define __ARM_NEON_H
     26 
     27 #ifndef __ARM_NEON__
     28 #error "NEON support not enabled"
     29 #endif
     30 
     31 #include <stdint.h>
     32 
     33 typedef float float32_t;
     34 typedef int8_t poly8_t;
     35 typedef int16_t poly16_t;
     36 typedef uint16_t float16_t;
     37 typedef __attribute__((neon_vector_type(8)))  int8_t int8x8_t;
     38 typedef __attribute__((neon_vector_type(16))) int8_t int8x16_t;
     39 typedef __attribute__((neon_vector_type(4)))  int16_t int16x4_t;
     40 typedef __attribute__((neon_vector_type(8)))  int16_t int16x8_t;
     41 typedef __attribute__((neon_vector_type(2)))  int32_t int32x2_t;
     42 typedef __attribute__((neon_vector_type(4)))  int32_t int32x4_t;
     43 typedef __attribute__((neon_vector_type(1)))  int64_t int64x1_t;
     44 typedef __attribute__((neon_vector_type(2)))  int64_t int64x2_t;
     45 typedef __attribute__((neon_vector_type(8)))  uint8_t uint8x8_t;
     46 typedef __attribute__((neon_vector_type(16))) uint8_t uint8x16_t;
     47 typedef __attribute__((neon_vector_type(4)))  uint16_t uint16x4_t;
     48 typedef __attribute__((neon_vector_type(8)))  uint16_t uint16x8_t;
     49 typedef __attribute__((neon_vector_type(2)))  uint32_t uint32x2_t;
     50 typedef __attribute__((neon_vector_type(4)))  uint32_t uint32x4_t;
     51 typedef __attribute__((neon_vector_type(1)))  uint64_t uint64x1_t;
     52 typedef __attribute__((neon_vector_type(2)))  uint64_t uint64x2_t;
     53 typedef __attribute__((neon_vector_type(4)))  float16_t float16x4_t;
     54 typedef __attribute__((neon_vector_type(8)))  float16_t float16x8_t;
     55 typedef __attribute__((neon_vector_type(2)))  float32_t float32x2_t;
     56 typedef __attribute__((neon_vector_type(4)))  float32_t float32x4_t;
     57 typedef __attribute__((neon_polyvector_type(8)))  poly8_t poly8x8_t;
     58 typedef __attribute__((neon_polyvector_type(16))) poly8_t poly8x16_t;
     59 typedef __attribute__((neon_polyvector_type(4)))  poly16_t poly16x4_t;
     60 typedef __attribute__((neon_polyvector_type(8)))  poly16_t poly16x8_t;
     61 
     62 typedef struct int8x8x2_t {
     63   int8x8_t val[2];
     64 } int8x8x2_t;
     65 
     66 typedef struct int8x16x2_t {
     67   int8x16_t val[2];
     68 } int8x16x2_t;
     69 
     70 typedef struct int16x4x2_t {
     71   int16x4_t val[2];
     72 } int16x4x2_t;
     73 
     74 typedef struct int16x8x2_t {
     75   int16x8_t val[2];
     76 } int16x8x2_t;
     77 
     78 typedef struct int32x2x2_t {
     79   int32x2_t val[2];
     80 } int32x2x2_t;
     81 
     82 typedef struct int32x4x2_t {
     83   int32x4_t val[2];
     84 } int32x4x2_t;
     85 
     86 typedef struct int64x1x2_t {
     87   int64x1_t val[2];
     88 } int64x1x2_t;
     89 
     90 typedef struct int64x2x2_t {
     91   int64x2_t val[2];
     92 } int64x2x2_t;
     93 
     94 typedef struct uint8x8x2_t {
     95   uint8x8_t val[2];
     96 } uint8x8x2_t;
     97 
     98 typedef struct uint8x16x2_t {
     99   uint8x16_t val[2];
    100 } uint8x16x2_t;
    101 
    102 typedef struct uint16x4x2_t {
    103   uint16x4_t val[2];
    104 } uint16x4x2_t;
    105 
    106 typedef struct uint16x8x2_t {
    107   uint16x8_t val[2];
    108 } uint16x8x2_t;
    109 
    110 typedef struct uint32x2x2_t {
    111   uint32x2_t val[2];
    112 } uint32x2x2_t;
    113 
    114 typedef struct uint32x4x2_t {
    115   uint32x4_t val[2];
    116 } uint32x4x2_t;
    117 
    118 typedef struct uint64x1x2_t {
    119   uint64x1_t val[2];
    120 } uint64x1x2_t;
    121 
    122 typedef struct uint64x2x2_t {
    123   uint64x2_t val[2];
    124 } uint64x2x2_t;
    125 
    126 typedef struct float16x4x2_t {
    127   float16x4_t val[2];
    128 } float16x4x2_t;
    129 
    130 typedef struct float16x8x2_t {
    131   float16x8_t val[2];
    132 } float16x8x2_t;
    133 
    134 typedef struct float32x2x2_t {
    135   float32x2_t val[2];
    136 } float32x2x2_t;
    137 
    138 typedef struct float32x4x2_t {
    139   float32x4_t val[2];
    140 } float32x4x2_t;
    141 
    142 typedef struct poly8x8x2_t {
    143   poly8x8_t val[2];
    144 } poly8x8x2_t;
    145 
    146 typedef struct poly8x16x2_t {
    147   poly8x16_t val[2];
    148 } poly8x16x2_t;
    149 
    150 typedef struct poly16x4x2_t {
    151   poly16x4_t val[2];
    152 } poly16x4x2_t;
    153 
    154 typedef struct poly16x8x2_t {
    155   poly16x8_t val[2];
    156 } poly16x8x2_t;
    157 
    158 typedef struct int8x8x3_t {
    159   int8x8_t val[3];
    160 } int8x8x3_t;
    161 
    162 typedef struct int8x16x3_t {
    163   int8x16_t val[3];
    164 } int8x16x3_t;
    165 
    166 typedef struct int16x4x3_t {
    167   int16x4_t val[3];
    168 } int16x4x3_t;
    169 
    170 typedef struct int16x8x3_t {
    171   int16x8_t val[3];
    172 } int16x8x3_t;
    173 
    174 typedef struct int32x2x3_t {
    175   int32x2_t val[3];
    176 } int32x2x3_t;
    177 
    178 typedef struct int32x4x3_t {
    179   int32x4_t val[3];
    180 } int32x4x3_t;
    181 
    182 typedef struct int64x1x3_t {
    183   int64x1_t val[3];
    184 } int64x1x3_t;
    185 
    186 typedef struct int64x2x3_t {
    187   int64x2_t val[3];
    188 } int64x2x3_t;
    189 
    190 typedef struct uint8x8x3_t {
    191   uint8x8_t val[3];
    192 } uint8x8x3_t;
    193 
    194 typedef struct uint8x16x3_t {
    195   uint8x16_t val[3];
    196 } uint8x16x3_t;
    197 
    198 typedef struct uint16x4x3_t {
    199   uint16x4_t val[3];
    200 } uint16x4x3_t;
    201 
    202 typedef struct uint16x8x3_t {
    203   uint16x8_t val[3];
    204 } uint16x8x3_t;
    205 
    206 typedef struct uint32x2x3_t {
    207   uint32x2_t val[3];
    208 } uint32x2x3_t;
    209 
    210 typedef struct uint32x4x3_t {
    211   uint32x4_t val[3];
    212 } uint32x4x3_t;
    213 
    214 typedef struct uint64x1x3_t {
    215   uint64x1_t val[3];
    216 } uint64x1x3_t;
    217 
    218 typedef struct uint64x2x3_t {
    219   uint64x2_t val[3];
    220 } uint64x2x3_t;
    221 
    222 typedef struct float16x4x3_t {
    223   float16x4_t val[3];
    224 } float16x4x3_t;
    225 
    226 typedef struct float16x8x3_t {
    227   float16x8_t val[3];
    228 } float16x8x3_t;
    229 
    230 typedef struct float32x2x3_t {
    231   float32x2_t val[3];
    232 } float32x2x3_t;
    233 
    234 typedef struct float32x4x3_t {
    235   float32x4_t val[3];
    236 } float32x4x3_t;
    237 
    238 typedef struct poly8x8x3_t {
    239   poly8x8_t val[3];
    240 } poly8x8x3_t;
    241 
    242 typedef struct poly8x16x3_t {
    243   poly8x16_t val[3];
    244 } poly8x16x3_t;
    245 
    246 typedef struct poly16x4x3_t {
    247   poly16x4_t val[3];
    248 } poly16x4x3_t;
    249 
    250 typedef struct poly16x8x3_t {
    251   poly16x8_t val[3];
    252 } poly16x8x3_t;
    253 
    254 typedef struct int8x8x4_t {
    255   int8x8_t val[4];
    256 } int8x8x4_t;
    257 
    258 typedef struct int8x16x4_t {
    259   int8x16_t val[4];
    260 } int8x16x4_t;
    261 
    262 typedef struct int16x4x4_t {
    263   int16x4_t val[4];
    264 } int16x4x4_t;
    265 
    266 typedef struct int16x8x4_t {
    267   int16x8_t val[4];
    268 } int16x8x4_t;
    269 
    270 typedef struct int32x2x4_t {
    271   int32x2_t val[4];
    272 } int32x2x4_t;
    273 
    274 typedef struct int32x4x4_t {
    275   int32x4_t val[4];
    276 } int32x4x4_t;
    277 
    278 typedef struct int64x1x4_t {
    279   int64x1_t val[4];
    280 } int64x1x4_t;
    281 
    282 typedef struct int64x2x4_t {
    283   int64x2_t val[4];
    284 } int64x2x4_t;
    285 
    286 typedef struct uint8x8x4_t {
    287   uint8x8_t val[4];
    288 } uint8x8x4_t;
    289 
    290 typedef struct uint8x16x4_t {
    291   uint8x16_t val[4];
    292 } uint8x16x4_t;
    293 
    294 typedef struct uint16x4x4_t {
    295   uint16x4_t val[4];
    296 } uint16x4x4_t;
    297 
    298 typedef struct uint16x8x4_t {
    299   uint16x8_t val[4];
    300 } uint16x8x4_t;
    301 
    302 typedef struct uint32x2x4_t {
    303   uint32x2_t val[4];
    304 } uint32x2x4_t;
    305 
    306 typedef struct uint32x4x4_t {
    307   uint32x4_t val[4];
    308 } uint32x4x4_t;
    309 
    310 typedef struct uint64x1x4_t {
    311   uint64x1_t val[4];
    312 } uint64x1x4_t;
    313 
    314 typedef struct uint64x2x4_t {
    315   uint64x2_t val[4];
    316 } uint64x2x4_t;
    317 
    318 typedef struct float16x4x4_t {
    319   float16x4_t val[4];
    320 } float16x4x4_t;
    321 
    322 typedef struct float16x8x4_t {
    323   float16x8_t val[4];
    324 } float16x8x4_t;
    325 
    326 typedef struct float32x2x4_t {
    327   float32x2_t val[4];
    328 } float32x2x4_t;
    329 
    330 typedef struct float32x4x4_t {
    331   float32x4_t val[4];
    332 } float32x4x4_t;
    333 
    334 typedef struct poly8x8x4_t {
    335   poly8x8_t val[4];
    336 } poly8x8x4_t;
    337 
    338 typedef struct poly8x16x4_t {
    339   poly8x16_t val[4];
    340 } poly8x16x4_t;
    341 
    342 typedef struct poly16x4x4_t {
    343   poly16x4_t val[4];
    344 } poly16x4x4_t;
    345 
    346 typedef struct poly16x8x4_t {
    347   poly16x8_t val[4];
    348 } poly16x8x4_t;
    349 
    350 #define __ai static inline __attribute__((__always_inline__, __nodebug__))
    351 
    352 __ai int16x8_t vmovl_s8(int8x8_t __a) {
    353   return (int16x8_t)__builtin_neon_vmovl_v(__a, 33); }
    354 __ai int32x4_t vmovl_s16(int16x4_t __a) {
    355   return (int32x4_t)__builtin_neon_vmovl_v((int8x8_t)__a, 34); }
    356 __ai int64x2_t vmovl_s32(int32x2_t __a) {
    357   return (int64x2_t)__builtin_neon_vmovl_v((int8x8_t)__a, 35); }
    358 __ai uint16x8_t vmovl_u8(uint8x8_t __a) {
    359   return (uint16x8_t)__builtin_neon_vmovl_v((int8x8_t)__a, 49); }
    360 __ai uint32x4_t vmovl_u16(uint16x4_t __a) {
    361   return (uint32x4_t)__builtin_neon_vmovl_v((int8x8_t)__a, 50); }
    362 __ai uint64x2_t vmovl_u32(uint32x2_t __a) {
    363   return (uint64x2_t)__builtin_neon_vmovl_v((int8x8_t)__a, 51); }
    364 
    365 __ai int16x8_t vmull_s8(int8x8_t __a, int8x8_t __b) {
    366   return (int16x8_t)__builtin_neon_vmull_v(__a, __b, 33); }
    367 __ai int32x4_t vmull_s16(int16x4_t __a, int16x4_t __b) {
    368   return (int32x4_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)__b, 34); }
    369 __ai int64x2_t vmull_s32(int32x2_t __a, int32x2_t __b) {
    370   return (int64x2_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)__b, 35); }
    371 __ai uint16x8_t vmull_u8(uint8x8_t __a, uint8x8_t __b) {
    372   return (uint16x8_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)__b, 49); }
    373 __ai uint32x4_t vmull_u16(uint16x4_t __a, uint16x4_t __b) {
    374   return (uint32x4_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)__b, 50); }
    375 __ai uint64x2_t vmull_u32(uint32x2_t __a, uint32x2_t __b) {
    376   return (uint64x2_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)__b, 51); }
    377 __ai poly16x8_t vmull_p8(poly8x8_t __a, poly8x8_t __b) {
    378   return (poly16x8_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)__b, 37); }
    379 
    380 __ai int8x8_t vabd_s8(int8x8_t __a, int8x8_t __b) {
    381   return (int8x8_t)__builtin_neon_vabd_v(__a, __b, 0); }
    382 __ai int16x4_t vabd_s16(int16x4_t __a, int16x4_t __b) {
    383   return (int16x4_t)__builtin_neon_vabd_v((int8x8_t)__a, (int8x8_t)__b, 1); }
    384 __ai int32x2_t vabd_s32(int32x2_t __a, int32x2_t __b) {
    385   return (int32x2_t)__builtin_neon_vabd_v((int8x8_t)__a, (int8x8_t)__b, 2); }
    386 __ai uint8x8_t vabd_u8(uint8x8_t __a, uint8x8_t __b) {
    387   return (uint8x8_t)__builtin_neon_vabd_v((int8x8_t)__a, (int8x8_t)__b, 16); }
    388 __ai uint16x4_t vabd_u16(uint16x4_t __a, uint16x4_t __b) {
    389   return (uint16x4_t)__builtin_neon_vabd_v((int8x8_t)__a, (int8x8_t)__b, 17); }
    390 __ai uint32x2_t vabd_u32(uint32x2_t __a, uint32x2_t __b) {
    391   return (uint32x2_t)__builtin_neon_vabd_v((int8x8_t)__a, (int8x8_t)__b, 18); }
    392 __ai float32x2_t vabd_f32(float32x2_t __a, float32x2_t __b) {
    393   return (float32x2_t)__builtin_neon_vabd_v((int8x8_t)__a, (int8x8_t)__b, 7); }
    394 __ai int8x16_t vabdq_s8(int8x16_t __a, int8x16_t __b) {
    395   return (int8x16_t)__builtin_neon_vabdq_v(__a, __b, 32); }
    396 __ai int16x8_t vabdq_s16(int16x8_t __a, int16x8_t __b) {
    397   return (int16x8_t)__builtin_neon_vabdq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
    398 __ai int32x4_t vabdq_s32(int32x4_t __a, int32x4_t __b) {
    399   return (int32x4_t)__builtin_neon_vabdq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
    400 __ai uint8x16_t vabdq_u8(uint8x16_t __a, uint8x16_t __b) {
    401   return (uint8x16_t)__builtin_neon_vabdq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
    402 __ai uint16x8_t vabdq_u16(uint16x8_t __a, uint16x8_t __b) {
    403   return (uint16x8_t)__builtin_neon_vabdq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
    404 __ai uint32x4_t vabdq_u32(uint32x4_t __a, uint32x4_t __b) {
    405   return (uint32x4_t)__builtin_neon_vabdq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
    406 __ai float32x4_t vabdq_f32(float32x4_t __a, float32x4_t __b) {
    407   return (float32x4_t)__builtin_neon_vabdq_v((int8x16_t)__a, (int8x16_t)__b, 39); }
    408 
    409 __ai int8x8_t vaba_s8(int8x8_t __a, int8x8_t __b, int8x8_t __c) {
    410   return __a + vabd_s8(__b, __c); }
    411 __ai int16x4_t vaba_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c) {
    412   return __a + vabd_s16(__b, __c); }
    413 __ai int32x2_t vaba_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c) {
    414   return __a + vabd_s32(__b, __c); }
    415 __ai uint8x8_t vaba_u8(uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) {
    416   return __a + vabd_u8(__b, __c); }
    417 __ai uint16x4_t vaba_u16(uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) {
    418   return __a + vabd_u16(__b, __c); }
    419 __ai uint32x2_t vaba_u32(uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) {
    420   return __a + vabd_u32(__b, __c); }
    421 __ai int8x16_t vabaq_s8(int8x16_t __a, int8x16_t __b, int8x16_t __c) {
    422   return __a + vabdq_s8(__b, __c); }
    423 __ai int16x8_t vabaq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) {
    424   return __a + vabdq_s16(__b, __c); }
    425 __ai int32x4_t vabaq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) {
    426   return __a + vabdq_s32(__b, __c); }
    427 __ai uint8x16_t vabaq_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) {
    428   return __a + vabdq_u8(__b, __c); }
    429 __ai uint16x8_t vabaq_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) {
    430   return __a + vabdq_u16(__b, __c); }
    431 __ai uint32x4_t vabaq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) {
    432   return __a + vabdq_u32(__b, __c); }
    433 
    434 __ai int16x8_t vabal_s8(int16x8_t __a, int8x8_t __b, int8x8_t __c) {
    435   return __a + (int16x8_t)vmovl_u8((uint8x8_t)vabd_s8(__b, __c)); }
    436 __ai int32x4_t vabal_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c) {
    437   return __a + (int32x4_t)vmovl_u16((uint16x4_t)vabd_s16(__b, __c)); }
    438 __ai int64x2_t vabal_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c) {
    439   return __a + (int64x2_t)vmovl_u32((uint32x2_t)vabd_s32(__b, __c)); }
    440 __ai uint16x8_t vabal_u8(uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) {
    441   return __a + vmovl_u8(vabd_u8(__b, __c)); }
    442 __ai uint32x4_t vabal_u16(uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) {
    443   return __a + vmovl_u16(vabd_u16(__b, __c)); }
    444 __ai uint64x2_t vabal_u32(uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) {
    445   return __a + vmovl_u32(vabd_u32(__b, __c)); }
    446 
    447 __ai int16x8_t vabdl_s8(int8x8_t __a, int8x8_t __b) {
    448   return (int16x8_t)vmovl_u8((uint8x8_t)vabd_s8(__a, __b)); }
    449 __ai int32x4_t vabdl_s16(int16x4_t __a, int16x4_t __b) {
    450   return (int32x4_t)vmovl_u16((uint16x4_t)vabd_s16(__a, __b)); }
    451 __ai int64x2_t vabdl_s32(int32x2_t __a, int32x2_t __b) {
    452   return (int64x2_t)vmovl_u32((uint32x2_t)vabd_s32(__a, __b)); }
    453 __ai uint16x8_t vabdl_u8(uint8x8_t __a, uint8x8_t __b) {
    454   return vmovl_u8(vabd_u8(__a, __b)); }
    455 __ai uint32x4_t vabdl_u16(uint16x4_t __a, uint16x4_t __b) {
    456   return vmovl_u16(vabd_u16(__a, __b)); }
    457 __ai uint64x2_t vabdl_u32(uint32x2_t __a, uint32x2_t __b) {
    458   return vmovl_u32(vabd_u32(__a, __b)); }
    459 
    460 __ai int8x8_t vabs_s8(int8x8_t __a) {
    461   return (int8x8_t)__builtin_neon_vabs_v(__a, 0); }
    462 __ai int16x4_t vabs_s16(int16x4_t __a) {
    463   return (int16x4_t)__builtin_neon_vabs_v((int8x8_t)__a, 1); }
    464 __ai int32x2_t vabs_s32(int32x2_t __a) {
    465   return (int32x2_t)__builtin_neon_vabs_v((int8x8_t)__a, 2); }
    466 __ai float32x2_t vabs_f32(float32x2_t __a) {
    467   return (float32x2_t)__builtin_neon_vabs_v((int8x8_t)__a, 7); }
    468 __ai int8x16_t vabsq_s8(int8x16_t __a) {
    469   return (int8x16_t)__builtin_neon_vabsq_v(__a, 32); }
    470 __ai int16x8_t vabsq_s16(int16x8_t __a) {
    471   return (int16x8_t)__builtin_neon_vabsq_v((int8x16_t)__a, 33); }
    472 __ai int32x4_t vabsq_s32(int32x4_t __a) {
    473   return (int32x4_t)__builtin_neon_vabsq_v((int8x16_t)__a, 34); }
    474 __ai float32x4_t vabsq_f32(float32x4_t __a) {
    475   return (float32x4_t)__builtin_neon_vabsq_v((int8x16_t)__a, 39); }
    476 
    477 __ai int8x8_t vadd_s8(int8x8_t __a, int8x8_t __b) {
    478   return __a + __b; }
    479 __ai int16x4_t vadd_s16(int16x4_t __a, int16x4_t __b) {
    480   return __a + __b; }
    481 __ai int32x2_t vadd_s32(int32x2_t __a, int32x2_t __b) {
    482   return __a + __b; }
    483 __ai int64x1_t vadd_s64(int64x1_t __a, int64x1_t __b) {
    484   return __a + __b; }
    485 __ai float32x2_t vadd_f32(float32x2_t __a, float32x2_t __b) {
    486   return __a + __b; }
    487 __ai uint8x8_t vadd_u8(uint8x8_t __a, uint8x8_t __b) {
    488   return __a + __b; }
    489 __ai uint16x4_t vadd_u16(uint16x4_t __a, uint16x4_t __b) {
    490   return __a + __b; }
    491 __ai uint32x2_t vadd_u32(uint32x2_t __a, uint32x2_t __b) {
    492   return __a + __b; }
    493 __ai uint64x1_t vadd_u64(uint64x1_t __a, uint64x1_t __b) {
    494   return __a + __b; }
    495 __ai int8x16_t vaddq_s8(int8x16_t __a, int8x16_t __b) {
    496   return __a + __b; }
    497 __ai int16x8_t vaddq_s16(int16x8_t __a, int16x8_t __b) {
    498   return __a + __b; }
    499 __ai int32x4_t vaddq_s32(int32x4_t __a, int32x4_t __b) {
    500   return __a + __b; }
    501 __ai int64x2_t vaddq_s64(int64x2_t __a, int64x2_t __b) {
    502   return __a + __b; }
    503 __ai float32x4_t vaddq_f32(float32x4_t __a, float32x4_t __b) {
    504   return __a + __b; }
    505 __ai uint8x16_t vaddq_u8(uint8x16_t __a, uint8x16_t __b) {
    506   return __a + __b; }
    507 __ai uint16x8_t vaddq_u16(uint16x8_t __a, uint16x8_t __b) {
    508   return __a + __b; }
    509 __ai uint32x4_t vaddq_u32(uint32x4_t __a, uint32x4_t __b) {
    510   return __a + __b; }
    511 __ai uint64x2_t vaddq_u64(uint64x2_t __a, uint64x2_t __b) {
    512   return __a + __b; }
    513 
    514 __ai int8x8_t vaddhn_s16(int16x8_t __a, int16x8_t __b) {
    515   return (int8x8_t)__builtin_neon_vaddhn_v((int8x16_t)__a, (int8x16_t)__b, 0); }
    516 __ai int16x4_t vaddhn_s32(int32x4_t __a, int32x4_t __b) {
    517   return (int16x4_t)__builtin_neon_vaddhn_v((int8x16_t)__a, (int8x16_t)__b, 1); }
    518 __ai int32x2_t vaddhn_s64(int64x2_t __a, int64x2_t __b) {
    519   return (int32x2_t)__builtin_neon_vaddhn_v((int8x16_t)__a, (int8x16_t)__b, 2); }
    520 __ai uint8x8_t vaddhn_u16(uint16x8_t __a, uint16x8_t __b) {
    521   return (uint8x8_t)__builtin_neon_vaddhn_v((int8x16_t)__a, (int8x16_t)__b, 16); }
    522 __ai uint16x4_t vaddhn_u32(uint32x4_t __a, uint32x4_t __b) {
    523   return (uint16x4_t)__builtin_neon_vaddhn_v((int8x16_t)__a, (int8x16_t)__b, 17); }
    524 __ai uint32x2_t vaddhn_u64(uint64x2_t __a, uint64x2_t __b) {
    525   return (uint32x2_t)__builtin_neon_vaddhn_v((int8x16_t)__a, (int8x16_t)__b, 18); }
    526 
    527 __ai int16x8_t vaddl_s8(int8x8_t __a, int8x8_t __b) {
    528   return vmovl_s8(__a) + vmovl_s8(__b); }
    529 __ai int32x4_t vaddl_s16(int16x4_t __a, int16x4_t __b) {
    530   return vmovl_s16(__a) + vmovl_s16(__b); }
    531 __ai int64x2_t vaddl_s32(int32x2_t __a, int32x2_t __b) {
    532   return vmovl_s32(__a) + vmovl_s32(__b); }
    533 __ai uint16x8_t vaddl_u8(uint8x8_t __a, uint8x8_t __b) {
    534   return vmovl_u8(__a) + vmovl_u8(__b); }
    535 __ai uint32x4_t vaddl_u16(uint16x4_t __a, uint16x4_t __b) {
    536   return vmovl_u16(__a) + vmovl_u16(__b); }
    537 __ai uint64x2_t vaddl_u32(uint32x2_t __a, uint32x2_t __b) {
    538   return vmovl_u32(__a) + vmovl_u32(__b); }
    539 
    540 __ai int16x8_t vaddw_s8(int16x8_t __a, int8x8_t __b) {
    541   return __a + vmovl_s8(__b); }
    542 __ai int32x4_t vaddw_s16(int32x4_t __a, int16x4_t __b) {
    543   return __a + vmovl_s16(__b); }
    544 __ai int64x2_t vaddw_s32(int64x2_t __a, int32x2_t __b) {
    545   return __a + vmovl_s32(__b); }
    546 __ai uint16x8_t vaddw_u8(uint16x8_t __a, uint8x8_t __b) {
    547   return __a + vmovl_u8(__b); }
    548 __ai uint32x4_t vaddw_u16(uint32x4_t __a, uint16x4_t __b) {
    549   return __a + vmovl_u16(__b); }
    550 __ai uint64x2_t vaddw_u32(uint64x2_t __a, uint32x2_t __b) {
    551   return __a + vmovl_u32(__b); }
    552 
    553 __ai int8x8_t vand_s8(int8x8_t __a, int8x8_t __b) {
    554   return __a & __b; }
    555 __ai int16x4_t vand_s16(int16x4_t __a, int16x4_t __b) {
    556   return __a & __b; }
    557 __ai int32x2_t vand_s32(int32x2_t __a, int32x2_t __b) {
    558   return __a & __b; }
    559 __ai int64x1_t vand_s64(int64x1_t __a, int64x1_t __b) {
    560   return __a & __b; }
    561 __ai uint8x8_t vand_u8(uint8x8_t __a, uint8x8_t __b) {
    562   return __a & __b; }
    563 __ai uint16x4_t vand_u16(uint16x4_t __a, uint16x4_t __b) {
    564   return __a & __b; }
    565 __ai uint32x2_t vand_u32(uint32x2_t __a, uint32x2_t __b) {
    566   return __a & __b; }
    567 __ai uint64x1_t vand_u64(uint64x1_t __a, uint64x1_t __b) {
    568   return __a & __b; }
    569 __ai int8x16_t vandq_s8(int8x16_t __a, int8x16_t __b) {
    570   return __a & __b; }
    571 __ai int16x8_t vandq_s16(int16x8_t __a, int16x8_t __b) {
    572   return __a & __b; }
    573 __ai int32x4_t vandq_s32(int32x4_t __a, int32x4_t __b) {
    574   return __a & __b; }
    575 __ai int64x2_t vandq_s64(int64x2_t __a, int64x2_t __b) {
    576   return __a & __b; }
    577 __ai uint8x16_t vandq_u8(uint8x16_t __a, uint8x16_t __b) {
    578   return __a & __b; }
    579 __ai uint16x8_t vandq_u16(uint16x8_t __a, uint16x8_t __b) {
    580   return __a & __b; }
    581 __ai uint32x4_t vandq_u32(uint32x4_t __a, uint32x4_t __b) {
    582   return __a & __b; }
    583 __ai uint64x2_t vandq_u64(uint64x2_t __a, uint64x2_t __b) {
    584   return __a & __b; }
    585 
    586 __ai int8x8_t vbic_s8(int8x8_t __a, int8x8_t __b) {
    587   return __a & ~__b; }
    588 __ai int16x4_t vbic_s16(int16x4_t __a, int16x4_t __b) {
    589   return __a & ~__b; }
    590 __ai int32x2_t vbic_s32(int32x2_t __a, int32x2_t __b) {
    591   return __a & ~__b; }
    592 __ai int64x1_t vbic_s64(int64x1_t __a, int64x1_t __b) {
    593   return __a & ~__b; }
    594 __ai uint8x8_t vbic_u8(uint8x8_t __a, uint8x8_t __b) {
    595   return __a & ~__b; }
    596 __ai uint16x4_t vbic_u16(uint16x4_t __a, uint16x4_t __b) {
    597   return __a & ~__b; }
    598 __ai uint32x2_t vbic_u32(uint32x2_t __a, uint32x2_t __b) {
    599   return __a & ~__b; }
    600 __ai uint64x1_t vbic_u64(uint64x1_t __a, uint64x1_t __b) {
    601   return __a & ~__b; }
    602 __ai int8x16_t vbicq_s8(int8x16_t __a, int8x16_t __b) {
    603   return __a & ~__b; }
    604 __ai int16x8_t vbicq_s16(int16x8_t __a, int16x8_t __b) {
    605   return __a & ~__b; }
    606 __ai int32x4_t vbicq_s32(int32x4_t __a, int32x4_t __b) {
    607   return __a & ~__b; }
    608 __ai int64x2_t vbicq_s64(int64x2_t __a, int64x2_t __b) {
    609   return __a & ~__b; }
    610 __ai uint8x16_t vbicq_u8(uint8x16_t __a, uint8x16_t __b) {
    611   return __a & ~__b; }
    612 __ai uint16x8_t vbicq_u16(uint16x8_t __a, uint16x8_t __b) {
    613   return __a & ~__b; }
    614 __ai uint32x4_t vbicq_u32(uint32x4_t __a, uint32x4_t __b) {
    615   return __a & ~__b; }
    616 __ai uint64x2_t vbicq_u64(uint64x2_t __a, uint64x2_t __b) {
    617   return __a & ~__b; }
    618 
    619 __ai int8x8_t vbsl_s8(uint8x8_t __a, int8x8_t __b, int8x8_t __c) {
    620   return (int8x8_t)__builtin_neon_vbsl_v((int8x8_t)__a, __b, __c, 0); }
    621 __ai int16x4_t vbsl_s16(uint16x4_t __a, int16x4_t __b, int16x4_t __c) {
    622   return (int16x4_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 1); }
    623 __ai int32x2_t vbsl_s32(uint32x2_t __a, int32x2_t __b, int32x2_t __c) {
    624   return (int32x2_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 2); }
    625 __ai int64x1_t vbsl_s64(uint64x1_t __a, int64x1_t __b, int64x1_t __c) {
    626   return (int64x1_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 3); }
    627 __ai uint8x8_t vbsl_u8(uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) {
    628   return (uint8x8_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 16); }
    629 __ai uint16x4_t vbsl_u16(uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) {
    630   return (uint16x4_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 17); }
    631 __ai uint32x2_t vbsl_u32(uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) {
    632   return (uint32x2_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 18); }
    633 __ai uint64x1_t vbsl_u64(uint64x1_t __a, uint64x1_t __b, uint64x1_t __c) {
    634   return (uint64x1_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 19); }
    635 __ai float32x2_t vbsl_f32(uint32x2_t __a, float32x2_t __b, float32x2_t __c) {
    636   return (float32x2_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 7); }
    637 __ai poly8x8_t vbsl_p8(uint8x8_t __a, poly8x8_t __b, poly8x8_t __c) {
    638   return (poly8x8_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 4); }
    639 __ai poly16x4_t vbsl_p16(uint16x4_t __a, poly16x4_t __b, poly16x4_t __c) {
    640   return (poly16x4_t)__builtin_neon_vbsl_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 5); }
    641 __ai int8x16_t vbslq_s8(uint8x16_t __a, int8x16_t __b, int8x16_t __c) {
    642   return (int8x16_t)__builtin_neon_vbslq_v((int8x16_t)__a, __b, __c, 32); }
    643 __ai int16x8_t vbslq_s16(uint16x8_t __a, int16x8_t __b, int16x8_t __c) {
    644   return (int16x8_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 33); }
    645 __ai int32x4_t vbslq_s32(uint32x4_t __a, int32x4_t __b, int32x4_t __c) {
    646   return (int32x4_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 34); }
    647 __ai int64x2_t vbslq_s64(uint64x2_t __a, int64x2_t __b, int64x2_t __c) {
    648   return (int64x2_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 35); }
    649 __ai uint8x16_t vbslq_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) {
    650   return (uint8x16_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 48); }
    651 __ai uint16x8_t vbslq_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) {
    652   return (uint16x8_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 49); }
    653 __ai uint32x4_t vbslq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) {
    654   return (uint32x4_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 50); }
    655 __ai uint64x2_t vbslq_u64(uint64x2_t __a, uint64x2_t __b, uint64x2_t __c) {
    656   return (uint64x2_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 51); }
    657 __ai float32x4_t vbslq_f32(uint32x4_t __a, float32x4_t __b, float32x4_t __c) {
    658   return (float32x4_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 39); }
    659 __ai poly8x16_t vbslq_p8(uint8x16_t __a, poly8x16_t __b, poly8x16_t __c) {
    660   return (poly8x16_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 36); }
    661 __ai poly16x8_t vbslq_p16(uint16x8_t __a, poly16x8_t __b, poly16x8_t __c) {
    662   return (poly16x8_t)__builtin_neon_vbslq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 37); }
    663 
    664 __ai uint32x2_t vcage_f32(float32x2_t __a, float32x2_t __b) {
    665   return (uint32x2_t)__builtin_neon_vcage_v((int8x8_t)__a, (int8x8_t)__b, 18); }
    666 __ai uint32x4_t vcageq_f32(float32x4_t __a, float32x4_t __b) {
    667   return (uint32x4_t)__builtin_neon_vcageq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
    668 
    669 __ai uint32x2_t vcagt_f32(float32x2_t __a, float32x2_t __b) {
    670   return (uint32x2_t)__builtin_neon_vcagt_v((int8x8_t)__a, (int8x8_t)__b, 18); }
    671 __ai uint32x4_t vcagtq_f32(float32x4_t __a, float32x4_t __b) {
    672   return (uint32x4_t)__builtin_neon_vcagtq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
    673 
    674 __ai uint32x2_t vcale_f32(float32x2_t __a, float32x2_t __b) {
    675   return (uint32x2_t)__builtin_neon_vcale_v((int8x8_t)__a, (int8x8_t)__b, 18); }
    676 __ai uint32x4_t vcaleq_f32(float32x4_t __a, float32x4_t __b) {
    677   return (uint32x4_t)__builtin_neon_vcaleq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
    678 
    679 __ai uint32x2_t vcalt_f32(float32x2_t __a, float32x2_t __b) {
    680   return (uint32x2_t)__builtin_neon_vcalt_v((int8x8_t)__a, (int8x8_t)__b, 18); }
    681 __ai uint32x4_t vcaltq_f32(float32x4_t __a, float32x4_t __b) {
    682   return (uint32x4_t)__builtin_neon_vcaltq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
    683 
    684 __ai uint8x8_t vceq_s8(int8x8_t __a, int8x8_t __b) {
    685   return (uint8x8_t)(__a == __b); }
    686 __ai uint16x4_t vceq_s16(int16x4_t __a, int16x4_t __b) {
    687   return (uint16x4_t)(__a == __b); }
    688 __ai uint32x2_t vceq_s32(int32x2_t __a, int32x2_t __b) {
    689   return (uint32x2_t)(__a == __b); }
    690 __ai uint32x2_t vceq_f32(float32x2_t __a, float32x2_t __b) {
    691   return (uint32x2_t)(__a == __b); }
    692 __ai uint8x8_t vceq_u8(uint8x8_t __a, uint8x8_t __b) {
    693   return (uint8x8_t)(__a == __b); }
    694 __ai uint16x4_t vceq_u16(uint16x4_t __a, uint16x4_t __b) {
    695   return (uint16x4_t)(__a == __b); }
    696 __ai uint32x2_t vceq_u32(uint32x2_t __a, uint32x2_t __b) {
    697   return (uint32x2_t)(__a == __b); }
    698 __ai uint8x8_t vceq_p8(poly8x8_t __a, poly8x8_t __b) {
    699   return (uint8x8_t)(__a == __b); }
    700 __ai uint8x16_t vceqq_s8(int8x16_t __a, int8x16_t __b) {
    701   return (uint8x16_t)(__a == __b); }
    702 __ai uint16x8_t vceqq_s16(int16x8_t __a, int16x8_t __b) {
    703   return (uint16x8_t)(__a == __b); }
    704 __ai uint32x4_t vceqq_s32(int32x4_t __a, int32x4_t __b) {
    705   return (uint32x4_t)(__a == __b); }
    706 __ai uint32x4_t vceqq_f32(float32x4_t __a, float32x4_t __b) {
    707   return (uint32x4_t)(__a == __b); }
    708 __ai uint8x16_t vceqq_u8(uint8x16_t __a, uint8x16_t __b) {
    709   return (uint8x16_t)(__a == __b); }
    710 __ai uint16x8_t vceqq_u16(uint16x8_t __a, uint16x8_t __b) {
    711   return (uint16x8_t)(__a == __b); }
    712 __ai uint32x4_t vceqq_u32(uint32x4_t __a, uint32x4_t __b) {
    713   return (uint32x4_t)(__a == __b); }
    714 __ai uint8x16_t vceqq_p8(poly8x16_t __a, poly8x16_t __b) {
    715   return (uint8x16_t)(__a == __b); }
    716 
    717 __ai uint8x8_t vcge_s8(int8x8_t __a, int8x8_t __b) {
    718   return (uint8x8_t)(__a >= __b); }
    719 __ai uint16x4_t vcge_s16(int16x4_t __a, int16x4_t __b) {
    720   return (uint16x4_t)(__a >= __b); }
    721 __ai uint32x2_t vcge_s32(int32x2_t __a, int32x2_t __b) {
    722   return (uint32x2_t)(__a >= __b); }
    723 __ai uint32x2_t vcge_f32(float32x2_t __a, float32x2_t __b) {
    724   return (uint32x2_t)(__a >= __b); }
    725 __ai uint8x8_t vcge_u8(uint8x8_t __a, uint8x8_t __b) {
    726   return (uint8x8_t)(__a >= __b); }
    727 __ai uint16x4_t vcge_u16(uint16x4_t __a, uint16x4_t __b) {
    728   return (uint16x4_t)(__a >= __b); }
    729 __ai uint32x2_t vcge_u32(uint32x2_t __a, uint32x2_t __b) {
    730   return (uint32x2_t)(__a >= __b); }
    731 __ai uint8x16_t vcgeq_s8(int8x16_t __a, int8x16_t __b) {
    732   return (uint8x16_t)(__a >= __b); }
    733 __ai uint16x8_t vcgeq_s16(int16x8_t __a, int16x8_t __b) {
    734   return (uint16x8_t)(__a >= __b); }
    735 __ai uint32x4_t vcgeq_s32(int32x4_t __a, int32x4_t __b) {
    736   return (uint32x4_t)(__a >= __b); }
    737 __ai uint32x4_t vcgeq_f32(float32x4_t __a, float32x4_t __b) {
    738   return (uint32x4_t)(__a >= __b); }
    739 __ai uint8x16_t vcgeq_u8(uint8x16_t __a, uint8x16_t __b) {
    740   return (uint8x16_t)(__a >= __b); }
    741 __ai uint16x8_t vcgeq_u16(uint16x8_t __a, uint16x8_t __b) {
    742   return (uint16x8_t)(__a >= __b); }
    743 __ai uint32x4_t vcgeq_u32(uint32x4_t __a, uint32x4_t __b) {
    744   return (uint32x4_t)(__a >= __b); }
    745 
    746 __ai uint8x8_t vcgt_s8(int8x8_t __a, int8x8_t __b) {
    747   return (uint8x8_t)(__a > __b); }
    748 __ai uint16x4_t vcgt_s16(int16x4_t __a, int16x4_t __b) {
    749   return (uint16x4_t)(__a > __b); }
    750 __ai uint32x2_t vcgt_s32(int32x2_t __a, int32x2_t __b) {
    751   return (uint32x2_t)(__a > __b); }
    752 __ai uint32x2_t vcgt_f32(float32x2_t __a, float32x2_t __b) {
    753   return (uint32x2_t)(__a > __b); }
    754 __ai uint8x8_t vcgt_u8(uint8x8_t __a, uint8x8_t __b) {
    755   return (uint8x8_t)(__a > __b); }
    756 __ai uint16x4_t vcgt_u16(uint16x4_t __a, uint16x4_t __b) {
    757   return (uint16x4_t)(__a > __b); }
    758 __ai uint32x2_t vcgt_u32(uint32x2_t __a, uint32x2_t __b) {
    759   return (uint32x2_t)(__a > __b); }
    760 __ai uint8x16_t vcgtq_s8(int8x16_t __a, int8x16_t __b) {
    761   return (uint8x16_t)(__a > __b); }
    762 __ai uint16x8_t vcgtq_s16(int16x8_t __a, int16x8_t __b) {
    763   return (uint16x8_t)(__a > __b); }
    764 __ai uint32x4_t vcgtq_s32(int32x4_t __a, int32x4_t __b) {
    765   return (uint32x4_t)(__a > __b); }
    766 __ai uint32x4_t vcgtq_f32(float32x4_t __a, float32x4_t __b) {
    767   return (uint32x4_t)(__a > __b); }
    768 __ai uint8x16_t vcgtq_u8(uint8x16_t __a, uint8x16_t __b) {
    769   return (uint8x16_t)(__a > __b); }
    770 __ai uint16x8_t vcgtq_u16(uint16x8_t __a, uint16x8_t __b) {
    771   return (uint16x8_t)(__a > __b); }
    772 __ai uint32x4_t vcgtq_u32(uint32x4_t __a, uint32x4_t __b) {
    773   return (uint32x4_t)(__a > __b); }
    774 
    775 __ai uint8x8_t vcle_s8(int8x8_t __a, int8x8_t __b) {
    776   return (uint8x8_t)(__a <= __b); }
    777 __ai uint16x4_t vcle_s16(int16x4_t __a, int16x4_t __b) {
    778   return (uint16x4_t)(__a <= __b); }
    779 __ai uint32x2_t vcle_s32(int32x2_t __a, int32x2_t __b) {
    780   return (uint32x2_t)(__a <= __b); }
    781 __ai uint32x2_t vcle_f32(float32x2_t __a, float32x2_t __b) {
    782   return (uint32x2_t)(__a <= __b); }
    783 __ai uint8x8_t vcle_u8(uint8x8_t __a, uint8x8_t __b) {
    784   return (uint8x8_t)(__a <= __b); }
    785 __ai uint16x4_t vcle_u16(uint16x4_t __a, uint16x4_t __b) {
    786   return (uint16x4_t)(__a <= __b); }
    787 __ai uint32x2_t vcle_u32(uint32x2_t __a, uint32x2_t __b) {
    788   return (uint32x2_t)(__a <= __b); }
    789 __ai uint8x16_t vcleq_s8(int8x16_t __a, int8x16_t __b) {
    790   return (uint8x16_t)(__a <= __b); }
    791 __ai uint16x8_t vcleq_s16(int16x8_t __a, int16x8_t __b) {
    792   return (uint16x8_t)(__a <= __b); }
    793 __ai uint32x4_t vcleq_s32(int32x4_t __a, int32x4_t __b) {
    794   return (uint32x4_t)(__a <= __b); }
    795 __ai uint32x4_t vcleq_f32(float32x4_t __a, float32x4_t __b) {
    796   return (uint32x4_t)(__a <= __b); }
    797 __ai uint8x16_t vcleq_u8(uint8x16_t __a, uint8x16_t __b) {
    798   return (uint8x16_t)(__a <= __b); }
    799 __ai uint16x8_t vcleq_u16(uint16x8_t __a, uint16x8_t __b) {
    800   return (uint16x8_t)(__a <= __b); }
    801 __ai uint32x4_t vcleq_u32(uint32x4_t __a, uint32x4_t __b) {
    802   return (uint32x4_t)(__a <= __b); }
    803 
    804 __ai int8x8_t vcls_s8(int8x8_t __a) {
    805   return (int8x8_t)__builtin_neon_vcls_v(__a, 0); }
    806 __ai int16x4_t vcls_s16(int16x4_t __a) {
    807   return (int16x4_t)__builtin_neon_vcls_v((int8x8_t)__a, 1); }
    808 __ai int32x2_t vcls_s32(int32x2_t __a) {
    809   return (int32x2_t)__builtin_neon_vcls_v((int8x8_t)__a, 2); }
    810 __ai int8x16_t vclsq_s8(int8x16_t __a) {
    811   return (int8x16_t)__builtin_neon_vclsq_v(__a, 32); }
    812 __ai int16x8_t vclsq_s16(int16x8_t __a) {
    813   return (int16x8_t)__builtin_neon_vclsq_v((int8x16_t)__a, 33); }
    814 __ai int32x4_t vclsq_s32(int32x4_t __a) {
    815   return (int32x4_t)__builtin_neon_vclsq_v((int8x16_t)__a, 34); }
    816 
    817 __ai uint8x8_t vclt_s8(int8x8_t __a, int8x8_t __b) {
    818   return (uint8x8_t)(__a < __b); }
    819 __ai uint16x4_t vclt_s16(int16x4_t __a, int16x4_t __b) {
    820   return (uint16x4_t)(__a < __b); }
    821 __ai uint32x2_t vclt_s32(int32x2_t __a, int32x2_t __b) {
    822   return (uint32x2_t)(__a < __b); }
    823 __ai uint32x2_t vclt_f32(float32x2_t __a, float32x2_t __b) {
    824   return (uint32x2_t)(__a < __b); }
    825 __ai uint8x8_t vclt_u8(uint8x8_t __a, uint8x8_t __b) {
    826   return (uint8x8_t)(__a < __b); }
    827 __ai uint16x4_t vclt_u16(uint16x4_t __a, uint16x4_t __b) {
    828   return (uint16x4_t)(__a < __b); }
    829 __ai uint32x2_t vclt_u32(uint32x2_t __a, uint32x2_t __b) {
    830   return (uint32x2_t)(__a < __b); }
    831 __ai uint8x16_t vcltq_s8(int8x16_t __a, int8x16_t __b) {
    832   return (uint8x16_t)(__a < __b); }
    833 __ai uint16x8_t vcltq_s16(int16x8_t __a, int16x8_t __b) {
    834   return (uint16x8_t)(__a < __b); }
    835 __ai uint32x4_t vcltq_s32(int32x4_t __a, int32x4_t __b) {
    836   return (uint32x4_t)(__a < __b); }
    837 __ai uint32x4_t vcltq_f32(float32x4_t __a, float32x4_t __b) {
    838   return (uint32x4_t)(__a < __b); }
    839 __ai uint8x16_t vcltq_u8(uint8x16_t __a, uint8x16_t __b) {
    840   return (uint8x16_t)(__a < __b); }
    841 __ai uint16x8_t vcltq_u16(uint16x8_t __a, uint16x8_t __b) {
    842   return (uint16x8_t)(__a < __b); }
    843 __ai uint32x4_t vcltq_u32(uint32x4_t __a, uint32x4_t __b) {
    844   return (uint32x4_t)(__a < __b); }
    845 
    846 __ai int8x8_t vclz_s8(int8x8_t __a) {
    847   return (int8x8_t)__builtin_neon_vclz_v(__a, 0); }
    848 __ai int16x4_t vclz_s16(int16x4_t __a) {
    849   return (int16x4_t)__builtin_neon_vclz_v((int8x8_t)__a, 1); }
    850 __ai int32x2_t vclz_s32(int32x2_t __a) {
    851   return (int32x2_t)__builtin_neon_vclz_v((int8x8_t)__a, 2); }
    852 __ai uint8x8_t vclz_u8(uint8x8_t __a) {
    853   return (uint8x8_t)__builtin_neon_vclz_v((int8x8_t)__a, 16); }
    854 __ai uint16x4_t vclz_u16(uint16x4_t __a) {
    855   return (uint16x4_t)__builtin_neon_vclz_v((int8x8_t)__a, 17); }
    856 __ai uint32x2_t vclz_u32(uint32x2_t __a) {
    857   return (uint32x2_t)__builtin_neon_vclz_v((int8x8_t)__a, 18); }
    858 __ai int8x16_t vclzq_s8(int8x16_t __a) {
    859   return (int8x16_t)__builtin_neon_vclzq_v(__a, 32); }
    860 __ai int16x8_t vclzq_s16(int16x8_t __a) {
    861   return (int16x8_t)__builtin_neon_vclzq_v((int8x16_t)__a, 33); }
    862 __ai int32x4_t vclzq_s32(int32x4_t __a) {
    863   return (int32x4_t)__builtin_neon_vclzq_v((int8x16_t)__a, 34); }
    864 __ai uint8x16_t vclzq_u8(uint8x16_t __a) {
    865   return (uint8x16_t)__builtin_neon_vclzq_v((int8x16_t)__a, 48); }
    866 __ai uint16x8_t vclzq_u16(uint16x8_t __a) {
    867   return (uint16x8_t)__builtin_neon_vclzq_v((int8x16_t)__a, 49); }
    868 __ai uint32x4_t vclzq_u32(uint32x4_t __a) {
    869   return (uint32x4_t)__builtin_neon_vclzq_v((int8x16_t)__a, 50); }
    870 
    871 __ai uint8x8_t vcnt_u8(uint8x8_t __a) {
    872   return (uint8x8_t)__builtin_neon_vcnt_v((int8x8_t)__a, 16); }
    873 __ai int8x8_t vcnt_s8(int8x8_t __a) {
    874   return (int8x8_t)__builtin_neon_vcnt_v(__a, 0); }
    875 __ai poly8x8_t vcnt_p8(poly8x8_t __a) {
    876   return (poly8x8_t)__builtin_neon_vcnt_v((int8x8_t)__a, 4); }
    877 __ai uint8x16_t vcntq_u8(uint8x16_t __a) {
    878   return (uint8x16_t)__builtin_neon_vcntq_v((int8x16_t)__a, 48); }
    879 __ai int8x16_t vcntq_s8(int8x16_t __a) {
    880   return (int8x16_t)__builtin_neon_vcntq_v(__a, 32); }
    881 __ai poly8x16_t vcntq_p8(poly8x16_t __a) {
    882   return (poly8x16_t)__builtin_neon_vcntq_v((int8x16_t)__a, 36); }
    883 
    884 __ai int8x16_t vcombine_s8(int8x8_t __a, int8x8_t __b) {
    885   return (int8x16_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    886 __ai int16x8_t vcombine_s16(int16x4_t __a, int16x4_t __b) {
    887   return (int16x8_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    888 __ai int32x4_t vcombine_s32(int32x2_t __a, int32x2_t __b) {
    889   return (int32x4_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    890 __ai int64x2_t vcombine_s64(int64x1_t __a, int64x1_t __b) {
    891   return (int64x2_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    892 __ai float16x8_t vcombine_f16(float16x4_t __a, float16x4_t __b) {
    893   return (float16x8_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    894 __ai float32x4_t vcombine_f32(float32x2_t __a, float32x2_t __b) {
    895   return (float32x4_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    896 __ai uint8x16_t vcombine_u8(uint8x8_t __a, uint8x8_t __b) {
    897   return (uint8x16_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    898 __ai uint16x8_t vcombine_u16(uint16x4_t __a, uint16x4_t __b) {
    899   return (uint16x8_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    900 __ai uint32x4_t vcombine_u32(uint32x2_t __a, uint32x2_t __b) {
    901   return (uint32x4_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    902 __ai uint64x2_t vcombine_u64(uint64x1_t __a, uint64x1_t __b) {
    903   return (uint64x2_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    904 __ai poly8x16_t vcombine_p8(poly8x8_t __a, poly8x8_t __b) {
    905   return (poly8x16_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    906 __ai poly16x8_t vcombine_p16(poly16x4_t __a, poly16x4_t __b) {
    907   return (poly16x8_t)__builtin_shufflevector((int64x1_t)__a, (int64x1_t)__b, 0, 1); }
    908 
    909 __ai int8x8_t vcreate_s8(uint64_t __a) {
    910   return (int8x8_t)__a; }
    911 __ai int16x4_t vcreate_s16(uint64_t __a) {
    912   return (int16x4_t)__a; }
    913 __ai int32x2_t vcreate_s32(uint64_t __a) {
    914   return (int32x2_t)__a; }
    915 __ai float16x4_t vcreate_f16(uint64_t __a) {
    916   return (float16x4_t)__a; }
    917 __ai float32x2_t vcreate_f32(uint64_t __a) {
    918   return (float32x2_t)__a; }
    919 __ai uint8x8_t vcreate_u8(uint64_t __a) {
    920   return (uint8x8_t)__a; }
    921 __ai uint16x4_t vcreate_u16(uint64_t __a) {
    922   return (uint16x4_t)__a; }
    923 __ai uint32x2_t vcreate_u32(uint64_t __a) {
    924   return (uint32x2_t)__a; }
    925 __ai uint64x1_t vcreate_u64(uint64_t __a) {
    926   return (uint64x1_t)__a; }
    927 __ai poly8x8_t vcreate_p8(uint64_t __a) {
    928   return (poly8x8_t)__a; }
    929 __ai poly16x4_t vcreate_p16(uint64_t __a) {
    930   return (poly16x4_t)__a; }
    931 __ai int64x1_t vcreate_s64(uint64_t __a) {
    932   return (int64x1_t)__a; }
    933 
    934 __ai float16x4_t vcvt_f16_f32(float32x4_t __a) {
    935   return (float16x4_t)__builtin_neon_vcvt_f16_v((int8x16_t)__a, 6); }
    936 
    937 __ai float32x2_t vcvt_f32_s32(int32x2_t __a) {
    938   return (float32x2_t)__builtin_neon_vcvt_f32_v((int8x8_t)__a, 2); }
    939 __ai float32x2_t vcvt_f32_u32(uint32x2_t __a) {
    940   return (float32x2_t)__builtin_neon_vcvt_f32_v((int8x8_t)__a, 18); }
    941 __ai float32x4_t vcvtq_f32_s32(int32x4_t __a) {
    942   return (float32x4_t)__builtin_neon_vcvtq_f32_v((int8x16_t)__a, 34); }
    943 __ai float32x4_t vcvtq_f32_u32(uint32x4_t __a) {
    944   return (float32x4_t)__builtin_neon_vcvtq_f32_v((int8x16_t)__a, 50); }
    945 
    946 __ai float32x4_t vcvt_f32_f16(float16x4_t __a) {
    947   return (float32x4_t)__builtin_neon_vcvt_f32_f16((int8x8_t)__a, 6); }
    948 
    949 #define vcvt_n_f32_s32(a, __b) __extension__ ({ \
    950   int32x2_t __a = (a); \
    951   (float32x2_t)__builtin_neon_vcvt_n_f32_v((int8x8_t)__a, __b, 2); })
    952 #define vcvt_n_f32_u32(a, __b) __extension__ ({ \
    953   uint32x2_t __a = (a); \
    954   (float32x2_t)__builtin_neon_vcvt_n_f32_v((int8x8_t)__a, __b, 18); })
    955 #define vcvtq_n_f32_s32(a, __b) __extension__ ({ \
    956   int32x4_t __a = (a); \
    957   (float32x4_t)__builtin_neon_vcvtq_n_f32_v((int8x16_t)__a, __b, 34); })
    958 #define vcvtq_n_f32_u32(a, __b) __extension__ ({ \
    959   uint32x4_t __a = (a); \
    960   (float32x4_t)__builtin_neon_vcvtq_n_f32_v((int8x16_t)__a, __b, 50); })
    961 
    962 #define vcvt_n_s32_f32(a, __b) __extension__ ({ \
    963   float32x2_t __a = (a); \
    964   (int32x2_t)__builtin_neon_vcvt_n_s32_v((int8x8_t)__a, __b, 2); })
    965 #define vcvtq_n_s32_f32(a, __b) __extension__ ({ \
    966   float32x4_t __a = (a); \
    967   (int32x4_t)__builtin_neon_vcvtq_n_s32_v((int8x16_t)__a, __b, 34); })
    968 
    969 #define vcvt_n_u32_f32(a, __b) __extension__ ({ \
    970   float32x2_t __a = (a); \
    971   (uint32x2_t)__builtin_neon_vcvt_n_u32_v((int8x8_t)__a, __b, 18); })
    972 #define vcvtq_n_u32_f32(a, __b) __extension__ ({ \
    973   float32x4_t __a = (a); \
    974   (uint32x4_t)__builtin_neon_vcvtq_n_u32_v((int8x16_t)__a, __b, 50); })
    975 
    976 __ai int32x2_t vcvt_s32_f32(float32x2_t __a) {
    977   return (int32x2_t)__builtin_neon_vcvt_s32_v((int8x8_t)__a, 2); }
    978 __ai int32x4_t vcvtq_s32_f32(float32x4_t __a) {
    979   return (int32x4_t)__builtin_neon_vcvtq_s32_v((int8x16_t)__a, 34); }
    980 
    981 __ai uint32x2_t vcvt_u32_f32(float32x2_t __a) {
    982   return (uint32x2_t)__builtin_neon_vcvt_u32_v((int8x8_t)__a, 18); }
    983 __ai uint32x4_t vcvtq_u32_f32(float32x4_t __a) {
    984   return (uint32x4_t)__builtin_neon_vcvtq_u32_v((int8x16_t)__a, 50); }
    985 
    986 #define vdup_lane_u8(a, __b) __extension__ ({ \
    987   uint8x8_t __a = (a); \
    988   __builtin_shufflevector(__a, __a, __b, __b, __b, __b, __b, __b, __b, __b); })
    989 #define vdup_lane_u16(a, __b) __extension__ ({ \
    990   uint16x4_t __a = (a); \
    991   __builtin_shufflevector(__a, __a, __b, __b, __b, __b); })
    992 #define vdup_lane_u32(a, __b) __extension__ ({ \
    993   uint32x2_t __a = (a); \
    994   __builtin_shufflevector(__a, __a, __b, __b); })
    995 #define vdup_lane_s8(a, __b) __extension__ ({ \
    996   int8x8_t __a = (a); \
    997   __builtin_shufflevector(__a, __a, __b, __b, __b, __b, __b, __b, __b, __b); })
    998 #define vdup_lane_s16(a, __b) __extension__ ({ \
    999   int16x4_t __a = (a); \
   1000   __builtin_shufflevector(__a, __a, __b, __b, __b, __b); })
   1001 #define vdup_lane_s32(a, __b) __extension__ ({ \
   1002   int32x2_t __a = (a); \
   1003   __builtin_shufflevector(__a, __a, __b, __b); })
   1004 #define vdup_lane_p8(a, __b) __extension__ ({ \
   1005   poly8x8_t __a = (a); \
   1006   __builtin_shufflevector(__a, __a, __b, __b, __b, __b, __b, __b, __b, __b); })
   1007 #define vdup_lane_p16(a, __b) __extension__ ({ \
   1008   poly16x4_t __a = (a); \
   1009   __builtin_shufflevector(__a, __a, __b, __b, __b, __b); })
   1010 #define vdup_lane_f32(a, __b) __extension__ ({ \
   1011   float32x2_t __a = (a); \
   1012   __builtin_shufflevector(__a, __a, __b, __b); })
   1013 #define vdupq_lane_u8(a, __b) __extension__ ({ \
   1014   uint8x8_t __a = (a); \
   1015   __builtin_shufflevector(__a, __a, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); })
   1016 #define vdupq_lane_u16(a, __b) __extension__ ({ \
   1017   uint16x4_t __a = (a); \
   1018   __builtin_shufflevector(__a, __a, __b, __b, __b, __b, __b, __b, __b, __b); })
   1019 #define vdupq_lane_u32(a, __b) __extension__ ({ \
   1020   uint32x2_t __a = (a); \
   1021   __builtin_shufflevector(__a, __a, __b, __b, __b, __b); })
   1022 #define vdupq_lane_s8(a, __b) __extension__ ({ \
   1023   int8x8_t __a = (a); \
   1024   __builtin_shufflevector(__a, __a, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); })
   1025 #define vdupq_lane_s16(a, __b) __extension__ ({ \
   1026   int16x4_t __a = (a); \
   1027   __builtin_shufflevector(__a, __a, __b, __b, __b, __b, __b, __b, __b, __b); })
   1028 #define vdupq_lane_s32(a, __b) __extension__ ({ \
   1029   int32x2_t __a = (a); \
   1030   __builtin_shufflevector(__a, __a, __b, __b, __b, __b); })
   1031 #define vdupq_lane_p8(a, __b) __extension__ ({ \
   1032   poly8x8_t __a = (a); \
   1033   __builtin_shufflevector(__a, __a, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); })
   1034 #define vdupq_lane_p16(a, __b) __extension__ ({ \
   1035   poly16x4_t __a = (a); \
   1036   __builtin_shufflevector(__a, __a, __b, __b, __b, __b, __b, __b, __b, __b); })
   1037 #define vdupq_lane_f32(a, __b) __extension__ ({ \
   1038   float32x2_t __a = (a); \
   1039   __builtin_shufflevector(__a, __a, __b, __b, __b, __b); })
   1040 #define vdup_lane_s64(a, __b) __extension__ ({ \
   1041   int64x1_t __a = (a); \
   1042   __builtin_shufflevector(__a, __a, __b); })
   1043 #define vdup_lane_u64(a, __b) __extension__ ({ \
   1044   uint64x1_t __a = (a); \
   1045   __builtin_shufflevector(__a, __a, __b); })
   1046 #define vdupq_lane_s64(a, __b) __extension__ ({ \
   1047   int64x1_t __a = (a); \
   1048   __builtin_shufflevector(__a, __a, __b, __b); })
   1049 #define vdupq_lane_u64(a, __b) __extension__ ({ \
   1050   uint64x1_t __a = (a); \
   1051   __builtin_shufflevector(__a, __a, __b, __b); })
   1052 
   1053 __ai uint8x8_t vdup_n_u8(uint8_t __a) {
   1054   return (uint8x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   1055 __ai uint16x4_t vdup_n_u16(uint16_t __a) {
   1056   return (uint16x4_t){ __a, __a, __a, __a }; }
   1057 __ai uint32x2_t vdup_n_u32(uint32_t __a) {
   1058   return (uint32x2_t){ __a, __a }; }
   1059 __ai int8x8_t vdup_n_s8(int8_t __a) {
   1060   return (int8x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   1061 __ai int16x4_t vdup_n_s16(int16_t __a) {
   1062   return (int16x4_t){ __a, __a, __a, __a }; }
   1063 __ai int32x2_t vdup_n_s32(int32_t __a) {
   1064   return (int32x2_t){ __a, __a }; }
   1065 __ai poly8x8_t vdup_n_p8(poly8_t __a) {
   1066   return (poly8x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   1067 __ai poly16x4_t vdup_n_p16(poly16_t __a) {
   1068   return (poly16x4_t){ __a, __a, __a, __a }; }
   1069 __ai float32x2_t vdup_n_f32(float32_t __a) {
   1070   return (float32x2_t){ __a, __a }; }
   1071 __ai uint8x16_t vdupq_n_u8(uint8_t __a) {
   1072   return (uint8x16_t){ __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a }; }
   1073 __ai uint16x8_t vdupq_n_u16(uint16_t __a) {
   1074   return (uint16x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   1075 __ai uint32x4_t vdupq_n_u32(uint32_t __a) {
   1076   return (uint32x4_t){ __a, __a, __a, __a }; }
   1077 __ai int8x16_t vdupq_n_s8(int8_t __a) {
   1078   return (int8x16_t){ __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a }; }
   1079 __ai int16x8_t vdupq_n_s16(int16_t __a) {
   1080   return (int16x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   1081 __ai int32x4_t vdupq_n_s32(int32_t __a) {
   1082   return (int32x4_t){ __a, __a, __a, __a }; }
   1083 __ai poly8x16_t vdupq_n_p8(poly8_t __a) {
   1084   return (poly8x16_t){ __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a }; }
   1085 __ai poly16x8_t vdupq_n_p16(poly16_t __a) {
   1086   return (poly16x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   1087 __ai float32x4_t vdupq_n_f32(float32_t __a) {
   1088   return (float32x4_t){ __a, __a, __a, __a }; }
   1089 __ai int64x1_t vdup_n_s64(int64_t __a) {
   1090   return (int64x1_t){ __a }; }
   1091 __ai uint64x1_t vdup_n_u64(uint64_t __a) {
   1092   return (uint64x1_t){ __a }; }
   1093 __ai int64x2_t vdupq_n_s64(int64_t __a) {
   1094   return (int64x2_t){ __a, __a }; }
   1095 __ai uint64x2_t vdupq_n_u64(uint64_t __a) {
   1096   return (uint64x2_t){ __a, __a }; }
   1097 
   1098 __ai int8x8_t veor_s8(int8x8_t __a, int8x8_t __b) {
   1099   return __a ^ __b; }
   1100 __ai int16x4_t veor_s16(int16x4_t __a, int16x4_t __b) {
   1101   return __a ^ __b; }
   1102 __ai int32x2_t veor_s32(int32x2_t __a, int32x2_t __b) {
   1103   return __a ^ __b; }
   1104 __ai int64x1_t veor_s64(int64x1_t __a, int64x1_t __b) {
   1105   return __a ^ __b; }
   1106 __ai uint8x8_t veor_u8(uint8x8_t __a, uint8x8_t __b) {
   1107   return __a ^ __b; }
   1108 __ai uint16x4_t veor_u16(uint16x4_t __a, uint16x4_t __b) {
   1109   return __a ^ __b; }
   1110 __ai uint32x2_t veor_u32(uint32x2_t __a, uint32x2_t __b) {
   1111   return __a ^ __b; }
   1112 __ai uint64x1_t veor_u64(uint64x1_t __a, uint64x1_t __b) {
   1113   return __a ^ __b; }
   1114 __ai int8x16_t veorq_s8(int8x16_t __a, int8x16_t __b) {
   1115   return __a ^ __b; }
   1116 __ai int16x8_t veorq_s16(int16x8_t __a, int16x8_t __b) {
   1117   return __a ^ __b; }
   1118 __ai int32x4_t veorq_s32(int32x4_t __a, int32x4_t __b) {
   1119   return __a ^ __b; }
   1120 __ai int64x2_t veorq_s64(int64x2_t __a, int64x2_t __b) {
   1121   return __a ^ __b; }
   1122 __ai uint8x16_t veorq_u8(uint8x16_t __a, uint8x16_t __b) {
   1123   return __a ^ __b; }
   1124 __ai uint16x8_t veorq_u16(uint16x8_t __a, uint16x8_t __b) {
   1125   return __a ^ __b; }
   1126 __ai uint32x4_t veorq_u32(uint32x4_t __a, uint32x4_t __b) {
   1127   return __a ^ __b; }
   1128 __ai uint64x2_t veorq_u64(uint64x2_t __a, uint64x2_t __b) {
   1129   return __a ^ __b; }
   1130 
   1131 #define vext_s8(a, b, __c) __extension__ ({ \
   1132   int8x8_t __a = (a); int8x8_t __b = (b); \
   1133   (int8x8_t)__builtin_neon_vext_v(__a, __b, __c, 0); })
   1134 #define vext_u8(a, b, __c) __extension__ ({ \
   1135   uint8x8_t __a = (a); uint8x8_t __b = (b); \
   1136   (uint8x8_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 16); })
   1137 #define vext_p8(a, b, __c) __extension__ ({ \
   1138   poly8x8_t __a = (a); poly8x8_t __b = (b); \
   1139   (poly8x8_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 4); })
   1140 #define vext_s16(a, b, __c) __extension__ ({ \
   1141   int16x4_t __a = (a); int16x4_t __b = (b); \
   1142   (int16x4_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 1); })
   1143 #define vext_u16(a, b, __c) __extension__ ({ \
   1144   uint16x4_t __a = (a); uint16x4_t __b = (b); \
   1145   (uint16x4_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 17); })
   1146 #define vext_p16(a, b, __c) __extension__ ({ \
   1147   poly16x4_t __a = (a); poly16x4_t __b = (b); \
   1148   (poly16x4_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 5); })
   1149 #define vext_s32(a, b, __c) __extension__ ({ \
   1150   int32x2_t __a = (a); int32x2_t __b = (b); \
   1151   (int32x2_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 2); })
   1152 #define vext_u32(a, b, __c) __extension__ ({ \
   1153   uint32x2_t __a = (a); uint32x2_t __b = (b); \
   1154   (uint32x2_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 18); })
   1155 #define vext_s64(a, b, __c) __extension__ ({ \
   1156   int64x1_t __a = (a); int64x1_t __b = (b); \
   1157   (int64x1_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 3); })
   1158 #define vext_u64(a, b, __c) __extension__ ({ \
   1159   uint64x1_t __a = (a); uint64x1_t __b = (b); \
   1160   (uint64x1_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 19); })
   1161 #define vext_f32(a, b, __c) __extension__ ({ \
   1162   float32x2_t __a = (a); float32x2_t __b = (b); \
   1163   (float32x2_t)__builtin_neon_vext_v((int8x8_t)__a, (int8x8_t)__b, __c, 7); })
   1164 #define vextq_s8(a, b, __c) __extension__ ({ \
   1165   int8x16_t __a = (a); int8x16_t __b = (b); \
   1166   (int8x16_t)__builtin_neon_vextq_v(__a, __b, __c, 32); })
   1167 #define vextq_u8(a, b, __c) __extension__ ({ \
   1168   uint8x16_t __a = (a); uint8x16_t __b = (b); \
   1169   (uint8x16_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 48); })
   1170 #define vextq_p8(a, b, __c) __extension__ ({ \
   1171   poly8x16_t __a = (a); poly8x16_t __b = (b); \
   1172   (poly8x16_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 36); })
   1173 #define vextq_s16(a, b, __c) __extension__ ({ \
   1174   int16x8_t __a = (a); int16x8_t __b = (b); \
   1175   (int16x8_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 33); })
   1176 #define vextq_u16(a, b, __c) __extension__ ({ \
   1177   uint16x8_t __a = (a); uint16x8_t __b = (b); \
   1178   (uint16x8_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 49); })
   1179 #define vextq_p16(a, b, __c) __extension__ ({ \
   1180   poly16x8_t __a = (a); poly16x8_t __b = (b); \
   1181   (poly16x8_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 37); })
   1182 #define vextq_s32(a, b, __c) __extension__ ({ \
   1183   int32x4_t __a = (a); int32x4_t __b = (b); \
   1184   (int32x4_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 34); })
   1185 #define vextq_u32(a, b, __c) __extension__ ({ \
   1186   uint32x4_t __a = (a); uint32x4_t __b = (b); \
   1187   (uint32x4_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 50); })
   1188 #define vextq_s64(a, b, __c) __extension__ ({ \
   1189   int64x2_t __a = (a); int64x2_t __b = (b); \
   1190   (int64x2_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 35); })
   1191 #define vextq_u64(a, b, __c) __extension__ ({ \
   1192   uint64x2_t __a = (a); uint64x2_t __b = (b); \
   1193   (uint64x2_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 51); })
   1194 #define vextq_f32(a, b, __c) __extension__ ({ \
   1195   float32x4_t __a = (a); float32x4_t __b = (b); \
   1196   (float32x4_t)__builtin_neon_vextq_v((int8x16_t)__a, (int8x16_t)__b, __c, 39); })
   1197 
   1198 __ai float32x2_t vfma_f32(float32x2_t __a, float32x2_t __b, float32x2_t __c) {
   1199   return (float32x2_t)__builtin_neon_vfma_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 7); }
   1200 __ai float32x4_t vfmaq_f32(float32x4_t __a, float32x4_t __b, float32x4_t __c) {
   1201   return (float32x4_t)__builtin_neon_vfmaq_v((int8x16_t)__a, (int8x16_t)__b, (int8x16_t)__c, 39); }
   1202 
   1203 __ai int8x8_t vget_high_s8(int8x16_t __a) {
   1204   return (int8x8_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1205 __ai int16x4_t vget_high_s16(int16x8_t __a) {
   1206   return (int16x4_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1207 __ai int32x2_t vget_high_s32(int32x4_t __a) {
   1208   return (int32x2_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1209 __ai int64x1_t vget_high_s64(int64x2_t __a) {
   1210   return (int64x1_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1211 __ai float16x4_t vget_high_f16(float16x8_t __a) {
   1212   return (float16x4_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1213 __ai float32x2_t vget_high_f32(float32x4_t __a) {
   1214   return (float32x2_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1215 __ai uint8x8_t vget_high_u8(uint8x16_t __a) {
   1216   return (uint8x8_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1217 __ai uint16x4_t vget_high_u16(uint16x8_t __a) {
   1218   return (uint16x4_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1219 __ai uint32x2_t vget_high_u32(uint32x4_t __a) {
   1220   return (uint32x2_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1221 __ai uint64x1_t vget_high_u64(uint64x2_t __a) {
   1222   return (uint64x1_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1223 __ai poly8x8_t vget_high_p8(poly8x16_t __a) {
   1224   return (poly8x8_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1225 __ai poly16x4_t vget_high_p16(poly16x8_t __a) {
   1226   return (poly16x4_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 1); }
   1227 
   1228 #define vget_lane_u8(a, __b) __extension__ ({ \
   1229   uint8x8_t __a = (a); \
   1230   (uint8_t)__builtin_neon_vget_lane_i8((int8x8_t)__a, __b); })
   1231 #define vget_lane_u16(a, __b) __extension__ ({ \
   1232   uint16x4_t __a = (a); \
   1233   (uint16_t)__builtin_neon_vget_lane_i16((int16x4_t)__a, __b); })
   1234 #define vget_lane_u32(a, __b) __extension__ ({ \
   1235   uint32x2_t __a = (a); \
   1236   (uint32_t)__builtin_neon_vget_lane_i32((int32x2_t)__a, __b); })
   1237 #define vget_lane_s8(a, __b) __extension__ ({ \
   1238   int8x8_t __a = (a); \
   1239   (int8_t)__builtin_neon_vget_lane_i8(__a, __b); })
   1240 #define vget_lane_s16(a, __b) __extension__ ({ \
   1241   int16x4_t __a = (a); \
   1242   (int16_t)__builtin_neon_vget_lane_i16(__a, __b); })
   1243 #define vget_lane_s32(a, __b) __extension__ ({ \
   1244   int32x2_t __a = (a); \
   1245   (int32_t)__builtin_neon_vget_lane_i32(__a, __b); })
   1246 #define vget_lane_p8(a, __b) __extension__ ({ \
   1247   poly8x8_t __a = (a); \
   1248   (poly8_t)__builtin_neon_vget_lane_i8((int8x8_t)__a, __b); })
   1249 #define vget_lane_p16(a, __b) __extension__ ({ \
   1250   poly16x4_t __a = (a); \
   1251   (poly16_t)__builtin_neon_vget_lane_i16((int16x4_t)__a, __b); })
   1252 #define vget_lane_f32(a, __b) __extension__ ({ \
   1253   float32x2_t __a = (a); \
   1254   (float32_t)__builtin_neon_vget_lane_f32(__a, __b); })
   1255 #define vgetq_lane_u8(a, __b) __extension__ ({ \
   1256   uint8x16_t __a = (a); \
   1257   (uint8_t)__builtin_neon_vgetq_lane_i8((int8x16_t)__a, __b); })
   1258 #define vgetq_lane_u16(a, __b) __extension__ ({ \
   1259   uint16x8_t __a = (a); \
   1260   (uint16_t)__builtin_neon_vgetq_lane_i16((int16x8_t)__a, __b); })
   1261 #define vgetq_lane_u32(a, __b) __extension__ ({ \
   1262   uint32x4_t __a = (a); \
   1263   (uint32_t)__builtin_neon_vgetq_lane_i32((int32x4_t)__a, __b); })
   1264 #define vgetq_lane_s8(a, __b) __extension__ ({ \
   1265   int8x16_t __a = (a); \
   1266   (int8_t)__builtin_neon_vgetq_lane_i8(__a, __b); })
   1267 #define vgetq_lane_s16(a, __b) __extension__ ({ \
   1268   int16x8_t __a = (a); \
   1269   (int16_t)__builtin_neon_vgetq_lane_i16(__a, __b); })
   1270 #define vgetq_lane_s32(a, __b) __extension__ ({ \
   1271   int32x4_t __a = (a); \
   1272   (int32_t)__builtin_neon_vgetq_lane_i32(__a, __b); })
   1273 #define vgetq_lane_p8(a, __b) __extension__ ({ \
   1274   poly8x16_t __a = (a); \
   1275   (poly8_t)__builtin_neon_vgetq_lane_i8((int8x16_t)__a, __b); })
   1276 #define vgetq_lane_p16(a, __b) __extension__ ({ \
   1277   poly16x8_t __a = (a); \
   1278   (poly16_t)__builtin_neon_vgetq_lane_i16((int16x8_t)__a, __b); })
   1279 #define vgetq_lane_f32(a, __b) __extension__ ({ \
   1280   float32x4_t __a = (a); \
   1281   (float32_t)__builtin_neon_vgetq_lane_f32(__a, __b); })
   1282 #define vget_lane_s64(a, __b) __extension__ ({ \
   1283   int64x1_t __a = (a); \
   1284   (int64_t)__builtin_neon_vget_lane_i64(__a, __b); })
   1285 #define vget_lane_u64(a, __b) __extension__ ({ \
   1286   uint64x1_t __a = (a); \
   1287   (uint64_t)__builtin_neon_vget_lane_i64((int64x1_t)__a, __b); })
   1288 #define vgetq_lane_s64(a, __b) __extension__ ({ \
   1289   int64x2_t __a = (a); \
   1290   (int64_t)__builtin_neon_vgetq_lane_i64(__a, __b); })
   1291 #define vgetq_lane_u64(a, __b) __extension__ ({ \
   1292   uint64x2_t __a = (a); \
   1293   (uint64_t)__builtin_neon_vgetq_lane_i64((int64x2_t)__a, __b); })
   1294 
   1295 __ai int8x8_t vget_low_s8(int8x16_t __a) {
   1296   return (int8x8_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1297 __ai int16x4_t vget_low_s16(int16x8_t __a) {
   1298   return (int16x4_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1299 __ai int32x2_t vget_low_s32(int32x4_t __a) {
   1300   return (int32x2_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1301 __ai int64x1_t vget_low_s64(int64x2_t __a) {
   1302   return (int64x1_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1303 __ai float16x4_t vget_low_f16(float16x8_t __a) {
   1304   return (float16x4_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1305 __ai float32x2_t vget_low_f32(float32x4_t __a) {
   1306   return (float32x2_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1307 __ai uint8x8_t vget_low_u8(uint8x16_t __a) {
   1308   return (uint8x8_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1309 __ai uint16x4_t vget_low_u16(uint16x8_t __a) {
   1310   return (uint16x4_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1311 __ai uint32x2_t vget_low_u32(uint32x4_t __a) {
   1312   return (uint32x2_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1313 __ai uint64x1_t vget_low_u64(uint64x2_t __a) {
   1314   return (uint64x1_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1315 __ai poly8x8_t vget_low_p8(poly8x16_t __a) {
   1316   return (poly8x8_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1317 __ai poly16x4_t vget_low_p16(poly16x8_t __a) {
   1318   return (poly16x4_t)__builtin_shufflevector((int64x2_t)__a, (int64x2_t)__a, 0); }
   1319 
   1320 __ai int8x8_t vhadd_s8(int8x8_t __a, int8x8_t __b) {
   1321   return (int8x8_t)__builtin_neon_vhadd_v(__a, __b, 0); }
   1322 __ai int16x4_t vhadd_s16(int16x4_t __a, int16x4_t __b) {
   1323   return (int16x4_t)__builtin_neon_vhadd_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   1324 __ai int32x2_t vhadd_s32(int32x2_t __a, int32x2_t __b) {
   1325   return (int32x2_t)__builtin_neon_vhadd_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   1326 __ai uint8x8_t vhadd_u8(uint8x8_t __a, uint8x8_t __b) {
   1327   return (uint8x8_t)__builtin_neon_vhadd_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   1328 __ai uint16x4_t vhadd_u16(uint16x4_t __a, uint16x4_t __b) {
   1329   return (uint16x4_t)__builtin_neon_vhadd_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   1330 __ai uint32x2_t vhadd_u32(uint32x2_t __a, uint32x2_t __b) {
   1331   return (uint32x2_t)__builtin_neon_vhadd_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   1332 __ai int8x16_t vhaddq_s8(int8x16_t __a, int8x16_t __b) {
   1333   return (int8x16_t)__builtin_neon_vhaddq_v(__a, __b, 32); }
   1334 __ai int16x8_t vhaddq_s16(int16x8_t __a, int16x8_t __b) {
   1335   return (int16x8_t)__builtin_neon_vhaddq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   1336 __ai int32x4_t vhaddq_s32(int32x4_t __a, int32x4_t __b) {
   1337   return (int32x4_t)__builtin_neon_vhaddq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   1338 __ai uint8x16_t vhaddq_u8(uint8x16_t __a, uint8x16_t __b) {
   1339   return (uint8x16_t)__builtin_neon_vhaddq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
   1340 __ai uint16x8_t vhaddq_u16(uint16x8_t __a, uint16x8_t __b) {
   1341   return (uint16x8_t)__builtin_neon_vhaddq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   1342 __ai uint32x4_t vhaddq_u32(uint32x4_t __a, uint32x4_t __b) {
   1343   return (uint32x4_t)__builtin_neon_vhaddq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   1344 
   1345 __ai int8x8_t vhsub_s8(int8x8_t __a, int8x8_t __b) {
   1346   return (int8x8_t)__builtin_neon_vhsub_v(__a, __b, 0); }
   1347 __ai int16x4_t vhsub_s16(int16x4_t __a, int16x4_t __b) {
   1348   return (int16x4_t)__builtin_neon_vhsub_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   1349 __ai int32x2_t vhsub_s32(int32x2_t __a, int32x2_t __b) {
   1350   return (int32x2_t)__builtin_neon_vhsub_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   1351 __ai uint8x8_t vhsub_u8(uint8x8_t __a, uint8x8_t __b) {
   1352   return (uint8x8_t)__builtin_neon_vhsub_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   1353 __ai uint16x4_t vhsub_u16(uint16x4_t __a, uint16x4_t __b) {
   1354   return (uint16x4_t)__builtin_neon_vhsub_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   1355 __ai uint32x2_t vhsub_u32(uint32x2_t __a, uint32x2_t __b) {
   1356   return (uint32x2_t)__builtin_neon_vhsub_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   1357 __ai int8x16_t vhsubq_s8(int8x16_t __a, int8x16_t __b) {
   1358   return (int8x16_t)__builtin_neon_vhsubq_v(__a, __b, 32); }
   1359 __ai int16x8_t vhsubq_s16(int16x8_t __a, int16x8_t __b) {
   1360   return (int16x8_t)__builtin_neon_vhsubq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   1361 __ai int32x4_t vhsubq_s32(int32x4_t __a, int32x4_t __b) {
   1362   return (int32x4_t)__builtin_neon_vhsubq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   1363 __ai uint8x16_t vhsubq_u8(uint8x16_t __a, uint8x16_t __b) {
   1364   return (uint8x16_t)__builtin_neon_vhsubq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
   1365 __ai uint16x8_t vhsubq_u16(uint16x8_t __a, uint16x8_t __b) {
   1366   return (uint16x8_t)__builtin_neon_vhsubq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   1367 __ai uint32x4_t vhsubq_u32(uint32x4_t __a, uint32x4_t __b) {
   1368   return (uint32x4_t)__builtin_neon_vhsubq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   1369 
   1370 #define vld1q_u8(__a) __extension__ ({ \
   1371   (uint8x16_t)__builtin_neon_vld1q_v(__a, 48); })
   1372 #define vld1q_u16(__a) __extension__ ({ \
   1373   (uint16x8_t)__builtin_neon_vld1q_v(__a, 49); })
   1374 #define vld1q_u32(__a) __extension__ ({ \
   1375   (uint32x4_t)__builtin_neon_vld1q_v(__a, 50); })
   1376 #define vld1q_u64(__a) __extension__ ({ \
   1377   (uint64x2_t)__builtin_neon_vld1q_v(__a, 51); })
   1378 #define vld1q_s8(__a) __extension__ ({ \
   1379   (int8x16_t)__builtin_neon_vld1q_v(__a, 32); })
   1380 #define vld1q_s16(__a) __extension__ ({ \
   1381   (int16x8_t)__builtin_neon_vld1q_v(__a, 33); })
   1382 #define vld1q_s32(__a) __extension__ ({ \
   1383   (int32x4_t)__builtin_neon_vld1q_v(__a, 34); })
   1384 #define vld1q_s64(__a) __extension__ ({ \
   1385   (int64x2_t)__builtin_neon_vld1q_v(__a, 35); })
   1386 #define vld1q_f16(__a) __extension__ ({ \
   1387   (float16x8_t)__builtin_neon_vld1q_v(__a, 38); })
   1388 #define vld1q_f32(__a) __extension__ ({ \
   1389   (float32x4_t)__builtin_neon_vld1q_v(__a, 39); })
   1390 #define vld1q_p8(__a) __extension__ ({ \
   1391   (poly8x16_t)__builtin_neon_vld1q_v(__a, 36); })
   1392 #define vld1q_p16(__a) __extension__ ({ \
   1393   (poly16x8_t)__builtin_neon_vld1q_v(__a, 37); })
   1394 #define vld1_u8(__a) __extension__ ({ \
   1395   (uint8x8_t)__builtin_neon_vld1_v(__a, 16); })
   1396 #define vld1_u16(__a) __extension__ ({ \
   1397   (uint16x4_t)__builtin_neon_vld1_v(__a, 17); })
   1398 #define vld1_u32(__a) __extension__ ({ \
   1399   (uint32x2_t)__builtin_neon_vld1_v(__a, 18); })
   1400 #define vld1_u64(__a) __extension__ ({ \
   1401   (uint64x1_t)__builtin_neon_vld1_v(__a, 19); })
   1402 #define vld1_s8(__a) __extension__ ({ \
   1403   (int8x8_t)__builtin_neon_vld1_v(__a, 0); })
   1404 #define vld1_s16(__a) __extension__ ({ \
   1405   (int16x4_t)__builtin_neon_vld1_v(__a, 1); })
   1406 #define vld1_s32(__a) __extension__ ({ \
   1407   (int32x2_t)__builtin_neon_vld1_v(__a, 2); })
   1408 #define vld1_s64(__a) __extension__ ({ \
   1409   (int64x1_t)__builtin_neon_vld1_v(__a, 3); })
   1410 #define vld1_f16(__a) __extension__ ({ \
   1411   (float16x4_t)__builtin_neon_vld1_v(__a, 6); })
   1412 #define vld1_f32(__a) __extension__ ({ \
   1413   (float32x2_t)__builtin_neon_vld1_v(__a, 7); })
   1414 #define vld1_p8(__a) __extension__ ({ \
   1415   (poly8x8_t)__builtin_neon_vld1_v(__a, 4); })
   1416 #define vld1_p16(__a) __extension__ ({ \
   1417   (poly16x4_t)__builtin_neon_vld1_v(__a, 5); })
   1418 
   1419 #define vld1q_dup_u8(__a) __extension__ ({ \
   1420   (uint8x16_t)__builtin_neon_vld1q_dup_v(__a, 48); })
   1421 #define vld1q_dup_u16(__a) __extension__ ({ \
   1422   (uint16x8_t)__builtin_neon_vld1q_dup_v(__a, 49); })
   1423 #define vld1q_dup_u32(__a) __extension__ ({ \
   1424   (uint32x4_t)__builtin_neon_vld1q_dup_v(__a, 50); })
   1425 #define vld1q_dup_u64(__a) __extension__ ({ \
   1426   (uint64x2_t)__builtin_neon_vld1q_dup_v(__a, 51); })
   1427 #define vld1q_dup_s8(__a) __extension__ ({ \
   1428   (int8x16_t)__builtin_neon_vld1q_dup_v(__a, 32); })
   1429 #define vld1q_dup_s16(__a) __extension__ ({ \
   1430   (int16x8_t)__builtin_neon_vld1q_dup_v(__a, 33); })
   1431 #define vld1q_dup_s32(__a) __extension__ ({ \
   1432   (int32x4_t)__builtin_neon_vld1q_dup_v(__a, 34); })
   1433 #define vld1q_dup_s64(__a) __extension__ ({ \
   1434   (int64x2_t)__builtin_neon_vld1q_dup_v(__a, 35); })
   1435 #define vld1q_dup_f16(__a) __extension__ ({ \
   1436   (float16x8_t)__builtin_neon_vld1q_dup_v(__a, 38); })
   1437 #define vld1q_dup_f32(__a) __extension__ ({ \
   1438   (float32x4_t)__builtin_neon_vld1q_dup_v(__a, 39); })
   1439 #define vld1q_dup_p8(__a) __extension__ ({ \
   1440   (poly8x16_t)__builtin_neon_vld1q_dup_v(__a, 36); })
   1441 #define vld1q_dup_p16(__a) __extension__ ({ \
   1442   (poly16x8_t)__builtin_neon_vld1q_dup_v(__a, 37); })
   1443 #define vld1_dup_u8(__a) __extension__ ({ \
   1444   (uint8x8_t)__builtin_neon_vld1_dup_v(__a, 16); })
   1445 #define vld1_dup_u16(__a) __extension__ ({ \
   1446   (uint16x4_t)__builtin_neon_vld1_dup_v(__a, 17); })
   1447 #define vld1_dup_u32(__a) __extension__ ({ \
   1448   (uint32x2_t)__builtin_neon_vld1_dup_v(__a, 18); })
   1449 #define vld1_dup_u64(__a) __extension__ ({ \
   1450   (uint64x1_t)__builtin_neon_vld1_dup_v(__a, 19); })
   1451 #define vld1_dup_s8(__a) __extension__ ({ \
   1452   (int8x8_t)__builtin_neon_vld1_dup_v(__a, 0); })
   1453 #define vld1_dup_s16(__a) __extension__ ({ \
   1454   (int16x4_t)__builtin_neon_vld1_dup_v(__a, 1); })
   1455 #define vld1_dup_s32(__a) __extension__ ({ \
   1456   (int32x2_t)__builtin_neon_vld1_dup_v(__a, 2); })
   1457 #define vld1_dup_s64(__a) __extension__ ({ \
   1458   (int64x1_t)__builtin_neon_vld1_dup_v(__a, 3); })
   1459 #define vld1_dup_f16(__a) __extension__ ({ \
   1460   (float16x4_t)__builtin_neon_vld1_dup_v(__a, 6); })
   1461 #define vld1_dup_f32(__a) __extension__ ({ \
   1462   (float32x2_t)__builtin_neon_vld1_dup_v(__a, 7); })
   1463 #define vld1_dup_p8(__a) __extension__ ({ \
   1464   (poly8x8_t)__builtin_neon_vld1_dup_v(__a, 4); })
   1465 #define vld1_dup_p16(__a) __extension__ ({ \
   1466   (poly16x4_t)__builtin_neon_vld1_dup_v(__a, 5); })
   1467 
   1468 #define vld1q_lane_u8(__a, b, __c) __extension__ ({ \
   1469   uint8x16_t __b = (b); \
   1470   (uint8x16_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 48); })
   1471 #define vld1q_lane_u16(__a, b, __c) __extension__ ({ \
   1472   uint16x8_t __b = (b); \
   1473   (uint16x8_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 49); })
   1474 #define vld1q_lane_u32(__a, b, __c) __extension__ ({ \
   1475   uint32x4_t __b = (b); \
   1476   (uint32x4_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 50); })
   1477 #define vld1q_lane_u64(__a, b, __c) __extension__ ({ \
   1478   uint64x2_t __b = (b); \
   1479   (uint64x2_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 51); })
   1480 #define vld1q_lane_s8(__a, b, __c) __extension__ ({ \
   1481   int8x16_t __b = (b); \
   1482   (int8x16_t)__builtin_neon_vld1q_lane_v(__a, __b, __c, 32); })
   1483 #define vld1q_lane_s16(__a, b, __c) __extension__ ({ \
   1484   int16x8_t __b = (b); \
   1485   (int16x8_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 33); })
   1486 #define vld1q_lane_s32(__a, b, __c) __extension__ ({ \
   1487   int32x4_t __b = (b); \
   1488   (int32x4_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 34); })
   1489 #define vld1q_lane_s64(__a, b, __c) __extension__ ({ \
   1490   int64x2_t __b = (b); \
   1491   (int64x2_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 35); })
   1492 #define vld1q_lane_f16(__a, b, __c) __extension__ ({ \
   1493   float16x8_t __b = (b); \
   1494   (float16x8_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 38); })
   1495 #define vld1q_lane_f32(__a, b, __c) __extension__ ({ \
   1496   float32x4_t __b = (b); \
   1497   (float32x4_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 39); })
   1498 #define vld1q_lane_p8(__a, b, __c) __extension__ ({ \
   1499   poly8x16_t __b = (b); \
   1500   (poly8x16_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 36); })
   1501 #define vld1q_lane_p16(__a, b, __c) __extension__ ({ \
   1502   poly16x8_t __b = (b); \
   1503   (poly16x8_t)__builtin_neon_vld1q_lane_v(__a, (int8x16_t)__b, __c, 37); })
   1504 #define vld1_lane_u8(__a, b, __c) __extension__ ({ \
   1505   uint8x8_t __b = (b); \
   1506   (uint8x8_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 16); })
   1507 #define vld1_lane_u16(__a, b, __c) __extension__ ({ \
   1508   uint16x4_t __b = (b); \
   1509   (uint16x4_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 17); })
   1510 #define vld1_lane_u32(__a, b, __c) __extension__ ({ \
   1511   uint32x2_t __b = (b); \
   1512   (uint32x2_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 18); })
   1513 #define vld1_lane_u64(__a, b, __c) __extension__ ({ \
   1514   uint64x1_t __b = (b); \
   1515   (uint64x1_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 19); })
   1516 #define vld1_lane_s8(__a, b, __c) __extension__ ({ \
   1517   int8x8_t __b = (b); \
   1518   (int8x8_t)__builtin_neon_vld1_lane_v(__a, __b, __c, 0); })
   1519 #define vld1_lane_s16(__a, b, __c) __extension__ ({ \
   1520   int16x4_t __b = (b); \
   1521   (int16x4_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 1); })
   1522 #define vld1_lane_s32(__a, b, __c) __extension__ ({ \
   1523   int32x2_t __b = (b); \
   1524   (int32x2_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 2); })
   1525 #define vld1_lane_s64(__a, b, __c) __extension__ ({ \
   1526   int64x1_t __b = (b); \
   1527   (int64x1_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 3); })
   1528 #define vld1_lane_f16(__a, b, __c) __extension__ ({ \
   1529   float16x4_t __b = (b); \
   1530   (float16x4_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 6); })
   1531 #define vld1_lane_f32(__a, b, __c) __extension__ ({ \
   1532   float32x2_t __b = (b); \
   1533   (float32x2_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 7); })
   1534 #define vld1_lane_p8(__a, b, __c) __extension__ ({ \
   1535   poly8x8_t __b = (b); \
   1536   (poly8x8_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 4); })
   1537 #define vld1_lane_p16(__a, b, __c) __extension__ ({ \
   1538   poly16x4_t __b = (b); \
   1539   (poly16x4_t)__builtin_neon_vld1_lane_v(__a, (int8x8_t)__b, __c, 5); })
   1540 
   1541 #define vld2q_u8(__a) __extension__ ({ \
   1542   uint8x16x2_t r; __builtin_neon_vld2q_v(&r, __a, 48); r; })
   1543 #define vld2q_u16(__a) __extension__ ({ \
   1544   uint16x8x2_t r; __builtin_neon_vld2q_v(&r, __a, 49); r; })
   1545 #define vld2q_u32(__a) __extension__ ({ \
   1546   uint32x4x2_t r; __builtin_neon_vld2q_v(&r, __a, 50); r; })
   1547 #define vld2q_s8(__a) __extension__ ({ \
   1548   int8x16x2_t r; __builtin_neon_vld2q_v(&r, __a, 32); r; })
   1549 #define vld2q_s16(__a) __extension__ ({ \
   1550   int16x8x2_t r; __builtin_neon_vld2q_v(&r, __a, 33); r; })
   1551 #define vld2q_s32(__a) __extension__ ({ \
   1552   int32x4x2_t r; __builtin_neon_vld2q_v(&r, __a, 34); r; })
   1553 #define vld2q_f16(__a) __extension__ ({ \
   1554   float16x8x2_t r; __builtin_neon_vld2q_v(&r, __a, 38); r; })
   1555 #define vld2q_f32(__a) __extension__ ({ \
   1556   float32x4x2_t r; __builtin_neon_vld2q_v(&r, __a, 39); r; })
   1557 #define vld2q_p8(__a) __extension__ ({ \
   1558   poly8x16x2_t r; __builtin_neon_vld2q_v(&r, __a, 36); r; })
   1559 #define vld2q_p16(__a) __extension__ ({ \
   1560   poly16x8x2_t r; __builtin_neon_vld2q_v(&r, __a, 37); r; })
   1561 #define vld2_u8(__a) __extension__ ({ \
   1562   uint8x8x2_t r; __builtin_neon_vld2_v(&r, __a, 16); r; })
   1563 #define vld2_u16(__a) __extension__ ({ \
   1564   uint16x4x2_t r; __builtin_neon_vld2_v(&r, __a, 17); r; })
   1565 #define vld2_u32(__a) __extension__ ({ \
   1566   uint32x2x2_t r; __builtin_neon_vld2_v(&r, __a, 18); r; })
   1567 #define vld2_u64(__a) __extension__ ({ \
   1568   uint64x1x2_t r; __builtin_neon_vld2_v(&r, __a, 19); r; })
   1569 #define vld2_s8(__a) __extension__ ({ \
   1570   int8x8x2_t r; __builtin_neon_vld2_v(&r, __a, 0); r; })
   1571 #define vld2_s16(__a) __extension__ ({ \
   1572   int16x4x2_t r; __builtin_neon_vld2_v(&r, __a, 1); r; })
   1573 #define vld2_s32(__a) __extension__ ({ \
   1574   int32x2x2_t r; __builtin_neon_vld2_v(&r, __a, 2); r; })
   1575 #define vld2_s64(__a) __extension__ ({ \
   1576   int64x1x2_t r; __builtin_neon_vld2_v(&r, __a, 3); r; })
   1577 #define vld2_f16(__a) __extension__ ({ \
   1578   float16x4x2_t r; __builtin_neon_vld2_v(&r, __a, 6); r; })
   1579 #define vld2_f32(__a) __extension__ ({ \
   1580   float32x2x2_t r; __builtin_neon_vld2_v(&r, __a, 7); r; })
   1581 #define vld2_p8(__a) __extension__ ({ \
   1582   poly8x8x2_t r; __builtin_neon_vld2_v(&r, __a, 4); r; })
   1583 #define vld2_p16(__a) __extension__ ({ \
   1584   poly16x4x2_t r; __builtin_neon_vld2_v(&r, __a, 5); r; })
   1585 
   1586 #define vld2_dup_u8(__a) __extension__ ({ \
   1587   uint8x8x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 16); r; })
   1588 #define vld2_dup_u16(__a) __extension__ ({ \
   1589   uint16x4x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 17); r; })
   1590 #define vld2_dup_u32(__a) __extension__ ({ \
   1591   uint32x2x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 18); r; })
   1592 #define vld2_dup_u64(__a) __extension__ ({ \
   1593   uint64x1x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 19); r; })
   1594 #define vld2_dup_s8(__a) __extension__ ({ \
   1595   int8x8x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 0); r; })
   1596 #define vld2_dup_s16(__a) __extension__ ({ \
   1597   int16x4x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 1); r; })
   1598 #define vld2_dup_s32(__a) __extension__ ({ \
   1599   int32x2x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 2); r; })
   1600 #define vld2_dup_s64(__a) __extension__ ({ \
   1601   int64x1x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 3); r; })
   1602 #define vld2_dup_f16(__a) __extension__ ({ \
   1603   float16x4x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 6); r; })
   1604 #define vld2_dup_f32(__a) __extension__ ({ \
   1605   float32x2x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 7); r; })
   1606 #define vld2_dup_p8(__a) __extension__ ({ \
   1607   poly8x8x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 4); r; })
   1608 #define vld2_dup_p16(__a) __extension__ ({ \
   1609   poly16x4x2_t r; __builtin_neon_vld2_dup_v(&r, __a, 5); r; })
   1610 
   1611 #define vld2q_lane_u16(__a, b, __c) __extension__ ({ \
   1612   uint16x8x2_t __b = (b); \
   1613   uint16x8x2_t r; __builtin_neon_vld2q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 49); r; })
   1614 #define vld2q_lane_u32(__a, b, __c) __extension__ ({ \
   1615   uint32x4x2_t __b = (b); \
   1616   uint32x4x2_t r; __builtin_neon_vld2q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 50); r; })
   1617 #define vld2q_lane_s16(__a, b, __c) __extension__ ({ \
   1618   int16x8x2_t __b = (b); \
   1619   int16x8x2_t r; __builtin_neon_vld2q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 33); r; })
   1620 #define vld2q_lane_s32(__a, b, __c) __extension__ ({ \
   1621   int32x4x2_t __b = (b); \
   1622   int32x4x2_t r; __builtin_neon_vld2q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 34); r; })
   1623 #define vld2q_lane_f16(__a, b, __c) __extension__ ({ \
   1624   float16x8x2_t __b = (b); \
   1625   float16x8x2_t r; __builtin_neon_vld2q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 38); r; })
   1626 #define vld2q_lane_f32(__a, b, __c) __extension__ ({ \
   1627   float32x4x2_t __b = (b); \
   1628   float32x4x2_t r; __builtin_neon_vld2q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 39); r; })
   1629 #define vld2q_lane_p16(__a, b, __c) __extension__ ({ \
   1630   poly16x8x2_t __b = (b); \
   1631   poly16x8x2_t r; __builtin_neon_vld2q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 37); r; })
   1632 #define vld2_lane_u8(__a, b, __c) __extension__ ({ \
   1633   uint8x8x2_t __b = (b); \
   1634   uint8x8x2_t r; __builtin_neon_vld2_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 16); r; })
   1635 #define vld2_lane_u16(__a, b, __c) __extension__ ({ \
   1636   uint16x4x2_t __b = (b); \
   1637   uint16x4x2_t r; __builtin_neon_vld2_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 17); r; })
   1638 #define vld2_lane_u32(__a, b, __c) __extension__ ({ \
   1639   uint32x2x2_t __b = (b); \
   1640   uint32x2x2_t r; __builtin_neon_vld2_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 18); r; })
   1641 #define vld2_lane_s8(__a, b, __c) __extension__ ({ \
   1642   int8x8x2_t __b = (b); \
   1643   int8x8x2_t r; __builtin_neon_vld2_lane_v(&r, __a, __b.val[0], __b.val[1], __c, 0); r; })
   1644 #define vld2_lane_s16(__a, b, __c) __extension__ ({ \
   1645   int16x4x2_t __b = (b); \
   1646   int16x4x2_t r; __builtin_neon_vld2_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 1); r; })
   1647 #define vld2_lane_s32(__a, b, __c) __extension__ ({ \
   1648   int32x2x2_t __b = (b); \
   1649   int32x2x2_t r; __builtin_neon_vld2_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 2); r; })
   1650 #define vld2_lane_f16(__a, b, __c) __extension__ ({ \
   1651   float16x4x2_t __b = (b); \
   1652   float16x4x2_t r; __builtin_neon_vld2_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 6); r; })
   1653 #define vld2_lane_f32(__a, b, __c) __extension__ ({ \
   1654   float32x2x2_t __b = (b); \
   1655   float32x2x2_t r; __builtin_neon_vld2_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 7); r; })
   1656 #define vld2_lane_p8(__a, b, __c) __extension__ ({ \
   1657   poly8x8x2_t __b = (b); \
   1658   poly8x8x2_t r; __builtin_neon_vld2_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 4); r; })
   1659 #define vld2_lane_p16(__a, b, __c) __extension__ ({ \
   1660   poly16x4x2_t __b = (b); \
   1661   poly16x4x2_t r; __builtin_neon_vld2_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 5); r; })
   1662 
   1663 #define vld3q_u8(__a) __extension__ ({ \
   1664   uint8x16x3_t r; __builtin_neon_vld3q_v(&r, __a, 48); r; })
   1665 #define vld3q_u16(__a) __extension__ ({ \
   1666   uint16x8x3_t r; __builtin_neon_vld3q_v(&r, __a, 49); r; })
   1667 #define vld3q_u32(__a) __extension__ ({ \
   1668   uint32x4x3_t r; __builtin_neon_vld3q_v(&r, __a, 50); r; })
   1669 #define vld3q_s8(__a) __extension__ ({ \
   1670   int8x16x3_t r; __builtin_neon_vld3q_v(&r, __a, 32); r; })
   1671 #define vld3q_s16(__a) __extension__ ({ \
   1672   int16x8x3_t r; __builtin_neon_vld3q_v(&r, __a, 33); r; })
   1673 #define vld3q_s32(__a) __extension__ ({ \
   1674   int32x4x3_t r; __builtin_neon_vld3q_v(&r, __a, 34); r; })
   1675 #define vld3q_f16(__a) __extension__ ({ \
   1676   float16x8x3_t r; __builtin_neon_vld3q_v(&r, __a, 38); r; })
   1677 #define vld3q_f32(__a) __extension__ ({ \
   1678   float32x4x3_t r; __builtin_neon_vld3q_v(&r, __a, 39); r; })
   1679 #define vld3q_p8(__a) __extension__ ({ \
   1680   poly8x16x3_t r; __builtin_neon_vld3q_v(&r, __a, 36); r; })
   1681 #define vld3q_p16(__a) __extension__ ({ \
   1682   poly16x8x3_t r; __builtin_neon_vld3q_v(&r, __a, 37); r; })
   1683 #define vld3_u8(__a) __extension__ ({ \
   1684   uint8x8x3_t r; __builtin_neon_vld3_v(&r, __a, 16); r; })
   1685 #define vld3_u16(__a) __extension__ ({ \
   1686   uint16x4x3_t r; __builtin_neon_vld3_v(&r, __a, 17); r; })
   1687 #define vld3_u32(__a) __extension__ ({ \
   1688   uint32x2x3_t r; __builtin_neon_vld3_v(&r, __a, 18); r; })
   1689 #define vld3_u64(__a) __extension__ ({ \
   1690   uint64x1x3_t r; __builtin_neon_vld3_v(&r, __a, 19); r; })
   1691 #define vld3_s8(__a) __extension__ ({ \
   1692   int8x8x3_t r; __builtin_neon_vld3_v(&r, __a, 0); r; })
   1693 #define vld3_s16(__a) __extension__ ({ \
   1694   int16x4x3_t r; __builtin_neon_vld3_v(&r, __a, 1); r; })
   1695 #define vld3_s32(__a) __extension__ ({ \
   1696   int32x2x3_t r; __builtin_neon_vld3_v(&r, __a, 2); r; })
   1697 #define vld3_s64(__a) __extension__ ({ \
   1698   int64x1x3_t r; __builtin_neon_vld3_v(&r, __a, 3); r; })
   1699 #define vld3_f16(__a) __extension__ ({ \
   1700   float16x4x3_t r; __builtin_neon_vld3_v(&r, __a, 6); r; })
   1701 #define vld3_f32(__a) __extension__ ({ \
   1702   float32x2x3_t r; __builtin_neon_vld3_v(&r, __a, 7); r; })
   1703 #define vld3_p8(__a) __extension__ ({ \
   1704   poly8x8x3_t r; __builtin_neon_vld3_v(&r, __a, 4); r; })
   1705 #define vld3_p16(__a) __extension__ ({ \
   1706   poly16x4x3_t r; __builtin_neon_vld3_v(&r, __a, 5); r; })
   1707 
   1708 #define vld3_dup_u8(__a) __extension__ ({ \
   1709   uint8x8x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 16); r; })
   1710 #define vld3_dup_u16(__a) __extension__ ({ \
   1711   uint16x4x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 17); r; })
   1712 #define vld3_dup_u32(__a) __extension__ ({ \
   1713   uint32x2x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 18); r; })
   1714 #define vld3_dup_u64(__a) __extension__ ({ \
   1715   uint64x1x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 19); r; })
   1716 #define vld3_dup_s8(__a) __extension__ ({ \
   1717   int8x8x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 0); r; })
   1718 #define vld3_dup_s16(__a) __extension__ ({ \
   1719   int16x4x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 1); r; })
   1720 #define vld3_dup_s32(__a) __extension__ ({ \
   1721   int32x2x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 2); r; })
   1722 #define vld3_dup_s64(__a) __extension__ ({ \
   1723   int64x1x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 3); r; })
   1724 #define vld3_dup_f16(__a) __extension__ ({ \
   1725   float16x4x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 6); r; })
   1726 #define vld3_dup_f32(__a) __extension__ ({ \
   1727   float32x2x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 7); r; })
   1728 #define vld3_dup_p8(__a) __extension__ ({ \
   1729   poly8x8x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 4); r; })
   1730 #define vld3_dup_p16(__a) __extension__ ({ \
   1731   poly16x4x3_t r; __builtin_neon_vld3_dup_v(&r, __a, 5); r; })
   1732 
   1733 #define vld3q_lane_u16(__a, b, __c) __extension__ ({ \
   1734   uint16x8x3_t __b = (b); \
   1735   uint16x8x3_t r; __builtin_neon_vld3q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 49); r; })
   1736 #define vld3q_lane_u32(__a, b, __c) __extension__ ({ \
   1737   uint32x4x3_t __b = (b); \
   1738   uint32x4x3_t r; __builtin_neon_vld3q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 50); r; })
   1739 #define vld3q_lane_s16(__a, b, __c) __extension__ ({ \
   1740   int16x8x3_t __b = (b); \
   1741   int16x8x3_t r; __builtin_neon_vld3q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 33); r; })
   1742 #define vld3q_lane_s32(__a, b, __c) __extension__ ({ \
   1743   int32x4x3_t __b = (b); \
   1744   int32x4x3_t r; __builtin_neon_vld3q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 34); r; })
   1745 #define vld3q_lane_f16(__a, b, __c) __extension__ ({ \
   1746   float16x8x3_t __b = (b); \
   1747   float16x8x3_t r; __builtin_neon_vld3q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 38); r; })
   1748 #define vld3q_lane_f32(__a, b, __c) __extension__ ({ \
   1749   float32x4x3_t __b = (b); \
   1750   float32x4x3_t r; __builtin_neon_vld3q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 39); r; })
   1751 #define vld3q_lane_p16(__a, b, __c) __extension__ ({ \
   1752   poly16x8x3_t __b = (b); \
   1753   poly16x8x3_t r; __builtin_neon_vld3q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 37); r; })
   1754 #define vld3_lane_u8(__a, b, __c) __extension__ ({ \
   1755   uint8x8x3_t __b = (b); \
   1756   uint8x8x3_t r; __builtin_neon_vld3_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 16); r; })
   1757 #define vld3_lane_u16(__a, b, __c) __extension__ ({ \
   1758   uint16x4x3_t __b = (b); \
   1759   uint16x4x3_t r; __builtin_neon_vld3_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 17); r; })
   1760 #define vld3_lane_u32(__a, b, __c) __extension__ ({ \
   1761   uint32x2x3_t __b = (b); \
   1762   uint32x2x3_t r; __builtin_neon_vld3_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 18); r; })
   1763 #define vld3_lane_s8(__a, b, __c) __extension__ ({ \
   1764   int8x8x3_t __b = (b); \
   1765   int8x8x3_t r; __builtin_neon_vld3_lane_v(&r, __a, __b.val[0], __b.val[1], __b.val[2], __c, 0); r; })
   1766 #define vld3_lane_s16(__a, b, __c) __extension__ ({ \
   1767   int16x4x3_t __b = (b); \
   1768   int16x4x3_t r; __builtin_neon_vld3_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 1); r; })
   1769 #define vld3_lane_s32(__a, b, __c) __extension__ ({ \
   1770   int32x2x3_t __b = (b); \
   1771   int32x2x3_t r; __builtin_neon_vld3_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 2); r; })
   1772 #define vld3_lane_f16(__a, b, __c) __extension__ ({ \
   1773   float16x4x3_t __b = (b); \
   1774   float16x4x3_t r; __builtin_neon_vld3_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 6); r; })
   1775 #define vld3_lane_f32(__a, b, __c) __extension__ ({ \
   1776   float32x2x3_t __b = (b); \
   1777   float32x2x3_t r; __builtin_neon_vld3_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 7); r; })
   1778 #define vld3_lane_p8(__a, b, __c) __extension__ ({ \
   1779   poly8x8x3_t __b = (b); \
   1780   poly8x8x3_t r; __builtin_neon_vld3_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 4); r; })
   1781 #define vld3_lane_p16(__a, b, __c) __extension__ ({ \
   1782   poly16x4x3_t __b = (b); \
   1783   poly16x4x3_t r; __builtin_neon_vld3_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 5); r; })
   1784 
   1785 #define vld4q_u8(__a) __extension__ ({ \
   1786   uint8x16x4_t r; __builtin_neon_vld4q_v(&r, __a, 48); r; })
   1787 #define vld4q_u16(__a) __extension__ ({ \
   1788   uint16x8x4_t r; __builtin_neon_vld4q_v(&r, __a, 49); r; })
   1789 #define vld4q_u32(__a) __extension__ ({ \
   1790   uint32x4x4_t r; __builtin_neon_vld4q_v(&r, __a, 50); r; })
   1791 #define vld4q_s8(__a) __extension__ ({ \
   1792   int8x16x4_t r; __builtin_neon_vld4q_v(&r, __a, 32); r; })
   1793 #define vld4q_s16(__a) __extension__ ({ \
   1794   int16x8x4_t r; __builtin_neon_vld4q_v(&r, __a, 33); r; })
   1795 #define vld4q_s32(__a) __extension__ ({ \
   1796   int32x4x4_t r; __builtin_neon_vld4q_v(&r, __a, 34); r; })
   1797 #define vld4q_f16(__a) __extension__ ({ \
   1798   float16x8x4_t r; __builtin_neon_vld4q_v(&r, __a, 38); r; })
   1799 #define vld4q_f32(__a) __extension__ ({ \
   1800   float32x4x4_t r; __builtin_neon_vld4q_v(&r, __a, 39); r; })
   1801 #define vld4q_p8(__a) __extension__ ({ \
   1802   poly8x16x4_t r; __builtin_neon_vld4q_v(&r, __a, 36); r; })
   1803 #define vld4q_p16(__a) __extension__ ({ \
   1804   poly16x8x4_t r; __builtin_neon_vld4q_v(&r, __a, 37); r; })
   1805 #define vld4_u8(__a) __extension__ ({ \
   1806   uint8x8x4_t r; __builtin_neon_vld4_v(&r, __a, 16); r; })
   1807 #define vld4_u16(__a) __extension__ ({ \
   1808   uint16x4x4_t r; __builtin_neon_vld4_v(&r, __a, 17); r; })
   1809 #define vld4_u32(__a) __extension__ ({ \
   1810   uint32x2x4_t r; __builtin_neon_vld4_v(&r, __a, 18); r; })
   1811 #define vld4_u64(__a) __extension__ ({ \
   1812   uint64x1x4_t r; __builtin_neon_vld4_v(&r, __a, 19); r; })
   1813 #define vld4_s8(__a) __extension__ ({ \
   1814   int8x8x4_t r; __builtin_neon_vld4_v(&r, __a, 0); r; })
   1815 #define vld4_s16(__a) __extension__ ({ \
   1816   int16x4x4_t r; __builtin_neon_vld4_v(&r, __a, 1); r; })
   1817 #define vld4_s32(__a) __extension__ ({ \
   1818   int32x2x4_t r; __builtin_neon_vld4_v(&r, __a, 2); r; })
   1819 #define vld4_s64(__a) __extension__ ({ \
   1820   int64x1x4_t r; __builtin_neon_vld4_v(&r, __a, 3); r; })
   1821 #define vld4_f16(__a) __extension__ ({ \
   1822   float16x4x4_t r; __builtin_neon_vld4_v(&r, __a, 6); r; })
   1823 #define vld4_f32(__a) __extension__ ({ \
   1824   float32x2x4_t r; __builtin_neon_vld4_v(&r, __a, 7); r; })
   1825 #define vld4_p8(__a) __extension__ ({ \
   1826   poly8x8x4_t r; __builtin_neon_vld4_v(&r, __a, 4); r; })
   1827 #define vld4_p16(__a) __extension__ ({ \
   1828   poly16x4x4_t r; __builtin_neon_vld4_v(&r, __a, 5); r; })
   1829 
   1830 #define vld4_dup_u8(__a) __extension__ ({ \
   1831   uint8x8x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 16); r; })
   1832 #define vld4_dup_u16(__a) __extension__ ({ \
   1833   uint16x4x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 17); r; })
   1834 #define vld4_dup_u32(__a) __extension__ ({ \
   1835   uint32x2x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 18); r; })
   1836 #define vld4_dup_u64(__a) __extension__ ({ \
   1837   uint64x1x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 19); r; })
   1838 #define vld4_dup_s8(__a) __extension__ ({ \
   1839   int8x8x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 0); r; })
   1840 #define vld4_dup_s16(__a) __extension__ ({ \
   1841   int16x4x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 1); r; })
   1842 #define vld4_dup_s32(__a) __extension__ ({ \
   1843   int32x2x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 2); r; })
   1844 #define vld4_dup_s64(__a) __extension__ ({ \
   1845   int64x1x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 3); r; })
   1846 #define vld4_dup_f16(__a) __extension__ ({ \
   1847   float16x4x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 6); r; })
   1848 #define vld4_dup_f32(__a) __extension__ ({ \
   1849   float32x2x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 7); r; })
   1850 #define vld4_dup_p8(__a) __extension__ ({ \
   1851   poly8x8x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 4); r; })
   1852 #define vld4_dup_p16(__a) __extension__ ({ \
   1853   poly16x4x4_t r; __builtin_neon_vld4_dup_v(&r, __a, 5); r; })
   1854 
   1855 #define vld4q_lane_u16(__a, b, __c) __extension__ ({ \
   1856   uint16x8x4_t __b = (b); \
   1857   uint16x8x4_t r; __builtin_neon_vld4q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 49); r; })
   1858 #define vld4q_lane_u32(__a, b, __c) __extension__ ({ \
   1859   uint32x4x4_t __b = (b); \
   1860   uint32x4x4_t r; __builtin_neon_vld4q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 50); r; })
   1861 #define vld4q_lane_s16(__a, b, __c) __extension__ ({ \
   1862   int16x8x4_t __b = (b); \
   1863   int16x8x4_t r; __builtin_neon_vld4q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 33); r; })
   1864 #define vld4q_lane_s32(__a, b, __c) __extension__ ({ \
   1865   int32x4x4_t __b = (b); \
   1866   int32x4x4_t r; __builtin_neon_vld4q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 34); r; })
   1867 #define vld4q_lane_f16(__a, b, __c) __extension__ ({ \
   1868   float16x8x4_t __b = (b); \
   1869   float16x8x4_t r; __builtin_neon_vld4q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 38); r; })
   1870 #define vld4q_lane_f32(__a, b, __c) __extension__ ({ \
   1871   float32x4x4_t __b = (b); \
   1872   float32x4x4_t r; __builtin_neon_vld4q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 39); r; })
   1873 #define vld4q_lane_p16(__a, b, __c) __extension__ ({ \
   1874   poly16x8x4_t __b = (b); \
   1875   poly16x8x4_t r; __builtin_neon_vld4q_lane_v(&r, __a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 37); r; })
   1876 #define vld4_lane_u8(__a, b, __c) __extension__ ({ \
   1877   uint8x8x4_t __b = (b); \
   1878   uint8x8x4_t r; __builtin_neon_vld4_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 16); r; })
   1879 #define vld4_lane_u16(__a, b, __c) __extension__ ({ \
   1880   uint16x4x4_t __b = (b); \
   1881   uint16x4x4_t r; __builtin_neon_vld4_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 17); r; })
   1882 #define vld4_lane_u32(__a, b, __c) __extension__ ({ \
   1883   uint32x2x4_t __b = (b); \
   1884   uint32x2x4_t r; __builtin_neon_vld4_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 18); r; })
   1885 #define vld4_lane_s8(__a, b, __c) __extension__ ({ \
   1886   int8x8x4_t __b = (b); \
   1887   int8x8x4_t r; __builtin_neon_vld4_lane_v(&r, __a, __b.val[0], __b.val[1], __b.val[2], __b.val[3], __c, 0); r; })
   1888 #define vld4_lane_s16(__a, b, __c) __extension__ ({ \
   1889   int16x4x4_t __b = (b); \
   1890   int16x4x4_t r; __builtin_neon_vld4_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 1); r; })
   1891 #define vld4_lane_s32(__a, b, __c) __extension__ ({ \
   1892   int32x2x4_t __b = (b); \
   1893   int32x2x4_t r; __builtin_neon_vld4_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 2); r; })
   1894 #define vld4_lane_f16(__a, b, __c) __extension__ ({ \
   1895   float16x4x4_t __b = (b); \
   1896   float16x4x4_t r; __builtin_neon_vld4_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 6); r; })
   1897 #define vld4_lane_f32(__a, b, __c) __extension__ ({ \
   1898   float32x2x4_t __b = (b); \
   1899   float32x2x4_t r; __builtin_neon_vld4_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 7); r; })
   1900 #define vld4_lane_p8(__a, b, __c) __extension__ ({ \
   1901   poly8x8x4_t __b = (b); \
   1902   poly8x8x4_t r; __builtin_neon_vld4_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 4); r; })
   1903 #define vld4_lane_p16(__a, b, __c) __extension__ ({ \
   1904   poly16x4x4_t __b = (b); \
   1905   poly16x4x4_t r; __builtin_neon_vld4_lane_v(&r, __a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 5); r; })
   1906 
   1907 __ai int8x8_t vmax_s8(int8x8_t __a, int8x8_t __b) {
   1908   return (int8x8_t)__builtin_neon_vmax_v(__a, __b, 0); }
   1909 __ai int16x4_t vmax_s16(int16x4_t __a, int16x4_t __b) {
   1910   return (int16x4_t)__builtin_neon_vmax_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   1911 __ai int32x2_t vmax_s32(int32x2_t __a, int32x2_t __b) {
   1912   return (int32x2_t)__builtin_neon_vmax_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   1913 __ai uint8x8_t vmax_u8(uint8x8_t __a, uint8x8_t __b) {
   1914   return (uint8x8_t)__builtin_neon_vmax_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   1915 __ai uint16x4_t vmax_u16(uint16x4_t __a, uint16x4_t __b) {
   1916   return (uint16x4_t)__builtin_neon_vmax_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   1917 __ai uint32x2_t vmax_u32(uint32x2_t __a, uint32x2_t __b) {
   1918   return (uint32x2_t)__builtin_neon_vmax_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   1919 __ai float32x2_t vmax_f32(float32x2_t __a, float32x2_t __b) {
   1920   return (float32x2_t)__builtin_neon_vmax_v((int8x8_t)__a, (int8x8_t)__b, 7); }
   1921 __ai int8x16_t vmaxq_s8(int8x16_t __a, int8x16_t __b) {
   1922   return (int8x16_t)__builtin_neon_vmaxq_v(__a, __b, 32); }
   1923 __ai int16x8_t vmaxq_s16(int16x8_t __a, int16x8_t __b) {
   1924   return (int16x8_t)__builtin_neon_vmaxq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   1925 __ai int32x4_t vmaxq_s32(int32x4_t __a, int32x4_t __b) {
   1926   return (int32x4_t)__builtin_neon_vmaxq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   1927 __ai uint8x16_t vmaxq_u8(uint8x16_t __a, uint8x16_t __b) {
   1928   return (uint8x16_t)__builtin_neon_vmaxq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
   1929 __ai uint16x8_t vmaxq_u16(uint16x8_t __a, uint16x8_t __b) {
   1930   return (uint16x8_t)__builtin_neon_vmaxq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   1931 __ai uint32x4_t vmaxq_u32(uint32x4_t __a, uint32x4_t __b) {
   1932   return (uint32x4_t)__builtin_neon_vmaxq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   1933 __ai float32x4_t vmaxq_f32(float32x4_t __a, float32x4_t __b) {
   1934   return (float32x4_t)__builtin_neon_vmaxq_v((int8x16_t)__a, (int8x16_t)__b, 39); }
   1935 
   1936 __ai int8x8_t vmin_s8(int8x8_t __a, int8x8_t __b) {
   1937   return (int8x8_t)__builtin_neon_vmin_v(__a, __b, 0); }
   1938 __ai int16x4_t vmin_s16(int16x4_t __a, int16x4_t __b) {
   1939   return (int16x4_t)__builtin_neon_vmin_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   1940 __ai int32x2_t vmin_s32(int32x2_t __a, int32x2_t __b) {
   1941   return (int32x2_t)__builtin_neon_vmin_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   1942 __ai uint8x8_t vmin_u8(uint8x8_t __a, uint8x8_t __b) {
   1943   return (uint8x8_t)__builtin_neon_vmin_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   1944 __ai uint16x4_t vmin_u16(uint16x4_t __a, uint16x4_t __b) {
   1945   return (uint16x4_t)__builtin_neon_vmin_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   1946 __ai uint32x2_t vmin_u32(uint32x2_t __a, uint32x2_t __b) {
   1947   return (uint32x2_t)__builtin_neon_vmin_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   1948 __ai float32x2_t vmin_f32(float32x2_t __a, float32x2_t __b) {
   1949   return (float32x2_t)__builtin_neon_vmin_v((int8x8_t)__a, (int8x8_t)__b, 7); }
   1950 __ai int8x16_t vminq_s8(int8x16_t __a, int8x16_t __b) {
   1951   return (int8x16_t)__builtin_neon_vminq_v(__a, __b, 32); }
   1952 __ai int16x8_t vminq_s16(int16x8_t __a, int16x8_t __b) {
   1953   return (int16x8_t)__builtin_neon_vminq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   1954 __ai int32x4_t vminq_s32(int32x4_t __a, int32x4_t __b) {
   1955   return (int32x4_t)__builtin_neon_vminq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   1956 __ai uint8x16_t vminq_u8(uint8x16_t __a, uint8x16_t __b) {
   1957   return (uint8x16_t)__builtin_neon_vminq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
   1958 __ai uint16x8_t vminq_u16(uint16x8_t __a, uint16x8_t __b) {
   1959   return (uint16x8_t)__builtin_neon_vminq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   1960 __ai uint32x4_t vminq_u32(uint32x4_t __a, uint32x4_t __b) {
   1961   return (uint32x4_t)__builtin_neon_vminq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   1962 __ai float32x4_t vminq_f32(float32x4_t __a, float32x4_t __b) {
   1963   return (float32x4_t)__builtin_neon_vminq_v((int8x16_t)__a, (int8x16_t)__b, 39); }
   1964 
   1965 __ai int8x8_t vmla_s8(int8x8_t __a, int8x8_t __b, int8x8_t __c) {
   1966   return __a + (__b * __c); }
   1967 __ai int16x4_t vmla_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c) {
   1968   return __a + (__b * __c); }
   1969 __ai int32x2_t vmla_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c) {
   1970   return __a + (__b * __c); }
   1971 __ai float32x2_t vmla_f32(float32x2_t __a, float32x2_t __b, float32x2_t __c) {
   1972   return __a + (__b * __c); }
   1973 __ai uint8x8_t vmla_u8(uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) {
   1974   return __a + (__b * __c); }
   1975 __ai uint16x4_t vmla_u16(uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) {
   1976   return __a + (__b * __c); }
   1977 __ai uint32x2_t vmla_u32(uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) {
   1978   return __a + (__b * __c); }
   1979 __ai int8x16_t vmlaq_s8(int8x16_t __a, int8x16_t __b, int8x16_t __c) {
   1980   return __a + (__b * __c); }
   1981 __ai int16x8_t vmlaq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) {
   1982   return __a + (__b * __c); }
   1983 __ai int32x4_t vmlaq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) {
   1984   return __a + (__b * __c); }
   1985 __ai float32x4_t vmlaq_f32(float32x4_t __a, float32x4_t __b, float32x4_t __c) {
   1986   return __a + (__b * __c); }
   1987 __ai uint8x16_t vmlaq_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) {
   1988   return __a + (__b * __c); }
   1989 __ai uint16x8_t vmlaq_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) {
   1990   return __a + (__b * __c); }
   1991 __ai uint32x4_t vmlaq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) {
   1992   return __a + (__b * __c); }
   1993 
   1994 __ai int16x8_t vmlal_s8(int16x8_t __a, int8x8_t __b, int8x8_t __c) {
   1995   return __a + vmull_s8(__b, __c); }
   1996 __ai int32x4_t vmlal_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c) {
   1997   return __a + vmull_s16(__b, __c); }
   1998 __ai int64x2_t vmlal_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c) {
   1999   return __a + vmull_s32(__b, __c); }
   2000 __ai uint16x8_t vmlal_u8(uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) {
   2001   return __a + vmull_u8(__b, __c); }
   2002 __ai uint32x4_t vmlal_u16(uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) {
   2003   return __a + vmull_u16(__b, __c); }
   2004 __ai uint64x2_t vmlal_u32(uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) {
   2005   return __a + vmull_u32(__b, __c); }
   2006 
   2007 #define vmlal_lane_s16(a, b, c, __d) __extension__ ({ \
   2008   int32x4_t __a = (a); int16x4_t __b = (b); int16x4_t __c = (c); \
   2009   __a + vmull_s16(__b, __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2010 #define vmlal_lane_s32(a, b, c, __d) __extension__ ({ \
   2011   int64x2_t __a = (a); int32x2_t __b = (b); int32x2_t __c = (c); \
   2012   __a + vmull_s32(__b, __builtin_shufflevector(__c, __c, __d, __d)); })
   2013 #define vmlal_lane_u16(a, b, c, __d) __extension__ ({ \
   2014   uint32x4_t __a = (a); uint16x4_t __b = (b); uint16x4_t __c = (c); \
   2015   __a + vmull_u16(__b, __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2016 #define vmlal_lane_u32(a, b, c, __d) __extension__ ({ \
   2017   uint64x2_t __a = (a); uint32x2_t __b = (b); uint32x2_t __c = (c); \
   2018   __a + vmull_u32(__b, __builtin_shufflevector(__c, __c, __d, __d)); })
   2019 
   2020 __ai int32x4_t vmlal_n_s16(int32x4_t __a, int16x4_t __b, int16_t __c) {
   2021   return __a + vmull_s16(__b, (int16x4_t){ __c, __c, __c, __c }); }
   2022 __ai int64x2_t vmlal_n_s32(int64x2_t __a, int32x2_t __b, int32_t __c) {
   2023   return __a + vmull_s32(__b, (int32x2_t){ __c, __c }); }
   2024 __ai uint32x4_t vmlal_n_u16(uint32x4_t __a, uint16x4_t __b, uint16_t __c) {
   2025   return __a + vmull_u16(__b, (uint16x4_t){ __c, __c, __c, __c }); }
   2026 __ai uint64x2_t vmlal_n_u32(uint64x2_t __a, uint32x2_t __b, uint32_t __c) {
   2027   return __a + vmull_u32(__b, (uint32x2_t){ __c, __c }); }
   2028 
   2029 #define vmla_lane_s16(a, b, c, __d) __extension__ ({ \
   2030   int16x4_t __a = (a); int16x4_t __b = (b); int16x4_t __c = (c); \
   2031   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2032 #define vmla_lane_s32(a, b, c, __d) __extension__ ({ \
   2033   int32x2_t __a = (a); int32x2_t __b = (b); int32x2_t __c = (c); \
   2034   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d)); })
   2035 #define vmla_lane_u16(a, b, c, __d) __extension__ ({ \
   2036   uint16x4_t __a = (a); uint16x4_t __b = (b); uint16x4_t __c = (c); \
   2037   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2038 #define vmla_lane_u32(a, b, c, __d) __extension__ ({ \
   2039   uint32x2_t __a = (a); uint32x2_t __b = (b); uint32x2_t __c = (c); \
   2040   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d)); })
   2041 #define vmla_lane_f32(a, b, c, __d) __extension__ ({ \
   2042   float32x2_t __a = (a); float32x2_t __b = (b); float32x2_t __c = (c); \
   2043   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d)); })
   2044 #define vmlaq_lane_s16(a, b, c, __d) __extension__ ({ \
   2045   int16x8_t __a = (a); int16x8_t __b = (b); int16x4_t __c = (c); \
   2046   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d, __d, __d, __d, __d)); })
   2047 #define vmlaq_lane_s32(a, b, c, __d) __extension__ ({ \
   2048   int32x4_t __a = (a); int32x4_t __b = (b); int32x2_t __c = (c); \
   2049   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2050 #define vmlaq_lane_u16(a, b, c, __d) __extension__ ({ \
   2051   uint16x8_t __a = (a); uint16x8_t __b = (b); uint16x4_t __c = (c); \
   2052   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d, __d, __d, __d, __d)); })
   2053 #define vmlaq_lane_u32(a, b, c, __d) __extension__ ({ \
   2054   uint32x4_t __a = (a); uint32x4_t __b = (b); uint32x2_t __c = (c); \
   2055   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2056 #define vmlaq_lane_f32(a, b, c, __d) __extension__ ({ \
   2057   float32x4_t __a = (a); float32x4_t __b = (b); float32x2_t __c = (c); \
   2058   __a + (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2059 
   2060 __ai int16x4_t vmla_n_s16(int16x4_t __a, int16x4_t __b, int16_t __c) {
   2061   return __a + (__b * (int16x4_t){ __c, __c, __c, __c }); }
   2062 __ai int32x2_t vmla_n_s32(int32x2_t __a, int32x2_t __b, int32_t __c) {
   2063   return __a + (__b * (int32x2_t){ __c, __c }); }
   2064 __ai uint16x4_t vmla_n_u16(uint16x4_t __a, uint16x4_t __b, uint16_t __c) {
   2065   return __a + (__b * (uint16x4_t){ __c, __c, __c, __c }); }
   2066 __ai uint32x2_t vmla_n_u32(uint32x2_t __a, uint32x2_t __b, uint32_t __c) {
   2067   return __a + (__b * (uint32x2_t){ __c, __c }); }
   2068 __ai float32x2_t vmla_n_f32(float32x2_t __a, float32x2_t __b, float32_t __c) {
   2069   return __a + (__b * (float32x2_t){ __c, __c }); }
   2070 __ai int16x8_t vmlaq_n_s16(int16x8_t __a, int16x8_t __b, int16_t __c) {
   2071   return __a + (__b * (int16x8_t){ __c, __c, __c, __c, __c, __c, __c, __c }); }
   2072 __ai int32x4_t vmlaq_n_s32(int32x4_t __a, int32x4_t __b, int32_t __c) {
   2073   return __a + (__b * (int32x4_t){ __c, __c, __c, __c }); }
   2074 __ai uint16x8_t vmlaq_n_u16(uint16x8_t __a, uint16x8_t __b, uint16_t __c) {
   2075   return __a + (__b * (uint16x8_t){ __c, __c, __c, __c, __c, __c, __c, __c }); }
   2076 __ai uint32x4_t vmlaq_n_u32(uint32x4_t __a, uint32x4_t __b, uint32_t __c) {
   2077   return __a + (__b * (uint32x4_t){ __c, __c, __c, __c }); }
   2078 __ai float32x4_t vmlaq_n_f32(float32x4_t __a, float32x4_t __b, float32_t __c) {
   2079   return __a + (__b * (float32x4_t){ __c, __c, __c, __c }); }
   2080 
   2081 __ai int8x8_t vmls_s8(int8x8_t __a, int8x8_t __b, int8x8_t __c) {
   2082   return __a - (__b * __c); }
   2083 __ai int16x4_t vmls_s16(int16x4_t __a, int16x4_t __b, int16x4_t __c) {
   2084   return __a - (__b * __c); }
   2085 __ai int32x2_t vmls_s32(int32x2_t __a, int32x2_t __b, int32x2_t __c) {
   2086   return __a - (__b * __c); }
   2087 __ai float32x2_t vmls_f32(float32x2_t __a, float32x2_t __b, float32x2_t __c) {
   2088   return __a - (__b * __c); }
   2089 __ai uint8x8_t vmls_u8(uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) {
   2090   return __a - (__b * __c); }
   2091 __ai uint16x4_t vmls_u16(uint16x4_t __a, uint16x4_t __b, uint16x4_t __c) {
   2092   return __a - (__b * __c); }
   2093 __ai uint32x2_t vmls_u32(uint32x2_t __a, uint32x2_t __b, uint32x2_t __c) {
   2094   return __a - (__b * __c); }
   2095 __ai int8x16_t vmlsq_s8(int8x16_t __a, int8x16_t __b, int8x16_t __c) {
   2096   return __a - (__b * __c); }
   2097 __ai int16x8_t vmlsq_s16(int16x8_t __a, int16x8_t __b, int16x8_t __c) {
   2098   return __a - (__b * __c); }
   2099 __ai int32x4_t vmlsq_s32(int32x4_t __a, int32x4_t __b, int32x4_t __c) {
   2100   return __a - (__b * __c); }
   2101 __ai float32x4_t vmlsq_f32(float32x4_t __a, float32x4_t __b, float32x4_t __c) {
   2102   return __a - (__b * __c); }
   2103 __ai uint8x16_t vmlsq_u8(uint8x16_t __a, uint8x16_t __b, uint8x16_t __c) {
   2104   return __a - (__b * __c); }
   2105 __ai uint16x8_t vmlsq_u16(uint16x8_t __a, uint16x8_t __b, uint16x8_t __c) {
   2106   return __a - (__b * __c); }
   2107 __ai uint32x4_t vmlsq_u32(uint32x4_t __a, uint32x4_t __b, uint32x4_t __c) {
   2108   return __a - (__b * __c); }
   2109 
   2110 __ai int16x8_t vmlsl_s8(int16x8_t __a, int8x8_t __b, int8x8_t __c) {
   2111   return __a - vmull_s8(__b, __c); }
   2112 __ai int32x4_t vmlsl_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c) {
   2113   return __a - vmull_s16(__b, __c); }
   2114 __ai int64x2_t vmlsl_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c) {
   2115   return __a - vmull_s32(__b, __c); }
   2116 __ai uint16x8_t vmlsl_u8(uint16x8_t __a, uint8x8_t __b, uint8x8_t __c) {
   2117   return __a - vmull_u8(__b, __c); }
   2118 __ai uint32x4_t vmlsl_u16(uint32x4_t __a, uint16x4_t __b, uint16x4_t __c) {
   2119   return __a - vmull_u16(__b, __c); }
   2120 __ai uint64x2_t vmlsl_u32(uint64x2_t __a, uint32x2_t __b, uint32x2_t __c) {
   2121   return __a - vmull_u32(__b, __c); }
   2122 
   2123 #define vmlsl_lane_s16(a, b, c, __d) __extension__ ({ \
   2124   int32x4_t __a = (a); int16x4_t __b = (b); int16x4_t __c = (c); \
   2125   __a - vmull_s16(__b, __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2126 #define vmlsl_lane_s32(a, b, c, __d) __extension__ ({ \
   2127   int64x2_t __a = (a); int32x2_t __b = (b); int32x2_t __c = (c); \
   2128   __a - vmull_s32(__b, __builtin_shufflevector(__c, __c, __d, __d)); })
   2129 #define vmlsl_lane_u16(a, b, c, __d) __extension__ ({ \
   2130   uint32x4_t __a = (a); uint16x4_t __b = (b); uint16x4_t __c = (c); \
   2131   __a - vmull_u16(__b, __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2132 #define vmlsl_lane_u32(a, b, c, __d) __extension__ ({ \
   2133   uint64x2_t __a = (a); uint32x2_t __b = (b); uint32x2_t __c = (c); \
   2134   __a - vmull_u32(__b, __builtin_shufflevector(__c, __c, __d, __d)); })
   2135 
   2136 __ai int32x4_t vmlsl_n_s16(int32x4_t __a, int16x4_t __b, int16_t __c) {
   2137   return __a - vmull_s16(__b, (int16x4_t){ __c, __c, __c, __c }); }
   2138 __ai int64x2_t vmlsl_n_s32(int64x2_t __a, int32x2_t __b, int32_t __c) {
   2139   return __a - vmull_s32(__b, (int32x2_t){ __c, __c }); }
   2140 __ai uint32x4_t vmlsl_n_u16(uint32x4_t __a, uint16x4_t __b, uint16_t __c) {
   2141   return __a - vmull_u16(__b, (uint16x4_t){ __c, __c, __c, __c }); }
   2142 __ai uint64x2_t vmlsl_n_u32(uint64x2_t __a, uint32x2_t __b, uint32_t __c) {
   2143   return __a - vmull_u32(__b, (uint32x2_t){ __c, __c }); }
   2144 
   2145 #define vmls_lane_s16(a, b, c, __d) __extension__ ({ \
   2146   int16x4_t __a = (a); int16x4_t __b = (b); int16x4_t __c = (c); \
   2147   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2148 #define vmls_lane_s32(a, b, c, __d) __extension__ ({ \
   2149   int32x2_t __a = (a); int32x2_t __b = (b); int32x2_t __c = (c); \
   2150   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d)); })
   2151 #define vmls_lane_u16(a, b, c, __d) __extension__ ({ \
   2152   uint16x4_t __a = (a); uint16x4_t __b = (b); uint16x4_t __c = (c); \
   2153   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2154 #define vmls_lane_u32(a, b, c, __d) __extension__ ({ \
   2155   uint32x2_t __a = (a); uint32x2_t __b = (b); uint32x2_t __c = (c); \
   2156   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d)); })
   2157 #define vmls_lane_f32(a, b, c, __d) __extension__ ({ \
   2158   float32x2_t __a = (a); float32x2_t __b = (b); float32x2_t __c = (c); \
   2159   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d)); })
   2160 #define vmlsq_lane_s16(a, b, c, __d) __extension__ ({ \
   2161   int16x8_t __a = (a); int16x8_t __b = (b); int16x4_t __c = (c); \
   2162   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d, __d, __d, __d, __d)); })
   2163 #define vmlsq_lane_s32(a, b, c, __d) __extension__ ({ \
   2164   int32x4_t __a = (a); int32x4_t __b = (b); int32x2_t __c = (c); \
   2165   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2166 #define vmlsq_lane_u16(a, b, c, __d) __extension__ ({ \
   2167   uint16x8_t __a = (a); uint16x8_t __b = (b); uint16x4_t __c = (c); \
   2168   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d, __d, __d, __d, __d)); })
   2169 #define vmlsq_lane_u32(a, b, c, __d) __extension__ ({ \
   2170   uint32x4_t __a = (a); uint32x4_t __b = (b); uint32x2_t __c = (c); \
   2171   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2172 #define vmlsq_lane_f32(a, b, c, __d) __extension__ ({ \
   2173   float32x4_t __a = (a); float32x4_t __b = (b); float32x2_t __c = (c); \
   2174   __a - (__b * __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2175 
   2176 __ai int16x4_t vmls_n_s16(int16x4_t __a, int16x4_t __b, int16_t __c) {
   2177   return __a - (__b * (int16x4_t){ __c, __c, __c, __c }); }
   2178 __ai int32x2_t vmls_n_s32(int32x2_t __a, int32x2_t __b, int32_t __c) {
   2179   return __a - (__b * (int32x2_t){ __c, __c }); }
   2180 __ai uint16x4_t vmls_n_u16(uint16x4_t __a, uint16x4_t __b, uint16_t __c) {
   2181   return __a - (__b * (uint16x4_t){ __c, __c, __c, __c }); }
   2182 __ai uint32x2_t vmls_n_u32(uint32x2_t __a, uint32x2_t __b, uint32_t __c) {
   2183   return __a - (__b * (uint32x2_t){ __c, __c }); }
   2184 __ai float32x2_t vmls_n_f32(float32x2_t __a, float32x2_t __b, float32_t __c) {
   2185   return __a - (__b * (float32x2_t){ __c, __c }); }
   2186 __ai int16x8_t vmlsq_n_s16(int16x8_t __a, int16x8_t __b, int16_t __c) {
   2187   return __a - (__b * (int16x8_t){ __c, __c, __c, __c, __c, __c, __c, __c }); }
   2188 __ai int32x4_t vmlsq_n_s32(int32x4_t __a, int32x4_t __b, int32_t __c) {
   2189   return __a - (__b * (int32x4_t){ __c, __c, __c, __c }); }
   2190 __ai uint16x8_t vmlsq_n_u16(uint16x8_t __a, uint16x8_t __b, uint16_t __c) {
   2191   return __a - (__b * (uint16x8_t){ __c, __c, __c, __c, __c, __c, __c, __c }); }
   2192 __ai uint32x4_t vmlsq_n_u32(uint32x4_t __a, uint32x4_t __b, uint32_t __c) {
   2193   return __a - (__b * (uint32x4_t){ __c, __c, __c, __c }); }
   2194 __ai float32x4_t vmlsq_n_f32(float32x4_t __a, float32x4_t __b, float32_t __c) {
   2195   return __a - (__b * (float32x4_t){ __c, __c, __c, __c }); }
   2196 
   2197 __ai int8x8_t vmovn_s16(int16x8_t __a) {
   2198   return (int8x8_t)__builtin_neon_vmovn_v((int8x16_t)__a, 0); }
   2199 __ai int16x4_t vmovn_s32(int32x4_t __a) {
   2200   return (int16x4_t)__builtin_neon_vmovn_v((int8x16_t)__a, 1); }
   2201 __ai int32x2_t vmovn_s64(int64x2_t __a) {
   2202   return (int32x2_t)__builtin_neon_vmovn_v((int8x16_t)__a, 2); }
   2203 __ai uint8x8_t vmovn_u16(uint16x8_t __a) {
   2204   return (uint8x8_t)__builtin_neon_vmovn_v((int8x16_t)__a, 16); }
   2205 __ai uint16x4_t vmovn_u32(uint32x4_t __a) {
   2206   return (uint16x4_t)__builtin_neon_vmovn_v((int8x16_t)__a, 17); }
   2207 __ai uint32x2_t vmovn_u64(uint64x2_t __a) {
   2208   return (uint32x2_t)__builtin_neon_vmovn_v((int8x16_t)__a, 18); }
   2209 
   2210 __ai uint8x8_t vmov_n_u8(uint8_t __a) {
   2211   return (uint8x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   2212 __ai uint16x4_t vmov_n_u16(uint16_t __a) {
   2213   return (uint16x4_t){ __a, __a, __a, __a }; }
   2214 __ai uint32x2_t vmov_n_u32(uint32_t __a) {
   2215   return (uint32x2_t){ __a, __a }; }
   2216 __ai int8x8_t vmov_n_s8(int8_t __a) {
   2217   return (int8x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   2218 __ai int16x4_t vmov_n_s16(int16_t __a) {
   2219   return (int16x4_t){ __a, __a, __a, __a }; }
   2220 __ai int32x2_t vmov_n_s32(int32_t __a) {
   2221   return (int32x2_t){ __a, __a }; }
   2222 __ai poly8x8_t vmov_n_p8(poly8_t __a) {
   2223   return (poly8x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   2224 __ai poly16x4_t vmov_n_p16(poly16_t __a) {
   2225   return (poly16x4_t){ __a, __a, __a, __a }; }
   2226 __ai float32x2_t vmov_n_f32(float32_t __a) {
   2227   return (float32x2_t){ __a, __a }; }
   2228 __ai uint8x16_t vmovq_n_u8(uint8_t __a) {
   2229   return (uint8x16_t){ __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a }; }
   2230 __ai uint16x8_t vmovq_n_u16(uint16_t __a) {
   2231   return (uint16x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   2232 __ai uint32x4_t vmovq_n_u32(uint32_t __a) {
   2233   return (uint32x4_t){ __a, __a, __a, __a }; }
   2234 __ai int8x16_t vmovq_n_s8(int8_t __a) {
   2235   return (int8x16_t){ __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a }; }
   2236 __ai int16x8_t vmovq_n_s16(int16_t __a) {
   2237   return (int16x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   2238 __ai int32x4_t vmovq_n_s32(int32_t __a) {
   2239   return (int32x4_t){ __a, __a, __a, __a }; }
   2240 __ai poly8x16_t vmovq_n_p8(poly8_t __a) {
   2241   return (poly8x16_t){ __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a, __a }; }
   2242 __ai poly16x8_t vmovq_n_p16(poly16_t __a) {
   2243   return (poly16x8_t){ __a, __a, __a, __a, __a, __a, __a, __a }; }
   2244 __ai float32x4_t vmovq_n_f32(float32_t __a) {
   2245   return (float32x4_t){ __a, __a, __a, __a }; }
   2246 __ai int64x1_t vmov_n_s64(int64_t __a) {
   2247   return (int64x1_t){ __a }; }
   2248 __ai uint64x1_t vmov_n_u64(uint64_t __a) {
   2249   return (uint64x1_t){ __a }; }
   2250 __ai int64x2_t vmovq_n_s64(int64_t __a) {
   2251   return (int64x2_t){ __a, __a }; }
   2252 __ai uint64x2_t vmovq_n_u64(uint64_t __a) {
   2253   return (uint64x2_t){ __a, __a }; }
   2254 
   2255 __ai int8x8_t vmul_s8(int8x8_t __a, int8x8_t __b) {
   2256   return __a * __b; }
   2257 __ai int16x4_t vmul_s16(int16x4_t __a, int16x4_t __b) {
   2258   return __a * __b; }
   2259 __ai int32x2_t vmul_s32(int32x2_t __a, int32x2_t __b) {
   2260   return __a * __b; }
   2261 __ai float32x2_t vmul_f32(float32x2_t __a, float32x2_t __b) {
   2262   return __a * __b; }
   2263 __ai uint8x8_t vmul_u8(uint8x8_t __a, uint8x8_t __b) {
   2264   return __a * __b; }
   2265 __ai uint16x4_t vmul_u16(uint16x4_t __a, uint16x4_t __b) {
   2266   return __a * __b; }
   2267 __ai uint32x2_t vmul_u32(uint32x2_t __a, uint32x2_t __b) {
   2268   return __a * __b; }
   2269 __ai int8x16_t vmulq_s8(int8x16_t __a, int8x16_t __b) {
   2270   return __a * __b; }
   2271 __ai int16x8_t vmulq_s16(int16x8_t __a, int16x8_t __b) {
   2272   return __a * __b; }
   2273 __ai int32x4_t vmulq_s32(int32x4_t __a, int32x4_t __b) {
   2274   return __a * __b; }
   2275 __ai float32x4_t vmulq_f32(float32x4_t __a, float32x4_t __b) {
   2276   return __a * __b; }
   2277 __ai uint8x16_t vmulq_u8(uint8x16_t __a, uint8x16_t __b) {
   2278   return __a * __b; }
   2279 __ai uint16x8_t vmulq_u16(uint16x8_t __a, uint16x8_t __b) {
   2280   return __a * __b; }
   2281 __ai uint32x4_t vmulq_u32(uint32x4_t __a, uint32x4_t __b) {
   2282   return __a * __b; }
   2283 
   2284 #define vmull_lane_s16(a, b, __c) __extension__ ({ \
   2285   int16x4_t __a = (a); int16x4_t __b = (b); \
   2286   vmull_s16(__a, __builtin_shufflevector(__b, __b, __c, __c, __c, __c)); })
   2287 #define vmull_lane_s32(a, b, __c) __extension__ ({ \
   2288   int32x2_t __a = (a); int32x2_t __b = (b); \
   2289   vmull_s32(__a, __builtin_shufflevector(__b, __b, __c, __c)); })
   2290 #define vmull_lane_u16(a, b, __c) __extension__ ({ \
   2291   uint16x4_t __a = (a); uint16x4_t __b = (b); \
   2292   vmull_u16(__a, __builtin_shufflevector(__b, __b, __c, __c, __c, __c)); })
   2293 #define vmull_lane_u32(a, b, __c) __extension__ ({ \
   2294   uint32x2_t __a = (a); uint32x2_t __b = (b); \
   2295   vmull_u32(__a, __builtin_shufflevector(__b, __b, __c, __c)); })
   2296 
   2297 __ai int32x4_t vmull_n_s16(int16x4_t __a, int16_t __b) {
   2298   return (int32x4_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)(int16x4_t){ __b, __b, __b, __b }, 34); }
   2299 __ai int64x2_t vmull_n_s32(int32x2_t __a, int32_t __b) {
   2300   return (int64x2_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)(int32x2_t){ __b, __b }, 35); }
   2301 __ai uint32x4_t vmull_n_u16(uint16x4_t __a, uint16_t __b) {
   2302   return (uint32x4_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)(uint16x4_t){ __b, __b, __b, __b }, 50); }
   2303 __ai uint64x2_t vmull_n_u32(uint32x2_t __a, uint32_t __b) {
   2304   return (uint64x2_t)__builtin_neon_vmull_v((int8x8_t)__a, (int8x8_t)(uint32x2_t){ __b, __b }, 51); }
   2305 
   2306 __ai poly8x8_t vmul_p8(poly8x8_t __a, poly8x8_t __b) {
   2307   return (poly8x8_t)__builtin_neon_vmul_v((int8x8_t)__a, (int8x8_t)__b, 4); }
   2308 __ai poly8x16_t vmulq_p8(poly8x16_t __a, poly8x16_t __b) {
   2309   return (poly8x16_t)__builtin_neon_vmulq_v((int8x16_t)__a, (int8x16_t)__b, 36); }
   2310 
   2311 #define vmul_lane_s16(a, b, __c) __extension__ ({ \
   2312   int16x4_t __a = (a); int16x4_t __b = (b); \
   2313   __a * __builtin_shufflevector(__b, __b, __c, __c, __c, __c); })
   2314 #define vmul_lane_s32(a, b, __c) __extension__ ({ \
   2315   int32x2_t __a = (a); int32x2_t __b = (b); \
   2316   __a * __builtin_shufflevector(__b, __b, __c, __c); })
   2317 #define vmul_lane_f32(a, b, __c) __extension__ ({ \
   2318   float32x2_t __a = (a); float32x2_t __b = (b); \
   2319   __a * __builtin_shufflevector(__b, __b, __c, __c); })
   2320 #define vmul_lane_u16(a, b, __c) __extension__ ({ \
   2321   uint16x4_t __a = (a); uint16x4_t __b = (b); \
   2322   __a * __builtin_shufflevector(__b, __b, __c, __c, __c, __c); })
   2323 #define vmul_lane_u32(a, b, __c) __extension__ ({ \
   2324   uint32x2_t __a = (a); uint32x2_t __b = (b); \
   2325   __a * __builtin_shufflevector(__b, __b, __c, __c); })
   2326 #define vmulq_lane_s16(a, b, __c) __extension__ ({ \
   2327   int16x8_t __a = (a); int16x4_t __b = (b); \
   2328   __a * __builtin_shufflevector(__b, __b, __c, __c, __c, __c, __c, __c, __c, __c); })
   2329 #define vmulq_lane_s32(a, b, __c) __extension__ ({ \
   2330   int32x4_t __a = (a); int32x2_t __b = (b); \
   2331   __a * __builtin_shufflevector(__b, __b, __c, __c, __c, __c); })
   2332 #define vmulq_lane_f32(a, b, __c) __extension__ ({ \
   2333   float32x4_t __a = (a); float32x2_t __b = (b); \
   2334   __a * __builtin_shufflevector(__b, __b, __c, __c, __c, __c); })
   2335 #define vmulq_lane_u16(a, b, __c) __extension__ ({ \
   2336   uint16x8_t __a = (a); uint16x4_t __b = (b); \
   2337   __a * __builtin_shufflevector(__b, __b, __c, __c, __c, __c, __c, __c, __c, __c); })
   2338 #define vmulq_lane_u32(a, b, __c) __extension__ ({ \
   2339   uint32x4_t __a = (a); uint32x2_t __b = (b); \
   2340   __a * __builtin_shufflevector(__b, __b, __c, __c, __c, __c); })
   2341 
   2342 __ai int16x4_t vmul_n_s16(int16x4_t __a, int16_t __b) {
   2343   return __a * (int16x4_t){ __b, __b, __b, __b }; }
   2344 __ai int32x2_t vmul_n_s32(int32x2_t __a, int32_t __b) {
   2345   return __a * (int32x2_t){ __b, __b }; }
   2346 __ai float32x2_t vmul_n_f32(float32x2_t __a, float32_t __b) {
   2347   return __a * (float32x2_t){ __b, __b }; }
   2348 __ai uint16x4_t vmul_n_u16(uint16x4_t __a, uint16_t __b) {
   2349   return __a * (uint16x4_t){ __b, __b, __b, __b }; }
   2350 __ai uint32x2_t vmul_n_u32(uint32x2_t __a, uint32_t __b) {
   2351   return __a * (uint32x2_t){ __b, __b }; }
   2352 __ai int16x8_t vmulq_n_s16(int16x8_t __a, int16_t __b) {
   2353   return __a * (int16x8_t){ __b, __b, __b, __b, __b, __b, __b, __b }; }
   2354 __ai int32x4_t vmulq_n_s32(int32x4_t __a, int32_t __b) {
   2355   return __a * (int32x4_t){ __b, __b, __b, __b }; }
   2356 __ai float32x4_t vmulq_n_f32(float32x4_t __a, float32_t __b) {
   2357   return __a * (float32x4_t){ __b, __b, __b, __b }; }
   2358 __ai uint16x8_t vmulq_n_u16(uint16x8_t __a, uint16_t __b) {
   2359   return __a * (uint16x8_t){ __b, __b, __b, __b, __b, __b, __b, __b }; }
   2360 __ai uint32x4_t vmulq_n_u32(uint32x4_t __a, uint32_t __b) {
   2361   return __a * (uint32x4_t){ __b, __b, __b, __b }; }
   2362 
   2363 __ai int8x8_t vmvn_s8(int8x8_t __a) {
   2364   return  ~__a; }
   2365 __ai int16x4_t vmvn_s16(int16x4_t __a) {
   2366   return  ~__a; }
   2367 __ai int32x2_t vmvn_s32(int32x2_t __a) {
   2368   return  ~__a; }
   2369 __ai uint8x8_t vmvn_u8(uint8x8_t __a) {
   2370   return  ~__a; }
   2371 __ai uint16x4_t vmvn_u16(uint16x4_t __a) {
   2372   return  ~__a; }
   2373 __ai uint32x2_t vmvn_u32(uint32x2_t __a) {
   2374   return  ~__a; }
   2375 __ai poly8x8_t vmvn_p8(poly8x8_t __a) {
   2376   return  ~__a; }
   2377 __ai int8x16_t vmvnq_s8(int8x16_t __a) {
   2378   return  ~__a; }
   2379 __ai int16x8_t vmvnq_s16(int16x8_t __a) {
   2380   return  ~__a; }
   2381 __ai int32x4_t vmvnq_s32(int32x4_t __a) {
   2382   return  ~__a; }
   2383 __ai uint8x16_t vmvnq_u8(uint8x16_t __a) {
   2384   return  ~__a; }
   2385 __ai uint16x8_t vmvnq_u16(uint16x8_t __a) {
   2386   return  ~__a; }
   2387 __ai uint32x4_t vmvnq_u32(uint32x4_t __a) {
   2388   return  ~__a; }
   2389 __ai poly8x16_t vmvnq_p8(poly8x16_t __a) {
   2390   return  ~__a; }
   2391 
   2392 __ai int8x8_t vneg_s8(int8x8_t __a) {
   2393   return  -__a; }
   2394 __ai int16x4_t vneg_s16(int16x4_t __a) {
   2395   return  -__a; }
   2396 __ai int32x2_t vneg_s32(int32x2_t __a) {
   2397   return  -__a; }
   2398 __ai float32x2_t vneg_f32(float32x2_t __a) {
   2399   return  -__a; }
   2400 __ai int8x16_t vnegq_s8(int8x16_t __a) {
   2401   return  -__a; }
   2402 __ai int16x8_t vnegq_s16(int16x8_t __a) {
   2403   return  -__a; }
   2404 __ai int32x4_t vnegq_s32(int32x4_t __a) {
   2405   return  -__a; }
   2406 __ai float32x4_t vnegq_f32(float32x4_t __a) {
   2407   return  -__a; }
   2408 
   2409 __ai int8x8_t vorn_s8(int8x8_t __a, int8x8_t __b) {
   2410   return __a | ~__b; }
   2411 __ai int16x4_t vorn_s16(int16x4_t __a, int16x4_t __b) {
   2412   return __a | ~__b; }
   2413 __ai int32x2_t vorn_s32(int32x2_t __a, int32x2_t __b) {
   2414   return __a | ~__b; }
   2415 __ai int64x1_t vorn_s64(int64x1_t __a, int64x1_t __b) {
   2416   return __a | ~__b; }
   2417 __ai uint8x8_t vorn_u8(uint8x8_t __a, uint8x8_t __b) {
   2418   return __a | ~__b; }
   2419 __ai uint16x4_t vorn_u16(uint16x4_t __a, uint16x4_t __b) {
   2420   return __a | ~__b; }
   2421 __ai uint32x2_t vorn_u32(uint32x2_t __a, uint32x2_t __b) {
   2422   return __a | ~__b; }
   2423 __ai uint64x1_t vorn_u64(uint64x1_t __a, uint64x1_t __b) {
   2424   return __a | ~__b; }
   2425 __ai int8x16_t vornq_s8(int8x16_t __a, int8x16_t __b) {
   2426   return __a | ~__b; }
   2427 __ai int16x8_t vornq_s16(int16x8_t __a, int16x8_t __b) {
   2428   return __a | ~__b; }
   2429 __ai int32x4_t vornq_s32(int32x4_t __a, int32x4_t __b) {
   2430   return __a | ~__b; }
   2431 __ai int64x2_t vornq_s64(int64x2_t __a, int64x2_t __b) {
   2432   return __a | ~__b; }
   2433 __ai uint8x16_t vornq_u8(uint8x16_t __a, uint8x16_t __b) {
   2434   return __a | ~__b; }
   2435 __ai uint16x8_t vornq_u16(uint16x8_t __a, uint16x8_t __b) {
   2436   return __a | ~__b; }
   2437 __ai uint32x4_t vornq_u32(uint32x4_t __a, uint32x4_t __b) {
   2438   return __a | ~__b; }
   2439 __ai uint64x2_t vornq_u64(uint64x2_t __a, uint64x2_t __b) {
   2440   return __a | ~__b; }
   2441 
   2442 __ai int8x8_t vorr_s8(int8x8_t __a, int8x8_t __b) {
   2443   return __a | __b; }
   2444 __ai int16x4_t vorr_s16(int16x4_t __a, int16x4_t __b) {
   2445   return __a | __b; }
   2446 __ai int32x2_t vorr_s32(int32x2_t __a, int32x2_t __b) {
   2447   return __a | __b; }
   2448 __ai int64x1_t vorr_s64(int64x1_t __a, int64x1_t __b) {
   2449   return __a | __b; }
   2450 __ai uint8x8_t vorr_u8(uint8x8_t __a, uint8x8_t __b) {
   2451   return __a | __b; }
   2452 __ai uint16x4_t vorr_u16(uint16x4_t __a, uint16x4_t __b) {
   2453   return __a | __b; }
   2454 __ai uint32x2_t vorr_u32(uint32x2_t __a, uint32x2_t __b) {
   2455   return __a | __b; }
   2456 __ai uint64x1_t vorr_u64(uint64x1_t __a, uint64x1_t __b) {
   2457   return __a | __b; }
   2458 __ai int8x16_t vorrq_s8(int8x16_t __a, int8x16_t __b) {
   2459   return __a | __b; }
   2460 __ai int16x8_t vorrq_s16(int16x8_t __a, int16x8_t __b) {
   2461   return __a | __b; }
   2462 __ai int32x4_t vorrq_s32(int32x4_t __a, int32x4_t __b) {
   2463   return __a | __b; }
   2464 __ai int64x2_t vorrq_s64(int64x2_t __a, int64x2_t __b) {
   2465   return __a | __b; }
   2466 __ai uint8x16_t vorrq_u8(uint8x16_t __a, uint8x16_t __b) {
   2467   return __a | __b; }
   2468 __ai uint16x8_t vorrq_u16(uint16x8_t __a, uint16x8_t __b) {
   2469   return __a | __b; }
   2470 __ai uint32x4_t vorrq_u32(uint32x4_t __a, uint32x4_t __b) {
   2471   return __a | __b; }
   2472 __ai uint64x2_t vorrq_u64(uint64x2_t __a, uint64x2_t __b) {
   2473   return __a | __b; }
   2474 
   2475 __ai int16x4_t vpadal_s8(int16x4_t __a, int8x8_t __b) {
   2476   return (int16x4_t)__builtin_neon_vpadal_v((int8x8_t)__a, __b, 1); }
   2477 __ai int32x2_t vpadal_s16(int32x2_t __a, int16x4_t __b) {
   2478   return (int32x2_t)__builtin_neon_vpadal_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2479 __ai int64x1_t vpadal_s32(int64x1_t __a, int32x2_t __b) {
   2480   return (int64x1_t)__builtin_neon_vpadal_v((int8x8_t)__a, (int8x8_t)__b, 3); }
   2481 __ai uint16x4_t vpadal_u8(uint16x4_t __a, uint8x8_t __b) {
   2482   return (uint16x4_t)__builtin_neon_vpadal_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   2483 __ai uint32x2_t vpadal_u16(uint32x2_t __a, uint16x4_t __b) {
   2484   return (uint32x2_t)__builtin_neon_vpadal_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   2485 __ai uint64x1_t vpadal_u32(uint64x1_t __a, uint32x2_t __b) {
   2486   return (uint64x1_t)__builtin_neon_vpadal_v((int8x8_t)__a, (int8x8_t)__b, 19); }
   2487 __ai int16x8_t vpadalq_s8(int16x8_t __a, int8x16_t __b) {
   2488   return (int16x8_t)__builtin_neon_vpadalq_v((int8x16_t)__a, __b, 33); }
   2489 __ai int32x4_t vpadalq_s16(int32x4_t __a, int16x8_t __b) {
   2490   return (int32x4_t)__builtin_neon_vpadalq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   2491 __ai int64x2_t vpadalq_s32(int64x2_t __a, int32x4_t __b) {
   2492   return (int64x2_t)__builtin_neon_vpadalq_v((int8x16_t)__a, (int8x16_t)__b, 35); }
   2493 __ai uint16x8_t vpadalq_u8(uint16x8_t __a, uint8x16_t __b) {
   2494   return (uint16x8_t)__builtin_neon_vpadalq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   2495 __ai uint32x4_t vpadalq_u16(uint32x4_t __a, uint16x8_t __b) {
   2496   return (uint32x4_t)__builtin_neon_vpadalq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   2497 __ai uint64x2_t vpadalq_u32(uint64x2_t __a, uint32x4_t __b) {
   2498   return (uint64x2_t)__builtin_neon_vpadalq_v((int8x16_t)__a, (int8x16_t)__b, 51); }
   2499 
   2500 __ai int8x8_t vpadd_s8(int8x8_t __a, int8x8_t __b) {
   2501   return (int8x8_t)__builtin_neon_vpadd_v(__a, __b, 0); }
   2502 __ai int16x4_t vpadd_s16(int16x4_t __a, int16x4_t __b) {
   2503   return (int16x4_t)__builtin_neon_vpadd_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   2504 __ai int32x2_t vpadd_s32(int32x2_t __a, int32x2_t __b) {
   2505   return (int32x2_t)__builtin_neon_vpadd_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2506 __ai uint8x8_t vpadd_u8(uint8x8_t __a, uint8x8_t __b) {
   2507   return (uint8x8_t)__builtin_neon_vpadd_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   2508 __ai uint16x4_t vpadd_u16(uint16x4_t __a, uint16x4_t __b) {
   2509   return (uint16x4_t)__builtin_neon_vpadd_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   2510 __ai uint32x2_t vpadd_u32(uint32x2_t __a, uint32x2_t __b) {
   2511   return (uint32x2_t)__builtin_neon_vpadd_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   2512 __ai float32x2_t vpadd_f32(float32x2_t __a, float32x2_t __b) {
   2513   return (float32x2_t)__builtin_neon_vpadd_v((int8x8_t)__a, (int8x8_t)__b, 7); }
   2514 
   2515 __ai int16x4_t vpaddl_s8(int8x8_t __a) {
   2516   return (int16x4_t)__builtin_neon_vpaddl_v(__a, 1); }
   2517 __ai int32x2_t vpaddl_s16(int16x4_t __a) {
   2518   return (int32x2_t)__builtin_neon_vpaddl_v((int8x8_t)__a, 2); }
   2519 __ai int64x1_t vpaddl_s32(int32x2_t __a) {
   2520   return (int64x1_t)__builtin_neon_vpaddl_v((int8x8_t)__a, 3); }
   2521 __ai uint16x4_t vpaddl_u8(uint8x8_t __a) {
   2522   return (uint16x4_t)__builtin_neon_vpaddl_v((int8x8_t)__a, 17); }
   2523 __ai uint32x2_t vpaddl_u16(uint16x4_t __a) {
   2524   return (uint32x2_t)__builtin_neon_vpaddl_v((int8x8_t)__a, 18); }
   2525 __ai uint64x1_t vpaddl_u32(uint32x2_t __a) {
   2526   return (uint64x1_t)__builtin_neon_vpaddl_v((int8x8_t)__a, 19); }
   2527 __ai int16x8_t vpaddlq_s8(int8x16_t __a) {
   2528   return (int16x8_t)__builtin_neon_vpaddlq_v(__a, 33); }
   2529 __ai int32x4_t vpaddlq_s16(int16x8_t __a) {
   2530   return (int32x4_t)__builtin_neon_vpaddlq_v((int8x16_t)__a, 34); }
   2531 __ai int64x2_t vpaddlq_s32(int32x4_t __a) {
   2532   return (int64x2_t)__builtin_neon_vpaddlq_v((int8x16_t)__a, 35); }
   2533 __ai uint16x8_t vpaddlq_u8(uint8x16_t __a) {
   2534   return (uint16x8_t)__builtin_neon_vpaddlq_v((int8x16_t)__a, 49); }
   2535 __ai uint32x4_t vpaddlq_u16(uint16x8_t __a) {
   2536   return (uint32x4_t)__builtin_neon_vpaddlq_v((int8x16_t)__a, 50); }
   2537 __ai uint64x2_t vpaddlq_u32(uint32x4_t __a) {
   2538   return (uint64x2_t)__builtin_neon_vpaddlq_v((int8x16_t)__a, 51); }
   2539 
   2540 __ai int8x8_t vpmax_s8(int8x8_t __a, int8x8_t __b) {
   2541   return (int8x8_t)__builtin_neon_vpmax_v(__a, __b, 0); }
   2542 __ai int16x4_t vpmax_s16(int16x4_t __a, int16x4_t __b) {
   2543   return (int16x4_t)__builtin_neon_vpmax_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   2544 __ai int32x2_t vpmax_s32(int32x2_t __a, int32x2_t __b) {
   2545   return (int32x2_t)__builtin_neon_vpmax_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2546 __ai uint8x8_t vpmax_u8(uint8x8_t __a, uint8x8_t __b) {
   2547   return (uint8x8_t)__builtin_neon_vpmax_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   2548 __ai uint16x4_t vpmax_u16(uint16x4_t __a, uint16x4_t __b) {
   2549   return (uint16x4_t)__builtin_neon_vpmax_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   2550 __ai uint32x2_t vpmax_u32(uint32x2_t __a, uint32x2_t __b) {
   2551   return (uint32x2_t)__builtin_neon_vpmax_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   2552 __ai float32x2_t vpmax_f32(float32x2_t __a, float32x2_t __b) {
   2553   return (float32x2_t)__builtin_neon_vpmax_v((int8x8_t)__a, (int8x8_t)__b, 7); }
   2554 
   2555 __ai int8x8_t vpmin_s8(int8x8_t __a, int8x8_t __b) {
   2556   return (int8x8_t)__builtin_neon_vpmin_v(__a, __b, 0); }
   2557 __ai int16x4_t vpmin_s16(int16x4_t __a, int16x4_t __b) {
   2558   return (int16x4_t)__builtin_neon_vpmin_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   2559 __ai int32x2_t vpmin_s32(int32x2_t __a, int32x2_t __b) {
   2560   return (int32x2_t)__builtin_neon_vpmin_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2561 __ai uint8x8_t vpmin_u8(uint8x8_t __a, uint8x8_t __b) {
   2562   return (uint8x8_t)__builtin_neon_vpmin_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   2563 __ai uint16x4_t vpmin_u16(uint16x4_t __a, uint16x4_t __b) {
   2564   return (uint16x4_t)__builtin_neon_vpmin_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   2565 __ai uint32x2_t vpmin_u32(uint32x2_t __a, uint32x2_t __b) {
   2566   return (uint32x2_t)__builtin_neon_vpmin_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   2567 __ai float32x2_t vpmin_f32(float32x2_t __a, float32x2_t __b) {
   2568   return (float32x2_t)__builtin_neon_vpmin_v((int8x8_t)__a, (int8x8_t)__b, 7); }
   2569 
   2570 __ai int8x8_t vqabs_s8(int8x8_t __a) {
   2571   return (int8x8_t)__builtin_neon_vqabs_v(__a, 0); }
   2572 __ai int16x4_t vqabs_s16(int16x4_t __a) {
   2573   return (int16x4_t)__builtin_neon_vqabs_v((int8x8_t)__a, 1); }
   2574 __ai int32x2_t vqabs_s32(int32x2_t __a) {
   2575   return (int32x2_t)__builtin_neon_vqabs_v((int8x8_t)__a, 2); }
   2576 __ai int8x16_t vqabsq_s8(int8x16_t __a) {
   2577   return (int8x16_t)__builtin_neon_vqabsq_v(__a, 32); }
   2578 __ai int16x8_t vqabsq_s16(int16x8_t __a) {
   2579   return (int16x8_t)__builtin_neon_vqabsq_v((int8x16_t)__a, 33); }
   2580 __ai int32x4_t vqabsq_s32(int32x4_t __a) {
   2581   return (int32x4_t)__builtin_neon_vqabsq_v((int8x16_t)__a, 34); }
   2582 
   2583 __ai int8x8_t vqadd_s8(int8x8_t __a, int8x8_t __b) {
   2584   return (int8x8_t)__builtin_neon_vqadd_v(__a, __b, 0); }
   2585 __ai int16x4_t vqadd_s16(int16x4_t __a, int16x4_t __b) {
   2586   return (int16x4_t)__builtin_neon_vqadd_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   2587 __ai int32x2_t vqadd_s32(int32x2_t __a, int32x2_t __b) {
   2588   return (int32x2_t)__builtin_neon_vqadd_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2589 __ai int64x1_t vqadd_s64(int64x1_t __a, int64x1_t __b) {
   2590   return (int64x1_t)__builtin_neon_vqadd_v((int8x8_t)__a, (int8x8_t)__b, 3); }
   2591 __ai uint8x8_t vqadd_u8(uint8x8_t __a, uint8x8_t __b) {
   2592   return (uint8x8_t)__builtin_neon_vqadd_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   2593 __ai uint16x4_t vqadd_u16(uint16x4_t __a, uint16x4_t __b) {
   2594   return (uint16x4_t)__builtin_neon_vqadd_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   2595 __ai uint32x2_t vqadd_u32(uint32x2_t __a, uint32x2_t __b) {
   2596   return (uint32x2_t)__builtin_neon_vqadd_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   2597 __ai uint64x1_t vqadd_u64(uint64x1_t __a, uint64x1_t __b) {
   2598   return (uint64x1_t)__builtin_neon_vqadd_v((int8x8_t)__a, (int8x8_t)__b, 19); }
   2599 __ai int8x16_t vqaddq_s8(int8x16_t __a, int8x16_t __b) {
   2600   return (int8x16_t)__builtin_neon_vqaddq_v(__a, __b, 32); }
   2601 __ai int16x8_t vqaddq_s16(int16x8_t __a, int16x8_t __b) {
   2602   return (int16x8_t)__builtin_neon_vqaddq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   2603 __ai int32x4_t vqaddq_s32(int32x4_t __a, int32x4_t __b) {
   2604   return (int32x4_t)__builtin_neon_vqaddq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   2605 __ai int64x2_t vqaddq_s64(int64x2_t __a, int64x2_t __b) {
   2606   return (int64x2_t)__builtin_neon_vqaddq_v((int8x16_t)__a, (int8x16_t)__b, 35); }
   2607 __ai uint8x16_t vqaddq_u8(uint8x16_t __a, uint8x16_t __b) {
   2608   return (uint8x16_t)__builtin_neon_vqaddq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
   2609 __ai uint16x8_t vqaddq_u16(uint16x8_t __a, uint16x8_t __b) {
   2610   return (uint16x8_t)__builtin_neon_vqaddq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   2611 __ai uint32x4_t vqaddq_u32(uint32x4_t __a, uint32x4_t __b) {
   2612   return (uint32x4_t)__builtin_neon_vqaddq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   2613 __ai uint64x2_t vqaddq_u64(uint64x2_t __a, uint64x2_t __b) {
   2614   return (uint64x2_t)__builtin_neon_vqaddq_v((int8x16_t)__a, (int8x16_t)__b, 51); }
   2615 
   2616 __ai int32x4_t vqdmlal_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c) {
   2617   return (int32x4_t)__builtin_neon_vqdmlal_v((int8x16_t)__a, (int8x8_t)__b, (int8x8_t)__c, 34); }
   2618 __ai int64x2_t vqdmlal_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c) {
   2619   return (int64x2_t)__builtin_neon_vqdmlal_v((int8x16_t)__a, (int8x8_t)__b, (int8x8_t)__c, 35); }
   2620 
   2621 #define vqdmlal_lane_s16(a, b, c, __d) __extension__ ({ \
   2622   int32x4_t __a = (a); int16x4_t __b = (b); int16x4_t __c = (c); \
   2623   vqdmlal_s16(__a, __b, __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2624 #define vqdmlal_lane_s32(a, b, c, __d) __extension__ ({ \
   2625   int64x2_t __a = (a); int32x2_t __b = (b); int32x2_t __c = (c); \
   2626   vqdmlal_s32(__a, __b, __builtin_shufflevector(__c, __c, __d, __d)); })
   2627 
   2628 __ai int32x4_t vqdmlal_n_s16(int32x4_t __a, int16x4_t __b, int16_t __c) {
   2629   return (int32x4_t)__builtin_neon_vqdmlal_v((int8x16_t)__a, (int8x8_t)__b, (int8x8_t)(int16x4_t){ __c, __c, __c, __c }, 34); }
   2630 __ai int64x2_t vqdmlal_n_s32(int64x2_t __a, int32x2_t __b, int32_t __c) {
   2631   return (int64x2_t)__builtin_neon_vqdmlal_v((int8x16_t)__a, (int8x8_t)__b, (int8x8_t)(int32x2_t){ __c, __c }, 35); }
   2632 
   2633 __ai int32x4_t vqdmlsl_s16(int32x4_t __a, int16x4_t __b, int16x4_t __c) {
   2634   return (int32x4_t)__builtin_neon_vqdmlsl_v((int8x16_t)__a, (int8x8_t)__b, (int8x8_t)__c, 34); }
   2635 __ai int64x2_t vqdmlsl_s32(int64x2_t __a, int32x2_t __b, int32x2_t __c) {
   2636   return (int64x2_t)__builtin_neon_vqdmlsl_v((int8x16_t)__a, (int8x8_t)__b, (int8x8_t)__c, 35); }
   2637 
   2638 #define vqdmlsl_lane_s16(a, b, c, __d) __extension__ ({ \
   2639   int32x4_t __a = (a); int16x4_t __b = (b); int16x4_t __c = (c); \
   2640   vqdmlsl_s16(__a, __b, __builtin_shufflevector(__c, __c, __d, __d, __d, __d)); })
   2641 #define vqdmlsl_lane_s32(a, b, c, __d) __extension__ ({ \
   2642   int64x2_t __a = (a); int32x2_t __b = (b); int32x2_t __c = (c); \
   2643   vqdmlsl_s32(__a, __b, __builtin_shufflevector(__c, __c, __d, __d)); })
   2644 
   2645 __ai int32x4_t vqdmlsl_n_s16(int32x4_t __a, int16x4_t __b, int16_t __c) {
   2646   return (int32x4_t)__builtin_neon_vqdmlsl_v((int8x16_t)__a, (int8x8_t)__b, (int8x8_t)(int16x4_t){ __c, __c, __c, __c }, 34); }
   2647 __ai int64x2_t vqdmlsl_n_s32(int64x2_t __a, int32x2_t __b, int32_t __c) {
   2648   return (int64x2_t)__builtin_neon_vqdmlsl_v((int8x16_t)__a, (int8x8_t)__b, (int8x8_t)(int32x2_t){ __c, __c }, 35); }
   2649 
   2650 __ai int16x4_t vqdmulh_s16(int16x4_t __a, int16x4_t __b) {
   2651   return (int16x4_t)__builtin_neon_vqdmulh_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   2652 __ai int32x2_t vqdmulh_s32(int32x2_t __a, int32x2_t __b) {
   2653   return (int32x2_t)__builtin_neon_vqdmulh_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2654 __ai int16x8_t vqdmulhq_s16(int16x8_t __a, int16x8_t __b) {
   2655   return (int16x8_t)__builtin_neon_vqdmulhq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   2656 __ai int32x4_t vqdmulhq_s32(int32x4_t __a, int32x4_t __b) {
   2657   return (int32x4_t)__builtin_neon_vqdmulhq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   2658 
   2659 #define vqdmulh_lane_s16(a, b, __c) __extension__ ({ \
   2660   int16x4_t __a = (a); int16x4_t __b = (b); \
   2661   vqdmulh_s16(__a, __builtin_shufflevector(__b, __b, __c, __c, __c, __c)); })
   2662 #define vqdmulh_lane_s32(a, b, __c) __extension__ ({ \
   2663   int32x2_t __a = (a); int32x2_t __b = (b); \
   2664   vqdmulh_s32(__a, __builtin_shufflevector(__b, __b, __c, __c)); })
   2665 #define vqdmulhq_lane_s16(a, b, __c) __extension__ ({ \
   2666   int16x8_t __a = (a); int16x4_t __b = (b); \
   2667   vqdmulhq_s16(__a, __builtin_shufflevector(__b, __b, __c, __c, __c, __c, __c, __c, __c, __c)); })
   2668 #define vqdmulhq_lane_s32(a, b, __c) __extension__ ({ \
   2669   int32x4_t __a = (a); int32x2_t __b = (b); \
   2670   vqdmulhq_s32(__a, __builtin_shufflevector(__b, __b, __c, __c, __c, __c)); })
   2671 
   2672 __ai int16x4_t vqdmulh_n_s16(int16x4_t __a, int16_t __b) {
   2673   return (int16x4_t)__builtin_neon_vqdmulh_v((int8x8_t)__a, (int8x8_t)(int16x4_t){ __b, __b, __b, __b }, 1); }
   2674 __ai int32x2_t vqdmulh_n_s32(int32x2_t __a, int32_t __b) {
   2675   return (int32x2_t)__builtin_neon_vqdmulh_v((int8x8_t)__a, (int8x8_t)(int32x2_t){ __b, __b }, 2); }
   2676 __ai int16x8_t vqdmulhq_n_s16(int16x8_t __a, int16_t __b) {
   2677   return (int16x8_t)__builtin_neon_vqdmulhq_v((int8x16_t)__a, (int8x16_t)(int16x8_t){ __b, __b, __b, __b, __b, __b, __b, __b }, 33); }
   2678 __ai int32x4_t vqdmulhq_n_s32(int32x4_t __a, int32_t __b) {
   2679   return (int32x4_t)__builtin_neon_vqdmulhq_v((int8x16_t)__a, (int8x16_t)(int32x4_t){ __b, __b, __b, __b }, 34); }
   2680 
   2681 __ai int32x4_t vqdmull_s16(int16x4_t __a, int16x4_t __b) {
   2682   return (int32x4_t)__builtin_neon_vqdmull_v((int8x8_t)__a, (int8x8_t)__b, 34); }
   2683 __ai int64x2_t vqdmull_s32(int32x2_t __a, int32x2_t __b) {
   2684   return (int64x2_t)__builtin_neon_vqdmull_v((int8x8_t)__a, (int8x8_t)__b, 35); }
   2685 
   2686 #define vqdmull_lane_s16(a, b, __c) __extension__ ({ \
   2687   int16x4_t __a = (a); int16x4_t __b = (b); \
   2688   vqdmull_s16(__a, __builtin_shufflevector(__b, __b, __c, __c, __c, __c)); })
   2689 #define vqdmull_lane_s32(a, b, __c) __extension__ ({ \
   2690   int32x2_t __a = (a); int32x2_t __b = (b); \
   2691   vqdmull_s32(__a, __builtin_shufflevector(__b, __b, __c, __c)); })
   2692 
   2693 __ai int32x4_t vqdmull_n_s16(int16x4_t __a, int16_t __b) {
   2694   return (int32x4_t)__builtin_neon_vqdmull_v((int8x8_t)__a, (int8x8_t)(int16x4_t){ __b, __b, __b, __b }, 34); }
   2695 __ai int64x2_t vqdmull_n_s32(int32x2_t __a, int32_t __b) {
   2696   return (int64x2_t)__builtin_neon_vqdmull_v((int8x8_t)__a, (int8x8_t)(int32x2_t){ __b, __b }, 35); }
   2697 
   2698 __ai int8x8_t vqmovn_s16(int16x8_t __a) {
   2699   return (int8x8_t)__builtin_neon_vqmovn_v((int8x16_t)__a, 0); }
   2700 __ai int16x4_t vqmovn_s32(int32x4_t __a) {
   2701   return (int16x4_t)__builtin_neon_vqmovn_v((int8x16_t)__a, 1); }
   2702 __ai int32x2_t vqmovn_s64(int64x2_t __a) {
   2703   return (int32x2_t)__builtin_neon_vqmovn_v((int8x16_t)__a, 2); }
   2704 __ai uint8x8_t vqmovn_u16(uint16x8_t __a) {
   2705   return (uint8x8_t)__builtin_neon_vqmovn_v((int8x16_t)__a, 16); }
   2706 __ai uint16x4_t vqmovn_u32(uint32x4_t __a) {
   2707   return (uint16x4_t)__builtin_neon_vqmovn_v((int8x16_t)__a, 17); }
   2708 __ai uint32x2_t vqmovn_u64(uint64x2_t __a) {
   2709   return (uint32x2_t)__builtin_neon_vqmovn_v((int8x16_t)__a, 18); }
   2710 
   2711 __ai uint8x8_t vqmovun_s16(int16x8_t __a) {
   2712   return (uint8x8_t)__builtin_neon_vqmovun_v((int8x16_t)__a, 16); }
   2713 __ai uint16x4_t vqmovun_s32(int32x4_t __a) {
   2714   return (uint16x4_t)__builtin_neon_vqmovun_v((int8x16_t)__a, 17); }
   2715 __ai uint32x2_t vqmovun_s64(int64x2_t __a) {
   2716   return (uint32x2_t)__builtin_neon_vqmovun_v((int8x16_t)__a, 18); }
   2717 
   2718 __ai int8x8_t vqneg_s8(int8x8_t __a) {
   2719   return (int8x8_t)__builtin_neon_vqneg_v(__a, 0); }
   2720 __ai int16x4_t vqneg_s16(int16x4_t __a) {
   2721   return (int16x4_t)__builtin_neon_vqneg_v((int8x8_t)__a, 1); }
   2722 __ai int32x2_t vqneg_s32(int32x2_t __a) {
   2723   return (int32x2_t)__builtin_neon_vqneg_v((int8x8_t)__a, 2); }
   2724 __ai int8x16_t vqnegq_s8(int8x16_t __a) {
   2725   return (int8x16_t)__builtin_neon_vqnegq_v(__a, 32); }
   2726 __ai int16x8_t vqnegq_s16(int16x8_t __a) {
   2727   return (int16x8_t)__builtin_neon_vqnegq_v((int8x16_t)__a, 33); }
   2728 __ai int32x4_t vqnegq_s32(int32x4_t __a) {
   2729   return (int32x4_t)__builtin_neon_vqnegq_v((int8x16_t)__a, 34); }
   2730 
   2731 __ai int16x4_t vqrdmulh_s16(int16x4_t __a, int16x4_t __b) {
   2732   return (int16x4_t)__builtin_neon_vqrdmulh_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   2733 __ai int32x2_t vqrdmulh_s32(int32x2_t __a, int32x2_t __b) {
   2734   return (int32x2_t)__builtin_neon_vqrdmulh_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2735 __ai int16x8_t vqrdmulhq_s16(int16x8_t __a, int16x8_t __b) {
   2736   return (int16x8_t)__builtin_neon_vqrdmulhq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   2737 __ai int32x4_t vqrdmulhq_s32(int32x4_t __a, int32x4_t __b) {
   2738   return (int32x4_t)__builtin_neon_vqrdmulhq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   2739 
   2740 #define vqrdmulh_lane_s16(a, b, __c) __extension__ ({ \
   2741   int16x4_t __a = (a); int16x4_t __b = (b); \
   2742   vqrdmulh_s16(__a, __builtin_shufflevector(__b, __b, __c, __c, __c, __c)); })
   2743 #define vqrdmulh_lane_s32(a, b, __c) __extension__ ({ \
   2744   int32x2_t __a = (a); int32x2_t __b = (b); \
   2745   vqrdmulh_s32(__a, __builtin_shufflevector(__b, __b, __c, __c)); })
   2746 #define vqrdmulhq_lane_s16(a, b, __c) __extension__ ({ \
   2747   int16x8_t __a = (a); int16x4_t __b = (b); \
   2748   vqrdmulhq_s16(__a, __builtin_shufflevector(__b, __b, __c, __c, __c, __c, __c, __c, __c, __c)); })
   2749 #define vqrdmulhq_lane_s32(a, b, __c) __extension__ ({ \
   2750   int32x4_t __a = (a); int32x2_t __b = (b); \
   2751   vqrdmulhq_s32(__a, __builtin_shufflevector(__b, __b, __c, __c, __c, __c)); })
   2752 
   2753 __ai int16x4_t vqrdmulh_n_s16(int16x4_t __a, int16_t __b) {
   2754   return (int16x4_t)__builtin_neon_vqrdmulh_v((int8x8_t)__a, (int8x8_t)(int16x4_t){ __b, __b, __b, __b }, 1); }
   2755 __ai int32x2_t vqrdmulh_n_s32(int32x2_t __a, int32_t __b) {
   2756   return (int32x2_t)__builtin_neon_vqrdmulh_v((int8x8_t)__a, (int8x8_t)(int32x2_t){ __b, __b }, 2); }
   2757 __ai int16x8_t vqrdmulhq_n_s16(int16x8_t __a, int16_t __b) {
   2758   return (int16x8_t)__builtin_neon_vqrdmulhq_v((int8x16_t)__a, (int8x16_t)(int16x8_t){ __b, __b, __b, __b, __b, __b, __b, __b }, 33); }
   2759 __ai int32x4_t vqrdmulhq_n_s32(int32x4_t __a, int32_t __b) {
   2760   return (int32x4_t)__builtin_neon_vqrdmulhq_v((int8x16_t)__a, (int8x16_t)(int32x4_t){ __b, __b, __b, __b }, 34); }
   2761 
   2762 __ai int8x8_t vqrshl_s8(int8x8_t __a, int8x8_t __b) {
   2763   return (int8x8_t)__builtin_neon_vqrshl_v(__a, __b, 0); }
   2764 __ai int16x4_t vqrshl_s16(int16x4_t __a, int16x4_t __b) {
   2765   return (int16x4_t)__builtin_neon_vqrshl_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   2766 __ai int32x2_t vqrshl_s32(int32x2_t __a, int32x2_t __b) {
   2767   return (int32x2_t)__builtin_neon_vqrshl_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2768 __ai int64x1_t vqrshl_s64(int64x1_t __a, int64x1_t __b) {
   2769   return (int64x1_t)__builtin_neon_vqrshl_v((int8x8_t)__a, (int8x8_t)__b, 3); }
   2770 __ai uint8x8_t vqrshl_u8(uint8x8_t __a, int8x8_t __b) {
   2771   return (uint8x8_t)__builtin_neon_vqrshl_v((int8x8_t)__a, __b, 16); }
   2772 __ai uint16x4_t vqrshl_u16(uint16x4_t __a, int16x4_t __b) {
   2773   return (uint16x4_t)__builtin_neon_vqrshl_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   2774 __ai uint32x2_t vqrshl_u32(uint32x2_t __a, int32x2_t __b) {
   2775   return (uint32x2_t)__builtin_neon_vqrshl_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   2776 __ai uint64x1_t vqrshl_u64(uint64x1_t __a, int64x1_t __b) {
   2777   return (uint64x1_t)__builtin_neon_vqrshl_v((int8x8_t)__a, (int8x8_t)__b, 19); }
   2778 __ai int8x16_t vqrshlq_s8(int8x16_t __a, int8x16_t __b) {
   2779   return (int8x16_t)__builtin_neon_vqrshlq_v(__a, __b, 32); }
   2780 __ai int16x8_t vqrshlq_s16(int16x8_t __a, int16x8_t __b) {
   2781   return (int16x8_t)__builtin_neon_vqrshlq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   2782 __ai int32x4_t vqrshlq_s32(int32x4_t __a, int32x4_t __b) {
   2783   return (int32x4_t)__builtin_neon_vqrshlq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   2784 __ai int64x2_t vqrshlq_s64(int64x2_t __a, int64x2_t __b) {
   2785   return (int64x2_t)__builtin_neon_vqrshlq_v((int8x16_t)__a, (int8x16_t)__b, 35); }
   2786 __ai uint8x16_t vqrshlq_u8(uint8x16_t __a, int8x16_t __b) {
   2787   return (uint8x16_t)__builtin_neon_vqrshlq_v((int8x16_t)__a, __b, 48); }
   2788 __ai uint16x8_t vqrshlq_u16(uint16x8_t __a, int16x8_t __b) {
   2789   return (uint16x8_t)__builtin_neon_vqrshlq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   2790 __ai uint32x4_t vqrshlq_u32(uint32x4_t __a, int32x4_t __b) {
   2791   return (uint32x4_t)__builtin_neon_vqrshlq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   2792 __ai uint64x2_t vqrshlq_u64(uint64x2_t __a, int64x2_t __b) {
   2793   return (uint64x2_t)__builtin_neon_vqrshlq_v((int8x16_t)__a, (int8x16_t)__b, 51); }
   2794 
   2795 #define vqrshrn_n_s16(a, __b) __extension__ ({ \
   2796   int16x8_t __a = (a); \
   2797   (int8x8_t)__builtin_neon_vqrshrn_n_v((int8x16_t)__a, __b, 0); })
   2798 #define vqrshrn_n_s32(a, __b) __extension__ ({ \
   2799   int32x4_t __a = (a); \
   2800   (int16x4_t)__builtin_neon_vqrshrn_n_v((int8x16_t)__a, __b, 1); })
   2801 #define vqrshrn_n_s64(a, __b) __extension__ ({ \
   2802   int64x2_t __a = (a); \
   2803   (int32x2_t)__builtin_neon_vqrshrn_n_v((int8x16_t)__a, __b, 2); })
   2804 #define vqrshrn_n_u16(a, __b) __extension__ ({ \
   2805   uint16x8_t __a = (a); \
   2806   (uint8x8_t)__builtin_neon_vqrshrn_n_v((int8x16_t)__a, __b, 16); })
   2807 #define vqrshrn_n_u32(a, __b) __extension__ ({ \
   2808   uint32x4_t __a = (a); \
   2809   (uint16x4_t)__builtin_neon_vqrshrn_n_v((int8x16_t)__a, __b, 17); })
   2810 #define vqrshrn_n_u64(a, __b) __extension__ ({ \
   2811   uint64x2_t __a = (a); \
   2812   (uint32x2_t)__builtin_neon_vqrshrn_n_v((int8x16_t)__a, __b, 18); })
   2813 
   2814 #define vqrshrun_n_s16(a, __b) __extension__ ({ \
   2815   int16x8_t __a = (a); \
   2816   (uint8x8_t)__builtin_neon_vqrshrun_n_v((int8x16_t)__a, __b, 16); })
   2817 #define vqrshrun_n_s32(a, __b) __extension__ ({ \
   2818   int32x4_t __a = (a); \
   2819   (uint16x4_t)__builtin_neon_vqrshrun_n_v((int8x16_t)__a, __b, 17); })
   2820 #define vqrshrun_n_s64(a, __b) __extension__ ({ \
   2821   int64x2_t __a = (a); \
   2822   (uint32x2_t)__builtin_neon_vqrshrun_n_v((int8x16_t)__a, __b, 18); })
   2823 
   2824 __ai int8x8_t vqshl_s8(int8x8_t __a, int8x8_t __b) {
   2825   return (int8x8_t)__builtin_neon_vqshl_v(__a, __b, 0); }
   2826 __ai int16x4_t vqshl_s16(int16x4_t __a, int16x4_t __b) {
   2827   return (int16x4_t)__builtin_neon_vqshl_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   2828 __ai int32x2_t vqshl_s32(int32x2_t __a, int32x2_t __b) {
   2829   return (int32x2_t)__builtin_neon_vqshl_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2830 __ai int64x1_t vqshl_s64(int64x1_t __a, int64x1_t __b) {
   2831   return (int64x1_t)__builtin_neon_vqshl_v((int8x8_t)__a, (int8x8_t)__b, 3); }
   2832 __ai uint8x8_t vqshl_u8(uint8x8_t __a, int8x8_t __b) {
   2833   return (uint8x8_t)__builtin_neon_vqshl_v((int8x8_t)__a, __b, 16); }
   2834 __ai uint16x4_t vqshl_u16(uint16x4_t __a, int16x4_t __b) {
   2835   return (uint16x4_t)__builtin_neon_vqshl_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   2836 __ai uint32x2_t vqshl_u32(uint32x2_t __a, int32x2_t __b) {
   2837   return (uint32x2_t)__builtin_neon_vqshl_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   2838 __ai uint64x1_t vqshl_u64(uint64x1_t __a, int64x1_t __b) {
   2839   return (uint64x1_t)__builtin_neon_vqshl_v((int8x8_t)__a, (int8x8_t)__b, 19); }
   2840 __ai int8x16_t vqshlq_s8(int8x16_t __a, int8x16_t __b) {
   2841   return (int8x16_t)__builtin_neon_vqshlq_v(__a, __b, 32); }
   2842 __ai int16x8_t vqshlq_s16(int16x8_t __a, int16x8_t __b) {
   2843   return (int16x8_t)__builtin_neon_vqshlq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   2844 __ai int32x4_t vqshlq_s32(int32x4_t __a, int32x4_t __b) {
   2845   return (int32x4_t)__builtin_neon_vqshlq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   2846 __ai int64x2_t vqshlq_s64(int64x2_t __a, int64x2_t __b) {
   2847   return (int64x2_t)__builtin_neon_vqshlq_v((int8x16_t)__a, (int8x16_t)__b, 35); }
   2848 __ai uint8x16_t vqshlq_u8(uint8x16_t __a, int8x16_t __b) {
   2849   return (uint8x16_t)__builtin_neon_vqshlq_v((int8x16_t)__a, __b, 48); }
   2850 __ai uint16x8_t vqshlq_u16(uint16x8_t __a, int16x8_t __b) {
   2851   return (uint16x8_t)__builtin_neon_vqshlq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   2852 __ai uint32x4_t vqshlq_u32(uint32x4_t __a, int32x4_t __b) {
   2853   return (uint32x4_t)__builtin_neon_vqshlq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   2854 __ai uint64x2_t vqshlq_u64(uint64x2_t __a, int64x2_t __b) {
   2855   return (uint64x2_t)__builtin_neon_vqshlq_v((int8x16_t)__a, (int8x16_t)__b, 51); }
   2856 
   2857 #define vqshlu_n_s8(a, __b) __extension__ ({ \
   2858   int8x8_t __a = (a); \
   2859   (uint8x8_t)__builtin_neon_vqshlu_n_v(__a, __b, 16); })
   2860 #define vqshlu_n_s16(a, __b) __extension__ ({ \
   2861   int16x4_t __a = (a); \
   2862   (uint16x4_t)__builtin_neon_vqshlu_n_v((int8x8_t)__a, __b, 17); })
   2863 #define vqshlu_n_s32(a, __b) __extension__ ({ \
   2864   int32x2_t __a = (a); \
   2865   (uint32x2_t)__builtin_neon_vqshlu_n_v((int8x8_t)__a, __b, 18); })
   2866 #define vqshlu_n_s64(a, __b) __extension__ ({ \
   2867   int64x1_t __a = (a); \
   2868   (uint64x1_t)__builtin_neon_vqshlu_n_v((int8x8_t)__a, __b, 19); })
   2869 #define vqshluq_n_s8(a, __b) __extension__ ({ \
   2870   int8x16_t __a = (a); \
   2871   (uint8x16_t)__builtin_neon_vqshluq_n_v(__a, __b, 48); })
   2872 #define vqshluq_n_s16(a, __b) __extension__ ({ \
   2873   int16x8_t __a = (a); \
   2874   (uint16x8_t)__builtin_neon_vqshluq_n_v((int8x16_t)__a, __b, 49); })
   2875 #define vqshluq_n_s32(a, __b) __extension__ ({ \
   2876   int32x4_t __a = (a); \
   2877   (uint32x4_t)__builtin_neon_vqshluq_n_v((int8x16_t)__a, __b, 50); })
   2878 #define vqshluq_n_s64(a, __b) __extension__ ({ \
   2879   int64x2_t __a = (a); \
   2880   (uint64x2_t)__builtin_neon_vqshluq_n_v((int8x16_t)__a, __b, 51); })
   2881 
   2882 #define vqshl_n_s8(a, __b) __extension__ ({ \
   2883   int8x8_t __a = (a); \
   2884   (int8x8_t)__builtin_neon_vqshl_n_v(__a, __b, 0); })
   2885 #define vqshl_n_s16(a, __b) __extension__ ({ \
   2886   int16x4_t __a = (a); \
   2887   (int16x4_t)__builtin_neon_vqshl_n_v((int8x8_t)__a, __b, 1); })
   2888 #define vqshl_n_s32(a, __b) __extension__ ({ \
   2889   int32x2_t __a = (a); \
   2890   (int32x2_t)__builtin_neon_vqshl_n_v((int8x8_t)__a, __b, 2); })
   2891 #define vqshl_n_s64(a, __b) __extension__ ({ \
   2892   int64x1_t __a = (a); \
   2893   (int64x1_t)__builtin_neon_vqshl_n_v((int8x8_t)__a, __b, 3); })
   2894 #define vqshl_n_u8(a, __b) __extension__ ({ \
   2895   uint8x8_t __a = (a); \
   2896   (uint8x8_t)__builtin_neon_vqshl_n_v((int8x8_t)__a, __b, 16); })
   2897 #define vqshl_n_u16(a, __b) __extension__ ({ \
   2898   uint16x4_t __a = (a); \
   2899   (uint16x4_t)__builtin_neon_vqshl_n_v((int8x8_t)__a, __b, 17); })
   2900 #define vqshl_n_u32(a, __b) __extension__ ({ \
   2901   uint32x2_t __a = (a); \
   2902   (uint32x2_t)__builtin_neon_vqshl_n_v((int8x8_t)__a, __b, 18); })
   2903 #define vqshl_n_u64(a, __b) __extension__ ({ \
   2904   uint64x1_t __a = (a); \
   2905   (uint64x1_t)__builtin_neon_vqshl_n_v((int8x8_t)__a, __b, 19); })
   2906 #define vqshlq_n_s8(a, __b) __extension__ ({ \
   2907   int8x16_t __a = (a); \
   2908   (int8x16_t)__builtin_neon_vqshlq_n_v(__a, __b, 32); })
   2909 #define vqshlq_n_s16(a, __b) __extension__ ({ \
   2910   int16x8_t __a = (a); \
   2911   (int16x8_t)__builtin_neon_vqshlq_n_v((int8x16_t)__a, __b, 33); })
   2912 #define vqshlq_n_s32(a, __b) __extension__ ({ \
   2913   int32x4_t __a = (a); \
   2914   (int32x4_t)__builtin_neon_vqshlq_n_v((int8x16_t)__a, __b, 34); })
   2915 #define vqshlq_n_s64(a, __b) __extension__ ({ \
   2916   int64x2_t __a = (a); \
   2917   (int64x2_t)__builtin_neon_vqshlq_n_v((int8x16_t)__a, __b, 35); })
   2918 #define vqshlq_n_u8(a, __b) __extension__ ({ \
   2919   uint8x16_t __a = (a); \
   2920   (uint8x16_t)__builtin_neon_vqshlq_n_v((int8x16_t)__a, __b, 48); })
   2921 #define vqshlq_n_u16(a, __b) __extension__ ({ \
   2922   uint16x8_t __a = (a); \
   2923   (uint16x8_t)__builtin_neon_vqshlq_n_v((int8x16_t)__a, __b, 49); })
   2924 #define vqshlq_n_u32(a, __b) __extension__ ({ \
   2925   uint32x4_t __a = (a); \
   2926   (uint32x4_t)__builtin_neon_vqshlq_n_v((int8x16_t)__a, __b, 50); })
   2927 #define vqshlq_n_u64(a, __b) __extension__ ({ \
   2928   uint64x2_t __a = (a); \
   2929   (uint64x2_t)__builtin_neon_vqshlq_n_v((int8x16_t)__a, __b, 51); })
   2930 
   2931 #define vqshrn_n_s16(a, __b) __extension__ ({ \
   2932   int16x8_t __a = (a); \
   2933   (int8x8_t)__builtin_neon_vqshrn_n_v((int8x16_t)__a, __b, 0); })
   2934 #define vqshrn_n_s32(a, __b) __extension__ ({ \
   2935   int32x4_t __a = (a); \
   2936   (int16x4_t)__builtin_neon_vqshrn_n_v((int8x16_t)__a, __b, 1); })
   2937 #define vqshrn_n_s64(a, __b) __extension__ ({ \
   2938   int64x2_t __a = (a); \
   2939   (int32x2_t)__builtin_neon_vqshrn_n_v((int8x16_t)__a, __b, 2); })
   2940 #define vqshrn_n_u16(a, __b) __extension__ ({ \
   2941   uint16x8_t __a = (a); \
   2942   (uint8x8_t)__builtin_neon_vqshrn_n_v((int8x16_t)__a, __b, 16); })
   2943 #define vqshrn_n_u32(a, __b) __extension__ ({ \
   2944   uint32x4_t __a = (a); \
   2945   (uint16x4_t)__builtin_neon_vqshrn_n_v((int8x16_t)__a, __b, 17); })
   2946 #define vqshrn_n_u64(a, __b) __extension__ ({ \
   2947   uint64x2_t __a = (a); \
   2948   (uint32x2_t)__builtin_neon_vqshrn_n_v((int8x16_t)__a, __b, 18); })
   2949 
   2950 #define vqshrun_n_s16(a, __b) __extension__ ({ \
   2951   int16x8_t __a = (a); \
   2952   (uint8x8_t)__builtin_neon_vqshrun_n_v((int8x16_t)__a, __b, 16); })
   2953 #define vqshrun_n_s32(a, __b) __extension__ ({ \
   2954   int32x4_t __a = (a); \
   2955   (uint16x4_t)__builtin_neon_vqshrun_n_v((int8x16_t)__a, __b, 17); })
   2956 #define vqshrun_n_s64(a, __b) __extension__ ({ \
   2957   int64x2_t __a = (a); \
   2958   (uint32x2_t)__builtin_neon_vqshrun_n_v((int8x16_t)__a, __b, 18); })
   2959 
   2960 __ai int8x8_t vqsub_s8(int8x8_t __a, int8x8_t __b) {
   2961   return (int8x8_t)__builtin_neon_vqsub_v(__a, __b, 0); }
   2962 __ai int16x4_t vqsub_s16(int16x4_t __a, int16x4_t __b) {
   2963   return (int16x4_t)__builtin_neon_vqsub_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   2964 __ai int32x2_t vqsub_s32(int32x2_t __a, int32x2_t __b) {
   2965   return (int32x2_t)__builtin_neon_vqsub_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   2966 __ai int64x1_t vqsub_s64(int64x1_t __a, int64x1_t __b) {
   2967   return (int64x1_t)__builtin_neon_vqsub_v((int8x8_t)__a, (int8x8_t)__b, 3); }
   2968 __ai uint8x8_t vqsub_u8(uint8x8_t __a, uint8x8_t __b) {
   2969   return (uint8x8_t)__builtin_neon_vqsub_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   2970 __ai uint16x4_t vqsub_u16(uint16x4_t __a, uint16x4_t __b) {
   2971   return (uint16x4_t)__builtin_neon_vqsub_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   2972 __ai uint32x2_t vqsub_u32(uint32x2_t __a, uint32x2_t __b) {
   2973   return (uint32x2_t)__builtin_neon_vqsub_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   2974 __ai uint64x1_t vqsub_u64(uint64x1_t __a, uint64x1_t __b) {
   2975   return (uint64x1_t)__builtin_neon_vqsub_v((int8x8_t)__a, (int8x8_t)__b, 19); }
   2976 __ai int8x16_t vqsubq_s8(int8x16_t __a, int8x16_t __b) {
   2977   return (int8x16_t)__builtin_neon_vqsubq_v(__a, __b, 32); }
   2978 __ai int16x8_t vqsubq_s16(int16x8_t __a, int16x8_t __b) {
   2979   return (int16x8_t)__builtin_neon_vqsubq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   2980 __ai int32x4_t vqsubq_s32(int32x4_t __a, int32x4_t __b) {
   2981   return (int32x4_t)__builtin_neon_vqsubq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   2982 __ai int64x2_t vqsubq_s64(int64x2_t __a, int64x2_t __b) {
   2983   return (int64x2_t)__builtin_neon_vqsubq_v((int8x16_t)__a, (int8x16_t)__b, 35); }
   2984 __ai uint8x16_t vqsubq_u8(uint8x16_t __a, uint8x16_t __b) {
   2985   return (uint8x16_t)__builtin_neon_vqsubq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
   2986 __ai uint16x8_t vqsubq_u16(uint16x8_t __a, uint16x8_t __b) {
   2987   return (uint16x8_t)__builtin_neon_vqsubq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   2988 __ai uint32x4_t vqsubq_u32(uint32x4_t __a, uint32x4_t __b) {
   2989   return (uint32x4_t)__builtin_neon_vqsubq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   2990 __ai uint64x2_t vqsubq_u64(uint64x2_t __a, uint64x2_t __b) {
   2991   return (uint64x2_t)__builtin_neon_vqsubq_v((int8x16_t)__a, (int8x16_t)__b, 51); }
   2992 
   2993 __ai int8x8_t vraddhn_s16(int16x8_t __a, int16x8_t __b) {
   2994   return (int8x8_t)__builtin_neon_vraddhn_v((int8x16_t)__a, (int8x16_t)__b, 0); }
   2995 __ai int16x4_t vraddhn_s32(int32x4_t __a, int32x4_t __b) {
   2996   return (int16x4_t)__builtin_neon_vraddhn_v((int8x16_t)__a, (int8x16_t)__b, 1); }
   2997 __ai int32x2_t vraddhn_s64(int64x2_t __a, int64x2_t __b) {
   2998   return (int32x2_t)__builtin_neon_vraddhn_v((int8x16_t)__a, (int8x16_t)__b, 2); }
   2999 __ai uint8x8_t vraddhn_u16(uint16x8_t __a, uint16x8_t __b) {
   3000   return (uint8x8_t)__builtin_neon_vraddhn_v((int8x16_t)__a, (int8x16_t)__b, 16); }
   3001 __ai uint16x4_t vraddhn_u32(uint32x4_t __a, uint32x4_t __b) {
   3002   return (uint16x4_t)__builtin_neon_vraddhn_v((int8x16_t)__a, (int8x16_t)__b, 17); }
   3003 __ai uint32x2_t vraddhn_u64(uint64x2_t __a, uint64x2_t __b) {
   3004   return (uint32x2_t)__builtin_neon_vraddhn_v((int8x16_t)__a, (int8x16_t)__b, 18); }
   3005 
   3006 __ai float32x2_t vrecpe_f32(float32x2_t __a) {
   3007   return (float32x2_t)__builtin_neon_vrecpe_v((int8x8_t)__a, 7); }
   3008 __ai uint32x2_t vrecpe_u32(uint32x2_t __a) {
   3009   return (uint32x2_t)__builtin_neon_vrecpe_v((int8x8_t)__a, 18); }
   3010 __ai float32x4_t vrecpeq_f32(float32x4_t __a) {
   3011   return (float32x4_t)__builtin_neon_vrecpeq_v((int8x16_t)__a, 39); }
   3012 __ai uint32x4_t vrecpeq_u32(uint32x4_t __a) {
   3013   return (uint32x4_t)__builtin_neon_vrecpeq_v((int8x16_t)__a, 50); }
   3014 
   3015 __ai float32x2_t vrecps_f32(float32x2_t __a, float32x2_t __b) {
   3016   return (float32x2_t)__builtin_neon_vrecps_v((int8x8_t)__a, (int8x8_t)__b, 7); }
   3017 __ai float32x4_t vrecpsq_f32(float32x4_t __a, float32x4_t __b) {
   3018   return (float32x4_t)__builtin_neon_vrecpsq_v((int8x16_t)__a, (int8x16_t)__b, 39); }
   3019 
   3020 __ai int8x8_t vreinterpret_s8_s16(int16x4_t __a) {
   3021   return (int8x8_t)__a; }
   3022 __ai int8x8_t vreinterpret_s8_s32(int32x2_t __a) {
   3023   return (int8x8_t)__a; }
   3024 __ai int8x8_t vreinterpret_s8_s64(int64x1_t __a) {
   3025   return (int8x8_t)__a; }
   3026 __ai int8x8_t vreinterpret_s8_u8(uint8x8_t __a) {
   3027   return (int8x8_t)__a; }
   3028 __ai int8x8_t vreinterpret_s8_u16(uint16x4_t __a) {
   3029   return (int8x8_t)__a; }
   3030 __ai int8x8_t vreinterpret_s8_u32(uint32x2_t __a) {
   3031   return (int8x8_t)__a; }
   3032 __ai int8x8_t vreinterpret_s8_u64(uint64x1_t __a) {
   3033   return (int8x8_t)__a; }
   3034 __ai int8x8_t vreinterpret_s8_f16(float16x4_t __a) {
   3035   return (int8x8_t)__a; }
   3036 __ai int8x8_t vreinterpret_s8_f32(float32x2_t __a) {
   3037   return (int8x8_t)__a; }
   3038 __ai int8x8_t vreinterpret_s8_p8(poly8x8_t __a) {
   3039   return (int8x8_t)__a; }
   3040 __ai int8x8_t vreinterpret_s8_p16(poly16x4_t __a) {
   3041   return (int8x8_t)__a; }
   3042 __ai int16x4_t vreinterpret_s16_s8(int8x8_t __a) {
   3043   return (int16x4_t)__a; }
   3044 __ai int16x4_t vreinterpret_s16_s32(int32x2_t __a) {
   3045   return (int16x4_t)__a; }
   3046 __ai int16x4_t vreinterpret_s16_s64(int64x1_t __a) {
   3047   return (int16x4_t)__a; }
   3048 __ai int16x4_t vreinterpret_s16_u8(uint8x8_t __a) {
   3049   return (int16x4_t)__a; }
   3050 __ai int16x4_t vreinterpret_s16_u16(uint16x4_t __a) {
   3051   return (int16x4_t)__a; }
   3052 __ai int16x4_t vreinterpret_s16_u32(uint32x2_t __a) {
   3053   return (int16x4_t)__a; }
   3054 __ai int16x4_t vreinterpret_s16_u64(uint64x1_t __a) {
   3055   return (int16x4_t)__a; }
   3056 __ai int16x4_t vreinterpret_s16_f16(float16x4_t __a) {
   3057   return (int16x4_t)__a; }
   3058 __ai int16x4_t vreinterpret_s16_f32(float32x2_t __a) {
   3059   return (int16x4_t)__a; }
   3060 __ai int16x4_t vreinterpret_s16_p8(poly8x8_t __a) {
   3061   return (int16x4_t)__a; }
   3062 __ai int16x4_t vreinterpret_s16_p16(poly16x4_t __a) {
   3063   return (int16x4_t)__a; }
   3064 __ai int32x2_t vreinterpret_s32_s8(int8x8_t __a) {
   3065   return (int32x2_t)__a; }
   3066 __ai int32x2_t vreinterpret_s32_s16(int16x4_t __a) {
   3067   return (int32x2_t)__a; }
   3068 __ai int32x2_t vreinterpret_s32_s64(int64x1_t __a) {
   3069   return (int32x2_t)__a; }
   3070 __ai int32x2_t vreinterpret_s32_u8(uint8x8_t __a) {
   3071   return (int32x2_t)__a; }
   3072 __ai int32x2_t vreinterpret_s32_u16(uint16x4_t __a) {
   3073   return (int32x2_t)__a; }
   3074 __ai int32x2_t vreinterpret_s32_u32(uint32x2_t __a) {
   3075   return (int32x2_t)__a; }
   3076 __ai int32x2_t vreinterpret_s32_u64(uint64x1_t __a) {
   3077   return (int32x2_t)__a; }
   3078 __ai int32x2_t vreinterpret_s32_f16(float16x4_t __a) {
   3079   return (int32x2_t)__a; }
   3080 __ai int32x2_t vreinterpret_s32_f32(float32x2_t __a) {
   3081   return (int32x2_t)__a; }
   3082 __ai int32x2_t vreinterpret_s32_p8(poly8x8_t __a) {
   3083   return (int32x2_t)__a; }
   3084 __ai int32x2_t vreinterpret_s32_p16(poly16x4_t __a) {
   3085   return (int32x2_t)__a; }
   3086 __ai int64x1_t vreinterpret_s64_s8(int8x8_t __a) {
   3087   return (int64x1_t)__a; }
   3088 __ai int64x1_t vreinterpret_s64_s16(int16x4_t __a) {
   3089   return (int64x1_t)__a; }
   3090 __ai int64x1_t vreinterpret_s64_s32(int32x2_t __a) {
   3091   return (int64x1_t)__a; }
   3092 __ai int64x1_t vreinterpret_s64_u8(uint8x8_t __a) {
   3093   return (int64x1_t)__a; }
   3094 __ai int64x1_t vreinterpret_s64_u16(uint16x4_t __a) {
   3095   return (int64x1_t)__a; }
   3096 __ai int64x1_t vreinterpret_s64_u32(uint32x2_t __a) {
   3097   return (int64x1_t)__a; }
   3098 __ai int64x1_t vreinterpret_s64_u64(uint64x1_t __a) {
   3099   return (int64x1_t)__a; }
   3100 __ai int64x1_t vreinterpret_s64_f16(float16x4_t __a) {
   3101   return (int64x1_t)__a; }
   3102 __ai int64x1_t vreinterpret_s64_f32(float32x2_t __a) {
   3103   return (int64x1_t)__a; }
   3104 __ai int64x1_t vreinterpret_s64_p8(poly8x8_t __a) {
   3105   return (int64x1_t)__a; }
   3106 __ai int64x1_t vreinterpret_s64_p16(poly16x4_t __a) {
   3107   return (int64x1_t)__a; }
   3108 __ai uint8x8_t vreinterpret_u8_s8(int8x8_t __a) {
   3109   return (uint8x8_t)__a; }
   3110 __ai uint8x8_t vreinterpret_u8_s16(int16x4_t __a) {
   3111   return (uint8x8_t)__a; }
   3112 __ai uint8x8_t vreinterpret_u8_s32(int32x2_t __a) {
   3113   return (uint8x8_t)__a; }
   3114 __ai uint8x8_t vreinterpret_u8_s64(int64x1_t __a) {
   3115   return (uint8x8_t)__a; }
   3116 __ai uint8x8_t vreinterpret_u8_u16(uint16x4_t __a) {
   3117   return (uint8x8_t)__a; }
   3118 __ai uint8x8_t vreinterpret_u8_u32(uint32x2_t __a) {
   3119   return (uint8x8_t)__a; }
   3120 __ai uint8x8_t vreinterpret_u8_u64(uint64x1_t __a) {
   3121   return (uint8x8_t)__a; }
   3122 __ai uint8x8_t vreinterpret_u8_f16(float16x4_t __a) {
   3123   return (uint8x8_t)__a; }
   3124 __ai uint8x8_t vreinterpret_u8_f32(float32x2_t __a) {
   3125   return (uint8x8_t)__a; }
   3126 __ai uint8x8_t vreinterpret_u8_p8(poly8x8_t __a) {
   3127   return (uint8x8_t)__a; }
   3128 __ai uint8x8_t vreinterpret_u8_p16(poly16x4_t __a) {
   3129   return (uint8x8_t)__a; }
   3130 __ai uint16x4_t vreinterpret_u16_s8(int8x8_t __a) {
   3131   return (uint16x4_t)__a; }
   3132 __ai uint16x4_t vreinterpret_u16_s16(int16x4_t __a) {
   3133   return (uint16x4_t)__a; }
   3134 __ai uint16x4_t vreinterpret_u16_s32(int32x2_t __a) {
   3135   return (uint16x4_t)__a; }
   3136 __ai uint16x4_t vreinterpret_u16_s64(int64x1_t __a) {
   3137   return (uint16x4_t)__a; }
   3138 __ai uint16x4_t vreinterpret_u16_u8(uint8x8_t __a) {
   3139   return (uint16x4_t)__a; }
   3140 __ai uint16x4_t vreinterpret_u16_u32(uint32x2_t __a) {
   3141   return (uint16x4_t)__a; }
   3142 __ai uint16x4_t vreinterpret_u16_u64(uint64x1_t __a) {
   3143   return (uint16x4_t)__a; }
   3144 __ai uint16x4_t vreinterpret_u16_f16(float16x4_t __a) {
   3145   return (uint16x4_t)__a; }
   3146 __ai uint16x4_t vreinterpret_u16_f32(float32x2_t __a) {
   3147   return (uint16x4_t)__a; }
   3148 __ai uint16x4_t vreinterpret_u16_p8(poly8x8_t __a) {
   3149   return (uint16x4_t)__a; }
   3150 __ai uint16x4_t vreinterpret_u16_p16(poly16x4_t __a) {
   3151   return (uint16x4_t)__a; }
   3152 __ai uint32x2_t vreinterpret_u32_s8(int8x8_t __a) {
   3153   return (uint32x2_t)__a; }
   3154 __ai uint32x2_t vreinterpret_u32_s16(int16x4_t __a) {
   3155   return (uint32x2_t)__a; }
   3156 __ai uint32x2_t vreinterpret_u32_s32(int32x2_t __a) {
   3157   return (uint32x2_t)__a; }
   3158 __ai uint32x2_t vreinterpret_u32_s64(int64x1_t __a) {
   3159   return (uint32x2_t)__a; }
   3160 __ai uint32x2_t vreinterpret_u32_u8(uint8x8_t __a) {
   3161   return (uint32x2_t)__a; }
   3162 __ai uint32x2_t vreinterpret_u32_u16(uint16x4_t __a) {
   3163   return (uint32x2_t)__a; }
   3164 __ai uint32x2_t vreinterpret_u32_u64(uint64x1_t __a) {
   3165   return (uint32x2_t)__a; }
   3166 __ai uint32x2_t vreinterpret_u32_f16(float16x4_t __a) {
   3167   return (uint32x2_t)__a; }
   3168 __ai uint32x2_t vreinterpret_u32_f32(float32x2_t __a) {
   3169   return (uint32x2_t)__a; }
   3170 __ai uint32x2_t vreinterpret_u32_p8(poly8x8_t __a) {
   3171   return (uint32x2_t)__a; }
   3172 __ai uint32x2_t vreinterpret_u32_p16(poly16x4_t __a) {
   3173   return (uint32x2_t)__a; }
   3174 __ai uint64x1_t vreinterpret_u64_s8(int8x8_t __a) {
   3175   return (uint64x1_t)__a; }
   3176 __ai uint64x1_t vreinterpret_u64_s16(int16x4_t __a) {
   3177   return (uint64x1_t)__a; }
   3178 __ai uint64x1_t vreinterpret_u64_s32(int32x2_t __a) {
   3179   return (uint64x1_t)__a; }
   3180 __ai uint64x1_t vreinterpret_u64_s64(int64x1_t __a) {
   3181   return (uint64x1_t)__a; }
   3182 __ai uint64x1_t vreinterpret_u64_u8(uint8x8_t __a) {
   3183   return (uint64x1_t)__a; }
   3184 __ai uint64x1_t vreinterpret_u64_u16(uint16x4_t __a) {
   3185   return (uint64x1_t)__a; }
   3186 __ai uint64x1_t vreinterpret_u64_u32(uint32x2_t __a) {
   3187   return (uint64x1_t)__a; }
   3188 __ai uint64x1_t vreinterpret_u64_f16(float16x4_t __a) {
   3189   return (uint64x1_t)__a; }
   3190 __ai uint64x1_t vreinterpret_u64_f32(float32x2_t __a) {
   3191   return (uint64x1_t)__a; }
   3192 __ai uint64x1_t vreinterpret_u64_p8(poly8x8_t __a) {
   3193   return (uint64x1_t)__a; }
   3194 __ai uint64x1_t vreinterpret_u64_p16(poly16x4_t __a) {
   3195   return (uint64x1_t)__a; }
   3196 __ai float16x4_t vreinterpret_f16_s8(int8x8_t __a) {
   3197   return (float16x4_t)__a; }
   3198 __ai float16x4_t vreinterpret_f16_s16(int16x4_t __a) {
   3199   return (float16x4_t)__a; }
   3200 __ai float16x4_t vreinterpret_f16_s32(int32x2_t __a) {
   3201   return (float16x4_t)__a; }
   3202 __ai float16x4_t vreinterpret_f16_s64(int64x1_t __a) {
   3203   return (float16x4_t)__a; }
   3204 __ai float16x4_t vreinterpret_f16_u8(uint8x8_t __a) {
   3205   return (float16x4_t)__a; }
   3206 __ai float16x4_t vreinterpret_f16_u16(uint16x4_t __a) {
   3207   return (float16x4_t)__a; }
   3208 __ai float16x4_t vreinterpret_f16_u32(uint32x2_t __a) {
   3209   return (float16x4_t)__a; }
   3210 __ai float16x4_t vreinterpret_f16_u64(uint64x1_t __a) {
   3211   return (float16x4_t)__a; }
   3212 __ai float16x4_t vreinterpret_f16_f32(float32x2_t __a) {
   3213   return (float16x4_t)__a; }
   3214 __ai float16x4_t vreinterpret_f16_p8(poly8x8_t __a) {
   3215   return (float16x4_t)__a; }
   3216 __ai float16x4_t vreinterpret_f16_p16(poly16x4_t __a) {
   3217   return (float16x4_t)__a; }
   3218 __ai float32x2_t vreinterpret_f32_s8(int8x8_t __a) {
   3219   return (float32x2_t)__a; }
   3220 __ai float32x2_t vreinterpret_f32_s16(int16x4_t __a) {
   3221   return (float32x2_t)__a; }
   3222 __ai float32x2_t vreinterpret_f32_s32(int32x2_t __a) {
   3223   return (float32x2_t)__a; }
   3224 __ai float32x2_t vreinterpret_f32_s64(int64x1_t __a) {
   3225   return (float32x2_t)__a; }
   3226 __ai float32x2_t vreinterpret_f32_u8(uint8x8_t __a) {
   3227   return (float32x2_t)__a; }
   3228 __ai float32x2_t vreinterpret_f32_u16(uint16x4_t __a) {
   3229   return (float32x2_t)__a; }
   3230 __ai float32x2_t vreinterpret_f32_u32(uint32x2_t __a) {
   3231   return (float32x2_t)__a; }
   3232 __ai float32x2_t vreinterpret_f32_u64(uint64x1_t __a) {
   3233   return (float32x2_t)__a; }
   3234 __ai float32x2_t vreinterpret_f32_f16(float16x4_t __a) {
   3235   return (float32x2_t)__a; }
   3236 __ai float32x2_t vreinterpret_f32_p8(poly8x8_t __a) {
   3237   return (float32x2_t)__a; }
   3238 __ai float32x2_t vreinterpret_f32_p16(poly16x4_t __a) {
   3239   return (float32x2_t)__a; }
   3240 __ai poly8x8_t vreinterpret_p8_s8(int8x8_t __a) {
   3241   return (poly8x8_t)__a; }
   3242 __ai poly8x8_t vreinterpret_p8_s16(int16x4_t __a) {
   3243   return (poly8x8_t)__a; }
   3244 __ai poly8x8_t vreinterpret_p8_s32(int32x2_t __a) {
   3245   return (poly8x8_t)__a; }
   3246 __ai poly8x8_t vreinterpret_p8_s64(int64x1_t __a) {
   3247   return (poly8x8_t)__a; }
   3248 __ai poly8x8_t vreinterpret_p8_u8(uint8x8_t __a) {
   3249   return (poly8x8_t)__a; }
   3250 __ai poly8x8_t vreinterpret_p8_u16(uint16x4_t __a) {
   3251   return (poly8x8_t)__a; }
   3252 __ai poly8x8_t vreinterpret_p8_u32(uint32x2_t __a) {
   3253   return (poly8x8_t)__a; }
   3254 __ai poly8x8_t vreinterpret_p8_u64(uint64x1_t __a) {
   3255   return (poly8x8_t)__a; }
   3256 __ai poly8x8_t vreinterpret_p8_f16(float16x4_t __a) {
   3257   return (poly8x8_t)__a; }
   3258 __ai poly8x8_t vreinterpret_p8_f32(float32x2_t __a) {
   3259   return (poly8x8_t)__a; }
   3260 __ai poly8x8_t vreinterpret_p8_p16(poly16x4_t __a) {
   3261   return (poly8x8_t)__a; }
   3262 __ai poly16x4_t vreinterpret_p16_s8(int8x8_t __a) {
   3263   return (poly16x4_t)__a; }
   3264 __ai poly16x4_t vreinterpret_p16_s16(int16x4_t __a) {
   3265   return (poly16x4_t)__a; }
   3266 __ai poly16x4_t vreinterpret_p16_s32(int32x2_t __a) {
   3267   return (poly16x4_t)__a; }
   3268 __ai poly16x4_t vreinterpret_p16_s64(int64x1_t __a) {
   3269   return (poly16x4_t)__a; }
   3270 __ai poly16x4_t vreinterpret_p16_u8(uint8x8_t __a) {
   3271   return (poly16x4_t)__a; }
   3272 __ai poly16x4_t vreinterpret_p16_u16(uint16x4_t __a) {
   3273   return (poly16x4_t)__a; }
   3274 __ai poly16x4_t vreinterpret_p16_u32(uint32x2_t __a) {
   3275   return (poly16x4_t)__a; }
   3276 __ai poly16x4_t vreinterpret_p16_u64(uint64x1_t __a) {
   3277   return (poly16x4_t)__a; }
   3278 __ai poly16x4_t vreinterpret_p16_f16(float16x4_t __a) {
   3279   return (poly16x4_t)__a; }
   3280 __ai poly16x4_t vreinterpret_p16_f32(float32x2_t __a) {
   3281   return (poly16x4_t)__a; }
   3282 __ai poly16x4_t vreinterpret_p16_p8(poly8x8_t __a) {
   3283   return (poly16x4_t)__a; }
   3284 __ai int8x16_t vreinterpretq_s8_s16(int16x8_t __a) {
   3285   return (int8x16_t)__a; }
   3286 __ai int8x16_t vreinterpretq_s8_s32(int32x4_t __a) {
   3287   return (int8x16_t)__a; }
   3288 __ai int8x16_t vreinterpretq_s8_s64(int64x2_t __a) {
   3289   return (int8x16_t)__a; }
   3290 __ai int8x16_t vreinterpretq_s8_u8(uint8x16_t __a) {
   3291   return (int8x16_t)__a; }
   3292 __ai int8x16_t vreinterpretq_s8_u16(uint16x8_t __a) {
   3293   return (int8x16_t)__a; }
   3294 __ai int8x16_t vreinterpretq_s8_u32(uint32x4_t __a) {
   3295   return (int8x16_t)__a; }
   3296 __ai int8x16_t vreinterpretq_s8_u64(uint64x2_t __a) {
   3297   return (int8x16_t)__a; }
   3298 __ai int8x16_t vreinterpretq_s8_f16(float16x8_t __a) {
   3299   return (int8x16_t)__a; }
   3300 __ai int8x16_t vreinterpretq_s8_f32(float32x4_t __a) {
   3301   return (int8x16_t)__a; }
   3302 __ai int8x16_t vreinterpretq_s8_p8(poly8x16_t __a) {
   3303   return (int8x16_t)__a; }
   3304 __ai int8x16_t vreinterpretq_s8_p16(poly16x8_t __a) {
   3305   return (int8x16_t)__a; }
   3306 __ai int16x8_t vreinterpretq_s16_s8(int8x16_t __a) {
   3307   return (int16x8_t)__a; }
   3308 __ai int16x8_t vreinterpretq_s16_s32(int32x4_t __a) {
   3309   return (int16x8_t)__a; }
   3310 __ai int16x8_t vreinterpretq_s16_s64(int64x2_t __a) {
   3311   return (int16x8_t)__a; }
   3312 __ai int16x8_t vreinterpretq_s16_u8(uint8x16_t __a) {
   3313   return (int16x8_t)__a; }
   3314 __ai int16x8_t vreinterpretq_s16_u16(uint16x8_t __a) {
   3315   return (int16x8_t)__a; }
   3316 __ai int16x8_t vreinterpretq_s16_u32(uint32x4_t __a) {
   3317   return (int16x8_t)__a; }
   3318 __ai int16x8_t vreinterpretq_s16_u64(uint64x2_t __a) {
   3319   return (int16x8_t)__a; }
   3320 __ai int16x8_t vreinterpretq_s16_f16(float16x8_t __a) {
   3321   return (int16x8_t)__a; }
   3322 __ai int16x8_t vreinterpretq_s16_f32(float32x4_t __a) {
   3323   return (int16x8_t)__a; }
   3324 __ai int16x8_t vreinterpretq_s16_p8(poly8x16_t __a) {
   3325   return (int16x8_t)__a; }
   3326 __ai int16x8_t vreinterpretq_s16_p16(poly16x8_t __a) {
   3327   return (int16x8_t)__a; }
   3328 __ai int32x4_t vreinterpretq_s32_s8(int8x16_t __a) {
   3329   return (int32x4_t)__a; }
   3330 __ai int32x4_t vreinterpretq_s32_s16(int16x8_t __a) {
   3331   return (int32x4_t)__a; }
   3332 __ai int32x4_t vreinterpretq_s32_s64(int64x2_t __a) {
   3333   return (int32x4_t)__a; }
   3334 __ai int32x4_t vreinterpretq_s32_u8(uint8x16_t __a) {
   3335   return (int32x4_t)__a; }
   3336 __ai int32x4_t vreinterpretq_s32_u16(uint16x8_t __a) {
   3337   return (int32x4_t)__a; }
   3338 __ai int32x4_t vreinterpretq_s32_u32(uint32x4_t __a) {
   3339   return (int32x4_t)__a; }
   3340 __ai int32x4_t vreinterpretq_s32_u64(uint64x2_t __a) {
   3341   return (int32x4_t)__a; }
   3342 __ai int32x4_t vreinterpretq_s32_f16(float16x8_t __a) {
   3343   return (int32x4_t)__a; }
   3344 __ai int32x4_t vreinterpretq_s32_f32(float32x4_t __a) {
   3345   return (int32x4_t)__a; }
   3346 __ai int32x4_t vreinterpretq_s32_p8(poly8x16_t __a) {
   3347   return (int32x4_t)__a; }
   3348 __ai int32x4_t vreinterpretq_s32_p16(poly16x8_t __a) {
   3349   return (int32x4_t)__a; }
   3350 __ai int64x2_t vreinterpretq_s64_s8(int8x16_t __a) {
   3351   return (int64x2_t)__a; }
   3352 __ai int64x2_t vreinterpretq_s64_s16(int16x8_t __a) {
   3353   return (int64x2_t)__a; }
   3354 __ai int64x2_t vreinterpretq_s64_s32(int32x4_t __a) {
   3355   return (int64x2_t)__a; }
   3356 __ai int64x2_t vreinterpretq_s64_u8(uint8x16_t __a) {
   3357   return (int64x2_t)__a; }
   3358 __ai int64x2_t vreinterpretq_s64_u16(uint16x8_t __a) {
   3359   return (int64x2_t)__a; }
   3360 __ai int64x2_t vreinterpretq_s64_u32(uint32x4_t __a) {
   3361   return (int64x2_t)__a; }
   3362 __ai int64x2_t vreinterpretq_s64_u64(uint64x2_t __a) {
   3363   return (int64x2_t)__a; }
   3364 __ai int64x2_t vreinterpretq_s64_f16(float16x8_t __a) {
   3365   return (int64x2_t)__a; }
   3366 __ai int64x2_t vreinterpretq_s64_f32(float32x4_t __a) {
   3367   return (int64x2_t)__a; }
   3368 __ai int64x2_t vreinterpretq_s64_p8(poly8x16_t __a) {
   3369   return (int64x2_t)__a; }
   3370 __ai int64x2_t vreinterpretq_s64_p16(poly16x8_t __a) {
   3371   return (int64x2_t)__a; }
   3372 __ai uint8x16_t vreinterpretq_u8_s8(int8x16_t __a) {
   3373   return (uint8x16_t)__a; }
   3374 __ai uint8x16_t vreinterpretq_u8_s16(int16x8_t __a) {
   3375   return (uint8x16_t)__a; }
   3376 __ai uint8x16_t vreinterpretq_u8_s32(int32x4_t __a) {
   3377   return (uint8x16_t)__a; }
   3378 __ai uint8x16_t vreinterpretq_u8_s64(int64x2_t __a) {
   3379   return (uint8x16_t)__a; }
   3380 __ai uint8x16_t vreinterpretq_u8_u16(uint16x8_t __a) {
   3381   return (uint8x16_t)__a; }
   3382 __ai uint8x16_t vreinterpretq_u8_u32(uint32x4_t __a) {
   3383   return (uint8x16_t)__a; }
   3384 __ai uint8x16_t vreinterpretq_u8_u64(uint64x2_t __a) {
   3385   return (uint8x16_t)__a; }
   3386 __ai uint8x16_t vreinterpretq_u8_f16(float16x8_t __a) {
   3387   return (uint8x16_t)__a; }
   3388 __ai uint8x16_t vreinterpretq_u8_f32(float32x4_t __a) {
   3389   return (uint8x16_t)__a; }
   3390 __ai uint8x16_t vreinterpretq_u8_p8(poly8x16_t __a) {
   3391   return (uint8x16_t)__a; }
   3392 __ai uint8x16_t vreinterpretq_u8_p16(poly16x8_t __a) {
   3393   return (uint8x16_t)__a; }
   3394 __ai uint16x8_t vreinterpretq_u16_s8(int8x16_t __a) {
   3395   return (uint16x8_t)__a; }
   3396 __ai uint16x8_t vreinterpretq_u16_s16(int16x8_t __a) {
   3397   return (uint16x8_t)__a; }
   3398 __ai uint16x8_t vreinterpretq_u16_s32(int32x4_t __a) {
   3399   return (uint16x8_t)__a; }
   3400 __ai uint16x8_t vreinterpretq_u16_s64(int64x2_t __a) {
   3401   return (uint16x8_t)__a; }
   3402 __ai uint16x8_t vreinterpretq_u16_u8(uint8x16_t __a) {
   3403   return (uint16x8_t)__a; }
   3404 __ai uint16x8_t vreinterpretq_u16_u32(uint32x4_t __a) {
   3405   return (uint16x8_t)__a; }
   3406 __ai uint16x8_t vreinterpretq_u16_u64(uint64x2_t __a) {
   3407   return (uint16x8_t)__a; }
   3408 __ai uint16x8_t vreinterpretq_u16_f16(float16x8_t __a) {
   3409   return (uint16x8_t)__a; }
   3410 __ai uint16x8_t vreinterpretq_u16_f32(float32x4_t __a) {
   3411   return (uint16x8_t)__a; }
   3412 __ai uint16x8_t vreinterpretq_u16_p8(poly8x16_t __a) {
   3413   return (uint16x8_t)__a; }
   3414 __ai uint16x8_t vreinterpretq_u16_p16(poly16x8_t __a) {
   3415   return (uint16x8_t)__a; }
   3416 __ai uint32x4_t vreinterpretq_u32_s8(int8x16_t __a) {
   3417   return (uint32x4_t)__a; }
   3418 __ai uint32x4_t vreinterpretq_u32_s16(int16x8_t __a) {
   3419   return (uint32x4_t)__a; }
   3420 __ai uint32x4_t vreinterpretq_u32_s32(int32x4_t __a) {
   3421   return (uint32x4_t)__a; }
   3422 __ai uint32x4_t vreinterpretq_u32_s64(int64x2_t __a) {
   3423   return (uint32x4_t)__a; }
   3424 __ai uint32x4_t vreinterpretq_u32_u8(uint8x16_t __a) {
   3425   return (uint32x4_t)__a; }
   3426 __ai uint32x4_t vreinterpretq_u32_u16(uint16x8_t __a) {
   3427   return (uint32x4_t)__a; }
   3428 __ai uint32x4_t vreinterpretq_u32_u64(uint64x2_t __a) {
   3429   return (uint32x4_t)__a; }
   3430 __ai uint32x4_t vreinterpretq_u32_f16(float16x8_t __a) {
   3431   return (uint32x4_t)__a; }
   3432 __ai uint32x4_t vreinterpretq_u32_f32(float32x4_t __a) {
   3433   return (uint32x4_t)__a; }
   3434 __ai uint32x4_t vreinterpretq_u32_p8(poly8x16_t __a) {
   3435   return (uint32x4_t)__a; }
   3436 __ai uint32x4_t vreinterpretq_u32_p16(poly16x8_t __a) {
   3437   return (uint32x4_t)__a; }
   3438 __ai uint64x2_t vreinterpretq_u64_s8(int8x16_t __a) {
   3439   return (uint64x2_t)__a; }
   3440 __ai uint64x2_t vreinterpretq_u64_s16(int16x8_t __a) {
   3441   return (uint64x2_t)__a; }
   3442 __ai uint64x2_t vreinterpretq_u64_s32(int32x4_t __a) {
   3443   return (uint64x2_t)__a; }
   3444 __ai uint64x2_t vreinterpretq_u64_s64(int64x2_t __a) {
   3445   return (uint64x2_t)__a; }
   3446 __ai uint64x2_t vreinterpretq_u64_u8(uint8x16_t __a) {
   3447   return (uint64x2_t)__a; }
   3448 __ai uint64x2_t vreinterpretq_u64_u16(uint16x8_t __a) {
   3449   return (uint64x2_t)__a; }
   3450 __ai uint64x2_t vreinterpretq_u64_u32(uint32x4_t __a) {
   3451   return (uint64x2_t)__a; }
   3452 __ai uint64x2_t vreinterpretq_u64_f16(float16x8_t __a) {
   3453   return (uint64x2_t)__a; }
   3454 __ai uint64x2_t vreinterpretq_u64_f32(float32x4_t __a) {
   3455   return (uint64x2_t)__a; }
   3456 __ai uint64x2_t vreinterpretq_u64_p8(poly8x16_t __a) {
   3457   return (uint64x2_t)__a; }
   3458 __ai uint64x2_t vreinterpretq_u64_p16(poly16x8_t __a) {
   3459   return (uint64x2_t)__a; }
   3460 __ai float16x8_t vreinterpretq_f16_s8(int8x16_t __a) {
   3461   return (float16x8_t)__a; }
   3462 __ai float16x8_t vreinterpretq_f16_s16(int16x8_t __a) {
   3463   return (float16x8_t)__a; }
   3464 __ai float16x8_t vreinterpretq_f16_s32(int32x4_t __a) {
   3465   return (float16x8_t)__a; }
   3466 __ai float16x8_t vreinterpretq_f16_s64(int64x2_t __a) {
   3467   return (float16x8_t)__a; }
   3468 __ai float16x8_t vreinterpretq_f16_u8(uint8x16_t __a) {
   3469   return (float16x8_t)__a; }
   3470 __ai float16x8_t vreinterpretq_f16_u16(uint16x8_t __a) {
   3471   return (float16x8_t)__a; }
   3472 __ai float16x8_t vreinterpretq_f16_u32(uint32x4_t __a) {
   3473   return (float16x8_t)__a; }
   3474 __ai float16x8_t vreinterpretq_f16_u64(uint64x2_t __a) {
   3475   return (float16x8_t)__a; }
   3476 __ai float16x8_t vreinterpretq_f16_f32(float32x4_t __a) {
   3477   return (float16x8_t)__a; }
   3478 __ai float16x8_t vreinterpretq_f16_p8(poly8x16_t __a) {
   3479   return (float16x8_t)__a; }
   3480 __ai float16x8_t vreinterpretq_f16_p16(poly16x8_t __a) {
   3481   return (float16x8_t)__a; }
   3482 __ai float32x4_t vreinterpretq_f32_s8(int8x16_t __a) {
   3483   return (float32x4_t)__a; }
   3484 __ai float32x4_t vreinterpretq_f32_s16(int16x8_t __a) {
   3485   return (float32x4_t)__a; }
   3486 __ai float32x4_t vreinterpretq_f32_s32(int32x4_t __a) {
   3487   return (float32x4_t)__a; }
   3488 __ai float32x4_t vreinterpretq_f32_s64(int64x2_t __a) {
   3489   return (float32x4_t)__a; }
   3490 __ai float32x4_t vreinterpretq_f32_u8(uint8x16_t __a) {
   3491   return (float32x4_t)__a; }
   3492 __ai float32x4_t vreinterpretq_f32_u16(uint16x8_t __a) {
   3493   return (float32x4_t)__a; }
   3494 __ai float32x4_t vreinterpretq_f32_u32(uint32x4_t __a) {
   3495   return (float32x4_t)__a; }
   3496 __ai float32x4_t vreinterpretq_f32_u64(uint64x2_t __a) {
   3497   return (float32x4_t)__a; }
   3498 __ai float32x4_t vreinterpretq_f32_f16(float16x8_t __a) {
   3499   return (float32x4_t)__a; }
   3500 __ai float32x4_t vreinterpretq_f32_p8(poly8x16_t __a) {
   3501   return (float32x4_t)__a; }
   3502 __ai float32x4_t vreinterpretq_f32_p16(poly16x8_t __a) {
   3503   return (float32x4_t)__a; }
   3504 __ai poly8x16_t vreinterpretq_p8_s8(int8x16_t __a) {
   3505   return (poly8x16_t)__a; }
   3506 __ai poly8x16_t vreinterpretq_p8_s16(int16x8_t __a) {
   3507   return (poly8x16_t)__a; }
   3508 __ai poly8x16_t vreinterpretq_p8_s32(int32x4_t __a) {
   3509   return (poly8x16_t)__a; }
   3510 __ai poly8x16_t vreinterpretq_p8_s64(int64x2_t __a) {
   3511   return (poly8x16_t)__a; }
   3512 __ai poly8x16_t vreinterpretq_p8_u8(uint8x16_t __a) {
   3513   return (poly8x16_t)__a; }
   3514 __ai poly8x16_t vreinterpretq_p8_u16(uint16x8_t __a) {
   3515   return (poly8x16_t)__a; }
   3516 __ai poly8x16_t vreinterpretq_p8_u32(uint32x4_t __a) {
   3517   return (poly8x16_t)__a; }
   3518 __ai poly8x16_t vreinterpretq_p8_u64(uint64x2_t __a) {
   3519   return (poly8x16_t)__a; }
   3520 __ai poly8x16_t vreinterpretq_p8_f16(float16x8_t __a) {
   3521   return (poly8x16_t)__a; }
   3522 __ai poly8x16_t vreinterpretq_p8_f32(float32x4_t __a) {
   3523   return (poly8x16_t)__a; }
   3524 __ai poly8x16_t vreinterpretq_p8_p16(poly16x8_t __a) {
   3525   return (poly8x16_t)__a; }
   3526 __ai poly16x8_t vreinterpretq_p16_s8(int8x16_t __a) {
   3527   return (poly16x8_t)__a; }
   3528 __ai poly16x8_t vreinterpretq_p16_s16(int16x8_t __a) {
   3529   return (poly16x8_t)__a; }
   3530 __ai poly16x8_t vreinterpretq_p16_s32(int32x4_t __a) {
   3531   return (poly16x8_t)__a; }
   3532 __ai poly16x8_t vreinterpretq_p16_s64(int64x2_t __a) {
   3533   return (poly16x8_t)__a; }
   3534 __ai poly16x8_t vreinterpretq_p16_u8(uint8x16_t __a) {
   3535   return (poly16x8_t)__a; }
   3536 __ai poly16x8_t vreinterpretq_p16_u16(uint16x8_t __a) {
   3537   return (poly16x8_t)__a; }
   3538 __ai poly16x8_t vreinterpretq_p16_u32(uint32x4_t __a) {
   3539   return (poly16x8_t)__a; }
   3540 __ai poly16x8_t vreinterpretq_p16_u64(uint64x2_t __a) {
   3541   return (poly16x8_t)__a; }
   3542 __ai poly16x8_t vreinterpretq_p16_f16(float16x8_t __a) {
   3543   return (poly16x8_t)__a; }
   3544 __ai poly16x8_t vreinterpretq_p16_f32(float32x4_t __a) {
   3545   return (poly16x8_t)__a; }
   3546 __ai poly16x8_t vreinterpretq_p16_p8(poly8x16_t __a) {
   3547   return (poly16x8_t)__a; }
   3548 
   3549 __ai int8x8_t vrev16_s8(int8x8_t __a) {
   3550   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2, 5, 4, 7, 6); }
   3551 __ai uint8x8_t vrev16_u8(uint8x8_t __a) {
   3552   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2, 5, 4, 7, 6); }
   3553 __ai poly8x8_t vrev16_p8(poly8x8_t __a) {
   3554   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2, 5, 4, 7, 6); }
   3555 __ai int8x16_t vrev16q_s8(int8x16_t __a) {
   3556   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); }
   3557 __ai uint8x16_t vrev16q_u8(uint8x16_t __a) {
   3558   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); }
   3559 __ai poly8x16_t vrev16q_p8(poly8x16_t __a) {
   3560   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); }
   3561 
   3562 __ai int8x8_t vrev32_s8(int8x8_t __a) {
   3563   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0, 7, 6, 5, 4); }
   3564 __ai int16x4_t vrev32_s16(int16x4_t __a) {
   3565   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2); }
   3566 __ai uint8x8_t vrev32_u8(uint8x8_t __a) {
   3567   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0, 7, 6, 5, 4); }
   3568 __ai uint16x4_t vrev32_u16(uint16x4_t __a) {
   3569   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2); }
   3570 __ai poly8x8_t vrev32_p8(poly8x8_t __a) {
   3571   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0, 7, 6, 5, 4); }
   3572 __ai poly16x4_t vrev32_p16(poly16x4_t __a) {
   3573   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2); }
   3574 __ai int8x16_t vrev32q_s8(int8x16_t __a) {
   3575   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); }
   3576 __ai int16x8_t vrev32q_s16(int16x8_t __a) {
   3577   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2, 5, 4, 7, 6); }
   3578 __ai uint8x16_t vrev32q_u8(uint8x16_t __a) {
   3579   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); }
   3580 __ai uint16x8_t vrev32q_u16(uint16x8_t __a) {
   3581   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2, 5, 4, 7, 6); }
   3582 __ai poly8x16_t vrev32q_p8(poly8x16_t __a) {
   3583   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12); }
   3584 __ai poly16x8_t vrev32q_p16(poly16x8_t __a) {
   3585   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2, 5, 4, 7, 6); }
   3586 
   3587 __ai int8x8_t vrev64_s8(int8x8_t __a) {
   3588   return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0); }
   3589 __ai int16x4_t vrev64_s16(int16x4_t __a) {
   3590   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); }
   3591 __ai int32x2_t vrev64_s32(int32x2_t __a) {
   3592   return __builtin_shufflevector(__a, __a, 1, 0); }
   3593 __ai uint8x8_t vrev64_u8(uint8x8_t __a) {
   3594   return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0); }
   3595 __ai uint16x4_t vrev64_u16(uint16x4_t __a) {
   3596   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); }
   3597 __ai uint32x2_t vrev64_u32(uint32x2_t __a) {
   3598   return __builtin_shufflevector(__a, __a, 1, 0); }
   3599 __ai poly8x8_t vrev64_p8(poly8x8_t __a) {
   3600   return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0); }
   3601 __ai poly16x4_t vrev64_p16(poly16x4_t __a) {
   3602   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0); }
   3603 __ai float32x2_t vrev64_f32(float32x2_t __a) {
   3604   return __builtin_shufflevector(__a, __a, 1, 0); }
   3605 __ai int8x16_t vrev64q_s8(int8x16_t __a) {
   3606   return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); }
   3607 __ai int16x8_t vrev64q_s16(int16x8_t __a) {
   3608   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0, 7, 6, 5, 4); }
   3609 __ai int32x4_t vrev64q_s32(int32x4_t __a) {
   3610   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2); }
   3611 __ai uint8x16_t vrev64q_u8(uint8x16_t __a) {
   3612   return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); }
   3613 __ai uint16x8_t vrev64q_u16(uint16x8_t __a) {
   3614   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0, 7, 6, 5, 4); }
   3615 __ai uint32x4_t vrev64q_u32(uint32x4_t __a) {
   3616   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2); }
   3617 __ai poly8x16_t vrev64q_p8(poly8x16_t __a) {
   3618   return __builtin_shufflevector(__a, __a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); }
   3619 __ai poly16x8_t vrev64q_p16(poly16x8_t __a) {
   3620   return __builtin_shufflevector(__a, __a, 3, 2, 1, 0, 7, 6, 5, 4); }
   3621 __ai float32x4_t vrev64q_f32(float32x4_t __a) {
   3622   return __builtin_shufflevector(__a, __a, 1, 0, 3, 2); }
   3623 
   3624 __ai int8x8_t vrhadd_s8(int8x8_t __a, int8x8_t __b) {
   3625   return (int8x8_t)__builtin_neon_vrhadd_v(__a, __b, 0); }
   3626 __ai int16x4_t vrhadd_s16(int16x4_t __a, int16x4_t __b) {
   3627   return (int16x4_t)__builtin_neon_vrhadd_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   3628 __ai int32x2_t vrhadd_s32(int32x2_t __a, int32x2_t __b) {
   3629   return (int32x2_t)__builtin_neon_vrhadd_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   3630 __ai uint8x8_t vrhadd_u8(uint8x8_t __a, uint8x8_t __b) {
   3631   return (uint8x8_t)__builtin_neon_vrhadd_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   3632 __ai uint16x4_t vrhadd_u16(uint16x4_t __a, uint16x4_t __b) {
   3633   return (uint16x4_t)__builtin_neon_vrhadd_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   3634 __ai uint32x2_t vrhadd_u32(uint32x2_t __a, uint32x2_t __b) {
   3635   return (uint32x2_t)__builtin_neon_vrhadd_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   3636 __ai int8x16_t vrhaddq_s8(int8x16_t __a, int8x16_t __b) {
   3637   return (int8x16_t)__builtin_neon_vrhaddq_v(__a, __b, 32); }
   3638 __ai int16x8_t vrhaddq_s16(int16x8_t __a, int16x8_t __b) {
   3639   return (int16x8_t)__builtin_neon_vrhaddq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   3640 __ai int32x4_t vrhaddq_s32(int32x4_t __a, int32x4_t __b) {
   3641   return (int32x4_t)__builtin_neon_vrhaddq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   3642 __ai uint8x16_t vrhaddq_u8(uint8x16_t __a, uint8x16_t __b) {
   3643   return (uint8x16_t)__builtin_neon_vrhaddq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
   3644 __ai uint16x8_t vrhaddq_u16(uint16x8_t __a, uint16x8_t __b) {
   3645   return (uint16x8_t)__builtin_neon_vrhaddq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   3646 __ai uint32x4_t vrhaddq_u32(uint32x4_t __a, uint32x4_t __b) {
   3647   return (uint32x4_t)__builtin_neon_vrhaddq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   3648 
   3649 __ai int8x8_t vrshl_s8(int8x8_t __a, int8x8_t __b) {
   3650   return (int8x8_t)__builtin_neon_vrshl_v(__a, __b, 0); }
   3651 __ai int16x4_t vrshl_s16(int16x4_t __a, int16x4_t __b) {
   3652   return (int16x4_t)__builtin_neon_vrshl_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   3653 __ai int32x2_t vrshl_s32(int32x2_t __a, int32x2_t __b) {
   3654   return (int32x2_t)__builtin_neon_vrshl_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   3655 __ai int64x1_t vrshl_s64(int64x1_t __a, int64x1_t __b) {
   3656   return (int64x1_t)__builtin_neon_vrshl_v((int8x8_t)__a, (int8x8_t)__b, 3); }
   3657 __ai uint8x8_t vrshl_u8(uint8x8_t __a, int8x8_t __b) {
   3658   return (uint8x8_t)__builtin_neon_vrshl_v((int8x8_t)__a, __b, 16); }
   3659 __ai uint16x4_t vrshl_u16(uint16x4_t __a, int16x4_t __b) {
   3660   return (uint16x4_t)__builtin_neon_vrshl_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   3661 __ai uint32x2_t vrshl_u32(uint32x2_t __a, int32x2_t __b) {
   3662   return (uint32x2_t)__builtin_neon_vrshl_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   3663 __ai uint64x1_t vrshl_u64(uint64x1_t __a, int64x1_t __b) {
   3664   return (uint64x1_t)__builtin_neon_vrshl_v((int8x8_t)__a, (int8x8_t)__b, 19); }
   3665 __ai int8x16_t vrshlq_s8(int8x16_t __a, int8x16_t __b) {
   3666   return (int8x16_t)__builtin_neon_vrshlq_v(__a, __b, 32); }
   3667 __ai int16x8_t vrshlq_s16(int16x8_t __a, int16x8_t __b) {
   3668   return (int16x8_t)__builtin_neon_vrshlq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   3669 __ai int32x4_t vrshlq_s32(int32x4_t __a, int32x4_t __b) {
   3670   return (int32x4_t)__builtin_neon_vrshlq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   3671 __ai int64x2_t vrshlq_s64(int64x2_t __a, int64x2_t __b) {
   3672   return (int64x2_t)__builtin_neon_vrshlq_v((int8x16_t)__a, (int8x16_t)__b, 35); }
   3673 __ai uint8x16_t vrshlq_u8(uint8x16_t __a, int8x16_t __b) {
   3674   return (uint8x16_t)__builtin_neon_vrshlq_v((int8x16_t)__a, __b, 48); }
   3675 __ai uint16x8_t vrshlq_u16(uint16x8_t __a, int16x8_t __b) {
   3676   return (uint16x8_t)__builtin_neon_vrshlq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   3677 __ai uint32x4_t vrshlq_u32(uint32x4_t __a, int32x4_t __b) {
   3678   return (uint32x4_t)__builtin_neon_vrshlq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   3679 __ai uint64x2_t vrshlq_u64(uint64x2_t __a, int64x2_t __b) {
   3680   return (uint64x2_t)__builtin_neon_vrshlq_v((int8x16_t)__a, (int8x16_t)__b, 51); }
   3681 
   3682 #define vrshrn_n_s16(a, __b) __extension__ ({ \
   3683   int16x8_t __a = (a); \
   3684   (int8x8_t)__builtin_neon_vrshrn_n_v((int8x16_t)__a, __b, 0); })
   3685 #define vrshrn_n_s32(a, __b) __extension__ ({ \
   3686   int32x4_t __a = (a); \
   3687   (int16x4_t)__builtin_neon_vrshrn_n_v((int8x16_t)__a, __b, 1); })
   3688 #define vrshrn_n_s64(a, __b) __extension__ ({ \
   3689   int64x2_t __a = (a); \
   3690   (int32x2_t)__builtin_neon_vrshrn_n_v((int8x16_t)__a, __b, 2); })
   3691 #define vrshrn_n_u16(a, __b) __extension__ ({ \
   3692   uint16x8_t __a = (a); \
   3693   (uint8x8_t)__builtin_neon_vrshrn_n_v((int8x16_t)__a, __b, 16); })
   3694 #define vrshrn_n_u32(a, __b) __extension__ ({ \
   3695   uint32x4_t __a = (a); \
   3696   (uint16x4_t)__builtin_neon_vrshrn_n_v((int8x16_t)__a, __b, 17); })
   3697 #define vrshrn_n_u64(a, __b) __extension__ ({ \
   3698   uint64x2_t __a = (a); \
   3699   (uint32x2_t)__builtin_neon_vrshrn_n_v((int8x16_t)__a, __b, 18); })
   3700 
   3701 #define vrshr_n_s8(a, __b) __extension__ ({ \
   3702   int8x8_t __a = (a); \
   3703   (int8x8_t)__builtin_neon_vrshr_n_v(__a, __b, 0); })
   3704 #define vrshr_n_s16(a, __b) __extension__ ({ \
   3705   int16x4_t __a = (a); \
   3706   (int16x4_t)__builtin_neon_vrshr_n_v((int8x8_t)__a, __b, 1); })
   3707 #define vrshr_n_s32(a, __b) __extension__ ({ \
   3708   int32x2_t __a = (a); \
   3709   (int32x2_t)__builtin_neon_vrshr_n_v((int8x8_t)__a, __b, 2); })
   3710 #define vrshr_n_s64(a, __b) __extension__ ({ \
   3711   int64x1_t __a = (a); \
   3712   (int64x1_t)__builtin_neon_vrshr_n_v((int8x8_t)__a, __b, 3); })
   3713 #define vrshr_n_u8(a, __b) __extension__ ({ \
   3714   uint8x8_t __a = (a); \
   3715   (uint8x8_t)__builtin_neon_vrshr_n_v((int8x8_t)__a, __b, 16); })
   3716 #define vrshr_n_u16(a, __b) __extension__ ({ \
   3717   uint16x4_t __a = (a); \
   3718   (uint16x4_t)__builtin_neon_vrshr_n_v((int8x8_t)__a, __b, 17); })
   3719 #define vrshr_n_u32(a, __b) __extension__ ({ \
   3720   uint32x2_t __a = (a); \
   3721   (uint32x2_t)__builtin_neon_vrshr_n_v((int8x8_t)__a, __b, 18); })
   3722 #define vrshr_n_u64(a, __b) __extension__ ({ \
   3723   uint64x1_t __a = (a); \
   3724   (uint64x1_t)__builtin_neon_vrshr_n_v((int8x8_t)__a, __b, 19); })
   3725 #define vrshrq_n_s8(a, __b) __extension__ ({ \
   3726   int8x16_t __a = (a); \
   3727   (int8x16_t)__builtin_neon_vrshrq_n_v(__a, __b, 32); })
   3728 #define vrshrq_n_s16(a, __b) __extension__ ({ \
   3729   int16x8_t __a = (a); \
   3730   (int16x8_t)__builtin_neon_vrshrq_n_v((int8x16_t)__a, __b, 33); })
   3731 #define vrshrq_n_s32(a, __b) __extension__ ({ \
   3732   int32x4_t __a = (a); \
   3733   (int32x4_t)__builtin_neon_vrshrq_n_v((int8x16_t)__a, __b, 34); })
   3734 #define vrshrq_n_s64(a, __b) __extension__ ({ \
   3735   int64x2_t __a = (a); \
   3736   (int64x2_t)__builtin_neon_vrshrq_n_v((int8x16_t)__a, __b, 35); })
   3737 #define vrshrq_n_u8(a, __b) __extension__ ({ \
   3738   uint8x16_t __a = (a); \
   3739   (uint8x16_t)__builtin_neon_vrshrq_n_v((int8x16_t)__a, __b, 48); })
   3740 #define vrshrq_n_u16(a, __b) __extension__ ({ \
   3741   uint16x8_t __a = (a); \
   3742   (uint16x8_t)__builtin_neon_vrshrq_n_v((int8x16_t)__a, __b, 49); })
   3743 #define vrshrq_n_u32(a, __b) __extension__ ({ \
   3744   uint32x4_t __a = (a); \
   3745   (uint32x4_t)__builtin_neon_vrshrq_n_v((int8x16_t)__a, __b, 50); })
   3746 #define vrshrq_n_u64(a, __b) __extension__ ({ \
   3747   uint64x2_t __a = (a); \
   3748   (uint64x2_t)__builtin_neon_vrshrq_n_v((int8x16_t)__a, __b, 51); })
   3749 
   3750 __ai float32x2_t vrsqrte_f32(float32x2_t __a) {
   3751   return (float32x2_t)__builtin_neon_vrsqrte_v((int8x8_t)__a, 7); }
   3752 __ai uint32x2_t vrsqrte_u32(uint32x2_t __a) {
   3753   return (uint32x2_t)__builtin_neon_vrsqrte_v((int8x8_t)__a, 18); }
   3754 __ai float32x4_t vrsqrteq_f32(float32x4_t __a) {
   3755   return (float32x4_t)__builtin_neon_vrsqrteq_v((int8x16_t)__a, 39); }
   3756 __ai uint32x4_t vrsqrteq_u32(uint32x4_t __a) {
   3757   return (uint32x4_t)__builtin_neon_vrsqrteq_v((int8x16_t)__a, 50); }
   3758 
   3759 __ai float32x2_t vrsqrts_f32(float32x2_t __a, float32x2_t __b) {
   3760   return (float32x2_t)__builtin_neon_vrsqrts_v((int8x8_t)__a, (int8x8_t)__b, 7); }
   3761 __ai float32x4_t vrsqrtsq_f32(float32x4_t __a, float32x4_t __b) {
   3762   return (float32x4_t)__builtin_neon_vrsqrtsq_v((int8x16_t)__a, (int8x16_t)__b, 39); }
   3763 
   3764 #define vrsra_n_s8(a, b, __c) __extension__ ({ \
   3765   int8x8_t __a = (a); int8x8_t __b = (b); \
   3766   (int8x8_t)__builtin_neon_vrsra_n_v(__a, __b, __c, 0); })
   3767 #define vrsra_n_s16(a, b, __c) __extension__ ({ \
   3768   int16x4_t __a = (a); int16x4_t __b = (b); \
   3769   (int16x4_t)__builtin_neon_vrsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 1); })
   3770 #define vrsra_n_s32(a, b, __c) __extension__ ({ \
   3771   int32x2_t __a = (a); int32x2_t __b = (b); \
   3772   (int32x2_t)__builtin_neon_vrsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 2); })
   3773 #define vrsra_n_s64(a, b, __c) __extension__ ({ \
   3774   int64x1_t __a = (a); int64x1_t __b = (b); \
   3775   (int64x1_t)__builtin_neon_vrsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 3); })
   3776 #define vrsra_n_u8(a, b, __c) __extension__ ({ \
   3777   uint8x8_t __a = (a); uint8x8_t __b = (b); \
   3778   (uint8x8_t)__builtin_neon_vrsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 16); })
   3779 #define vrsra_n_u16(a, b, __c) __extension__ ({ \
   3780   uint16x4_t __a = (a); uint16x4_t __b = (b); \
   3781   (uint16x4_t)__builtin_neon_vrsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 17); })
   3782 #define vrsra_n_u32(a, b, __c) __extension__ ({ \
   3783   uint32x2_t __a = (a); uint32x2_t __b = (b); \
   3784   (uint32x2_t)__builtin_neon_vrsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 18); })
   3785 #define vrsra_n_u64(a, b, __c) __extension__ ({ \
   3786   uint64x1_t __a = (a); uint64x1_t __b = (b); \
   3787   (uint64x1_t)__builtin_neon_vrsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 19); })
   3788 #define vrsraq_n_s8(a, b, __c) __extension__ ({ \
   3789   int8x16_t __a = (a); int8x16_t __b = (b); \
   3790   (int8x16_t)__builtin_neon_vrsraq_n_v(__a, __b, __c, 32); })
   3791 #define vrsraq_n_s16(a, b, __c) __extension__ ({ \
   3792   int16x8_t __a = (a); int16x8_t __b = (b); \
   3793   (int16x8_t)__builtin_neon_vrsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 33); })
   3794 #define vrsraq_n_s32(a, b, __c) __extension__ ({ \
   3795   int32x4_t __a = (a); int32x4_t __b = (b); \
   3796   (int32x4_t)__builtin_neon_vrsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 34); })
   3797 #define vrsraq_n_s64(a, b, __c) __extension__ ({ \
   3798   int64x2_t __a = (a); int64x2_t __b = (b); \
   3799   (int64x2_t)__builtin_neon_vrsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 35); })
   3800 #define vrsraq_n_u8(a, b, __c) __extension__ ({ \
   3801   uint8x16_t __a = (a); uint8x16_t __b = (b); \
   3802   (uint8x16_t)__builtin_neon_vrsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 48); })
   3803 #define vrsraq_n_u16(a, b, __c) __extension__ ({ \
   3804   uint16x8_t __a = (a); uint16x8_t __b = (b); \
   3805   (uint16x8_t)__builtin_neon_vrsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 49); })
   3806 #define vrsraq_n_u32(a, b, __c) __extension__ ({ \
   3807   uint32x4_t __a = (a); uint32x4_t __b = (b); \
   3808   (uint32x4_t)__builtin_neon_vrsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 50); })
   3809 #define vrsraq_n_u64(a, b, __c) __extension__ ({ \
   3810   uint64x2_t __a = (a); uint64x2_t __b = (b); \
   3811   (uint64x2_t)__builtin_neon_vrsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 51); })
   3812 
   3813 __ai int8x8_t vrsubhn_s16(int16x8_t __a, int16x8_t __b) {
   3814   return (int8x8_t)__builtin_neon_vrsubhn_v((int8x16_t)__a, (int8x16_t)__b, 0); }
   3815 __ai int16x4_t vrsubhn_s32(int32x4_t __a, int32x4_t __b) {
   3816   return (int16x4_t)__builtin_neon_vrsubhn_v((int8x16_t)__a, (int8x16_t)__b, 1); }
   3817 __ai int32x2_t vrsubhn_s64(int64x2_t __a, int64x2_t __b) {
   3818   return (int32x2_t)__builtin_neon_vrsubhn_v((int8x16_t)__a, (int8x16_t)__b, 2); }
   3819 __ai uint8x8_t vrsubhn_u16(uint16x8_t __a, uint16x8_t __b) {
   3820   return (uint8x8_t)__builtin_neon_vrsubhn_v((int8x16_t)__a, (int8x16_t)__b, 16); }
   3821 __ai uint16x4_t vrsubhn_u32(uint32x4_t __a, uint32x4_t __b) {
   3822   return (uint16x4_t)__builtin_neon_vrsubhn_v((int8x16_t)__a, (int8x16_t)__b, 17); }
   3823 __ai uint32x2_t vrsubhn_u64(uint64x2_t __a, uint64x2_t __b) {
   3824   return (uint32x2_t)__builtin_neon_vrsubhn_v((int8x16_t)__a, (int8x16_t)__b, 18); }
   3825 
   3826 #define vset_lane_u8(a, b, __c) __extension__ ({ \
   3827   uint8_t __a = (a); uint8x8_t __b = (b); \
   3828   (uint8x8_t)__builtin_neon_vset_lane_i8(__a, (int8x8_t)__b, __c); })
   3829 #define vset_lane_u16(a, b, __c) __extension__ ({ \
   3830   uint16_t __a = (a); uint16x4_t __b = (b); \
   3831   (uint16x4_t)__builtin_neon_vset_lane_i16(__a, (int16x4_t)__b, __c); })
   3832 #define vset_lane_u32(a, b, __c) __extension__ ({ \
   3833   uint32_t __a = (a); uint32x2_t __b = (b); \
   3834   (uint32x2_t)__builtin_neon_vset_lane_i32(__a, (int32x2_t)__b, __c); })
   3835 #define vset_lane_s8(a, b, __c) __extension__ ({ \
   3836   int8_t __a = (a); int8x8_t __b = (b); \
   3837   (int8x8_t)__builtin_neon_vset_lane_i8(__a, __b, __c); })
   3838 #define vset_lane_s16(a, b, __c) __extension__ ({ \
   3839   int16_t __a = (a); int16x4_t __b = (b); \
   3840   (int16x4_t)__builtin_neon_vset_lane_i16(__a, __b, __c); })
   3841 #define vset_lane_s32(a, b, __c) __extension__ ({ \
   3842   int32_t __a = (a); int32x2_t __b = (b); \
   3843   (int32x2_t)__builtin_neon_vset_lane_i32(__a, __b, __c); })
   3844 #define vset_lane_p8(a, b, __c) __extension__ ({ \
   3845   poly8_t __a = (a); poly8x8_t __b = (b); \
   3846   (poly8x8_t)__builtin_neon_vset_lane_i8(__a, (int8x8_t)__b, __c); })
   3847 #define vset_lane_p16(a, b, __c) __extension__ ({ \
   3848   poly16_t __a = (a); poly16x4_t __b = (b); \
   3849   (poly16x4_t)__builtin_neon_vset_lane_i16(__a, (int16x4_t)__b, __c); })
   3850 #define vset_lane_f32(a, b, __c) __extension__ ({ \
   3851   float32_t __a = (a); float32x2_t __b = (b); \
   3852   (float32x2_t)__builtin_neon_vset_lane_f32(__a, __b, __c); })
   3853 #define vsetq_lane_u8(a, b, __c) __extension__ ({ \
   3854   uint8_t __a = (a); uint8x16_t __b = (b); \
   3855   (uint8x16_t)__builtin_neon_vsetq_lane_i8(__a, (int8x16_t)__b, __c); })
   3856 #define vsetq_lane_u16(a, b, __c) __extension__ ({ \
   3857   uint16_t __a = (a); uint16x8_t __b = (b); \
   3858   (uint16x8_t)__builtin_neon_vsetq_lane_i16(__a, (int16x8_t)__b, __c); })
   3859 #define vsetq_lane_u32(a, b, __c) __extension__ ({ \
   3860   uint32_t __a = (a); uint32x4_t __b = (b); \
   3861   (uint32x4_t)__builtin_neon_vsetq_lane_i32(__a, (int32x4_t)__b, __c); })
   3862 #define vsetq_lane_s8(a, b, __c) __extension__ ({ \
   3863   int8_t __a = (a); int8x16_t __b = (b); \
   3864   (int8x16_t)__builtin_neon_vsetq_lane_i8(__a, __b, __c); })
   3865 #define vsetq_lane_s16(a, b, __c) __extension__ ({ \
   3866   int16_t __a = (a); int16x8_t __b = (b); \
   3867   (int16x8_t)__builtin_neon_vsetq_lane_i16(__a, __b, __c); })
   3868 #define vsetq_lane_s32(a, b, __c) __extension__ ({ \
   3869   int32_t __a = (a); int32x4_t __b = (b); \
   3870   (int32x4_t)__builtin_neon_vsetq_lane_i32(__a, __b, __c); })
   3871 #define vsetq_lane_p8(a, b, __c) __extension__ ({ \
   3872   poly8_t __a = (a); poly8x16_t __b = (b); \
   3873   (poly8x16_t)__builtin_neon_vsetq_lane_i8(__a, (int8x16_t)__b, __c); })
   3874 #define vsetq_lane_p16(a, b, __c) __extension__ ({ \
   3875   poly16_t __a = (a); poly16x8_t __b = (b); \
   3876   (poly16x8_t)__builtin_neon_vsetq_lane_i16(__a, (int16x8_t)__b, __c); })
   3877 #define vsetq_lane_f32(a, b, __c) __extension__ ({ \
   3878   float32_t __a = (a); float32x4_t __b = (b); \
   3879   (float32x4_t)__builtin_neon_vsetq_lane_f32(__a, __b, __c); })
   3880 #define vset_lane_s64(a, b, __c) __extension__ ({ \
   3881   int64_t __a = (a); int64x1_t __b = (b); \
   3882   (int64x1_t)__builtin_neon_vset_lane_i64(__a, __b, __c); })
   3883 #define vset_lane_u64(a, b, __c) __extension__ ({ \
   3884   uint64_t __a = (a); uint64x1_t __b = (b); \
   3885   (uint64x1_t)__builtin_neon_vset_lane_i64(__a, (int64x1_t)__b, __c); })
   3886 #define vsetq_lane_s64(a, b, __c) __extension__ ({ \
   3887   int64_t __a = (a); int64x2_t __b = (b); \
   3888   (int64x2_t)__builtin_neon_vsetq_lane_i64(__a, __b, __c); })
   3889 #define vsetq_lane_u64(a, b, __c) __extension__ ({ \
   3890   uint64_t __a = (a); uint64x2_t __b = (b); \
   3891   (uint64x2_t)__builtin_neon_vsetq_lane_i64(__a, (int64x2_t)__b, __c); })
   3892 
   3893 __ai int8x8_t vshl_s8(int8x8_t __a, int8x8_t __b) {
   3894   return (int8x8_t)__builtin_neon_vshl_v(__a, __b, 0); }
   3895 __ai int16x4_t vshl_s16(int16x4_t __a, int16x4_t __b) {
   3896   return (int16x4_t)__builtin_neon_vshl_v((int8x8_t)__a, (int8x8_t)__b, 1); }
   3897 __ai int32x2_t vshl_s32(int32x2_t __a, int32x2_t __b) {
   3898   return (int32x2_t)__builtin_neon_vshl_v((int8x8_t)__a, (int8x8_t)__b, 2); }
   3899 __ai int64x1_t vshl_s64(int64x1_t __a, int64x1_t __b) {
   3900   return (int64x1_t)__builtin_neon_vshl_v((int8x8_t)__a, (int8x8_t)__b, 3); }
   3901 __ai uint8x8_t vshl_u8(uint8x8_t __a, int8x8_t __b) {
   3902   return (uint8x8_t)__builtin_neon_vshl_v((int8x8_t)__a, __b, 16); }
   3903 __ai uint16x4_t vshl_u16(uint16x4_t __a, int16x4_t __b) {
   3904   return (uint16x4_t)__builtin_neon_vshl_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   3905 __ai uint32x2_t vshl_u32(uint32x2_t __a, int32x2_t __b) {
   3906   return (uint32x2_t)__builtin_neon_vshl_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   3907 __ai uint64x1_t vshl_u64(uint64x1_t __a, int64x1_t __b) {
   3908   return (uint64x1_t)__builtin_neon_vshl_v((int8x8_t)__a, (int8x8_t)__b, 19); }
   3909 __ai int8x16_t vshlq_s8(int8x16_t __a, int8x16_t __b) {
   3910   return (int8x16_t)__builtin_neon_vshlq_v(__a, __b, 32); }
   3911 __ai int16x8_t vshlq_s16(int16x8_t __a, int16x8_t __b) {
   3912   return (int16x8_t)__builtin_neon_vshlq_v((int8x16_t)__a, (int8x16_t)__b, 33); }
   3913 __ai int32x4_t vshlq_s32(int32x4_t __a, int32x4_t __b) {
   3914   return (int32x4_t)__builtin_neon_vshlq_v((int8x16_t)__a, (int8x16_t)__b, 34); }
   3915 __ai int64x2_t vshlq_s64(int64x2_t __a, int64x2_t __b) {
   3916   return (int64x2_t)__builtin_neon_vshlq_v((int8x16_t)__a, (int8x16_t)__b, 35); }
   3917 __ai uint8x16_t vshlq_u8(uint8x16_t __a, int8x16_t __b) {
   3918   return (uint8x16_t)__builtin_neon_vshlq_v((int8x16_t)__a, __b, 48); }
   3919 __ai uint16x8_t vshlq_u16(uint16x8_t __a, int16x8_t __b) {
   3920   return (uint16x8_t)__builtin_neon_vshlq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   3921 __ai uint32x4_t vshlq_u32(uint32x4_t __a, int32x4_t __b) {
   3922   return (uint32x4_t)__builtin_neon_vshlq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   3923 __ai uint64x2_t vshlq_u64(uint64x2_t __a, int64x2_t __b) {
   3924   return (uint64x2_t)__builtin_neon_vshlq_v((int8x16_t)__a, (int8x16_t)__b, 51); }
   3925 
   3926 #define vshll_n_s8(a, __b) __extension__ ({ \
   3927   int8x8_t __a = (a); \
   3928   (int16x8_t)__builtin_neon_vshll_n_v(__a, __b, 33); })
   3929 #define vshll_n_s16(a, __b) __extension__ ({ \
   3930   int16x4_t __a = (a); \
   3931   (int32x4_t)__builtin_neon_vshll_n_v((int8x8_t)__a, __b, 34); })
   3932 #define vshll_n_s32(a, __b) __extension__ ({ \
   3933   int32x2_t __a = (a); \
   3934   (int64x2_t)__builtin_neon_vshll_n_v((int8x8_t)__a, __b, 35); })
   3935 #define vshll_n_u8(a, __b) __extension__ ({ \
   3936   uint8x8_t __a = (a); \
   3937   (uint16x8_t)__builtin_neon_vshll_n_v((int8x8_t)__a, __b, 49); })
   3938 #define vshll_n_u16(a, __b) __extension__ ({ \
   3939   uint16x4_t __a = (a); \
   3940   (uint32x4_t)__builtin_neon_vshll_n_v((int8x8_t)__a, __b, 50); })
   3941 #define vshll_n_u32(a, __b) __extension__ ({ \
   3942   uint32x2_t __a = (a); \
   3943   (uint64x2_t)__builtin_neon_vshll_n_v((int8x8_t)__a, __b, 51); })
   3944 
   3945 #define vshl_n_s8(a, __b) __extension__ ({ \
   3946   int8x8_t __a = (a); \
   3947   (int8x8_t)__builtin_neon_vshl_n_v(__a, __b, 0); })
   3948 #define vshl_n_s16(a, __b) __extension__ ({ \
   3949   int16x4_t __a = (a); \
   3950   (int16x4_t)__builtin_neon_vshl_n_v((int8x8_t)__a, __b, 1); })
   3951 #define vshl_n_s32(a, __b) __extension__ ({ \
   3952   int32x2_t __a = (a); \
   3953   (int32x2_t)__builtin_neon_vshl_n_v((int8x8_t)__a, __b, 2); })
   3954 #define vshl_n_s64(a, __b) __extension__ ({ \
   3955   int64x1_t __a = (a); \
   3956   (int64x1_t)__builtin_neon_vshl_n_v((int8x8_t)__a, __b, 3); })
   3957 #define vshl_n_u8(a, __b) __extension__ ({ \
   3958   uint8x8_t __a = (a); \
   3959   (uint8x8_t)__builtin_neon_vshl_n_v((int8x8_t)__a, __b, 16); })
   3960 #define vshl_n_u16(a, __b) __extension__ ({ \
   3961   uint16x4_t __a = (a); \
   3962   (uint16x4_t)__builtin_neon_vshl_n_v((int8x8_t)__a, __b, 17); })
   3963 #define vshl_n_u32(a, __b) __extension__ ({ \
   3964   uint32x2_t __a = (a); \
   3965   (uint32x2_t)__builtin_neon_vshl_n_v((int8x8_t)__a, __b, 18); })
   3966 #define vshl_n_u64(a, __b) __extension__ ({ \
   3967   uint64x1_t __a = (a); \
   3968   (uint64x1_t)__builtin_neon_vshl_n_v((int8x8_t)__a, __b, 19); })
   3969 #define vshlq_n_s8(a, __b) __extension__ ({ \
   3970   int8x16_t __a = (a); \
   3971   (int8x16_t)__builtin_neon_vshlq_n_v(__a, __b, 32); })
   3972 #define vshlq_n_s16(a, __b) __extension__ ({ \
   3973   int16x8_t __a = (a); \
   3974   (int16x8_t)__builtin_neon_vshlq_n_v((int8x16_t)__a, __b, 33); })
   3975 #define vshlq_n_s32(a, __b) __extension__ ({ \
   3976   int32x4_t __a = (a); \
   3977   (int32x4_t)__builtin_neon_vshlq_n_v((int8x16_t)__a, __b, 34); })
   3978 #define vshlq_n_s64(a, __b) __extension__ ({ \
   3979   int64x2_t __a = (a); \
   3980   (int64x2_t)__builtin_neon_vshlq_n_v((int8x16_t)__a, __b, 35); })
   3981 #define vshlq_n_u8(a, __b) __extension__ ({ \
   3982   uint8x16_t __a = (a); \
   3983   (uint8x16_t)__builtin_neon_vshlq_n_v((int8x16_t)__a, __b, 48); })
   3984 #define vshlq_n_u16(a, __b) __extension__ ({ \
   3985   uint16x8_t __a = (a); \
   3986   (uint16x8_t)__builtin_neon_vshlq_n_v((int8x16_t)__a, __b, 49); })
   3987 #define vshlq_n_u32(a, __b) __extension__ ({ \
   3988   uint32x4_t __a = (a); \
   3989   (uint32x4_t)__builtin_neon_vshlq_n_v((int8x16_t)__a, __b, 50); })
   3990 #define vshlq_n_u64(a, __b) __extension__ ({ \
   3991   uint64x2_t __a = (a); \
   3992   (uint64x2_t)__builtin_neon_vshlq_n_v((int8x16_t)__a, __b, 51); })
   3993 
   3994 #define vshrn_n_s16(a, __b) __extension__ ({ \
   3995   int16x8_t __a = (a); \
   3996   (int8x8_t)__builtin_neon_vshrn_n_v((int8x16_t)__a, __b, 0); })
   3997 #define vshrn_n_s32(a, __b) __extension__ ({ \
   3998   int32x4_t __a = (a); \
   3999   (int16x4_t)__builtin_neon_vshrn_n_v((int8x16_t)__a, __b, 1); })
   4000 #define vshrn_n_s64(a, __b) __extension__ ({ \
   4001   int64x2_t __a = (a); \
   4002   (int32x2_t)__builtin_neon_vshrn_n_v((int8x16_t)__a, __b, 2); })
   4003 #define vshrn_n_u16(a, __b) __extension__ ({ \
   4004   uint16x8_t __a = (a); \
   4005   (uint8x8_t)__builtin_neon_vshrn_n_v((int8x16_t)__a, __b, 16); })
   4006 #define vshrn_n_u32(a, __b) __extension__ ({ \
   4007   uint32x4_t __a = (a); \
   4008   (uint16x4_t)__builtin_neon_vshrn_n_v((int8x16_t)__a, __b, 17); })
   4009 #define vshrn_n_u64(a, __b) __extension__ ({ \
   4010   uint64x2_t __a = (a); \
   4011   (uint32x2_t)__builtin_neon_vshrn_n_v((int8x16_t)__a, __b, 18); })
   4012 
   4013 #define vshr_n_s8(a, __b) __extension__ ({ \
   4014   int8x8_t __a = (a); \
   4015   (int8x8_t)__builtin_neon_vshr_n_v(__a, __b, 0); })
   4016 #define vshr_n_s16(a, __b) __extension__ ({ \
   4017   int16x4_t __a = (a); \
   4018   (int16x4_t)__builtin_neon_vshr_n_v((int8x8_t)__a, __b, 1); })
   4019 #define vshr_n_s32(a, __b) __extension__ ({ \
   4020   int32x2_t __a = (a); \
   4021   (int32x2_t)__builtin_neon_vshr_n_v((int8x8_t)__a, __b, 2); })
   4022 #define vshr_n_s64(a, __b) __extension__ ({ \
   4023   int64x1_t __a = (a); \
   4024   (int64x1_t)__builtin_neon_vshr_n_v((int8x8_t)__a, __b, 3); })
   4025 #define vshr_n_u8(a, __b) __extension__ ({ \
   4026   uint8x8_t __a = (a); \
   4027   (uint8x8_t)__builtin_neon_vshr_n_v((int8x8_t)__a, __b, 16); })
   4028 #define vshr_n_u16(a, __b) __extension__ ({ \
   4029   uint16x4_t __a = (a); \
   4030   (uint16x4_t)__builtin_neon_vshr_n_v((int8x8_t)__a, __b, 17); })
   4031 #define vshr_n_u32(a, __b) __extension__ ({ \
   4032   uint32x2_t __a = (a); \
   4033   (uint32x2_t)__builtin_neon_vshr_n_v((int8x8_t)__a, __b, 18); })
   4034 #define vshr_n_u64(a, __b) __extension__ ({ \
   4035   uint64x1_t __a = (a); \
   4036   (uint64x1_t)__builtin_neon_vshr_n_v((int8x8_t)__a, __b, 19); })
   4037 #define vshrq_n_s8(a, __b) __extension__ ({ \
   4038   int8x16_t __a = (a); \
   4039   (int8x16_t)__builtin_neon_vshrq_n_v(__a, __b, 32); })
   4040 #define vshrq_n_s16(a, __b) __extension__ ({ \
   4041   int16x8_t __a = (a); \
   4042   (int16x8_t)__builtin_neon_vshrq_n_v((int8x16_t)__a, __b, 33); })
   4043 #define vshrq_n_s32(a, __b) __extension__ ({ \
   4044   int32x4_t __a = (a); \
   4045   (int32x4_t)__builtin_neon_vshrq_n_v((int8x16_t)__a, __b, 34); })
   4046 #define vshrq_n_s64(a, __b) __extension__ ({ \
   4047   int64x2_t __a = (a); \
   4048   (int64x2_t)__builtin_neon_vshrq_n_v((int8x16_t)__a, __b, 35); })
   4049 #define vshrq_n_u8(a, __b) __extension__ ({ \
   4050   uint8x16_t __a = (a); \
   4051   (uint8x16_t)__builtin_neon_vshrq_n_v((int8x16_t)__a, __b, 48); })
   4052 #define vshrq_n_u16(a, __b) __extension__ ({ \
   4053   uint16x8_t __a = (a); \
   4054   (uint16x8_t)__builtin_neon_vshrq_n_v((int8x16_t)__a, __b, 49); })
   4055 #define vshrq_n_u32(a, __b) __extension__ ({ \
   4056   uint32x4_t __a = (a); \
   4057   (uint32x4_t)__builtin_neon_vshrq_n_v((int8x16_t)__a, __b, 50); })
   4058 #define vshrq_n_u64(a, __b) __extension__ ({ \
   4059   uint64x2_t __a = (a); \
   4060   (uint64x2_t)__builtin_neon_vshrq_n_v((int8x16_t)__a, __b, 51); })
   4061 
   4062 #define vsli_n_s8(a, b, __c) __extension__ ({ \
   4063   int8x8_t __a = (a); int8x8_t __b = (b); \
   4064   (int8x8_t)__builtin_neon_vsli_n_v(__a, __b, __c, 0); })
   4065 #define vsli_n_s16(a, b, __c) __extension__ ({ \
   4066   int16x4_t __a = (a); int16x4_t __b = (b); \
   4067   (int16x4_t)__builtin_neon_vsli_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 1); })
   4068 #define vsli_n_s32(a, b, __c) __extension__ ({ \
   4069   int32x2_t __a = (a); int32x2_t __b = (b); \
   4070   (int32x2_t)__builtin_neon_vsli_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 2); })
   4071 #define vsli_n_s64(a, b, __c) __extension__ ({ \
   4072   int64x1_t __a = (a); int64x1_t __b = (b); \
   4073   (int64x1_t)__builtin_neon_vsli_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 3); })
   4074 #define vsli_n_u8(a, b, __c) __extension__ ({ \
   4075   uint8x8_t __a = (a); uint8x8_t __b = (b); \
   4076   (uint8x8_t)__builtin_neon_vsli_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 16); })
   4077 #define vsli_n_u16(a, b, __c) __extension__ ({ \
   4078   uint16x4_t __a = (a); uint16x4_t __b = (b); \
   4079   (uint16x4_t)__builtin_neon_vsli_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 17); })
   4080 #define vsli_n_u32(a, b, __c) __extension__ ({ \
   4081   uint32x2_t __a = (a); uint32x2_t __b = (b); \
   4082   (uint32x2_t)__builtin_neon_vsli_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 18); })
   4083 #define vsli_n_u64(a, b, __c) __extension__ ({ \
   4084   uint64x1_t __a = (a); uint64x1_t __b = (b); \
   4085   (uint64x1_t)__builtin_neon_vsli_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 19); })
   4086 #define vsli_n_p8(a, b, __c) __extension__ ({ \
   4087   poly8x8_t __a = (a); poly8x8_t __b = (b); \
   4088   (poly8x8_t)__builtin_neon_vsli_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 4); })
   4089 #define vsli_n_p16(a, b, __c) __extension__ ({ \
   4090   poly16x4_t __a = (a); poly16x4_t __b = (b); \
   4091   (poly16x4_t)__builtin_neon_vsli_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 5); })
   4092 #define vsliq_n_s8(a, b, __c) __extension__ ({ \
   4093   int8x16_t __a = (a); int8x16_t __b = (b); \
   4094   (int8x16_t)__builtin_neon_vsliq_n_v(__a, __b, __c, 32); })
   4095 #define vsliq_n_s16(a, b, __c) __extension__ ({ \
   4096   int16x8_t __a = (a); int16x8_t __b = (b); \
   4097   (int16x8_t)__builtin_neon_vsliq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 33); })
   4098 #define vsliq_n_s32(a, b, __c) __extension__ ({ \
   4099   int32x4_t __a = (a); int32x4_t __b = (b); \
   4100   (int32x4_t)__builtin_neon_vsliq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 34); })
   4101 #define vsliq_n_s64(a, b, __c) __extension__ ({ \
   4102   int64x2_t __a = (a); int64x2_t __b = (b); \
   4103   (int64x2_t)__builtin_neon_vsliq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 35); })
   4104 #define vsliq_n_u8(a, b, __c) __extension__ ({ \
   4105   uint8x16_t __a = (a); uint8x16_t __b = (b); \
   4106   (uint8x16_t)__builtin_neon_vsliq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 48); })
   4107 #define vsliq_n_u16(a, b, __c) __extension__ ({ \
   4108   uint16x8_t __a = (a); uint16x8_t __b = (b); \
   4109   (uint16x8_t)__builtin_neon_vsliq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 49); })
   4110 #define vsliq_n_u32(a, b, __c) __extension__ ({ \
   4111   uint32x4_t __a = (a); uint32x4_t __b = (b); \
   4112   (uint32x4_t)__builtin_neon_vsliq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 50); })
   4113 #define vsliq_n_u64(a, b, __c) __extension__ ({ \
   4114   uint64x2_t __a = (a); uint64x2_t __b = (b); \
   4115   (uint64x2_t)__builtin_neon_vsliq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 51); })
   4116 #define vsliq_n_p8(a, b, __c) __extension__ ({ \
   4117   poly8x16_t __a = (a); poly8x16_t __b = (b); \
   4118   (poly8x16_t)__builtin_neon_vsliq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 36); })
   4119 #define vsliq_n_p16(a, b, __c) __extension__ ({ \
   4120   poly16x8_t __a = (a); poly16x8_t __b = (b); \
   4121   (poly16x8_t)__builtin_neon_vsliq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 37); })
   4122 
   4123 #define vsra_n_s8(a, b, __c) __extension__ ({ \
   4124   int8x8_t __a = (a); int8x8_t __b = (b); \
   4125   (int8x8_t)__builtin_neon_vsra_n_v(__a, __b, __c, 0); })
   4126 #define vsra_n_s16(a, b, __c) __extension__ ({ \
   4127   int16x4_t __a = (a); int16x4_t __b = (b); \
   4128   (int16x4_t)__builtin_neon_vsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 1); })
   4129 #define vsra_n_s32(a, b, __c) __extension__ ({ \
   4130   int32x2_t __a = (a); int32x2_t __b = (b); \
   4131   (int32x2_t)__builtin_neon_vsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 2); })
   4132 #define vsra_n_s64(a, b, __c) __extension__ ({ \
   4133   int64x1_t __a = (a); int64x1_t __b = (b); \
   4134   (int64x1_t)__builtin_neon_vsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 3); })
   4135 #define vsra_n_u8(a, b, __c) __extension__ ({ \
   4136   uint8x8_t __a = (a); uint8x8_t __b = (b); \
   4137   (uint8x8_t)__builtin_neon_vsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 16); })
   4138 #define vsra_n_u16(a, b, __c) __extension__ ({ \
   4139   uint16x4_t __a = (a); uint16x4_t __b = (b); \
   4140   (uint16x4_t)__builtin_neon_vsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 17); })
   4141 #define vsra_n_u32(a, b, __c) __extension__ ({ \
   4142   uint32x2_t __a = (a); uint32x2_t __b = (b); \
   4143   (uint32x2_t)__builtin_neon_vsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 18); })
   4144 #define vsra_n_u64(a, b, __c) __extension__ ({ \
   4145   uint64x1_t __a = (a); uint64x1_t __b = (b); \
   4146   (uint64x1_t)__builtin_neon_vsra_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 19); })
   4147 #define vsraq_n_s8(a, b, __c) __extension__ ({ \
   4148   int8x16_t __a = (a); int8x16_t __b = (b); \
   4149   (int8x16_t)__builtin_neon_vsraq_n_v(__a, __b, __c, 32); })
   4150 #define vsraq_n_s16(a, b, __c) __extension__ ({ \
   4151   int16x8_t __a = (a); int16x8_t __b = (b); \
   4152   (int16x8_t)__builtin_neon_vsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 33); })
   4153 #define vsraq_n_s32(a, b, __c) __extension__ ({ \
   4154   int32x4_t __a = (a); int32x4_t __b = (b); \
   4155   (int32x4_t)__builtin_neon_vsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 34); })
   4156 #define vsraq_n_s64(a, b, __c) __extension__ ({ \
   4157   int64x2_t __a = (a); int64x2_t __b = (b); \
   4158   (int64x2_t)__builtin_neon_vsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 35); })
   4159 #define vsraq_n_u8(a, b, __c) __extension__ ({ \
   4160   uint8x16_t __a = (a); uint8x16_t __b = (b); \
   4161   (uint8x16_t)__builtin_neon_vsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 48); })
   4162 #define vsraq_n_u16(a, b, __c) __extension__ ({ \
   4163   uint16x8_t __a = (a); uint16x8_t __b = (b); \
   4164   (uint16x8_t)__builtin_neon_vsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 49); })
   4165 #define vsraq_n_u32(a, b, __c) __extension__ ({ \
   4166   uint32x4_t __a = (a); uint32x4_t __b = (b); \
   4167   (uint32x4_t)__builtin_neon_vsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 50); })
   4168 #define vsraq_n_u64(a, b, __c) __extension__ ({ \
   4169   uint64x2_t __a = (a); uint64x2_t __b = (b); \
   4170   (uint64x2_t)__builtin_neon_vsraq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 51); })
   4171 
   4172 #define vsri_n_s8(a, b, __c) __extension__ ({ \
   4173   int8x8_t __a = (a); int8x8_t __b = (b); \
   4174   (int8x8_t)__builtin_neon_vsri_n_v(__a, __b, __c, 0); })
   4175 #define vsri_n_s16(a, b, __c) __extension__ ({ \
   4176   int16x4_t __a = (a); int16x4_t __b = (b); \
   4177   (int16x4_t)__builtin_neon_vsri_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 1); })
   4178 #define vsri_n_s32(a, b, __c) __extension__ ({ \
   4179   int32x2_t __a = (a); int32x2_t __b = (b); \
   4180   (int32x2_t)__builtin_neon_vsri_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 2); })
   4181 #define vsri_n_s64(a, b, __c) __extension__ ({ \
   4182   int64x1_t __a = (a); int64x1_t __b = (b); \
   4183   (int64x1_t)__builtin_neon_vsri_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 3); })
   4184 #define vsri_n_u8(a, b, __c) __extension__ ({ \
   4185   uint8x8_t __a = (a); uint8x8_t __b = (b); \
   4186   (uint8x8_t)__builtin_neon_vsri_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 16); })
   4187 #define vsri_n_u16(a, b, __c) __extension__ ({ \
   4188   uint16x4_t __a = (a); uint16x4_t __b = (b); \
   4189   (uint16x4_t)__builtin_neon_vsri_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 17); })
   4190 #define vsri_n_u32(a, b, __c) __extension__ ({ \
   4191   uint32x2_t __a = (a); uint32x2_t __b = (b); \
   4192   (uint32x2_t)__builtin_neon_vsri_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 18); })
   4193 #define vsri_n_u64(a, b, __c) __extension__ ({ \
   4194   uint64x1_t __a = (a); uint64x1_t __b = (b); \
   4195   (uint64x1_t)__builtin_neon_vsri_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 19); })
   4196 #define vsri_n_p8(a, b, __c) __extension__ ({ \
   4197   poly8x8_t __a = (a); poly8x8_t __b = (b); \
   4198   (poly8x8_t)__builtin_neon_vsri_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 4); })
   4199 #define vsri_n_p16(a, b, __c) __extension__ ({ \
   4200   poly16x4_t __a = (a); poly16x4_t __b = (b); \
   4201   (poly16x4_t)__builtin_neon_vsri_n_v((int8x8_t)__a, (int8x8_t)__b, __c, 5); })
   4202 #define vsriq_n_s8(a, b, __c) __extension__ ({ \
   4203   int8x16_t __a = (a); int8x16_t __b = (b); \
   4204   (int8x16_t)__builtin_neon_vsriq_n_v(__a, __b, __c, 32); })
   4205 #define vsriq_n_s16(a, b, __c) __extension__ ({ \
   4206   int16x8_t __a = (a); int16x8_t __b = (b); \
   4207   (int16x8_t)__builtin_neon_vsriq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 33); })
   4208 #define vsriq_n_s32(a, b, __c) __extension__ ({ \
   4209   int32x4_t __a = (a); int32x4_t __b = (b); \
   4210   (int32x4_t)__builtin_neon_vsriq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 34); })
   4211 #define vsriq_n_s64(a, b, __c) __extension__ ({ \
   4212   int64x2_t __a = (a); int64x2_t __b = (b); \
   4213   (int64x2_t)__builtin_neon_vsriq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 35); })
   4214 #define vsriq_n_u8(a, b, __c) __extension__ ({ \
   4215   uint8x16_t __a = (a); uint8x16_t __b = (b); \
   4216   (uint8x16_t)__builtin_neon_vsriq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 48); })
   4217 #define vsriq_n_u16(a, b, __c) __extension__ ({ \
   4218   uint16x8_t __a = (a); uint16x8_t __b = (b); \
   4219   (uint16x8_t)__builtin_neon_vsriq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 49); })
   4220 #define vsriq_n_u32(a, b, __c) __extension__ ({ \
   4221   uint32x4_t __a = (a); uint32x4_t __b = (b); \
   4222   (uint32x4_t)__builtin_neon_vsriq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 50); })
   4223 #define vsriq_n_u64(a, b, __c) __extension__ ({ \
   4224   uint64x2_t __a = (a); uint64x2_t __b = (b); \
   4225   (uint64x2_t)__builtin_neon_vsriq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 51); })
   4226 #define vsriq_n_p8(a, b, __c) __extension__ ({ \
   4227   poly8x16_t __a = (a); poly8x16_t __b = (b); \
   4228   (poly8x16_t)__builtin_neon_vsriq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 36); })
   4229 #define vsriq_n_p16(a, b, __c) __extension__ ({ \
   4230   poly16x8_t __a = (a); poly16x8_t __b = (b); \
   4231   (poly16x8_t)__builtin_neon_vsriq_n_v((int8x16_t)__a, (int8x16_t)__b, __c, 37); })
   4232 
   4233 #define vst1q_u8(__a, b) __extension__ ({ \
   4234   uint8x16_t __b = (b); \
   4235   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 48); })
   4236 #define vst1q_u16(__a, b) __extension__ ({ \
   4237   uint16x8_t __b = (b); \
   4238   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 49); })
   4239 #define vst1q_u32(__a, b) __extension__ ({ \
   4240   uint32x4_t __b = (b); \
   4241   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 50); })
   4242 #define vst1q_u64(__a, b) __extension__ ({ \
   4243   uint64x2_t __b = (b); \
   4244   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 51); })
   4245 #define vst1q_s8(__a, b) __extension__ ({ \
   4246   int8x16_t __b = (b); \
   4247   __builtin_neon_vst1q_v(__a, __b, 32); })
   4248 #define vst1q_s16(__a, b) __extension__ ({ \
   4249   int16x8_t __b = (b); \
   4250   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 33); })
   4251 #define vst1q_s32(__a, b) __extension__ ({ \
   4252   int32x4_t __b = (b); \
   4253   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 34); })
   4254 #define vst1q_s64(__a, b) __extension__ ({ \
   4255   int64x2_t __b = (b); \
   4256   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 35); })
   4257 #define vst1q_f16(__a, b) __extension__ ({ \
   4258   float16x8_t __b = (b); \
   4259   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 38); })
   4260 #define vst1q_f32(__a, b) __extension__ ({ \
   4261   float32x4_t __b = (b); \
   4262   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 39); })
   4263 #define vst1q_p8(__a, b) __extension__ ({ \
   4264   poly8x16_t __b = (b); \
   4265   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 36); })
   4266 #define vst1q_p16(__a, b) __extension__ ({ \
   4267   poly16x8_t __b = (b); \
   4268   __builtin_neon_vst1q_v(__a, (int8x16_t)__b, 37); })
   4269 #define vst1_u8(__a, b) __extension__ ({ \
   4270   uint8x8_t __b = (b); \
   4271   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 16); })
   4272 #define vst1_u16(__a, b) __extension__ ({ \
   4273   uint16x4_t __b = (b); \
   4274   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 17); })
   4275 #define vst1_u32(__a, b) __extension__ ({ \
   4276   uint32x2_t __b = (b); \
   4277   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 18); })
   4278 #define vst1_u64(__a, b) __extension__ ({ \
   4279   uint64x1_t __b = (b); \
   4280   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 19); })
   4281 #define vst1_s8(__a, b) __extension__ ({ \
   4282   int8x8_t __b = (b); \
   4283   __builtin_neon_vst1_v(__a, __b, 0); })
   4284 #define vst1_s16(__a, b) __extension__ ({ \
   4285   int16x4_t __b = (b); \
   4286   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 1); })
   4287 #define vst1_s32(__a, b) __extension__ ({ \
   4288   int32x2_t __b = (b); \
   4289   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 2); })
   4290 #define vst1_s64(__a, b) __extension__ ({ \
   4291   int64x1_t __b = (b); \
   4292   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 3); })
   4293 #define vst1_f16(__a, b) __extension__ ({ \
   4294   float16x4_t __b = (b); \
   4295   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 6); })
   4296 #define vst1_f32(__a, b) __extension__ ({ \
   4297   float32x2_t __b = (b); \
   4298   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 7); })
   4299 #define vst1_p8(__a, b) __extension__ ({ \
   4300   poly8x8_t __b = (b); \
   4301   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 4); })
   4302 #define vst1_p16(__a, b) __extension__ ({ \
   4303   poly16x4_t __b = (b); \
   4304   __builtin_neon_vst1_v(__a, (int8x8_t)__b, 5); })
   4305 
   4306 #define vst1q_lane_u8(__a, b, __c) __extension__ ({ \
   4307   uint8x16_t __b = (b); \
   4308   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 48); })
   4309 #define vst1q_lane_u16(__a, b, __c) __extension__ ({ \
   4310   uint16x8_t __b = (b); \
   4311   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 49); })
   4312 #define vst1q_lane_u32(__a, b, __c) __extension__ ({ \
   4313   uint32x4_t __b = (b); \
   4314   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 50); })
   4315 #define vst1q_lane_u64(__a, b, __c) __extension__ ({ \
   4316   uint64x2_t __b = (b); \
   4317   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 51); })
   4318 #define vst1q_lane_s8(__a, b, __c) __extension__ ({ \
   4319   int8x16_t __b = (b); \
   4320   __builtin_neon_vst1q_lane_v(__a, __b, __c, 32); })
   4321 #define vst1q_lane_s16(__a, b, __c) __extension__ ({ \
   4322   int16x8_t __b = (b); \
   4323   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 33); })
   4324 #define vst1q_lane_s32(__a, b, __c) __extension__ ({ \
   4325   int32x4_t __b = (b); \
   4326   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 34); })
   4327 #define vst1q_lane_s64(__a, b, __c) __extension__ ({ \
   4328   int64x2_t __b = (b); \
   4329   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 35); })
   4330 #define vst1q_lane_f16(__a, b, __c) __extension__ ({ \
   4331   float16x8_t __b = (b); \
   4332   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 38); })
   4333 #define vst1q_lane_f32(__a, b, __c) __extension__ ({ \
   4334   float32x4_t __b = (b); \
   4335   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 39); })
   4336 #define vst1q_lane_p8(__a, b, __c) __extension__ ({ \
   4337   poly8x16_t __b = (b); \
   4338   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 36); })
   4339 #define vst1q_lane_p16(__a, b, __c) __extension__ ({ \
   4340   poly16x8_t __b = (b); \
   4341   __builtin_neon_vst1q_lane_v(__a, (int8x16_t)__b, __c, 37); })
   4342 #define vst1_lane_u8(__a, b, __c) __extension__ ({ \
   4343   uint8x8_t __b = (b); \
   4344   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 16); })
   4345 #define vst1_lane_u16(__a, b, __c) __extension__ ({ \
   4346   uint16x4_t __b = (b); \
   4347   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 17); })
   4348 #define vst1_lane_u32(__a, b, __c) __extension__ ({ \
   4349   uint32x2_t __b = (b); \
   4350   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 18); })
   4351 #define vst1_lane_u64(__a, b, __c) __extension__ ({ \
   4352   uint64x1_t __b = (b); \
   4353   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 19); })
   4354 #define vst1_lane_s8(__a, b, __c) __extension__ ({ \
   4355   int8x8_t __b = (b); \
   4356   __builtin_neon_vst1_lane_v(__a, __b, __c, 0); })
   4357 #define vst1_lane_s16(__a, b, __c) __extension__ ({ \
   4358   int16x4_t __b = (b); \
   4359   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 1); })
   4360 #define vst1_lane_s32(__a, b, __c) __extension__ ({ \
   4361   int32x2_t __b = (b); \
   4362   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 2); })
   4363 #define vst1_lane_s64(__a, b, __c) __extension__ ({ \
   4364   int64x1_t __b = (b); \
   4365   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 3); })
   4366 #define vst1_lane_f16(__a, b, __c) __extension__ ({ \
   4367   float16x4_t __b = (b); \
   4368   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 6); })
   4369 #define vst1_lane_f32(__a, b, __c) __extension__ ({ \
   4370   float32x2_t __b = (b); \
   4371   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 7); })
   4372 #define vst1_lane_p8(__a, b, __c) __extension__ ({ \
   4373   poly8x8_t __b = (b); \
   4374   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 4); })
   4375 #define vst1_lane_p16(__a, b, __c) __extension__ ({ \
   4376   poly16x4_t __b = (b); \
   4377   __builtin_neon_vst1_lane_v(__a, (int8x8_t)__b, __c, 5); })
   4378 
   4379 #define vst2q_u8(__a, b) __extension__ ({ \
   4380   uint8x16x2_t __b = (b); \
   4381   __builtin_neon_vst2q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], 48); })
   4382 #define vst2q_u16(__a, b) __extension__ ({ \
   4383   uint16x8x2_t __b = (b); \
   4384   __builtin_neon_vst2q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], 49); })
   4385 #define vst2q_u32(__a, b) __extension__ ({ \
   4386   uint32x4x2_t __b = (b); \
   4387   __builtin_neon_vst2q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], 50); })
   4388 #define vst2q_s8(__a, b) __extension__ ({ \
   4389   int8x16x2_t __b = (b); \
   4390   __builtin_neon_vst2q_v(__a, __b.val[0], __b.val[1], 32); })
   4391 #define vst2q_s16(__a, b) __extension__ ({ \
   4392   int16x8x2_t __b = (b); \
   4393   __builtin_neon_vst2q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], 33); })
   4394 #define vst2q_s32(__a, b) __extension__ ({ \
   4395   int32x4x2_t __b = (b); \
   4396   __builtin_neon_vst2q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], 34); })
   4397 #define vst2q_f16(__a, b) __extension__ ({ \
   4398   float16x8x2_t __b = (b); \
   4399   __builtin_neon_vst2q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], 38); })
   4400 #define vst2q_f32(__a, b) __extension__ ({ \
   4401   float32x4x2_t __b = (b); \
   4402   __builtin_neon_vst2q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], 39); })
   4403 #define vst2q_p8(__a, b) __extension__ ({ \
   4404   poly8x16x2_t __b = (b); \
   4405   __builtin_neon_vst2q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], 36); })
   4406 #define vst2q_p16(__a, b) __extension__ ({ \
   4407   poly16x8x2_t __b = (b); \
   4408   __builtin_neon_vst2q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], 37); })
   4409 #define vst2_u8(__a, b) __extension__ ({ \
   4410   uint8x8x2_t __b = (b); \
   4411   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 16); })
   4412 #define vst2_u16(__a, b) __extension__ ({ \
   4413   uint16x4x2_t __b = (b); \
   4414   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 17); })
   4415 #define vst2_u32(__a, b) __extension__ ({ \
   4416   uint32x2x2_t __b = (b); \
   4417   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 18); })
   4418 #define vst2_u64(__a, b) __extension__ ({ \
   4419   uint64x1x2_t __b = (b); \
   4420   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 19); })
   4421 #define vst2_s8(__a, b) __extension__ ({ \
   4422   int8x8x2_t __b = (b); \
   4423   __builtin_neon_vst2_v(__a, __b.val[0], __b.val[1], 0); })
   4424 #define vst2_s16(__a, b) __extension__ ({ \
   4425   int16x4x2_t __b = (b); \
   4426   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 1); })
   4427 #define vst2_s32(__a, b) __extension__ ({ \
   4428   int32x2x2_t __b = (b); \
   4429   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 2); })
   4430 #define vst2_s64(__a, b) __extension__ ({ \
   4431   int64x1x2_t __b = (b); \
   4432   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 3); })
   4433 #define vst2_f16(__a, b) __extension__ ({ \
   4434   float16x4x2_t __b = (b); \
   4435   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 6); })
   4436 #define vst2_f32(__a, b) __extension__ ({ \
   4437   float32x2x2_t __b = (b); \
   4438   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 7); })
   4439 #define vst2_p8(__a, b) __extension__ ({ \
   4440   poly8x8x2_t __b = (b); \
   4441   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 4); })
   4442 #define vst2_p16(__a, b) __extension__ ({ \
   4443   poly16x4x2_t __b = (b); \
   4444   __builtin_neon_vst2_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], 5); })
   4445 
   4446 #define vst2q_lane_u16(__a, b, __c) __extension__ ({ \
   4447   uint16x8x2_t __b = (b); \
   4448   __builtin_neon_vst2q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 49); })
   4449 #define vst2q_lane_u32(__a, b, __c) __extension__ ({ \
   4450   uint32x4x2_t __b = (b); \
   4451   __builtin_neon_vst2q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 50); })
   4452 #define vst2q_lane_s16(__a, b, __c) __extension__ ({ \
   4453   int16x8x2_t __b = (b); \
   4454   __builtin_neon_vst2q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 33); })
   4455 #define vst2q_lane_s32(__a, b, __c) __extension__ ({ \
   4456   int32x4x2_t __b = (b); \
   4457   __builtin_neon_vst2q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 34); })
   4458 #define vst2q_lane_f16(__a, b, __c) __extension__ ({ \
   4459   float16x8x2_t __b = (b); \
   4460   __builtin_neon_vst2q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 38); })
   4461 #define vst2q_lane_f32(__a, b, __c) __extension__ ({ \
   4462   float32x4x2_t __b = (b); \
   4463   __builtin_neon_vst2q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 39); })
   4464 #define vst2q_lane_p16(__a, b, __c) __extension__ ({ \
   4465   poly16x8x2_t __b = (b); \
   4466   __builtin_neon_vst2q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], __c, 37); })
   4467 #define vst2_lane_u8(__a, b, __c) __extension__ ({ \
   4468   uint8x8x2_t __b = (b); \
   4469   __builtin_neon_vst2_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 16); })
   4470 #define vst2_lane_u16(__a, b, __c) __extension__ ({ \
   4471   uint16x4x2_t __b = (b); \
   4472   __builtin_neon_vst2_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 17); })
   4473 #define vst2_lane_u32(__a, b, __c) __extension__ ({ \
   4474   uint32x2x2_t __b = (b); \
   4475   __builtin_neon_vst2_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 18); })
   4476 #define vst2_lane_s8(__a, b, __c) __extension__ ({ \
   4477   int8x8x2_t __b = (b); \
   4478   __builtin_neon_vst2_lane_v(__a, __b.val[0], __b.val[1], __c, 0); })
   4479 #define vst2_lane_s16(__a, b, __c) __extension__ ({ \
   4480   int16x4x2_t __b = (b); \
   4481   __builtin_neon_vst2_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 1); })
   4482 #define vst2_lane_s32(__a, b, __c) __extension__ ({ \
   4483   int32x2x2_t __b = (b); \
   4484   __builtin_neon_vst2_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 2); })
   4485 #define vst2_lane_f16(__a, b, __c) __extension__ ({ \
   4486   float16x4x2_t __b = (b); \
   4487   __builtin_neon_vst2_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 6); })
   4488 #define vst2_lane_f32(__a, b, __c) __extension__ ({ \
   4489   float32x2x2_t __b = (b); \
   4490   __builtin_neon_vst2_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 7); })
   4491 #define vst2_lane_p8(__a, b, __c) __extension__ ({ \
   4492   poly8x8x2_t __b = (b); \
   4493   __builtin_neon_vst2_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 4); })
   4494 #define vst2_lane_p16(__a, b, __c) __extension__ ({ \
   4495   poly16x4x2_t __b = (b); \
   4496   __builtin_neon_vst2_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], __c, 5); })
   4497 
   4498 #define vst3q_u8(__a, b) __extension__ ({ \
   4499   uint8x16x3_t __b = (b); \
   4500   __builtin_neon_vst3q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], 48); })
   4501 #define vst3q_u16(__a, b) __extension__ ({ \
   4502   uint16x8x3_t __b = (b); \
   4503   __builtin_neon_vst3q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], 49); })
   4504 #define vst3q_u32(__a, b) __extension__ ({ \
   4505   uint32x4x3_t __b = (b); \
   4506   __builtin_neon_vst3q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], 50); })
   4507 #define vst3q_s8(__a, b) __extension__ ({ \
   4508   int8x16x3_t __b = (b); \
   4509   __builtin_neon_vst3q_v(__a, __b.val[0], __b.val[1], __b.val[2], 32); })
   4510 #define vst3q_s16(__a, b) __extension__ ({ \
   4511   int16x8x3_t __b = (b); \
   4512   __builtin_neon_vst3q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], 33); })
   4513 #define vst3q_s32(__a, b) __extension__ ({ \
   4514   int32x4x3_t __b = (b); \
   4515   __builtin_neon_vst3q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], 34); })
   4516 #define vst3q_f16(__a, b) __extension__ ({ \
   4517   float16x8x3_t __b = (b); \
   4518   __builtin_neon_vst3q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], 38); })
   4519 #define vst3q_f32(__a, b) __extension__ ({ \
   4520   float32x4x3_t __b = (b); \
   4521   __builtin_neon_vst3q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], 39); })
   4522 #define vst3q_p8(__a, b) __extension__ ({ \
   4523   poly8x16x3_t __b = (b); \
   4524   __builtin_neon_vst3q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], 36); })
   4525 #define vst3q_p16(__a, b) __extension__ ({ \
   4526   poly16x8x3_t __b = (b); \
   4527   __builtin_neon_vst3q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], 37); })
   4528 #define vst3_u8(__a, b) __extension__ ({ \
   4529   uint8x8x3_t __b = (b); \
   4530   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 16); })
   4531 #define vst3_u16(__a, b) __extension__ ({ \
   4532   uint16x4x3_t __b = (b); \
   4533   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 17); })
   4534 #define vst3_u32(__a, b) __extension__ ({ \
   4535   uint32x2x3_t __b = (b); \
   4536   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 18); })
   4537 #define vst3_u64(__a, b) __extension__ ({ \
   4538   uint64x1x3_t __b = (b); \
   4539   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 19); })
   4540 #define vst3_s8(__a, b) __extension__ ({ \
   4541   int8x8x3_t __b = (b); \
   4542   __builtin_neon_vst3_v(__a, __b.val[0], __b.val[1], __b.val[2], 0); })
   4543 #define vst3_s16(__a, b) __extension__ ({ \
   4544   int16x4x3_t __b = (b); \
   4545   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 1); })
   4546 #define vst3_s32(__a, b) __extension__ ({ \
   4547   int32x2x3_t __b = (b); \
   4548   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 2); })
   4549 #define vst3_s64(__a, b) __extension__ ({ \
   4550   int64x1x3_t __b = (b); \
   4551   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 3); })
   4552 #define vst3_f16(__a, b) __extension__ ({ \
   4553   float16x4x3_t __b = (b); \
   4554   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 6); })
   4555 #define vst3_f32(__a, b) __extension__ ({ \
   4556   float32x2x3_t __b = (b); \
   4557   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 7); })
   4558 #define vst3_p8(__a, b) __extension__ ({ \
   4559   poly8x8x3_t __b = (b); \
   4560   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 4); })
   4561 #define vst3_p16(__a, b) __extension__ ({ \
   4562   poly16x4x3_t __b = (b); \
   4563   __builtin_neon_vst3_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], 5); })
   4564 
   4565 #define vst3q_lane_u16(__a, b, __c) __extension__ ({ \
   4566   uint16x8x3_t __b = (b); \
   4567   __builtin_neon_vst3q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 49); })
   4568 #define vst3q_lane_u32(__a, b, __c) __extension__ ({ \
   4569   uint32x4x3_t __b = (b); \
   4570   __builtin_neon_vst3q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 50); })
   4571 #define vst3q_lane_s16(__a, b, __c) __extension__ ({ \
   4572   int16x8x3_t __b = (b); \
   4573   __builtin_neon_vst3q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 33); })
   4574 #define vst3q_lane_s32(__a, b, __c) __extension__ ({ \
   4575   int32x4x3_t __b = (b); \
   4576   __builtin_neon_vst3q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 34); })
   4577 #define vst3q_lane_f16(__a, b, __c) __extension__ ({ \
   4578   float16x8x3_t __b = (b); \
   4579   __builtin_neon_vst3q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 38); })
   4580 #define vst3q_lane_f32(__a, b, __c) __extension__ ({ \
   4581   float32x4x3_t __b = (b); \
   4582   __builtin_neon_vst3q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 39); })
   4583 #define vst3q_lane_p16(__a, b, __c) __extension__ ({ \
   4584   poly16x8x3_t __b = (b); \
   4585   __builtin_neon_vst3q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], __c, 37); })
   4586 #define vst3_lane_u8(__a, b, __c) __extension__ ({ \
   4587   uint8x8x3_t __b = (b); \
   4588   __builtin_neon_vst3_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 16); })
   4589 #define vst3_lane_u16(__a, b, __c) __extension__ ({ \
   4590   uint16x4x3_t __b = (b); \
   4591   __builtin_neon_vst3_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 17); })
   4592 #define vst3_lane_u32(__a, b, __c) __extension__ ({ \
   4593   uint32x2x3_t __b = (b); \
   4594   __builtin_neon_vst3_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 18); })
   4595 #define vst3_lane_s8(__a, b, __c) __extension__ ({ \
   4596   int8x8x3_t __b = (b); \
   4597   __builtin_neon_vst3_lane_v(__a, __b.val[0], __b.val[1], __b.val[2], __c, 0); })
   4598 #define vst3_lane_s16(__a, b, __c) __extension__ ({ \
   4599   int16x4x3_t __b = (b); \
   4600   __builtin_neon_vst3_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 1); })
   4601 #define vst3_lane_s32(__a, b, __c) __extension__ ({ \
   4602   int32x2x3_t __b = (b); \
   4603   __builtin_neon_vst3_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 2); })
   4604 #define vst3_lane_f16(__a, b, __c) __extension__ ({ \
   4605   float16x4x3_t __b = (b); \
   4606   __builtin_neon_vst3_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 6); })
   4607 #define vst3_lane_f32(__a, b, __c) __extension__ ({ \
   4608   float32x2x3_t __b = (b); \
   4609   __builtin_neon_vst3_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 7); })
   4610 #define vst3_lane_p8(__a, b, __c) __extension__ ({ \
   4611   poly8x8x3_t __b = (b); \
   4612   __builtin_neon_vst3_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 4); })
   4613 #define vst3_lane_p16(__a, b, __c) __extension__ ({ \
   4614   poly16x4x3_t __b = (b); \
   4615   __builtin_neon_vst3_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], __c, 5); })
   4616 
   4617 #define vst4q_u8(__a, b) __extension__ ({ \
   4618   uint8x16x4_t __b = (b); \
   4619   __builtin_neon_vst4q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], 48); })
   4620 #define vst4q_u16(__a, b) __extension__ ({ \
   4621   uint16x8x4_t __b = (b); \
   4622   __builtin_neon_vst4q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], 49); })
   4623 #define vst4q_u32(__a, b) __extension__ ({ \
   4624   uint32x4x4_t __b = (b); \
   4625   __builtin_neon_vst4q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], 50); })
   4626 #define vst4q_s8(__a, b) __extension__ ({ \
   4627   int8x16x4_t __b = (b); \
   4628   __builtin_neon_vst4q_v(__a, __b.val[0], __b.val[1], __b.val[2], __b.val[3], 32); })
   4629 #define vst4q_s16(__a, b) __extension__ ({ \
   4630   int16x8x4_t __b = (b); \
   4631   __builtin_neon_vst4q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], 33); })
   4632 #define vst4q_s32(__a, b) __extension__ ({ \
   4633   int32x4x4_t __b = (b); \
   4634   __builtin_neon_vst4q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], 34); })
   4635 #define vst4q_f16(__a, b) __extension__ ({ \
   4636   float16x8x4_t __b = (b); \
   4637   __builtin_neon_vst4q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], 38); })
   4638 #define vst4q_f32(__a, b) __extension__ ({ \
   4639   float32x4x4_t __b = (b); \
   4640   __builtin_neon_vst4q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], 39); })
   4641 #define vst4q_p8(__a, b) __extension__ ({ \
   4642   poly8x16x4_t __b = (b); \
   4643   __builtin_neon_vst4q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], 36); })
   4644 #define vst4q_p16(__a, b) __extension__ ({ \
   4645   poly16x8x4_t __b = (b); \
   4646   __builtin_neon_vst4q_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], 37); })
   4647 #define vst4_u8(__a, b) __extension__ ({ \
   4648   uint8x8x4_t __b = (b); \
   4649   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 16); })
   4650 #define vst4_u16(__a, b) __extension__ ({ \
   4651   uint16x4x4_t __b = (b); \
   4652   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 17); })
   4653 #define vst4_u32(__a, b) __extension__ ({ \
   4654   uint32x2x4_t __b = (b); \
   4655   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 18); })
   4656 #define vst4_u64(__a, b) __extension__ ({ \
   4657   uint64x1x4_t __b = (b); \
   4658   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 19); })
   4659 #define vst4_s8(__a, b) __extension__ ({ \
   4660   int8x8x4_t __b = (b); \
   4661   __builtin_neon_vst4_v(__a, __b.val[0], __b.val[1], __b.val[2], __b.val[3], 0); })
   4662 #define vst4_s16(__a, b) __extension__ ({ \
   4663   int16x4x4_t __b = (b); \
   4664   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 1); })
   4665 #define vst4_s32(__a, b) __extension__ ({ \
   4666   int32x2x4_t __b = (b); \
   4667   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 2); })
   4668 #define vst4_s64(__a, b) __extension__ ({ \
   4669   int64x1x4_t __b = (b); \
   4670   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 3); })
   4671 #define vst4_f16(__a, b) __extension__ ({ \
   4672   float16x4x4_t __b = (b); \
   4673   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 6); })
   4674 #define vst4_f32(__a, b) __extension__ ({ \
   4675   float32x2x4_t __b = (b); \
   4676   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 7); })
   4677 #define vst4_p8(__a, b) __extension__ ({ \
   4678   poly8x8x4_t __b = (b); \
   4679   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 4); })
   4680 #define vst4_p16(__a, b) __extension__ ({ \
   4681   poly16x4x4_t __b = (b); \
   4682   __builtin_neon_vst4_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], 5); })
   4683 
   4684 #define vst4q_lane_u16(__a, b, __c) __extension__ ({ \
   4685   uint16x8x4_t __b = (b); \
   4686   __builtin_neon_vst4q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 49); })
   4687 #define vst4q_lane_u32(__a, b, __c) __extension__ ({ \
   4688   uint32x4x4_t __b = (b); \
   4689   __builtin_neon_vst4q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 50); })
   4690 #define vst4q_lane_s16(__a, b, __c) __extension__ ({ \
   4691   int16x8x4_t __b = (b); \
   4692   __builtin_neon_vst4q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 33); })
   4693 #define vst4q_lane_s32(__a, b, __c) __extension__ ({ \
   4694   int32x4x4_t __b = (b); \
   4695   __builtin_neon_vst4q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 34); })
   4696 #define vst4q_lane_f16(__a, b, __c) __extension__ ({ \
   4697   float16x8x4_t __b = (b); \
   4698   __builtin_neon_vst4q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 38); })
   4699 #define vst4q_lane_f32(__a, b, __c) __extension__ ({ \
   4700   float32x4x4_t __b = (b); \
   4701   __builtin_neon_vst4q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 39); })
   4702 #define vst4q_lane_p16(__a, b, __c) __extension__ ({ \
   4703   poly16x8x4_t __b = (b); \
   4704   __builtin_neon_vst4q_lane_v(__a, (int8x16_t)__b.val[0], (int8x16_t)__b.val[1], (int8x16_t)__b.val[2], (int8x16_t)__b.val[3], __c, 37); })
   4705 #define vst4_lane_u8(__a, b, __c) __extension__ ({ \
   4706   uint8x8x4_t __b = (b); \
   4707   __builtin_neon_vst4_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 16); })
   4708 #define vst4_lane_u16(__a, b, __c) __extension__ ({ \
   4709   uint16x4x4_t __b = (b); \
   4710   __builtin_neon_vst4_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 17); })
   4711 #define vst4_lane_u32(__a, b, __c) __extension__ ({ \
   4712   uint32x2x4_t __b = (b); \
   4713   __builtin_neon_vst4_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 18); })
   4714 #define vst4_lane_s8(__a, b, __c) __extension__ ({ \
   4715   int8x8x4_t __b = (b); \
   4716   __builtin_neon_vst4_lane_v(__a, __b.val[0], __b.val[1], __b.val[2], __b.val[3], __c, 0); })
   4717 #define vst4_lane_s16(__a, b, __c) __extension__ ({ \
   4718   int16x4x4_t __b = (b); \
   4719   __builtin_neon_vst4_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 1); })
   4720 #define vst4_lane_s32(__a, b, __c) __extension__ ({ \
   4721   int32x2x4_t __b = (b); \
   4722   __builtin_neon_vst4_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 2); })
   4723 #define vst4_lane_f16(__a, b, __c) __extension__ ({ \
   4724   float16x4x4_t __b = (b); \
   4725   __builtin_neon_vst4_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 6); })
   4726 #define vst4_lane_f32(__a, b, __c) __extension__ ({ \
   4727   float32x2x4_t __b = (b); \
   4728   __builtin_neon_vst4_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 7); })
   4729 #define vst4_lane_p8(__a, b, __c) __extension__ ({ \
   4730   poly8x8x4_t __b = (b); \
   4731   __builtin_neon_vst4_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 4); })
   4732 #define vst4_lane_p16(__a, b, __c) __extension__ ({ \
   4733   poly16x4x4_t __b = (b); \
   4734   __builtin_neon_vst4_lane_v(__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], __c, 5); })
   4735 
   4736 __ai int8x8_t vsub_s8(int8x8_t __a, int8x8_t __b) {
   4737   return __a - __b; }
   4738 __ai int16x4_t vsub_s16(int16x4_t __a, int16x4_t __b) {
   4739   return __a - __b; }
   4740 __ai int32x2_t vsub_s32(int32x2_t __a, int32x2_t __b) {
   4741   return __a - __b; }
   4742 __ai int64x1_t vsub_s64(int64x1_t __a, int64x1_t __b) {
   4743   return __a - __b; }
   4744 __ai float32x2_t vsub_f32(float32x2_t __a, float32x2_t __b) {
   4745   return __a - __b; }
   4746 __ai uint8x8_t vsub_u8(uint8x8_t __a, uint8x8_t __b) {
   4747   return __a - __b; }
   4748 __ai uint16x4_t vsub_u16(uint16x4_t __a, uint16x4_t __b) {
   4749   return __a - __b; }
   4750 __ai uint32x2_t vsub_u32(uint32x2_t __a, uint32x2_t __b) {
   4751   return __a - __b; }
   4752 __ai uint64x1_t vsub_u64(uint64x1_t __a, uint64x1_t __b) {
   4753   return __a - __b; }
   4754 __ai int8x16_t vsubq_s8(int8x16_t __a, int8x16_t __b) {
   4755   return __a - __b; }
   4756 __ai int16x8_t vsubq_s16(int16x8_t __a, int16x8_t __b) {
   4757   return __a - __b; }
   4758 __ai int32x4_t vsubq_s32(int32x4_t __a, int32x4_t __b) {
   4759   return __a - __b; }
   4760 __ai int64x2_t vsubq_s64(int64x2_t __a, int64x2_t __b) {
   4761   return __a - __b; }
   4762 __ai float32x4_t vsubq_f32(float32x4_t __a, float32x4_t __b) {
   4763   return __a - __b; }
   4764 __ai uint8x16_t vsubq_u8(uint8x16_t __a, uint8x16_t __b) {
   4765   return __a - __b; }
   4766 __ai uint16x8_t vsubq_u16(uint16x8_t __a, uint16x8_t __b) {
   4767   return __a - __b; }
   4768 __ai uint32x4_t vsubq_u32(uint32x4_t __a, uint32x4_t __b) {
   4769   return __a - __b; }
   4770 __ai uint64x2_t vsubq_u64(uint64x2_t __a, uint64x2_t __b) {
   4771   return __a - __b; }
   4772 
   4773 __ai int8x8_t vsubhn_s16(int16x8_t __a, int16x8_t __b) {
   4774   return (int8x8_t)__builtin_neon_vsubhn_v((int8x16_t)__a, (int8x16_t)__b, 0); }
   4775 __ai int16x4_t vsubhn_s32(int32x4_t __a, int32x4_t __b) {
   4776   return (int16x4_t)__builtin_neon_vsubhn_v((int8x16_t)__a, (int8x16_t)__b, 1); }
   4777 __ai int32x2_t vsubhn_s64(int64x2_t __a, int64x2_t __b) {
   4778   return (int32x2_t)__builtin_neon_vsubhn_v((int8x16_t)__a, (int8x16_t)__b, 2); }
   4779 __ai uint8x8_t vsubhn_u16(uint16x8_t __a, uint16x8_t __b) {
   4780   return (uint8x8_t)__builtin_neon_vsubhn_v((int8x16_t)__a, (int8x16_t)__b, 16); }
   4781 __ai uint16x4_t vsubhn_u32(uint32x4_t __a, uint32x4_t __b) {
   4782   return (uint16x4_t)__builtin_neon_vsubhn_v((int8x16_t)__a, (int8x16_t)__b, 17); }
   4783 __ai uint32x2_t vsubhn_u64(uint64x2_t __a, uint64x2_t __b) {
   4784   return (uint32x2_t)__builtin_neon_vsubhn_v((int8x16_t)__a, (int8x16_t)__b, 18); }
   4785 
   4786 __ai int16x8_t vsubl_s8(int8x8_t __a, int8x8_t __b) {
   4787   return vmovl_s8(__a) - vmovl_s8(__b); }
   4788 __ai int32x4_t vsubl_s16(int16x4_t __a, int16x4_t __b) {
   4789   return vmovl_s16(__a) - vmovl_s16(__b); }
   4790 __ai int64x2_t vsubl_s32(int32x2_t __a, int32x2_t __b) {
   4791   return vmovl_s32(__a) - vmovl_s32(__b); }
   4792 __ai uint16x8_t vsubl_u8(uint8x8_t __a, uint8x8_t __b) {
   4793   return vmovl_u8(__a) - vmovl_u8(__b); }
   4794 __ai uint32x4_t vsubl_u16(uint16x4_t __a, uint16x4_t __b) {
   4795   return vmovl_u16(__a) - vmovl_u16(__b); }
   4796 __ai uint64x2_t vsubl_u32(uint32x2_t __a, uint32x2_t __b) {
   4797   return vmovl_u32(__a) - vmovl_u32(__b); }
   4798 
   4799 __ai int16x8_t vsubw_s8(int16x8_t __a, int8x8_t __b) {
   4800   return __a - vmovl_s8(__b); }
   4801 __ai int32x4_t vsubw_s16(int32x4_t __a, int16x4_t __b) {
   4802   return __a - vmovl_s16(__b); }
   4803 __ai int64x2_t vsubw_s32(int64x2_t __a, int32x2_t __b) {
   4804   return __a - vmovl_s32(__b); }
   4805 __ai uint16x8_t vsubw_u8(uint16x8_t __a, uint8x8_t __b) {
   4806   return __a - vmovl_u8(__b); }
   4807 __ai uint32x4_t vsubw_u16(uint32x4_t __a, uint16x4_t __b) {
   4808   return __a - vmovl_u16(__b); }
   4809 __ai uint64x2_t vsubw_u32(uint64x2_t __a, uint32x2_t __b) {
   4810   return __a - vmovl_u32(__b); }
   4811 
   4812 __ai uint8x8_t vtbl1_u8(uint8x8_t __a, uint8x8_t __b) {
   4813   return (uint8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   4814 __ai int8x8_t vtbl1_s8(int8x8_t __a, int8x8_t __b) {
   4815   return (int8x8_t)__builtin_neon_vtbl1_v(__a, __b, 0); }
   4816 __ai poly8x8_t vtbl1_p8(poly8x8_t __a, uint8x8_t __b) {
   4817   return (poly8x8_t)__builtin_neon_vtbl1_v((int8x8_t)__a, (int8x8_t)__b, 4); }
   4818 
   4819 __ai uint8x8_t vtbl2_u8(uint8x8x2_t __a, uint8x8_t __b) {
   4820   return (uint8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__a.val[0], (int8x8_t)__a.val[1], (int8x8_t)__b, 16); }
   4821 __ai int8x8_t vtbl2_s8(int8x8x2_t __a, int8x8_t __b) {
   4822   return (int8x8_t)__builtin_neon_vtbl2_v(__a.val[0], __a.val[1], __b, 0); }
   4823 __ai poly8x8_t vtbl2_p8(poly8x8x2_t __a, uint8x8_t __b) {
   4824   return (poly8x8_t)__builtin_neon_vtbl2_v((int8x8_t)__a.val[0], (int8x8_t)__a.val[1], (int8x8_t)__b, 4); }
   4825 
   4826 __ai uint8x8_t vtbl3_u8(uint8x8x3_t __a, uint8x8_t __b) {
   4827   return (uint8x8_t)__builtin_neon_vtbl3_v((int8x8_t)__a.val[0], (int8x8_t)__a.val[1], (int8x8_t)__a.val[2], (int8x8_t)__b, 16); }
   4828 __ai int8x8_t vtbl3_s8(int8x8x3_t __a, int8x8_t __b) {
   4829   return (int8x8_t)__builtin_neon_vtbl3_v(__a.val[0], __a.val[1], __a.val[2], __b, 0); }
   4830 __ai poly8x8_t vtbl3_p8(poly8x8x3_t __a, uint8x8_t __b) {
   4831   return (poly8x8_t)__builtin_neon_vtbl3_v((int8x8_t)__a.val[0], (int8x8_t)__a.val[1], (int8x8_t)__a.val[2], (int8x8_t)__b, 4); }
   4832 
   4833 __ai uint8x8_t vtbl4_u8(uint8x8x4_t __a, uint8x8_t __b) {
   4834   return (uint8x8_t)__builtin_neon_vtbl4_v((int8x8_t)__a.val[0], (int8x8_t)__a.val[1], (int8x8_t)__a.val[2], (int8x8_t)__a.val[3], (int8x8_t)__b, 16); }
   4835 __ai int8x8_t vtbl4_s8(int8x8x4_t __a, int8x8_t __b) {
   4836   return (int8x8_t)__builtin_neon_vtbl4_v(__a.val[0], __a.val[1], __a.val[2], __a.val[3], __b, 0); }
   4837 __ai poly8x8_t vtbl4_p8(poly8x8x4_t __a, uint8x8_t __b) {
   4838   return (poly8x8_t)__builtin_neon_vtbl4_v((int8x8_t)__a.val[0], (int8x8_t)__a.val[1], (int8x8_t)__a.val[2], (int8x8_t)__a.val[3], (int8x8_t)__b, 4); }
   4839 
   4840 __ai uint8x8_t vtbx1_u8(uint8x8_t __a, uint8x8_t __b, uint8x8_t __c) {
   4841   return (uint8x8_t)__builtin_neon_vtbx1_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 16); }
   4842 __ai int8x8_t vtbx1_s8(int8x8_t __a, int8x8_t __b, int8x8_t __c) {
   4843   return (int8x8_t)__builtin_neon_vtbx1_v(__a, __b, __c, 0); }
   4844 __ai poly8x8_t vtbx1_p8(poly8x8_t __a, poly8x8_t __b, uint8x8_t __c) {
   4845   return (poly8x8_t)__builtin_neon_vtbx1_v((int8x8_t)__a, (int8x8_t)__b, (int8x8_t)__c, 4); }
   4846 
   4847 __ai uint8x8_t vtbx2_u8(uint8x8_t __a, uint8x8x2_t __b, uint8x8_t __c) {
   4848   return (uint8x8_t)__builtin_neon_vtbx2_v((int8x8_t)__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__c, 16); }
   4849 __ai int8x8_t vtbx2_s8(int8x8_t __a, int8x8x2_t __b, int8x8_t __c) {
   4850   return (int8x8_t)__builtin_neon_vtbx2_v(__a, __b.val[0], __b.val[1], __c, 0); }
   4851 __ai poly8x8_t vtbx2_p8(poly8x8_t __a, poly8x8x2_t __b, uint8x8_t __c) {
   4852   return (poly8x8_t)__builtin_neon_vtbx2_v((int8x8_t)__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__c, 4); }
   4853 
   4854 __ai uint8x8_t vtbx3_u8(uint8x8_t __a, uint8x8x3_t __b, uint8x8_t __c) {
   4855   return (uint8x8_t)__builtin_neon_vtbx3_v((int8x8_t)__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__c, 16); }
   4856 __ai int8x8_t vtbx3_s8(int8x8_t __a, int8x8x3_t __b, int8x8_t __c) {
   4857   return (int8x8_t)__builtin_neon_vtbx3_v(__a, __b.val[0], __b.val[1], __b.val[2], __c, 0); }
   4858 __ai poly8x8_t vtbx3_p8(poly8x8_t __a, poly8x8x3_t __b, uint8x8_t __c) {
   4859   return (poly8x8_t)__builtin_neon_vtbx3_v((int8x8_t)__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__c, 4); }
   4860 
   4861 __ai uint8x8_t vtbx4_u8(uint8x8_t __a, uint8x8x4_t __b, uint8x8_t __c) {
   4862   return (uint8x8_t)__builtin_neon_vtbx4_v((int8x8_t)__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], (int8x8_t)__c, 16); }
   4863 __ai int8x8_t vtbx4_s8(int8x8_t __a, int8x8x4_t __b, int8x8_t __c) {
   4864   return (int8x8_t)__builtin_neon_vtbx4_v(__a, __b.val[0], __b.val[1], __b.val[2], __b.val[3], __c, 0); }
   4865 __ai poly8x8_t vtbx4_p8(poly8x8_t __a, poly8x8x4_t __b, uint8x8_t __c) {
   4866   return (poly8x8_t)__builtin_neon_vtbx4_v((int8x8_t)__a, (int8x8_t)__b.val[0], (int8x8_t)__b.val[1], (int8x8_t)__b.val[2], (int8x8_t)__b.val[3], (int8x8_t)__c, 4); }
   4867 
   4868 __ai int8x8x2_t vtrn_s8(int8x8_t __a, int8x8_t __b) {
   4869   int8x8x2_t r; __builtin_neon_vtrn_v(&r, __a, __b, 0); return r; }
   4870 __ai int16x4x2_t vtrn_s16(int16x4_t __a, int16x4_t __b) {
   4871   int16x4x2_t r; __builtin_neon_vtrn_v(&r, (int8x8_t)__a, (int8x8_t)__b, 1); return r; }
   4872 __ai int32x2x2_t vtrn_s32(int32x2_t __a, int32x2_t __b) {
   4873   int32x2x2_t r; __builtin_neon_vtrn_v(&r, (int8x8_t)__a, (int8x8_t)__b, 2); return r; }
   4874 __ai uint8x8x2_t vtrn_u8(uint8x8_t __a, uint8x8_t __b) {
   4875   uint8x8x2_t r; __builtin_neon_vtrn_v(&r, (int8x8_t)__a, (int8x8_t)__b, 16); return r; }
   4876 __ai uint16x4x2_t vtrn_u16(uint16x4_t __a, uint16x4_t __b) {
   4877   uint16x4x2_t r; __builtin_neon_vtrn_v(&r, (int8x8_t)__a, (int8x8_t)__b, 17); return r; }
   4878 __ai uint32x2x2_t vtrn_u32(uint32x2_t __a, uint32x2_t __b) {
   4879   uint32x2x2_t r; __builtin_neon_vtrn_v(&r, (int8x8_t)__a, (int8x8_t)__b, 18); return r; }
   4880 __ai float32x2x2_t vtrn_f32(float32x2_t __a, float32x2_t __b) {
   4881   float32x2x2_t r; __builtin_neon_vtrn_v(&r, (int8x8_t)__a, (int8x8_t)__b, 7); return r; }
   4882 __ai poly8x8x2_t vtrn_p8(poly8x8_t __a, poly8x8_t __b) {
   4883   poly8x8x2_t r; __builtin_neon_vtrn_v(&r, (int8x8_t)__a, (int8x8_t)__b, 4); return r; }
   4884 __ai poly16x4x2_t vtrn_p16(poly16x4_t __a, poly16x4_t __b) {
   4885   poly16x4x2_t r; __builtin_neon_vtrn_v(&r, (int8x8_t)__a, (int8x8_t)__b, 5); return r; }
   4886 __ai int8x16x2_t vtrnq_s8(int8x16_t __a, int8x16_t __b) {
   4887   int8x16x2_t r; __builtin_neon_vtrnq_v(&r, __a, __b, 32); return r; }
   4888 __ai int16x8x2_t vtrnq_s16(int16x8_t __a, int16x8_t __b) {
   4889   int16x8x2_t r; __builtin_neon_vtrnq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 33); return r; }
   4890 __ai int32x4x2_t vtrnq_s32(int32x4_t __a, int32x4_t __b) {
   4891   int32x4x2_t r; __builtin_neon_vtrnq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 34); return r; }
   4892 __ai uint8x16x2_t vtrnq_u8(uint8x16_t __a, uint8x16_t __b) {
   4893   uint8x16x2_t r; __builtin_neon_vtrnq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 48); return r; }
   4894 __ai uint16x8x2_t vtrnq_u16(uint16x8_t __a, uint16x8_t __b) {
   4895   uint16x8x2_t r; __builtin_neon_vtrnq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 49); return r; }
   4896 __ai uint32x4x2_t vtrnq_u32(uint32x4_t __a, uint32x4_t __b) {
   4897   uint32x4x2_t r; __builtin_neon_vtrnq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 50); return r; }
   4898 __ai float32x4x2_t vtrnq_f32(float32x4_t __a, float32x4_t __b) {
   4899   float32x4x2_t r; __builtin_neon_vtrnq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 39); return r; }
   4900 __ai poly8x16x2_t vtrnq_p8(poly8x16_t __a, poly8x16_t __b) {
   4901   poly8x16x2_t r; __builtin_neon_vtrnq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 36); return r; }
   4902 __ai poly16x8x2_t vtrnq_p16(poly16x8_t __a, poly16x8_t __b) {
   4903   poly16x8x2_t r; __builtin_neon_vtrnq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 37); return r; }
   4904 
   4905 __ai uint8x8_t vtst_s8(int8x8_t __a, int8x8_t __b) {
   4906   return (uint8x8_t)__builtin_neon_vtst_v(__a, __b, 16); }
   4907 __ai uint16x4_t vtst_s16(int16x4_t __a, int16x4_t __b) {
   4908   return (uint16x4_t)__builtin_neon_vtst_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   4909 __ai uint32x2_t vtst_s32(int32x2_t __a, int32x2_t __b) {
   4910   return (uint32x2_t)__builtin_neon_vtst_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   4911 __ai uint8x8_t vtst_u8(uint8x8_t __a, uint8x8_t __b) {
   4912   return (uint8x8_t)__builtin_neon_vtst_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   4913 __ai uint16x4_t vtst_u16(uint16x4_t __a, uint16x4_t __b) {
   4914   return (uint16x4_t)__builtin_neon_vtst_v((int8x8_t)__a, (int8x8_t)__b, 17); }
   4915 __ai uint32x2_t vtst_u32(uint32x2_t __a, uint32x2_t __b) {
   4916   return (uint32x2_t)__builtin_neon_vtst_v((int8x8_t)__a, (int8x8_t)__b, 18); }
   4917 __ai uint8x8_t vtst_p8(poly8x8_t __a, poly8x8_t __b) {
   4918   return (uint8x8_t)__builtin_neon_vtst_v((int8x8_t)__a, (int8x8_t)__b, 16); }
   4919 __ai uint8x16_t vtstq_s8(int8x16_t __a, int8x16_t __b) {
   4920   return (uint8x16_t)__builtin_neon_vtstq_v(__a, __b, 48); }
   4921 __ai uint16x8_t vtstq_s16(int16x8_t __a, int16x8_t __b) {
   4922   return (uint16x8_t)__builtin_neon_vtstq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   4923 __ai uint32x4_t vtstq_s32(int32x4_t __a, int32x4_t __b) {
   4924   return (uint32x4_t)__builtin_neon_vtstq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   4925 __ai uint8x16_t vtstq_u8(uint8x16_t __a, uint8x16_t __b) {
   4926   return (uint8x16_t)__builtin_neon_vtstq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
   4927 __ai uint16x8_t vtstq_u16(uint16x8_t __a, uint16x8_t __b) {
   4928   return (uint16x8_t)__builtin_neon_vtstq_v((int8x16_t)__a, (int8x16_t)__b, 49); }
   4929 __ai uint32x4_t vtstq_u32(uint32x4_t __a, uint32x4_t __b) {
   4930   return (uint32x4_t)__builtin_neon_vtstq_v((int8x16_t)__a, (int8x16_t)__b, 50); }
   4931 __ai uint8x16_t vtstq_p8(poly8x16_t __a, poly8x16_t __b) {
   4932   return (uint8x16_t)__builtin_neon_vtstq_v((int8x16_t)__a, (int8x16_t)__b, 48); }
   4933 
   4934 __ai int8x8x2_t vuzp_s8(int8x8_t __a, int8x8_t __b) {
   4935   int8x8x2_t r; __builtin_neon_vuzp_v(&r, __a, __b, 0); return r; }
   4936 __ai int16x4x2_t vuzp_s16(int16x4_t __a, int16x4_t __b) {
   4937   int16x4x2_t r; __builtin_neon_vuzp_v(&r, (int8x8_t)__a, (int8x8_t)__b, 1); return r; }
   4938 __ai int32x2x2_t vuzp_s32(int32x2_t __a, int32x2_t __b) {
   4939   int32x2x2_t r; __builtin_neon_vuzp_v(&r, (int8x8_t)__a, (int8x8_t)__b, 2); return r; }
   4940 __ai uint8x8x2_t vuzp_u8(uint8x8_t __a, uint8x8_t __b) {
   4941   uint8x8x2_t r; __builtin_neon_vuzp_v(&r, (int8x8_t)__a, (int8x8_t)__b, 16); return r; }
   4942 __ai uint16x4x2_t vuzp_u16(uint16x4_t __a, uint16x4_t __b) {
   4943   uint16x4x2_t r; __builtin_neon_vuzp_v(&r, (int8x8_t)__a, (int8x8_t)__b, 17); return r; }
   4944 __ai uint32x2x2_t vuzp_u32(uint32x2_t __a, uint32x2_t __b) {
   4945   uint32x2x2_t r; __builtin_neon_vuzp_v(&r, (int8x8_t)__a, (int8x8_t)__b, 18); return r; }
   4946 __ai float32x2x2_t vuzp_f32(float32x2_t __a, float32x2_t __b) {
   4947   float32x2x2_t r; __builtin_neon_vuzp_v(&r, (int8x8_t)__a, (int8x8_t)__b, 7); return r; }
   4948 __ai poly8x8x2_t vuzp_p8(poly8x8_t __a, poly8x8_t __b) {
   4949   poly8x8x2_t r; __builtin_neon_vuzp_v(&r, (int8x8_t)__a, (int8x8_t)__b, 4); return r; }
   4950 __ai poly16x4x2_t vuzp_p16(poly16x4_t __a, poly16x4_t __b) {
   4951   poly16x4x2_t r; __builtin_neon_vuzp_v(&r, (int8x8_t)__a, (int8x8_t)__b, 5); return r; }
   4952 __ai int8x16x2_t vuzpq_s8(int8x16_t __a, int8x16_t __b) {
   4953   int8x16x2_t r; __builtin_neon_vuzpq_v(&r, __a, __b, 32); return r; }
   4954 __ai int16x8x2_t vuzpq_s16(int16x8_t __a, int16x8_t __b) {
   4955   int16x8x2_t r; __builtin_neon_vuzpq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 33); return r; }
   4956 __ai int32x4x2_t vuzpq_s32(int32x4_t __a, int32x4_t __b) {
   4957   int32x4x2_t r; __builtin_neon_vuzpq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 34); return r; }
   4958 __ai uint8x16x2_t vuzpq_u8(uint8x16_t __a, uint8x16_t __b) {
   4959   uint8x16x2_t r; __builtin_neon_vuzpq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 48); return r; }
   4960 __ai uint16x8x2_t vuzpq_u16(uint16x8_t __a, uint16x8_t __b) {
   4961   uint16x8x2_t r; __builtin_neon_vuzpq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 49); return r; }
   4962 __ai uint32x4x2_t vuzpq_u32(uint32x4_t __a, uint32x4_t __b) {
   4963   uint32x4x2_t r; __builtin_neon_vuzpq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 50); return r; }
   4964 __ai float32x4x2_t vuzpq_f32(float32x4_t __a, float32x4_t __b) {
   4965   float32x4x2_t r; __builtin_neon_vuzpq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 39); return r; }
   4966 __ai poly8x16x2_t vuzpq_p8(poly8x16_t __a, poly8x16_t __b) {
   4967   poly8x16x2_t r; __builtin_neon_vuzpq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 36); return r; }
   4968 __ai poly16x8x2_t vuzpq_p16(poly16x8_t __a, poly16x8_t __b) {
   4969   poly16x8x2_t r; __builtin_neon_vuzpq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 37); return r; }
   4970 
   4971 __ai int8x8x2_t vzip_s8(int8x8_t __a, int8x8_t __b) {
   4972   int8x8x2_t r; __builtin_neon_vzip_v(&r, __a, __b, 0); return r; }
   4973 __ai int16x4x2_t vzip_s16(int16x4_t __a, int16x4_t __b) {
   4974   int16x4x2_t r; __builtin_neon_vzip_v(&r, (int8x8_t)__a, (int8x8_t)__b, 1); return r; }
   4975 __ai int32x2x2_t vzip_s32(int32x2_t __a, int32x2_t __b) {
   4976   int32x2x2_t r; __builtin_neon_vzip_v(&r, (int8x8_t)__a, (int8x8_t)__b, 2); return r; }
   4977 __ai uint8x8x2_t vzip_u8(uint8x8_t __a, uint8x8_t __b) {
   4978   uint8x8x2_t r; __builtin_neon_vzip_v(&r, (int8x8_t)__a, (int8x8_t)__b, 16); return r; }
   4979 __ai uint16x4x2_t vzip_u16(uint16x4_t __a, uint16x4_t __b) {
   4980   uint16x4x2_t r; __builtin_neon_vzip_v(&r, (int8x8_t)__a, (int8x8_t)__b, 17); return r; }
   4981 __ai uint32x2x2_t vzip_u32(uint32x2_t __a, uint32x2_t __b) {
   4982   uint32x2x2_t r; __builtin_neon_vzip_v(&r, (int8x8_t)__a, (int8x8_t)__b, 18); return r; }
   4983 __ai float32x2x2_t vzip_f32(float32x2_t __a, float32x2_t __b) {
   4984   float32x2x2_t r; __builtin_neon_vzip_v(&r, (int8x8_t)__a, (int8x8_t)__b, 7); return r; }
   4985 __ai poly8x8x2_t vzip_p8(poly8x8_t __a, poly8x8_t __b) {
   4986   poly8x8x2_t r; __builtin_neon_vzip_v(&r, (int8x8_t)__a, (int8x8_t)__b, 4); return r; }
   4987 __ai poly16x4x2_t vzip_p16(poly16x4_t __a, poly16x4_t __b) {
   4988   poly16x4x2_t r; __builtin_neon_vzip_v(&r, (int8x8_t)__a, (int8x8_t)__b, 5); return r; }
   4989 __ai int8x16x2_t vzipq_s8(int8x16_t __a, int8x16_t __b) {
   4990   int8x16x2_t r; __builtin_neon_vzipq_v(&r, __a, __b, 32); return r; }
   4991 __ai int16x8x2_t vzipq_s16(int16x8_t __a, int16x8_t __b) {
   4992   int16x8x2_t r; __builtin_neon_vzipq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 33); return r; }
   4993 __ai int32x4x2_t vzipq_s32(int32x4_t __a, int32x4_t __b) {
   4994   int32x4x2_t r; __builtin_neon_vzipq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 34); return r; }
   4995 __ai uint8x16x2_t vzipq_u8(uint8x16_t __a, uint8x16_t __b) {
   4996   uint8x16x2_t r; __builtin_neon_vzipq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 48); return r; }
   4997 __ai uint16x8x2_t vzipq_u16(uint16x8_t __a, uint16x8_t __b) {
   4998   uint16x8x2_t r; __builtin_neon_vzipq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 49); return r; }
   4999 __ai uint32x4x2_t vzipq_u32(uint32x4_t __a, uint32x4_t __b) {
   5000   uint32x4x2_t r; __builtin_neon_vzipq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 50); return r; }
   5001 __ai float32x4x2_t vzipq_f32(float32x4_t __a, float32x4_t __b) {
   5002   float32x4x2_t r; __builtin_neon_vzipq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 39); return r; }
   5003 __ai poly8x16x2_t vzipq_p8(poly8x16_t __a, poly8x16_t __b) {
   5004   poly8x16x2_t r; __builtin_neon_vzipq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 36); return r; }
   5005 __ai poly16x8x2_t vzipq_p16(poly16x8_t __a, poly16x8_t __b) {
   5006   poly16x8x2_t r; __builtin_neon_vzipq_v(&r, (int8x16_t)__a, (int8x16_t)__b, 37); return r; }
   5007 
   5008 #undef __ai
   5009 
   5010 #endif /* __ARM_NEON_H */
   5011