Home | History | Annotate | Download | only in target-arm
      1 /*
      2  * ARM NEON vector operations.
      3  *
      4  * Copyright (c) 2007, 2008 CodeSourcery.
      5  * Written by Paul Brook
      6  *
      7  * This code is licenced under the GNU GPL v2.
      8  */
      9 #include <stdlib.h>
     10 #include <stdio.h>
     11 
     12 #include "cpu.h"
     13 #include "exec.h"
     14 #include "helper.h"
     15 
     16 #define SIGNBIT (uint32_t)0x80000000
     17 #define SIGNBIT64 ((uint64_t)1 << 63)
     18 
     19 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
     20 
     21 #define NFS (&env->vfp.standard_fp_status)
     22 
     23 #define NEON_TYPE1(name, type) \
     24 typedef struct \
     25 { \
     26     type v1; \
     27 } neon_##name;
     28 #ifdef HOST_WORDS_BIGENDIAN
     29 #define NEON_TYPE2(name, type) \
     30 typedef struct \
     31 { \
     32     type v2; \
     33     type v1; \
     34 } neon_##name;
     35 #define NEON_TYPE4(name, type) \
     36 typedef struct \
     37 { \
     38     type v4; \
     39     type v3; \
     40     type v2; \
     41     type v1; \
     42 } neon_##name;
     43 #else
     44 #define NEON_TYPE2(name, type) \
     45 typedef struct \
     46 { \
     47     type v1; \
     48     type v2; \
     49 } neon_##name;
     50 #define NEON_TYPE4(name, type) \
     51 typedef struct \
     52 { \
     53     type v1; \
     54     type v2; \
     55     type v3; \
     56     type v4; \
     57 } neon_##name;
     58 #endif
     59 
     60 NEON_TYPE4(s8, int8_t)
     61 NEON_TYPE4(u8, uint8_t)
     62 NEON_TYPE2(s16, int16_t)
     63 NEON_TYPE2(u16, uint16_t)
     64 NEON_TYPE1(s32, int32_t)
     65 NEON_TYPE1(u32, uint32_t)
     66 #undef NEON_TYPE4
     67 #undef NEON_TYPE2
     68 #undef NEON_TYPE1
     69 
     70 /* Copy from a uint32_t to a vector structure type.  */
     71 #define NEON_UNPACK(vtype, dest, val) do { \
     72     union { \
     73         vtype v; \
     74         uint32_t i; \
     75     } conv_u; \
     76     conv_u.i = (val); \
     77     dest = conv_u.v; \
     78     } while(0)
     79 
     80 /* Copy from a vector structure type to a uint32_t.  */
     81 #define NEON_PACK(vtype, dest, val) do { \
     82     union { \
     83         vtype v; \
     84         uint32_t i; \
     85     } conv_u; \
     86     conv_u.v = (val); \
     87     dest = conv_u.i; \
     88     } while(0)
     89 
     90 #define NEON_DO1 \
     91     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
     92 #define NEON_DO2 \
     93     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
     94     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
     95 #define NEON_DO4 \
     96     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
     97     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
     98     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
     99     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
    100 
    101 #define NEON_VOP_BODY(vtype, n) \
    102 { \
    103     uint32_t res; \
    104     vtype vsrc1; \
    105     vtype vsrc2; \
    106     vtype vdest; \
    107     NEON_UNPACK(vtype, vsrc1, arg1); \
    108     NEON_UNPACK(vtype, vsrc2, arg2); \
    109     NEON_DO##n; \
    110     NEON_PACK(vtype, res, vdest); \
    111     return res; \
    112 }
    113 
    114 #define NEON_VOP(name, vtype, n) \
    115 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    116 NEON_VOP_BODY(vtype, n)
    117 
    118 /* Pairwise operations.  */
    119 /* For 32-bit elements each segment only contains a single element, so
    120    the elementwise and pairwise operations are the same.  */
    121 #define NEON_PDO2 \
    122     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    123     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
    124 #define NEON_PDO4 \
    125     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    126     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
    127     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
    128     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
    129 
    130 #define NEON_POP(name, vtype, n) \
    131 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    132 { \
    133     uint32_t res; \
    134     vtype vsrc1; \
    135     vtype vsrc2; \
    136     vtype vdest; \
    137     NEON_UNPACK(vtype, vsrc1, arg1); \
    138     NEON_UNPACK(vtype, vsrc2, arg2); \
    139     NEON_PDO##n; \
    140     NEON_PACK(vtype, res, vdest); \
    141     return res; \
    142 }
    143 
    144 /* Unary operators.  */
    145 #define NEON_VOP1(name, vtype, n) \
    146 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
    147 { \
    148     vtype vsrc1; \
    149     vtype vdest; \
    150     NEON_UNPACK(vtype, vsrc1, arg); \
    151     NEON_DO##n; \
    152     NEON_PACK(vtype, arg, vdest); \
    153     return arg; \
    154 }
    155 
    156 
    157 #define NEON_USAT(dest, src1, src2, type) do { \
    158     uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    159     if (tmp != (type)tmp) { \
    160         SET_QC(); \
    161         dest = ~0; \
    162     } else { \
    163         dest = tmp; \
    164     }} while(0)
    165 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    166 NEON_VOP(qadd_u8, neon_u8, 4)
    167 #undef NEON_FN
    168 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    169 NEON_VOP(qadd_u16, neon_u16, 2)
    170 #undef NEON_FN
    171 #undef NEON_USAT
    172 
    173 uint32_t HELPER(neon_qadd_u32)(uint32_t a, uint32_t b)
    174 {
    175     uint32_t res = a + b;
    176     if (res < a) {
    177         SET_QC();
    178         res = ~0;
    179     }
    180     return res;
    181 }
    182 
    183 uint64_t HELPER(neon_qadd_u64)(uint64_t src1, uint64_t src2)
    184 {
    185     uint64_t res;
    186 
    187     res = src1 + src2;
    188     if (res < src1) {
    189         SET_QC();
    190         res = ~(uint64_t)0;
    191     }
    192     return res;
    193 }
    194 
    195 #define NEON_SSAT(dest, src1, src2, type) do { \
    196     int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    197     if (tmp != (type)tmp) { \
    198         SET_QC(); \
    199         if (src2 > 0) { \
    200             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    201         } else { \
    202             tmp = 1 << (sizeof(type) * 8 - 1); \
    203         } \
    204     } \
    205     dest = tmp; \
    206     } while(0)
    207 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    208 NEON_VOP(qadd_s8, neon_s8, 4)
    209 #undef NEON_FN
    210 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    211 NEON_VOP(qadd_s16, neon_s16, 2)
    212 #undef NEON_FN
    213 #undef NEON_SSAT
    214 
    215 uint32_t HELPER(neon_qadd_s32)(uint32_t a, uint32_t b)
    216 {
    217     uint32_t res = a + b;
    218     if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
    219         SET_QC();
    220         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
    221     }
    222     return res;
    223 }
    224 
    225 uint64_t HELPER(neon_qadd_s64)(uint64_t src1, uint64_t src2)
    226 {
    227     uint64_t res;
    228 
    229     res = src1 + src2;
    230     if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
    231         SET_QC();
    232         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
    233     }
    234     return res;
    235 }
    236 
    237 #define NEON_USAT(dest, src1, src2, type) do { \
    238     uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    239     if (tmp != (type)tmp) { \
    240         SET_QC(); \
    241         dest = 0; \
    242     } else { \
    243         dest = tmp; \
    244     }} while(0)
    245 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    246 NEON_VOP(qsub_u8, neon_u8, 4)
    247 #undef NEON_FN
    248 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    249 NEON_VOP(qsub_u16, neon_u16, 2)
    250 #undef NEON_FN
    251 #undef NEON_USAT
    252 
    253 uint32_t HELPER(neon_qsub_u32)(uint32_t a, uint32_t b)
    254 {
    255     uint32_t res = a - b;
    256     if (res > a) {
    257         SET_QC();
    258         res = 0;
    259     }
    260     return res;
    261 }
    262 
    263 uint64_t HELPER(neon_qsub_u64)(uint64_t src1, uint64_t src2)
    264 {
    265     uint64_t res;
    266 
    267     if (src1 < src2) {
    268         SET_QC();
    269         res = 0;
    270     } else {
    271         res = src1 - src2;
    272     }
    273     return res;
    274 }
    275 
    276 #define NEON_SSAT(dest, src1, src2, type) do { \
    277     int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    278     if (tmp != (type)tmp) { \
    279         SET_QC(); \
    280         if (src2 < 0) { \
    281             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    282         } else { \
    283             tmp = 1 << (sizeof(type) * 8 - 1); \
    284         } \
    285     } \
    286     dest = tmp; \
    287     } while(0)
    288 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    289 NEON_VOP(qsub_s8, neon_s8, 4)
    290 #undef NEON_FN
    291 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    292 NEON_VOP(qsub_s16, neon_s16, 2)
    293 #undef NEON_FN
    294 #undef NEON_SSAT
    295 
    296 uint32_t HELPER(neon_qsub_s32)(uint32_t a, uint32_t b)
    297 {
    298     uint32_t res = a - b;
    299     if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
    300         SET_QC();
    301         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
    302     }
    303     return res;
    304 }
    305 
    306 uint64_t HELPER(neon_qsub_s64)(uint64_t src1, uint64_t src2)
    307 {
    308     uint64_t res;
    309 
    310     res = src1 - src2;
    311     if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
    312         SET_QC();
    313         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
    314     }
    315     return res;
    316 }
    317 
    318 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
    319 NEON_VOP(hadd_s8, neon_s8, 4)
    320 NEON_VOP(hadd_u8, neon_u8, 4)
    321 NEON_VOP(hadd_s16, neon_s16, 2)
    322 NEON_VOP(hadd_u16, neon_u16, 2)
    323 #undef NEON_FN
    324 
    325 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
    326 {
    327     int32_t dest;
    328 
    329     dest = (src1 >> 1) + (src2 >> 1);
    330     if (src1 & src2 & 1)
    331         dest++;
    332     return dest;
    333 }
    334 
    335 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
    336 {
    337     uint32_t dest;
    338 
    339     dest = (src1 >> 1) + (src2 >> 1);
    340     if (src1 & src2 & 1)
    341         dest++;
    342     return dest;
    343 }
    344 
    345 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
    346 NEON_VOP(rhadd_s8, neon_s8, 4)
    347 NEON_VOP(rhadd_u8, neon_u8, 4)
    348 NEON_VOP(rhadd_s16, neon_s16, 2)
    349 NEON_VOP(rhadd_u16, neon_u16, 2)
    350 #undef NEON_FN
    351 
    352 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
    353 {
    354     int32_t dest;
    355 
    356     dest = (src1 >> 1) + (src2 >> 1);
    357     if ((src1 | src2) & 1)
    358         dest++;
    359     return dest;
    360 }
    361 
    362 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
    363 {
    364     uint32_t dest;
    365 
    366     dest = (src1 >> 1) + (src2 >> 1);
    367     if ((src1 | src2) & 1)
    368         dest++;
    369     return dest;
    370 }
    371 
    372 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
    373 NEON_VOP(hsub_s8, neon_s8, 4)
    374 NEON_VOP(hsub_u8, neon_u8, 4)
    375 NEON_VOP(hsub_s16, neon_s16, 2)
    376 NEON_VOP(hsub_u16, neon_u16, 2)
    377 #undef NEON_FN
    378 
    379 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
    380 {
    381     int32_t dest;
    382 
    383     dest = (src1 >> 1) - (src2 >> 1);
    384     if ((~src1) & src2 & 1)
    385         dest--;
    386     return dest;
    387 }
    388 
    389 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
    390 {
    391     uint32_t dest;
    392 
    393     dest = (src1 >> 1) - (src2 >> 1);
    394     if ((~src1) & src2 & 1)
    395         dest--;
    396     return dest;
    397 }
    398 
    399 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
    400 NEON_VOP(cgt_s8, neon_s8, 4)
    401 NEON_VOP(cgt_u8, neon_u8, 4)
    402 NEON_VOP(cgt_s16, neon_s16, 2)
    403 NEON_VOP(cgt_u16, neon_u16, 2)
    404 NEON_VOP(cgt_s32, neon_s32, 1)
    405 NEON_VOP(cgt_u32, neon_u32, 1)
    406 #undef NEON_FN
    407 
    408 #define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
    409 NEON_VOP(cge_s8, neon_s8, 4)
    410 NEON_VOP(cge_u8, neon_u8, 4)
    411 NEON_VOP(cge_s16, neon_s16, 2)
    412 NEON_VOP(cge_u16, neon_u16, 2)
    413 NEON_VOP(cge_s32, neon_s32, 1)
    414 NEON_VOP(cge_u32, neon_u32, 1)
    415 #undef NEON_FN
    416 
    417 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
    418 NEON_VOP(min_s8, neon_s8, 4)
    419 NEON_VOP(min_u8, neon_u8, 4)
    420 NEON_VOP(min_s16, neon_s16, 2)
    421 NEON_VOP(min_u16, neon_u16, 2)
    422 NEON_VOP(min_s32, neon_s32, 1)
    423 NEON_VOP(min_u32, neon_u32, 1)
    424 NEON_POP(pmin_s8, neon_s8, 4)
    425 NEON_POP(pmin_u8, neon_u8, 4)
    426 NEON_POP(pmin_s16, neon_s16, 2)
    427 NEON_POP(pmin_u16, neon_u16, 2)
    428 #undef NEON_FN
    429 
    430 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
    431 NEON_VOP(max_s8, neon_s8, 4)
    432 NEON_VOP(max_u8, neon_u8, 4)
    433 NEON_VOP(max_s16, neon_s16, 2)
    434 NEON_VOP(max_u16, neon_u16, 2)
    435 NEON_VOP(max_s32, neon_s32, 1)
    436 NEON_VOP(max_u32, neon_u32, 1)
    437 NEON_POP(pmax_s8, neon_s8, 4)
    438 NEON_POP(pmax_u8, neon_u8, 4)
    439 NEON_POP(pmax_s16, neon_s16, 2)
    440 NEON_POP(pmax_u16, neon_u16, 2)
    441 #undef NEON_FN
    442 
    443 #define NEON_FN(dest, src1, src2) \
    444     dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
    445 NEON_VOP(abd_s8, neon_s8, 4)
    446 NEON_VOP(abd_u8, neon_u8, 4)
    447 NEON_VOP(abd_s16, neon_s16, 2)
    448 NEON_VOP(abd_u16, neon_u16, 2)
    449 NEON_VOP(abd_s32, neon_s32, 1)
    450 NEON_VOP(abd_u32, neon_u32, 1)
    451 #undef NEON_FN
    452 
    453 #define NEON_FN(dest, src1, src2) do { \
    454     int8_t tmp; \
    455     tmp = (int8_t)src2; \
    456     if (tmp >= (ssize_t)sizeof(src1) * 8 || \
    457         tmp <= -(ssize_t)sizeof(src1) * 8) { \
    458         dest = 0; \
    459     } else if (tmp < 0) { \
    460         dest = src1 >> -tmp; \
    461     } else { \
    462         dest = src1 << tmp; \
    463     }} while (0)
    464 NEON_VOP(shl_u8, neon_u8, 4)
    465 NEON_VOP(shl_u16, neon_u16, 2)
    466 NEON_VOP(shl_u32, neon_u32, 1)
    467 #undef NEON_FN
    468 
    469 uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
    470 {
    471     int8_t shift = (int8_t)shiftop;
    472     if (shift >= 64 || shift <= -64) {
    473         val = 0;
    474     } else if (shift < 0) {
    475         val >>= -shift;
    476     } else {
    477         val <<= shift;
    478     }
    479     return val;
    480 }
    481 
    482 #define NEON_FN(dest, src1, src2) do { \
    483     int8_t tmp; \
    484     tmp = (int8_t)src2; \
    485     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    486         dest = 0; \
    487     } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    488         dest = src1 >> (sizeof(src1) * 8 - 1); \
    489     } else if (tmp < 0) { \
    490         dest = src1 >> -tmp; \
    491     } else { \
    492         dest = src1 << tmp; \
    493     }} while (0)
    494 NEON_VOP(shl_s8, neon_s8, 4)
    495 NEON_VOP(shl_s16, neon_s16, 2)
    496 NEON_VOP(shl_s32, neon_s32, 1)
    497 #undef NEON_FN
    498 
    499 uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
    500 {
    501     int8_t shift = (int8_t)shiftop;
    502     int64_t val = valop;
    503     if (shift >= 64) {
    504         val = 0;
    505     } else if (shift <= -64) {
    506         val >>= 63;
    507     } else if (shift < 0) {
    508         val >>= -shift;
    509     } else {
    510         val <<= shift;
    511     }
    512     return val;
    513 }
    514 
    515 #define NEON_FN(dest, src1, src2) do { \
    516     int8_t tmp; \
    517     tmp = (int8_t)src2; \
    518     if ((tmp >= (ssize_t)sizeof(src1) * 8) \
    519         || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
    520         dest = 0; \
    521     } else if (tmp < 0) { \
    522         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    523     } else { \
    524         dest = src1 << tmp; \
    525     }} while (0)
    526 NEON_VOP(rshl_s8, neon_s8, 4)
    527 NEON_VOP(rshl_s16, neon_s16, 2)
    528 #undef NEON_FN
    529 
    530 /* The addition of the rounding constant may overflow, so we use an
    531  * intermediate 64 bits accumulator.  */
    532 uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
    533 {
    534     int32_t dest;
    535     int32_t val = (int32_t)valop;
    536     int8_t shift = (int8_t)shiftop;
    537     if ((shift >= 32) || (shift <= -32)) {
    538         dest = 0;
    539     } else if (shift < 0) {
    540         int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
    541         dest = big_dest >> -shift;
    542     } else {
    543         dest = val << shift;
    544     }
    545     return dest;
    546 }
    547 
    548 /* Handling addition overflow with 64 bits inputs values is more
    549  * tricky than with 32 bits values.  */
    550 uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
    551 {
    552     int8_t shift = (int8_t)shiftop;
    553     int64_t val = valop;
    554     if ((shift >= 64) || (shift <= -64)) {
    555         val = 0;
    556     } else if (shift < 0) {
    557         val >>= (-shift - 1);
    558         if (val == INT64_MAX) {
    559             /* In this case, it means that the rounding constant is 1,
    560              * and the addition would overflow. Return the actual
    561              * result directly.  */
    562             val = 0x4000000000000000LL;
    563         } else {
    564             val++;
    565             val >>= 1;
    566         }
    567     } else {
    568         val <<= shift;
    569     }
    570     return val;
    571 }
    572 
    573 #define NEON_FN(dest, src1, src2) do { \
    574     int8_t tmp; \
    575     tmp = (int8_t)src2; \
    576     if (tmp >= (ssize_t)sizeof(src1) * 8 || \
    577         tmp < -(ssize_t)sizeof(src1) * 8) { \
    578         dest = 0; \
    579     } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
    580         dest = src1 >> (-tmp - 1); \
    581     } else if (tmp < 0) { \
    582         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    583     } else { \
    584         dest = src1 << tmp; \
    585     }} while (0)
    586 NEON_VOP(rshl_u8, neon_u8, 4)
    587 NEON_VOP(rshl_u16, neon_u16, 2)
    588 #undef NEON_FN
    589 
    590 /* The addition of the rounding constant may overflow, so we use an
    591  * intermediate 64 bits accumulator.  */
    592 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
    593 {
    594     uint32_t dest;
    595     int8_t shift = (int8_t)shiftop;
    596     if (shift >= 32 || shift < -32) {
    597         dest = 0;
    598     } else if (shift == -32) {
    599         dest = val >> 31;
    600     } else if (shift < 0) {
    601         uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
    602         dest = big_dest >> -shift;
    603     } else {
    604         dest = val << shift;
    605     }
    606     return dest;
    607 }
    608 
    609 /* Handling addition overflow with 64 bits inputs values is more
    610  * tricky than with 32 bits values.  */
    611 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
    612 {
    613     int8_t shift = (uint8_t)shiftop;
    614     if (shift >= 64 || shift < -64) {
    615         val = 0;
    616     } else if (shift == -64) {
    617         /* Rounding a 1-bit result just preserves that bit.  */
    618         val >>= 63;
    619     } else if (shift < 0) {
    620         val >>= (-shift - 1);
    621         if (val == UINT64_MAX) {
    622             /* In this case, it means that the rounding constant is 1,
    623              * and the addition would overflow. Return the actual
    624              * result directly.  */
    625             val = 0x8000000000000000ULL;
    626         } else {
    627             val++;
    628             val >>= 1;
    629         }
    630     } else {
    631         val <<= shift;
    632     }
    633     return val;
    634 }
    635 
    636 #define NEON_FN(dest, src1, src2) do { \
    637     int8_t tmp; \
    638     tmp = (int8_t)src2; \
    639     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    640         if (src1) { \
    641             SET_QC(); \
    642             dest = ~0; \
    643         } else { \
    644             dest = 0; \
    645         } \
    646     } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    647         dest = 0; \
    648     } else if (tmp < 0) { \
    649         dest = src1 >> -tmp; \
    650     } else { \
    651         dest = src1 << tmp; \
    652         if ((dest >> tmp) != src1) { \
    653             SET_QC(); \
    654             dest = ~0; \
    655         } \
    656     }} while (0)
    657 NEON_VOP(qshl_u8, neon_u8, 4)
    658 NEON_VOP(qshl_u16, neon_u16, 2)
    659 NEON_VOP(qshl_u32, neon_u32, 1)
    660 #undef NEON_FN
    661 
    662 uint64_t HELPER(neon_qshl_u64)(uint64_t val, uint64_t shiftop)
    663 {
    664     int8_t shift = (int8_t)shiftop;
    665     if (shift >= 64) {
    666         if (val) {
    667             val = ~(uint64_t)0;
    668             SET_QC();
    669         }
    670     } else if (shift <= -64) {
    671         val = 0;
    672     } else if (shift < 0) {
    673         val >>= -shift;
    674     } else {
    675         uint64_t tmp = val;
    676         val <<= shift;
    677         if ((val >> shift) != tmp) {
    678             SET_QC();
    679             val = ~(uint64_t)0;
    680         }
    681     }
    682     return val;
    683 }
    684 
    685 #define NEON_FN(dest, src1, src2) do { \
    686     int8_t tmp; \
    687     tmp = (int8_t)src2; \
    688     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    689         if (src1) { \
    690             SET_QC(); \
    691             dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
    692             if (src1 > 0) { \
    693                 dest--; \
    694             } \
    695         } else { \
    696             dest = src1; \
    697         } \
    698     } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    699         dest = src1 >> 31; \
    700     } else if (tmp < 0) { \
    701         dest = src1 >> -tmp; \
    702     } else { \
    703         dest = src1 << tmp; \
    704         if ((dest >> tmp) != src1) { \
    705             SET_QC(); \
    706             dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
    707             if (src1 > 0) { \
    708                 dest--; \
    709             } \
    710         } \
    711     }} while (0)
    712 NEON_VOP(qshl_s8, neon_s8, 4)
    713 NEON_VOP(qshl_s16, neon_s16, 2)
    714 NEON_VOP(qshl_s32, neon_s32, 1)
    715 #undef NEON_FN
    716 
    717 uint64_t HELPER(neon_qshl_s64)(uint64_t valop, uint64_t shiftop)
    718 {
    719     int8_t shift = (uint8_t)shiftop;
    720     int64_t val = valop;
    721     if (shift >= 64) {
    722         if (val) {
    723             SET_QC();
    724             val = (val >> 63) ^ ~SIGNBIT64;
    725         }
    726     } else if (shift <= -64) {
    727         val >>= 63;
    728     } else if (shift < 0) {
    729         val >>= -shift;
    730     } else {
    731         int64_t tmp = val;
    732         val <<= shift;
    733         if ((val >> shift) != tmp) {
    734             SET_QC();
    735             val = (tmp >> 63) ^ ~SIGNBIT64;
    736         }
    737     }
    738     return val;
    739 }
    740 
    741 #define NEON_FN(dest, src1, src2) do { \
    742     if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
    743         SET_QC(); \
    744         dest = 0; \
    745     } else { \
    746         int8_t tmp; \
    747         tmp = (int8_t)src2; \
    748         if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    749             if (src1) { \
    750                 SET_QC(); \
    751                 dest = ~0; \
    752             } else { \
    753                 dest = 0; \
    754             } \
    755         } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    756             dest = 0; \
    757         } else if (tmp < 0) { \
    758             dest = src1 >> -tmp; \
    759         } else { \
    760             dest = src1 << tmp; \
    761             if ((dest >> tmp) != src1) { \
    762                 SET_QC(); \
    763                 dest = ~0; \
    764             } \
    765         } \
    766     }} while (0)
    767 NEON_VOP(qshlu_s8, neon_u8, 4)
    768 NEON_VOP(qshlu_s16, neon_u16, 2)
    769 #undef NEON_FN
    770 
    771 uint32_t HELPER(neon_qshlu_s32)(uint32_t valop, uint32_t shiftop)
    772 {
    773     if ((int32_t)valop < 0) {
    774         SET_QC();
    775         return 0;
    776     }
    777     return helper_neon_qshl_u32(valop, shiftop);
    778 }
    779 
    780 uint64_t HELPER(neon_qshlu_s64)(uint64_t valop, uint64_t shiftop)
    781 {
    782     if ((int64_t)valop < 0) {
    783         SET_QC();
    784         return 0;
    785     }
    786     return helper_neon_qshl_u64(valop, shiftop);
    787 }
    788 
    789 /* FIXME: This is wrong.  */
    790 #define NEON_FN(dest, src1, src2) do { \
    791     int8_t tmp; \
    792     tmp = (int8_t)src2; \
    793     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    794         if (src1) { \
    795             SET_QC(); \
    796             dest = ~0; \
    797         } else { \
    798             dest = 0; \
    799         } \
    800     } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
    801         dest = 0; \
    802     } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
    803         dest = src1 >> (sizeof(src1) * 8 - 1); \
    804     } else if (tmp < 0) { \
    805         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    806     } else { \
    807         dest = src1 << tmp; \
    808         if ((dest >> tmp) != src1) { \
    809             SET_QC(); \
    810             dest = ~0; \
    811         } \
    812     }} while (0)
    813 NEON_VOP(qrshl_u8, neon_u8, 4)
    814 NEON_VOP(qrshl_u16, neon_u16, 2)
    815 #undef NEON_FN
    816 
    817 /* The addition of the rounding constant may overflow, so we use an
    818  * intermediate 64 bits accumulator.  */
    819 uint32_t HELPER(neon_qrshl_u32)(uint32_t val, uint32_t shiftop)
    820 {
    821     uint32_t dest;
    822     int8_t shift = (int8_t)shiftop;
    823     if (shift >= 32) {
    824         if (val) {
    825             SET_QC();
    826             dest = ~0;
    827         } else {
    828             dest = 0;
    829         }
    830     } else if (shift < -32) {
    831         dest = 0;
    832     } else if (shift == -32) {
    833         dest = val >> 31;
    834     } else if (shift < 0) {
    835         uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
    836         dest = big_dest >> -shift;
    837     } else {
    838         dest = val << shift;
    839         if ((dest >> shift) != val) {
    840             SET_QC();
    841             dest = ~0;
    842         }
    843     }
    844     return dest;
    845 }
    846 
    847 /* Handling addition overflow with 64 bits inputs values is more
    848  * tricky than with 32 bits values.  */
    849 uint64_t HELPER(neon_qrshl_u64)(uint64_t val, uint64_t shiftop)
    850 {
    851     int8_t shift = (int8_t)shiftop;
    852     if (shift >= 64) {
    853         if (val) {
    854             SET_QC();
    855             val = ~0;
    856         }
    857     } else if (shift < -64) {
    858         val = 0;
    859     } else if (shift == -64) {
    860         val >>= 63;
    861     } else if (shift < 0) {
    862         val >>= (-shift - 1);
    863         if (val == UINT64_MAX) {
    864             /* In this case, it means that the rounding constant is 1,
    865              * and the addition would overflow. Return the actual
    866              * result directly.  */
    867             val = 0x8000000000000000ULL;
    868         } else {
    869             val++;
    870             val >>= 1;
    871         }
    872     } else { \
    873         uint64_t tmp = val;
    874         val <<= shift;
    875         if ((val >> shift) != tmp) {
    876             SET_QC();
    877             val = ~0;
    878         }
    879     }
    880     return val;
    881 }
    882 
    883 #define NEON_FN(dest, src1, src2) do { \
    884     int8_t tmp; \
    885     tmp = (int8_t)src2; \
    886     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    887         if (src1) { \
    888             SET_QC(); \
    889             dest = (1 << (sizeof(src1) * 8 - 1)); \
    890             if (src1 > 0) { \
    891                 dest--; \
    892             } \
    893         } else { \
    894             dest = 0; \
    895         } \
    896     } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    897         dest = 0; \
    898     } else if (tmp < 0) { \
    899         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    900     } else { \
    901         dest = src1 << tmp; \
    902         if ((dest >> tmp) != src1) { \
    903             SET_QC(); \
    904             dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
    905             if (src1 > 0) { \
    906                 dest--; \
    907             } \
    908         } \
    909     }} while (0)
    910 NEON_VOP(qrshl_s8, neon_s8, 4)
    911 NEON_VOP(qrshl_s16, neon_s16, 2)
    912 #undef NEON_FN
    913 
    914 /* The addition of the rounding constant may overflow, so we use an
    915  * intermediate 64 bits accumulator.  */
    916 uint32_t HELPER(neon_qrshl_s32)(uint32_t valop, uint32_t shiftop)
    917 {
    918     int32_t dest;
    919     int32_t val = (int32_t)valop;
    920     int8_t shift = (int8_t)shiftop;
    921     if (shift >= 32) {
    922         if (val) {
    923             SET_QC();
    924             dest = (val >> 31) ^ ~SIGNBIT;
    925         } else {
    926             dest = 0;
    927         }
    928     } else if (shift <= -32) {
    929         dest = 0;
    930     } else if (shift < 0) {
    931         int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
    932         dest = big_dest >> -shift;
    933     } else {
    934         dest = val << shift;
    935         if ((dest >> shift) != val) {
    936             SET_QC();
    937             dest = (val >> 31) ^ ~SIGNBIT;
    938         }
    939     }
    940     return dest;
    941 }
    942 
    943 /* Handling addition overflow with 64 bits inputs values is more
    944  * tricky than with 32 bits values.  */
    945 uint64_t HELPER(neon_qrshl_s64)(uint64_t valop, uint64_t shiftop)
    946 {
    947     int8_t shift = (uint8_t)shiftop;
    948     int64_t val = valop;
    949 
    950     if (shift >= 64) {
    951         if (val) {
    952             SET_QC();
    953             val = (val >> 63) ^ ~SIGNBIT64;
    954         }
    955     } else if (shift <= -64) {
    956         val = 0;
    957     } else if (shift < 0) {
    958         val >>= (-shift - 1);
    959         if (val == INT64_MAX) {
    960             /* In this case, it means that the rounding constant is 1,
    961              * and the addition would overflow. Return the actual
    962              * result directly.  */
    963             val = 0x4000000000000000ULL;
    964         } else {
    965             val++;
    966             val >>= 1;
    967         }
    968     } else {
    969         int64_t tmp = val;
    970         val <<= shift;
    971         if ((val >> shift) != tmp) {
    972             SET_QC();
    973             val = (tmp >> 63) ^ ~SIGNBIT64;
    974         }
    975     }
    976     return val;
    977 }
    978 
    979 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
    980 {
    981     uint32_t mask;
    982     mask = (a ^ b) & 0x80808080u;
    983     a &= ~0x80808080u;
    984     b &= ~0x80808080u;
    985     return (a + b) ^ mask;
    986 }
    987 
    988 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
    989 {
    990     uint32_t mask;
    991     mask = (a ^ b) & 0x80008000u;
    992     a &= ~0x80008000u;
    993     b &= ~0x80008000u;
    994     return (a + b) ^ mask;
    995 }
    996 
    997 #define NEON_FN(dest, src1, src2) dest = src1 + src2
    998 NEON_POP(padd_u8, neon_u8, 4)
    999 NEON_POP(padd_u16, neon_u16, 2)
   1000 #undef NEON_FN
   1001 
   1002 #define NEON_FN(dest, src1, src2) dest = src1 - src2
   1003 NEON_VOP(sub_u8, neon_u8, 4)
   1004 NEON_VOP(sub_u16, neon_u16, 2)
   1005 #undef NEON_FN
   1006 
   1007 #define NEON_FN(dest, src1, src2) dest = src1 * src2
   1008 NEON_VOP(mul_u8, neon_u8, 4)
   1009 NEON_VOP(mul_u16, neon_u16, 2)
   1010 #undef NEON_FN
   1011 
   1012 /* Polynomial multiplication is like integer multiplication except the
   1013    partial products are XORed, not added.  */
   1014 uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
   1015 {
   1016     uint32_t mask;
   1017     uint32_t result;
   1018     result = 0;
   1019     while (op1) {
   1020         mask = 0;
   1021         if (op1 & 1)
   1022             mask |= 0xff;
   1023         if (op1 & (1 << 8))
   1024             mask |= (0xff << 8);
   1025         if (op1 & (1 << 16))
   1026             mask |= (0xff << 16);
   1027         if (op1 & (1 << 24))
   1028             mask |= (0xff << 24);
   1029         result ^= op2 & mask;
   1030         op1 = (op1 >> 1) & 0x7f7f7f7f;
   1031         op2 = (op2 << 1) & 0xfefefefe;
   1032     }
   1033     return result;
   1034 }
   1035 
   1036 uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
   1037 {
   1038     uint64_t result = 0;
   1039     uint64_t mask;
   1040     uint64_t op2ex = op2;
   1041     op2ex = (op2ex & 0xff) |
   1042         ((op2ex & 0xff00) << 8) |
   1043         ((op2ex & 0xff0000) << 16) |
   1044         ((op2ex & 0xff000000) << 24);
   1045     while (op1) {
   1046         mask = 0;
   1047         if (op1 & 1) {
   1048             mask |= 0xffff;
   1049         }
   1050         if (op1 & (1 << 8)) {
   1051             mask |= (0xffffU << 16);
   1052         }
   1053         if (op1 & (1 << 16)) {
   1054             mask |= (0xffffULL << 32);
   1055         }
   1056         if (op1 & (1 << 24)) {
   1057             mask |= (0xffffULL << 48);
   1058         }
   1059         result ^= op2ex & mask;
   1060         op1 = (op1 >> 1) & 0x7f7f7f7f;
   1061         op2ex <<= 1;
   1062     }
   1063     return result;
   1064 }
   1065 
   1066 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
   1067 NEON_VOP(tst_u8, neon_u8, 4)
   1068 NEON_VOP(tst_u16, neon_u16, 2)
   1069 NEON_VOP(tst_u32, neon_u32, 1)
   1070 #undef NEON_FN
   1071 
   1072 #define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
   1073 NEON_VOP(ceq_u8, neon_u8, 4)
   1074 NEON_VOP(ceq_u16, neon_u16, 2)
   1075 NEON_VOP(ceq_u32, neon_u32, 1)
   1076 #undef NEON_FN
   1077 
   1078 #define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
   1079 NEON_VOP1(abs_s8, neon_s8, 4)
   1080 NEON_VOP1(abs_s16, neon_s16, 2)
   1081 #undef NEON_FN
   1082 
   1083 /* Count Leading Sign/Zero Bits.  */
   1084 static inline int do_clz8(uint8_t x)
   1085 {
   1086     int n;
   1087     for (n = 8; x; n--)
   1088         x >>= 1;
   1089     return n;
   1090 }
   1091 
   1092 static inline int do_clz16(uint16_t x)
   1093 {
   1094     int n;
   1095     for (n = 16; x; n--)
   1096         x >>= 1;
   1097     return n;
   1098 }
   1099 
   1100 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
   1101 NEON_VOP1(clz_u8, neon_u8, 4)
   1102 #undef NEON_FN
   1103 
   1104 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
   1105 NEON_VOP1(clz_u16, neon_u16, 2)
   1106 #undef NEON_FN
   1107 
   1108 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
   1109 NEON_VOP1(cls_s8, neon_s8, 4)
   1110 #undef NEON_FN
   1111 
   1112 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
   1113 NEON_VOP1(cls_s16, neon_s16, 2)
   1114 #undef NEON_FN
   1115 
   1116 uint32_t HELPER(neon_cls_s32)(uint32_t x)
   1117 {
   1118     int count;
   1119     if ((int32_t)x < 0)
   1120         x = ~x;
   1121     for (count = 32; x; count--)
   1122         x = x >> 1;
   1123     return count - 1;
   1124 }
   1125 
   1126 /* Bit count.  */
   1127 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
   1128 {
   1129     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
   1130     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
   1131     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
   1132     return x;
   1133 }
   1134 
   1135 #define NEON_QDMULH16(dest, src1, src2, round) do { \
   1136     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
   1137     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
   1138         SET_QC(); \
   1139         tmp = (tmp >> 31) ^ ~SIGNBIT; \
   1140     } else { \
   1141         tmp <<= 1; \
   1142     } \
   1143     if (round) { \
   1144         int32_t old = tmp; \
   1145         tmp += 1 << 15; \
   1146         if ((int32_t)tmp < old) { \
   1147             SET_QC(); \
   1148             tmp = SIGNBIT - 1; \
   1149         } \
   1150     } \
   1151     dest = tmp >> 16; \
   1152     } while(0)
   1153 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
   1154 NEON_VOP(qdmulh_s16, neon_s16, 2)
   1155 #undef NEON_FN
   1156 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
   1157 NEON_VOP(qrdmulh_s16, neon_s16, 2)
   1158 #undef NEON_FN
   1159 #undef NEON_QDMULH16
   1160 
   1161 #define NEON_QDMULH32(dest, src1, src2, round) do { \
   1162     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
   1163     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
   1164         SET_QC(); \
   1165         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
   1166     } else { \
   1167         tmp <<= 1; \
   1168     } \
   1169     if (round) { \
   1170         int64_t old = tmp; \
   1171         tmp += (int64_t)1 << 31; \
   1172         if ((int64_t)tmp < old) { \
   1173             SET_QC(); \
   1174             tmp = SIGNBIT64 - 1; \
   1175         } \
   1176     } \
   1177     dest = tmp >> 32; \
   1178     } while(0)
   1179 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
   1180 NEON_VOP(qdmulh_s32, neon_s32, 1)
   1181 #undef NEON_FN
   1182 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
   1183 NEON_VOP(qrdmulh_s32, neon_s32, 1)
   1184 #undef NEON_FN
   1185 #undef NEON_QDMULH32
   1186 
   1187 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
   1188 {
   1189     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
   1190            | ((x >> 24) & 0xff000000u);
   1191 }
   1192 
   1193 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
   1194 {
   1195     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
   1196 }
   1197 
   1198 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
   1199 {
   1200     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
   1201             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
   1202 }
   1203 
   1204 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
   1205 {
   1206     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
   1207 }
   1208 
   1209 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
   1210 {
   1211     x &= 0xff80ff80ff80ff80ull;
   1212     x += 0x0080008000800080ull;
   1213     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
   1214             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
   1215 }
   1216 
   1217 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
   1218 {
   1219     x &= 0xffff8000ffff8000ull;
   1220     x += 0x0000800000008000ull;
   1221     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
   1222 }
   1223 
   1224 uint32_t HELPER(neon_unarrow_sat8)(uint64_t x)
   1225 {
   1226     uint16_t s;
   1227     uint8_t d;
   1228     uint32_t res = 0;
   1229 #define SAT8(n) \
   1230     s = x >> n; \
   1231     if (s & 0x8000) { \
   1232         SET_QC(); \
   1233     } else { \
   1234         if (s > 0xff) { \
   1235             d = 0xff; \
   1236             SET_QC(); \
   1237         } else  { \
   1238             d = s; \
   1239         } \
   1240         res |= (uint32_t)d << (n / 2); \
   1241     }
   1242 
   1243     SAT8(0);
   1244     SAT8(16);
   1245     SAT8(32);
   1246     SAT8(48);
   1247 #undef SAT8
   1248     return res;
   1249 }
   1250 
   1251 uint32_t HELPER(neon_narrow_sat_u8)(uint64_t x)
   1252 {
   1253     uint16_t s;
   1254     uint8_t d;
   1255     uint32_t res = 0;
   1256 #define SAT8(n) \
   1257     s = x >> n; \
   1258     if (s > 0xff) { \
   1259         d = 0xff; \
   1260         SET_QC(); \
   1261     } else  { \
   1262         d = s; \
   1263     } \
   1264     res |= (uint32_t)d << (n / 2);
   1265 
   1266     SAT8(0);
   1267     SAT8(16);
   1268     SAT8(32);
   1269     SAT8(48);
   1270 #undef SAT8
   1271     return res;
   1272 }
   1273 
   1274 uint32_t HELPER(neon_narrow_sat_s8)(uint64_t x)
   1275 {
   1276     int16_t s;
   1277     uint8_t d;
   1278     uint32_t res = 0;
   1279 #define SAT8(n) \
   1280     s = x >> n; \
   1281     if (s != (int8_t)s) { \
   1282         d = (s >> 15) ^ 0x7f; \
   1283         SET_QC(); \
   1284     } else  { \
   1285         d = s; \
   1286     } \
   1287     res |= (uint32_t)d << (n / 2);
   1288 
   1289     SAT8(0);
   1290     SAT8(16);
   1291     SAT8(32);
   1292     SAT8(48);
   1293 #undef SAT8
   1294     return res;
   1295 }
   1296 
   1297 uint32_t HELPER(neon_unarrow_sat16)(uint64_t x)
   1298 {
   1299     uint32_t high;
   1300     uint32_t low;
   1301     low = x;
   1302     if (low & 0x80000000) {
   1303         low = 0;
   1304         SET_QC();
   1305     } else if (low > 0xffff) {
   1306         low = 0xffff;
   1307         SET_QC();
   1308     }
   1309     high = x >> 32;
   1310     if (high & 0x80000000) {
   1311         high = 0;
   1312         SET_QC();
   1313     } else if (high > 0xffff) {
   1314         high = 0xffff;
   1315         SET_QC();
   1316     }
   1317     return low | (high << 16);
   1318 }
   1319 
   1320 uint32_t HELPER(neon_narrow_sat_u16)(uint64_t x)
   1321 {
   1322     uint32_t high;
   1323     uint32_t low;
   1324     low = x;
   1325     if (low > 0xffff) {
   1326         low = 0xffff;
   1327         SET_QC();
   1328     }
   1329     high = x >> 32;
   1330     if (high > 0xffff) {
   1331         high = 0xffff;
   1332         SET_QC();
   1333     }
   1334     return low | (high << 16);
   1335 }
   1336 
   1337 uint32_t HELPER(neon_narrow_sat_s16)(uint64_t x)
   1338 {
   1339     int32_t low;
   1340     int32_t high;
   1341     low = x;
   1342     if (low != (int16_t)low) {
   1343         low = (low >> 31) ^ 0x7fff;
   1344         SET_QC();
   1345     }
   1346     high = x >> 32;
   1347     if (high != (int16_t)high) {
   1348         high = (high >> 31) ^ 0x7fff;
   1349         SET_QC();
   1350     }
   1351     return (uint16_t)low | (high << 16);
   1352 }
   1353 
   1354 uint32_t HELPER(neon_unarrow_sat32)(uint64_t x)
   1355 {
   1356     if (x & 0x8000000000000000ull) {
   1357         SET_QC();
   1358         return 0;
   1359     }
   1360     if (x > 0xffffffffu) {
   1361         SET_QC();
   1362         return 0xffffffffu;
   1363     }
   1364     return x;
   1365 }
   1366 
   1367 uint32_t HELPER(neon_narrow_sat_u32)(uint64_t x)
   1368 {
   1369     if (x > 0xffffffffu) {
   1370         SET_QC();
   1371         return 0xffffffffu;
   1372     }
   1373     return x;
   1374 }
   1375 
   1376 uint32_t HELPER(neon_narrow_sat_s32)(uint64_t x)
   1377 {
   1378     if ((int64_t)x != (int32_t)x) {
   1379         SET_QC();
   1380         return ((int64_t)x >> 63) ^ 0x7fffffff;
   1381     }
   1382     return x;
   1383 }
   1384 
   1385 uint64_t HELPER(neon_widen_u8)(uint32_t x)
   1386 {
   1387     uint64_t tmp;
   1388     uint64_t ret;
   1389     ret = (uint8_t)x;
   1390     tmp = (uint8_t)(x >> 8);
   1391     ret |= tmp << 16;
   1392     tmp = (uint8_t)(x >> 16);
   1393     ret |= tmp << 32;
   1394     tmp = (uint8_t)(x >> 24);
   1395     ret |= tmp << 48;
   1396     return ret;
   1397 }
   1398 
   1399 uint64_t HELPER(neon_widen_s8)(uint32_t x)
   1400 {
   1401     uint64_t tmp;
   1402     uint64_t ret;
   1403     ret = (uint16_t)(int8_t)x;
   1404     tmp = (uint16_t)(int8_t)(x >> 8);
   1405     ret |= tmp << 16;
   1406     tmp = (uint16_t)(int8_t)(x >> 16);
   1407     ret |= tmp << 32;
   1408     tmp = (uint16_t)(int8_t)(x >> 24);
   1409     ret |= tmp << 48;
   1410     return ret;
   1411 }
   1412 
   1413 uint64_t HELPER(neon_widen_u16)(uint32_t x)
   1414 {
   1415     uint64_t high = (uint16_t)(x >> 16);
   1416     return ((uint16_t)x) | (high << 32);
   1417 }
   1418 
   1419 uint64_t HELPER(neon_widen_s16)(uint32_t x)
   1420 {
   1421     uint64_t high = (int16_t)(x >> 16);
   1422     return ((uint32_t)(int16_t)x) | (high << 32);
   1423 }
   1424 
   1425 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
   1426 {
   1427     uint64_t mask;
   1428     mask = (a ^ b) & 0x8000800080008000ull;
   1429     a &= ~0x8000800080008000ull;
   1430     b &= ~0x8000800080008000ull;
   1431     return (a + b) ^ mask;
   1432 }
   1433 
   1434 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
   1435 {
   1436     uint64_t mask;
   1437     mask = (a ^ b) & 0x8000000080000000ull;
   1438     a &= ~0x8000000080000000ull;
   1439     b &= ~0x8000000080000000ull;
   1440     return (a + b) ^ mask;
   1441 }
   1442 
   1443 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
   1444 {
   1445     uint64_t tmp;
   1446     uint64_t tmp2;
   1447 
   1448     tmp = a & 0x0000ffff0000ffffull;
   1449     tmp += (a >> 16) & 0x0000ffff0000ffffull;
   1450     tmp2 = b & 0xffff0000ffff0000ull;
   1451     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
   1452     return    ( tmp         & 0xffff)
   1453             | ((tmp  >> 16) & 0xffff0000ull)
   1454             | ((tmp2 << 16) & 0xffff00000000ull)
   1455             | ( tmp2        & 0xffff000000000000ull);
   1456 }
   1457 
   1458 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
   1459 {
   1460     uint32_t low = a + (a >> 32);
   1461     uint32_t high = b + (b >> 32);
   1462     return low + ((uint64_t)high << 32);
   1463 }
   1464 
   1465 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
   1466 {
   1467     uint64_t mask;
   1468     mask = (a ^ ~b) & 0x8000800080008000ull;
   1469     a |= 0x8000800080008000ull;
   1470     b &= ~0x8000800080008000ull;
   1471     return (a - b) ^ mask;
   1472 }
   1473 
   1474 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
   1475 {
   1476     uint64_t mask;
   1477     mask = (a ^ ~b) & 0x8000000080000000ull;
   1478     a |= 0x8000000080000000ull;
   1479     b &= ~0x8000000080000000ull;
   1480     return (a - b) ^ mask;
   1481 }
   1482 
   1483 uint64_t HELPER(neon_addl_saturate_s32)(uint64_t a, uint64_t b)
   1484 {
   1485     uint32_t x, y;
   1486     uint32_t low, high;
   1487 
   1488     x = a;
   1489     y = b;
   1490     low = x + y;
   1491     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1492         SET_QC();
   1493         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1494     }
   1495     x = a >> 32;
   1496     y = b >> 32;
   1497     high = x + y;
   1498     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1499         SET_QC();
   1500         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1501     }
   1502     return low | ((uint64_t)high << 32);
   1503 }
   1504 
   1505 uint64_t HELPER(neon_addl_saturate_s64)(uint64_t a, uint64_t b)
   1506 {
   1507     uint64_t result;
   1508 
   1509     result = a + b;
   1510     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
   1511         SET_QC();
   1512         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
   1513     }
   1514     return result;
   1515 }
   1516 
   1517 /* We have to do the arithmetic in a larger type than
   1518  * the input type, because for example with a signed 32 bit
   1519  * op the absolute difference can overflow a signed 32 bit value.
   1520  */
   1521 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
   1522     arithtype tmp_x = (intype)(x);                            \
   1523     arithtype tmp_y = (intype)(y);                            \
   1524     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
   1525     } while(0)
   1526 
   1527 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
   1528 {
   1529     uint64_t tmp;
   1530     uint64_t result;
   1531     DO_ABD(result, a, b, uint8_t, uint32_t);
   1532     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
   1533     result |= tmp << 16;
   1534     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
   1535     result |= tmp << 32;
   1536     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
   1537     result |= tmp << 48;
   1538     return result;
   1539 }
   1540 
   1541 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
   1542 {
   1543     uint64_t tmp;
   1544     uint64_t result;
   1545     DO_ABD(result, a, b, int8_t, int32_t);
   1546     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
   1547     result |= tmp << 16;
   1548     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
   1549     result |= tmp << 32;
   1550     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
   1551     result |= tmp << 48;
   1552     return result;
   1553 }
   1554 
   1555 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
   1556 {
   1557     uint64_t tmp;
   1558     uint64_t result;
   1559     DO_ABD(result, a, b, uint16_t, uint32_t);
   1560     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
   1561     return result | (tmp << 32);
   1562 }
   1563 
   1564 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
   1565 {
   1566     uint64_t tmp;
   1567     uint64_t result;
   1568     DO_ABD(result, a, b, int16_t, int32_t);
   1569     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
   1570     return result | (tmp << 32);
   1571 }
   1572 
   1573 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
   1574 {
   1575     uint64_t result;
   1576     DO_ABD(result, a, b, uint32_t, uint64_t);
   1577     return result;
   1578 }
   1579 
   1580 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
   1581 {
   1582     uint64_t result;
   1583     DO_ABD(result, a, b, int32_t, int64_t);
   1584     return result;
   1585 }
   1586 #undef DO_ABD
   1587 
   1588 /* Widening multiply. Named type is the source type.  */
   1589 #define DO_MULL(dest, x, y, type1, type2) do { \
   1590     type1 tmp_x = x; \
   1591     type1 tmp_y = y; \
   1592     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
   1593     } while(0)
   1594 
   1595 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
   1596 {
   1597     uint64_t tmp;
   1598     uint64_t result;
   1599 
   1600     DO_MULL(result, a, b, uint8_t, uint16_t);
   1601     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
   1602     result |= tmp << 16;
   1603     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
   1604     result |= tmp << 32;
   1605     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
   1606     result |= tmp << 48;
   1607     return result;
   1608 }
   1609 
   1610 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
   1611 {
   1612     uint64_t tmp;
   1613     uint64_t result;
   1614 
   1615     DO_MULL(result, a, b, int8_t, uint16_t);
   1616     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
   1617     result |= tmp << 16;
   1618     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
   1619     result |= tmp << 32;
   1620     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
   1621     result |= tmp << 48;
   1622     return result;
   1623 }
   1624 
   1625 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
   1626 {
   1627     uint64_t tmp;
   1628     uint64_t result;
   1629 
   1630     DO_MULL(result, a, b, uint16_t, uint32_t);
   1631     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
   1632     return result | (tmp << 32);
   1633 }
   1634 
   1635 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
   1636 {
   1637     uint64_t tmp;
   1638     uint64_t result;
   1639 
   1640     DO_MULL(result, a, b, int16_t, uint32_t);
   1641     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
   1642     return result | (tmp << 32);
   1643 }
   1644 
   1645 uint64_t HELPER(neon_negl_u16)(uint64_t x)
   1646 {
   1647     uint16_t tmp;
   1648     uint64_t result;
   1649     result = (uint16_t)-x;
   1650     tmp = -(x >> 16);
   1651     result |= (uint64_t)tmp << 16;
   1652     tmp = -(x >> 32);
   1653     result |= (uint64_t)tmp << 32;
   1654     tmp = -(x >> 48);
   1655     result |= (uint64_t)tmp << 48;
   1656     return result;
   1657 }
   1658 
   1659 uint64_t HELPER(neon_negl_u32)(uint64_t x)
   1660 {
   1661     uint32_t low = -x;
   1662     uint32_t high = -(x >> 32);
   1663     return low | ((uint64_t)high << 32);
   1664 }
   1665 
   1666 /* FIXME:  There should be a native op for this.  */
   1667 uint64_t HELPER(neon_negl_u64)(uint64_t x)
   1668 {
   1669     return -x;
   1670 }
   1671 
   1672 /* Saturnating sign manuipulation.  */
   1673 /* ??? Make these use NEON_VOP1 */
   1674 #define DO_QABS8(x) do { \
   1675     if (x == (int8_t)0x80) { \
   1676         x = 0x7f; \
   1677         SET_QC(); \
   1678     } else if (x < 0) { \
   1679         x = -x; \
   1680     }} while (0)
   1681 uint32_t HELPER(neon_qabs_s8)(uint32_t x)
   1682 {
   1683     neon_s8 vec;
   1684     NEON_UNPACK(neon_s8, vec, x);
   1685     DO_QABS8(vec.v1);
   1686     DO_QABS8(vec.v2);
   1687     DO_QABS8(vec.v3);
   1688     DO_QABS8(vec.v4);
   1689     NEON_PACK(neon_s8, x, vec);
   1690     return x;
   1691 }
   1692 #undef DO_QABS8
   1693 
   1694 #define DO_QNEG8(x) do { \
   1695     if (x == (int8_t)0x80) { \
   1696         x = 0x7f; \
   1697         SET_QC(); \
   1698     } else { \
   1699         x = -x; \
   1700     }} while (0)
   1701 uint32_t HELPER(neon_qneg_s8)(uint32_t x)
   1702 {
   1703     neon_s8 vec;
   1704     NEON_UNPACK(neon_s8, vec, x);
   1705     DO_QNEG8(vec.v1);
   1706     DO_QNEG8(vec.v2);
   1707     DO_QNEG8(vec.v3);
   1708     DO_QNEG8(vec.v4);
   1709     NEON_PACK(neon_s8, x, vec);
   1710     return x;
   1711 }
   1712 #undef DO_QNEG8
   1713 
   1714 #define DO_QABS16(x) do { \
   1715     if (x == (int16_t)0x8000) { \
   1716         x = 0x7fff; \
   1717         SET_QC(); \
   1718     } else if (x < 0) { \
   1719         x = -x; \
   1720     }} while (0)
   1721 uint32_t HELPER(neon_qabs_s16)(uint32_t x)
   1722 {
   1723     neon_s16 vec;
   1724     NEON_UNPACK(neon_s16, vec, x);
   1725     DO_QABS16(vec.v1);
   1726     DO_QABS16(vec.v2);
   1727     NEON_PACK(neon_s16, x, vec);
   1728     return x;
   1729 }
   1730 #undef DO_QABS16
   1731 
   1732 #define DO_QNEG16(x) do { \
   1733     if (x == (int16_t)0x8000) { \
   1734         x = 0x7fff; \
   1735         SET_QC(); \
   1736     } else { \
   1737         x = -x; \
   1738     }} while (0)
   1739 uint32_t HELPER(neon_qneg_s16)(uint32_t x)
   1740 {
   1741     neon_s16 vec;
   1742     NEON_UNPACK(neon_s16, vec, x);
   1743     DO_QNEG16(vec.v1);
   1744     DO_QNEG16(vec.v2);
   1745     NEON_PACK(neon_s16, x, vec);
   1746     return x;
   1747 }
   1748 #undef DO_QNEG16
   1749 
   1750 uint32_t HELPER(neon_qabs_s32)(uint32_t x)
   1751 {
   1752     if (x == SIGNBIT) {
   1753         SET_QC();
   1754         x = ~SIGNBIT;
   1755     } else if ((int32_t)x < 0) {
   1756         x = -x;
   1757     }
   1758     return x;
   1759 }
   1760 
   1761 uint32_t HELPER(neon_qneg_s32)(uint32_t x)
   1762 {
   1763     if (x == SIGNBIT) {
   1764         SET_QC();
   1765         x = ~SIGNBIT;
   1766     } else {
   1767         x = -x;
   1768     }
   1769     return x;
   1770 }
   1771 
   1772 /* NEON Float helpers.  */
   1773 uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
   1774 {
   1775     return float32_val(float32_min(make_float32(a), make_float32(b), NFS));
   1776 }
   1777 
   1778 uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
   1779 {
   1780     return float32_val(float32_max(make_float32(a), make_float32(b), NFS));
   1781 }
   1782 
   1783 uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
   1784 {
   1785     float32 f0 = make_float32(a);
   1786     float32 f1 = make_float32(b);
   1787     return float32_val(float32_abs(float32_sub(f0, f1, NFS)));
   1788 }
   1789 
   1790 uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
   1791 {
   1792     return float32_val(float32_add(make_float32(a), make_float32(b), NFS));
   1793 }
   1794 
   1795 uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
   1796 {
   1797     return float32_val(float32_sub(make_float32(a), make_float32(b), NFS));
   1798 }
   1799 
   1800 uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
   1801 {
   1802     return float32_val(float32_mul(make_float32(a), make_float32(b), NFS));
   1803 }
   1804 
   1805 /* Floating point comparisons produce an integer result.
   1806  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
   1807  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
   1808  */
   1809 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b)
   1810 {
   1811     return -float32_eq_quiet(make_float32(a), make_float32(b), NFS);
   1812 }
   1813 
   1814 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b)
   1815 {
   1816     return -float32_le(make_float32(b), make_float32(a), NFS);
   1817 }
   1818 
   1819 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b)
   1820 {
   1821     return -float32_lt(make_float32(b), make_float32(a), NFS);
   1822 }
   1823 
   1824 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
   1825 {
   1826     float32 f0 = float32_abs(make_float32(a));
   1827     float32 f1 = float32_abs(make_float32(b));
   1828     return -float32_le(f1, f0, NFS);
   1829 }
   1830 
   1831 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
   1832 {
   1833     float32 f0 = float32_abs(make_float32(a));
   1834     float32 f1 = float32_abs(make_float32(b));
   1835     return -float32_lt(f1, f0, NFS);
   1836 }
   1837 
   1838 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
   1839 
   1840 void HELPER(neon_qunzip8)(uint32_t rd, uint32_t rm)
   1841 {
   1842     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1843     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1844     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1845     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1846     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
   1847         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
   1848         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
   1849         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
   1850     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
   1851         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
   1852         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
   1853         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
   1854     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
   1855         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
   1856         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
   1857         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
   1858     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
   1859         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
   1860         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
   1861         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
   1862     env->vfp.regs[rm] = make_float64(m0);
   1863     env->vfp.regs[rm + 1] = make_float64(m1);
   1864     env->vfp.regs[rd] = make_float64(d0);
   1865     env->vfp.regs[rd + 1] = make_float64(d1);
   1866 }
   1867 
   1868 void HELPER(neon_qunzip16)(uint32_t rd, uint32_t rm)
   1869 {
   1870     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1871     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1872     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1873     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1874     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
   1875         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
   1876     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
   1877         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
   1878     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
   1879         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
   1880     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
   1881         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
   1882     env->vfp.regs[rm] = make_float64(m0);
   1883     env->vfp.regs[rm + 1] = make_float64(m1);
   1884     env->vfp.regs[rd] = make_float64(d0);
   1885     env->vfp.regs[rd + 1] = make_float64(d1);
   1886 }
   1887 
   1888 void HELPER(neon_qunzip32)(uint32_t rd, uint32_t rm)
   1889 {
   1890     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1891     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1892     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1893     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1894     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
   1895     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
   1896     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
   1897     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
   1898     env->vfp.regs[rm] = make_float64(m0);
   1899     env->vfp.regs[rm + 1] = make_float64(m1);
   1900     env->vfp.regs[rd] = make_float64(d0);
   1901     env->vfp.regs[rd + 1] = make_float64(d1);
   1902 }
   1903 
   1904 void HELPER(neon_unzip8)(uint32_t rd, uint32_t rm)
   1905 {
   1906     uint64_t zm = float64_val(env->vfp.regs[rm]);
   1907     uint64_t zd = float64_val(env->vfp.regs[rd]);
   1908     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
   1909         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
   1910         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
   1911         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
   1912     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
   1913         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
   1914         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
   1915         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
   1916     env->vfp.regs[rm] = make_float64(m0);
   1917     env->vfp.regs[rd] = make_float64(d0);
   1918 }
   1919 
   1920 void HELPER(neon_unzip16)(uint32_t rd, uint32_t rm)
   1921 {
   1922     uint64_t zm = float64_val(env->vfp.regs[rm]);
   1923     uint64_t zd = float64_val(env->vfp.regs[rd]);
   1924     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
   1925         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
   1926     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
   1927         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
   1928     env->vfp.regs[rm] = make_float64(m0);
   1929     env->vfp.regs[rd] = make_float64(d0);
   1930 }
   1931 
   1932 void HELPER(neon_qzip8)(uint32_t rd, uint32_t rm)
   1933 {
   1934     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1935     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1936     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1937     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1938     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
   1939         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
   1940         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
   1941         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
   1942     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
   1943         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
   1944         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
   1945         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
   1946     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
   1947         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
   1948         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
   1949         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
   1950     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
   1951         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
   1952         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
   1953         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
   1954     env->vfp.regs[rm] = make_float64(m0);
   1955     env->vfp.regs[rm + 1] = make_float64(m1);
   1956     env->vfp.regs[rd] = make_float64(d0);
   1957     env->vfp.regs[rd + 1] = make_float64(d1);
   1958 }
   1959 
   1960 void HELPER(neon_qzip16)(uint32_t rd, uint32_t rm)
   1961 {
   1962     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1963     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1964     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1965     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1966     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
   1967         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
   1968     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
   1969         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
   1970     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
   1971         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
   1972     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
   1973         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
   1974     env->vfp.regs[rm] = make_float64(m0);
   1975     env->vfp.regs[rm + 1] = make_float64(m1);
   1976     env->vfp.regs[rd] = make_float64(d0);
   1977     env->vfp.regs[rd + 1] = make_float64(d1);
   1978 }
   1979 
   1980 void HELPER(neon_qzip32)(uint32_t rd, uint32_t rm)
   1981 {
   1982     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1983     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1984     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1985     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1986     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
   1987     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
   1988     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
   1989     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
   1990     env->vfp.regs[rm] = make_float64(m0);
   1991     env->vfp.regs[rm + 1] = make_float64(m1);
   1992     env->vfp.regs[rd] = make_float64(d0);
   1993     env->vfp.regs[rd + 1] = make_float64(d1);
   1994 }
   1995 
   1996 void HELPER(neon_zip8)(uint32_t rd, uint32_t rm)
   1997 {
   1998     uint64_t zm = float64_val(env->vfp.regs[rm]);
   1999     uint64_t zd = float64_val(env->vfp.regs[rd]);
   2000     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
   2001         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
   2002         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
   2003         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
   2004     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
   2005         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
   2006         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
   2007         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
   2008     env->vfp.regs[rm] = make_float64(m0);
   2009     env->vfp.regs[rd] = make_float64(d0);
   2010 }
   2011 
   2012 void HELPER(neon_zip16)(uint32_t rd, uint32_t rm)
   2013 {
   2014     uint64_t zm = float64_val(env->vfp.regs[rm]);
   2015     uint64_t zd = float64_val(env->vfp.regs[rd]);
   2016     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
   2017         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
   2018     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
   2019         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
   2020     env->vfp.regs[rm] = make_float64(m0);
   2021     env->vfp.regs[rd] = make_float64(d0);
   2022 }
   2023