Home | History | Annotate | Download | only in target-arm
      1 /*
      2  * ARM NEON vector operations.
      3  *
      4  * Copyright (c) 2007, 2008 CodeSourcery.
      5  * Written by Paul Brook
      6  *
      7  * This code is licensed under the GNU GPL v2.
      8  */
      9 #include <stdlib.h>
     10 #include <stdio.h>
     11 
     12 #include "cpu.h"
     13 #include "exec/exec-all.h"
     14 #include "helper.h"
     15 
     16 #define SIGNBIT (uint32_t)0x80000000
     17 #define SIGNBIT64 ((uint64_t)1 << 63)
     18 
     19 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q
     20 
     21 #define NEON_TYPE1(name, type) \
     22 typedef struct \
     23 { \
     24     type v1; \
     25 } neon_##name;
     26 #ifdef HOST_WORDS_BIGENDIAN
     27 #define NEON_TYPE2(name, type) \
     28 typedef struct \
     29 { \
     30     type v2; \
     31     type v1; \
     32 } neon_##name;
     33 #define NEON_TYPE4(name, type) \
     34 typedef struct \
     35 { \
     36     type v4; \
     37     type v3; \
     38     type v2; \
     39     type v1; \
     40 } neon_##name;
     41 #else
     42 #define NEON_TYPE2(name, type) \
     43 typedef struct \
     44 { \
     45     type v1; \
     46     type v2; \
     47 } neon_##name;
     48 #define NEON_TYPE4(name, type) \
     49 typedef struct \
     50 { \
     51     type v1; \
     52     type v2; \
     53     type v3; \
     54     type v4; \
     55 } neon_##name;
     56 #endif
     57 
     58 NEON_TYPE4(s8, int8_t)
     59 NEON_TYPE4(u8, uint8_t)
     60 NEON_TYPE2(s16, int16_t)
     61 NEON_TYPE2(u16, uint16_t)
     62 NEON_TYPE1(s32, int32_t)
     63 NEON_TYPE1(u32, uint32_t)
     64 #undef NEON_TYPE4
     65 #undef NEON_TYPE2
     66 #undef NEON_TYPE1
     67 
     68 /* Copy from a uint32_t to a vector structure type.  */
     69 #define NEON_UNPACK(vtype, dest, val) do { \
     70     union { \
     71         vtype v; \
     72         uint32_t i; \
     73     } conv_u; \
     74     conv_u.i = (val); \
     75     dest = conv_u.v; \
     76     } while(0)
     77 
     78 /* Copy from a vector structure type to a uint32_t.  */
     79 #define NEON_PACK(vtype, dest, val) do { \
     80     union { \
     81         vtype v; \
     82         uint32_t i; \
     83     } conv_u; \
     84     conv_u.v = (val); \
     85     dest = conv_u.i; \
     86     } while(0)
     87 
     88 #define NEON_DO1 \
     89     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
     90 #define NEON_DO2 \
     91     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
     92     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
     93 #define NEON_DO4 \
     94     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
     95     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
     96     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
     97     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
     98 
     99 #define NEON_VOP_BODY(vtype, n) \
    100 { \
    101     uint32_t res; \
    102     vtype vsrc1; \
    103     vtype vsrc2; \
    104     vtype vdest; \
    105     NEON_UNPACK(vtype, vsrc1, arg1); \
    106     NEON_UNPACK(vtype, vsrc2, arg2); \
    107     NEON_DO##n; \
    108     NEON_PACK(vtype, res, vdest); \
    109     return res; \
    110 }
    111 
    112 #define NEON_VOP(name, vtype, n) \
    113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    114 NEON_VOP_BODY(vtype, n)
    115 
    116 #define NEON_VOP_ENV(name, vtype, n) \
    117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \
    118 NEON_VOP_BODY(vtype, n)
    119 
    120 /* Pairwise operations.  */
    121 /* For 32-bit elements each segment only contains a single element, so
    122    the elementwise and pairwise operations are the same.  */
    123 #define NEON_PDO2 \
    124     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    125     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
    126 #define NEON_PDO4 \
    127     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    128     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
    129     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
    130     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
    131 
    132 #define NEON_POP(name, vtype, n) \
    133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    134 { \
    135     uint32_t res; \
    136     vtype vsrc1; \
    137     vtype vsrc2; \
    138     vtype vdest; \
    139     NEON_UNPACK(vtype, vsrc1, arg1); \
    140     NEON_UNPACK(vtype, vsrc2, arg2); \
    141     NEON_PDO##n; \
    142     NEON_PACK(vtype, res, vdest); \
    143     return res; \
    144 }
    145 
    146 /* Unary operators.  */
    147 #define NEON_VOP1(name, vtype, n) \
    148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
    149 { \
    150     vtype vsrc1; \
    151     vtype vdest; \
    152     NEON_UNPACK(vtype, vsrc1, arg); \
    153     NEON_DO##n; \
    154     NEON_PACK(vtype, arg, vdest); \
    155     return arg; \
    156 }
    157 
    158 
    159 #define NEON_USAT(dest, src1, src2, type) do { \
    160     uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    161     if (tmp != (type)tmp) { \
    162         SET_QC(); \
    163         dest = ~0; \
    164     } else { \
    165         dest = tmp; \
    166     }} while(0)
    167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    168 NEON_VOP_ENV(qadd_u8, neon_u8, 4)
    169 #undef NEON_FN
    170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    171 NEON_VOP_ENV(qadd_u16, neon_u16, 2)
    172 #undef NEON_FN
    173 #undef NEON_USAT
    174 
    175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b)
    176 {
    177     uint32_t res = a + b;
    178     if (res < a) {
    179         SET_QC();
    180         res = ~0;
    181     }
    182     return res;
    183 }
    184 
    185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    186 {
    187     uint64_t res;
    188 
    189     res = src1 + src2;
    190     if (res < src1) {
    191         SET_QC();
    192         res = ~(uint64_t)0;
    193     }
    194     return res;
    195 }
    196 
    197 #define NEON_SSAT(dest, src1, src2, type) do { \
    198     int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    199     if (tmp != (type)tmp) { \
    200         SET_QC(); \
    201         if (src2 > 0) { \
    202             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    203         } else { \
    204             tmp = 1 << (sizeof(type) * 8 - 1); \
    205         } \
    206     } \
    207     dest = tmp; \
    208     } while(0)
    209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    210 NEON_VOP_ENV(qadd_s8, neon_s8, 4)
    211 #undef NEON_FN
    212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    213 NEON_VOP_ENV(qadd_s16, neon_s16, 2)
    214 #undef NEON_FN
    215 #undef NEON_SSAT
    216 
    217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b)
    218 {
    219     uint32_t res = a + b;
    220     if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) {
    221         SET_QC();
    222         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
    223     }
    224     return res;
    225 }
    226 
    227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    228 {
    229     uint64_t res;
    230 
    231     res = src1 + src2;
    232     if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) {
    233         SET_QC();
    234         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
    235     }
    236     return res;
    237 }
    238 
    239 #define NEON_USAT(dest, src1, src2, type) do { \
    240     uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    241     if (tmp != (type)tmp) { \
    242         SET_QC(); \
    243         dest = 0; \
    244     } else { \
    245         dest = tmp; \
    246     }} while(0)
    247 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    248 NEON_VOP_ENV(qsub_u8, neon_u8, 4)
    249 #undef NEON_FN
    250 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    251 NEON_VOP_ENV(qsub_u16, neon_u16, 2)
    252 #undef NEON_FN
    253 #undef NEON_USAT
    254 
    255 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b)
    256 {
    257     uint32_t res = a - b;
    258     if (res > a) {
    259         SET_QC();
    260         res = 0;
    261     }
    262     return res;
    263 }
    264 
    265 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    266 {
    267     uint64_t res;
    268 
    269     if (src1 < src2) {
    270         SET_QC();
    271         res = 0;
    272     } else {
    273         res = src1 - src2;
    274     }
    275     return res;
    276 }
    277 
    278 #define NEON_SSAT(dest, src1, src2, type) do { \
    279     int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    280     if (tmp != (type)tmp) { \
    281         SET_QC(); \
    282         if (src2 < 0) { \
    283             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    284         } else { \
    285             tmp = 1 << (sizeof(type) * 8 - 1); \
    286         } \
    287     } \
    288     dest = tmp; \
    289     } while(0)
    290 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    291 NEON_VOP_ENV(qsub_s8, neon_s8, 4)
    292 #undef NEON_FN
    293 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    294 NEON_VOP_ENV(qsub_s16, neon_s16, 2)
    295 #undef NEON_FN
    296 #undef NEON_SSAT
    297 
    298 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b)
    299 {
    300     uint32_t res = a - b;
    301     if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) {
    302         SET_QC();
    303         res = ~(((int32_t)a >> 31) ^ SIGNBIT);
    304     }
    305     return res;
    306 }
    307 
    308 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2)
    309 {
    310     uint64_t res;
    311 
    312     res = src1 - src2;
    313     if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) {
    314         SET_QC();
    315         res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64;
    316     }
    317     return res;
    318 }
    319 
    320 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
    321 NEON_VOP(hadd_s8, neon_s8, 4)
    322 NEON_VOP(hadd_u8, neon_u8, 4)
    323 NEON_VOP(hadd_s16, neon_s16, 2)
    324 NEON_VOP(hadd_u16, neon_u16, 2)
    325 #undef NEON_FN
    326 
    327 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
    328 {
    329     int32_t dest;
    330 
    331     dest = (src1 >> 1) + (src2 >> 1);
    332     if (src1 & src2 & 1)
    333         dest++;
    334     return dest;
    335 }
    336 
    337 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
    338 {
    339     uint32_t dest;
    340 
    341     dest = (src1 >> 1) + (src2 >> 1);
    342     if (src1 & src2 & 1)
    343         dest++;
    344     return dest;
    345 }
    346 
    347 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
    348 NEON_VOP(rhadd_s8, neon_s8, 4)
    349 NEON_VOP(rhadd_u8, neon_u8, 4)
    350 NEON_VOP(rhadd_s16, neon_s16, 2)
    351 NEON_VOP(rhadd_u16, neon_u16, 2)
    352 #undef NEON_FN
    353 
    354 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
    355 {
    356     int32_t dest;
    357 
    358     dest = (src1 >> 1) + (src2 >> 1);
    359     if ((src1 | src2) & 1)
    360         dest++;
    361     return dest;
    362 }
    363 
    364 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
    365 {
    366     uint32_t dest;
    367 
    368     dest = (src1 >> 1) + (src2 >> 1);
    369     if ((src1 | src2) & 1)
    370         dest++;
    371     return dest;
    372 }
    373 
    374 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
    375 NEON_VOP(hsub_s8, neon_s8, 4)
    376 NEON_VOP(hsub_u8, neon_u8, 4)
    377 NEON_VOP(hsub_s16, neon_s16, 2)
    378 NEON_VOP(hsub_u16, neon_u16, 2)
    379 #undef NEON_FN
    380 
    381 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
    382 {
    383     int32_t dest;
    384 
    385     dest = (src1 >> 1) - (src2 >> 1);
    386     if ((~src1) & src2 & 1)
    387         dest--;
    388     return dest;
    389 }
    390 
    391 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
    392 {
    393     uint32_t dest;
    394 
    395     dest = (src1 >> 1) - (src2 >> 1);
    396     if ((~src1) & src2 & 1)
    397         dest--;
    398     return dest;
    399 }
    400 
    401 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
    402 NEON_VOP(cgt_s8, neon_s8, 4)
    403 NEON_VOP(cgt_u8, neon_u8, 4)
    404 NEON_VOP(cgt_s16, neon_s16, 2)
    405 NEON_VOP(cgt_u16, neon_u16, 2)
    406 NEON_VOP(cgt_s32, neon_s32, 1)
    407 NEON_VOP(cgt_u32, neon_u32, 1)
    408 #undef NEON_FN
    409 
    410 #define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
    411 NEON_VOP(cge_s8, neon_s8, 4)
    412 NEON_VOP(cge_u8, neon_u8, 4)
    413 NEON_VOP(cge_s16, neon_s16, 2)
    414 NEON_VOP(cge_u16, neon_u16, 2)
    415 NEON_VOP(cge_s32, neon_s32, 1)
    416 NEON_VOP(cge_u32, neon_u32, 1)
    417 #undef NEON_FN
    418 
    419 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
    420 NEON_VOP(min_s8, neon_s8, 4)
    421 NEON_VOP(min_u8, neon_u8, 4)
    422 NEON_VOP(min_s16, neon_s16, 2)
    423 NEON_VOP(min_u16, neon_u16, 2)
    424 NEON_VOP(min_s32, neon_s32, 1)
    425 NEON_VOP(min_u32, neon_u32, 1)
    426 NEON_POP(pmin_s8, neon_s8, 4)
    427 NEON_POP(pmin_u8, neon_u8, 4)
    428 NEON_POP(pmin_s16, neon_s16, 2)
    429 NEON_POP(pmin_u16, neon_u16, 2)
    430 #undef NEON_FN
    431 
    432 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
    433 NEON_VOP(max_s8, neon_s8, 4)
    434 NEON_VOP(max_u8, neon_u8, 4)
    435 NEON_VOP(max_s16, neon_s16, 2)
    436 NEON_VOP(max_u16, neon_u16, 2)
    437 NEON_VOP(max_s32, neon_s32, 1)
    438 NEON_VOP(max_u32, neon_u32, 1)
    439 NEON_POP(pmax_s8, neon_s8, 4)
    440 NEON_POP(pmax_u8, neon_u8, 4)
    441 NEON_POP(pmax_s16, neon_s16, 2)
    442 NEON_POP(pmax_u16, neon_u16, 2)
    443 #undef NEON_FN
    444 
    445 #define NEON_FN(dest, src1, src2) \
    446     dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
    447 NEON_VOP(abd_s8, neon_s8, 4)
    448 NEON_VOP(abd_u8, neon_u8, 4)
    449 NEON_VOP(abd_s16, neon_s16, 2)
    450 NEON_VOP(abd_u16, neon_u16, 2)
    451 NEON_VOP(abd_s32, neon_s32, 1)
    452 NEON_VOP(abd_u32, neon_u32, 1)
    453 #undef NEON_FN
    454 
    455 #define NEON_FN(dest, src1, src2) do { \
    456     int8_t tmp; \
    457     tmp = (int8_t)src2; \
    458     if (tmp >= (ssize_t)sizeof(src1) * 8 || \
    459         tmp <= -(ssize_t)sizeof(src1) * 8) { \
    460         dest = 0; \
    461     } else if (tmp < 0) { \
    462         dest = src1 >> -tmp; \
    463     } else { \
    464         dest = src1 << tmp; \
    465     }} while (0)
    466 NEON_VOP(shl_u8, neon_u8, 4)
    467 NEON_VOP(shl_u16, neon_u16, 2)
    468 NEON_VOP(shl_u32, neon_u32, 1)
    469 #undef NEON_FN
    470 
    471 uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
    472 {
    473     int8_t shift = (int8_t)shiftop;
    474     if (shift >= 64 || shift <= -64) {
    475         val = 0;
    476     } else if (shift < 0) {
    477         val >>= -shift;
    478     } else {
    479         val <<= shift;
    480     }
    481     return val;
    482 }
    483 
    484 #define NEON_FN(dest, src1, src2) do { \
    485     int8_t tmp; \
    486     tmp = (int8_t)src2; \
    487     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    488         dest = 0; \
    489     } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    490         dest = src1 >> (sizeof(src1) * 8 - 1); \
    491     } else if (tmp < 0) { \
    492         dest = src1 >> -tmp; \
    493     } else { \
    494         dest = src1 << tmp; \
    495     }} while (0)
    496 NEON_VOP(shl_s8, neon_s8, 4)
    497 NEON_VOP(shl_s16, neon_s16, 2)
    498 NEON_VOP(shl_s32, neon_s32, 1)
    499 #undef NEON_FN
    500 
    501 uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
    502 {
    503     int8_t shift = (int8_t)shiftop;
    504     int64_t val = valop;
    505     if (shift >= 64) {
    506         val = 0;
    507     } else if (shift <= -64) {
    508         val >>= 63;
    509     } else if (shift < 0) {
    510         val >>= -shift;
    511     } else {
    512         val <<= shift;
    513     }
    514     return val;
    515 }
    516 
    517 #define NEON_FN(dest, src1, src2) do { \
    518     int8_t tmp; \
    519     tmp = (int8_t)src2; \
    520     if ((tmp >= (ssize_t)sizeof(src1) * 8) \
    521         || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \
    522         dest = 0; \
    523     } else if (tmp < 0) { \
    524         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    525     } else { \
    526         dest = src1 << tmp; \
    527     }} while (0)
    528 NEON_VOP(rshl_s8, neon_s8, 4)
    529 NEON_VOP(rshl_s16, neon_s16, 2)
    530 #undef NEON_FN
    531 
    532 /* The addition of the rounding constant may overflow, so we use an
    533  * intermediate 64 bit accumulator.  */
    534 uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop)
    535 {
    536     int32_t dest;
    537     int32_t val = (int32_t)valop;
    538     int8_t shift = (int8_t)shiftop;
    539     if ((shift >= 32) || (shift <= -32)) {
    540         dest = 0;
    541     } else if (shift < 0) {
    542         int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
    543         dest = big_dest >> -shift;
    544     } else {
    545         dest = val << shift;
    546     }
    547     return dest;
    548 }
    549 
    550 /* Handling addition overflow with 64 bit input values is more
    551  * tricky than with 32 bit values.  */
    552 uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
    553 {
    554     int8_t shift = (int8_t)shiftop;
    555     int64_t val = valop;
    556     if ((shift >= 64) || (shift <= -64)) {
    557         val = 0;
    558     } else if (shift < 0) {
    559         val >>= (-shift - 1);
    560         if (val == INT64_MAX) {
    561             /* In this case, it means that the rounding constant is 1,
    562              * and the addition would overflow. Return the actual
    563              * result directly.  */
    564             val = 0x4000000000000000LL;
    565         } else {
    566             val++;
    567             val >>= 1;
    568         }
    569     } else {
    570         val <<= shift;
    571     }
    572     return val;
    573 }
    574 
    575 #define NEON_FN(dest, src1, src2) do { \
    576     int8_t tmp; \
    577     tmp = (int8_t)src2; \
    578     if (tmp >= (ssize_t)sizeof(src1) * 8 || \
    579         tmp < -(ssize_t)sizeof(src1) * 8) { \
    580         dest = 0; \
    581     } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
    582         dest = src1 >> (-tmp - 1); \
    583     } else if (tmp < 0) { \
    584         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    585     } else { \
    586         dest = src1 << tmp; \
    587     }} while (0)
    588 NEON_VOP(rshl_u8, neon_u8, 4)
    589 NEON_VOP(rshl_u16, neon_u16, 2)
    590 #undef NEON_FN
    591 
    592 /* The addition of the rounding constant may overflow, so we use an
    593  * intermediate 64 bit accumulator.  */
    594 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop)
    595 {
    596     uint32_t dest;
    597     int8_t shift = (int8_t)shiftop;
    598     if (shift >= 32 || shift < -32) {
    599         dest = 0;
    600     } else if (shift == -32) {
    601         dest = val >> 31;
    602     } else if (shift < 0) {
    603         uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
    604         dest = big_dest >> -shift;
    605     } else {
    606         dest = val << shift;
    607     }
    608     return dest;
    609 }
    610 
    611 /* Handling addition overflow with 64 bit input values is more
    612  * tricky than with 32 bit values.  */
    613 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
    614 {
    615     int8_t shift = (uint8_t)shiftop;
    616     if (shift >= 64 || shift < -64) {
    617         val = 0;
    618     } else if (shift == -64) {
    619         /* Rounding a 1-bit result just preserves that bit.  */
    620         val >>= 63;
    621     } else if (shift < 0) {
    622         val >>= (-shift - 1);
    623         if (val == UINT64_MAX) {
    624             /* In this case, it means that the rounding constant is 1,
    625              * and the addition would overflow. Return the actual
    626              * result directly.  */
    627             val = 0x8000000000000000ULL;
    628         } else {
    629             val++;
    630             val >>= 1;
    631         }
    632     } else {
    633         val <<= shift;
    634     }
    635     return val;
    636 }
    637 
    638 #define NEON_FN(dest, src1, src2) do { \
    639     int8_t tmp; \
    640     tmp = (int8_t)src2; \
    641     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    642         if (src1) { \
    643             SET_QC(); \
    644             dest = ~0; \
    645         } else { \
    646             dest = 0; \
    647         } \
    648     } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    649         dest = 0; \
    650     } else if (tmp < 0) { \
    651         dest = src1 >> -tmp; \
    652     } else { \
    653         dest = src1 << tmp; \
    654         if ((dest >> tmp) != src1) { \
    655             SET_QC(); \
    656             dest = ~0; \
    657         } \
    658     }} while (0)
    659 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
    660 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
    661 NEON_VOP_ENV(qshl_u32, neon_u32, 1)
    662 #undef NEON_FN
    663 
    664 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
    665 {
    666     int8_t shift = (int8_t)shiftop;
    667     if (shift >= 64) {
    668         if (val) {
    669             val = ~(uint64_t)0;
    670             SET_QC();
    671         }
    672     } else if (shift <= -64) {
    673         val = 0;
    674     } else if (shift < 0) {
    675         val >>= -shift;
    676     } else {
    677         uint64_t tmp = val;
    678         val <<= shift;
    679         if ((val >> shift) != tmp) {
    680             SET_QC();
    681             val = ~(uint64_t)0;
    682         }
    683     }
    684     return val;
    685 }
    686 
    687 #define NEON_FN(dest, src1, src2) do { \
    688     int8_t tmp; \
    689     tmp = (int8_t)src2; \
    690     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    691         if (src1) { \
    692             SET_QC(); \
    693             dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
    694             if (src1 > 0) { \
    695                 dest--; \
    696             } \
    697         } else { \
    698             dest = src1; \
    699         } \
    700     } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    701         dest = src1 >> 31; \
    702     } else if (tmp < 0) { \
    703         dest = src1 >> -tmp; \
    704     } else { \
    705         dest = src1 << tmp; \
    706         if ((dest >> tmp) != src1) { \
    707             SET_QC(); \
    708             dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
    709             if (src1 > 0) { \
    710                 dest--; \
    711             } \
    712         } \
    713     }} while (0)
    714 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
    715 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
    716 NEON_VOP_ENV(qshl_s32, neon_s32, 1)
    717 #undef NEON_FN
    718 
    719 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
    720 {
    721     int8_t shift = (uint8_t)shiftop;
    722     int64_t val = valop;
    723     if (shift >= 64) {
    724         if (val) {
    725             SET_QC();
    726             val = (val >> 63) ^ ~SIGNBIT64;
    727         }
    728     } else if (shift <= -64) {
    729         val >>= 63;
    730     } else if (shift < 0) {
    731         val >>= -shift;
    732     } else {
    733         int64_t tmp = val;
    734         val <<= shift;
    735         if ((val >> shift) != tmp) {
    736             SET_QC();
    737             val = (tmp >> 63) ^ ~SIGNBIT64;
    738         }
    739     }
    740     return val;
    741 }
    742 
    743 #define NEON_FN(dest, src1, src2) do { \
    744     if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \
    745         SET_QC(); \
    746         dest = 0; \
    747     } else { \
    748         int8_t tmp; \
    749         tmp = (int8_t)src2; \
    750         if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    751             if (src1) { \
    752                 SET_QC(); \
    753                 dest = ~0; \
    754             } else { \
    755                 dest = 0; \
    756             } \
    757         } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    758             dest = 0; \
    759         } else if (tmp < 0) { \
    760             dest = src1 >> -tmp; \
    761         } else { \
    762             dest = src1 << tmp; \
    763             if ((dest >> tmp) != src1) { \
    764                 SET_QC(); \
    765                 dest = ~0; \
    766             } \
    767         } \
    768     }} while (0)
    769 NEON_VOP_ENV(qshlu_s8, neon_u8, 4)
    770 NEON_VOP_ENV(qshlu_s16, neon_u16, 2)
    771 #undef NEON_FN
    772 
    773 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
    774 {
    775     if ((int32_t)valop < 0) {
    776         SET_QC();
    777         return 0;
    778     }
    779     return helper_neon_qshl_u32(env, valop, shiftop);
    780 }
    781 
    782 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
    783 {
    784     if ((int64_t)valop < 0) {
    785         SET_QC();
    786         return 0;
    787     }
    788     return helper_neon_qshl_u64(env, valop, shiftop);
    789 }
    790 
    791 #define NEON_FN(dest, src1, src2) do { \
    792     int8_t tmp; \
    793     tmp = (int8_t)src2; \
    794     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    795         if (src1) { \
    796             SET_QC(); \
    797             dest = ~0; \
    798         } else { \
    799             dest = 0; \
    800         } \
    801     } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \
    802         dest = 0; \
    803     } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \
    804         dest = src1 >> (sizeof(src1) * 8 - 1); \
    805     } else if (tmp < 0) { \
    806         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    807     } else { \
    808         dest = src1 << tmp; \
    809         if ((dest >> tmp) != src1) { \
    810             SET_QC(); \
    811             dest = ~0; \
    812         } \
    813     }} while (0)
    814 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
    815 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
    816 #undef NEON_FN
    817 
    818 /* The addition of the rounding constant may overflow, so we use an
    819  * intermediate 64 bit accumulator.  */
    820 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shiftop)
    821 {
    822     uint32_t dest;
    823     int8_t shift = (int8_t)shiftop;
    824     if (shift >= 32) {
    825         if (val) {
    826             SET_QC();
    827             dest = ~0;
    828         } else {
    829             dest = 0;
    830         }
    831     } else if (shift < -32) {
    832         dest = 0;
    833     } else if (shift == -32) {
    834         dest = val >> 31;
    835     } else if (shift < 0) {
    836         uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift)));
    837         dest = big_dest >> -shift;
    838     } else {
    839         dest = val << shift;
    840         if ((dest >> shift) != val) {
    841             SET_QC();
    842             dest = ~0;
    843         }
    844     }
    845     return dest;
    846 }
    847 
    848 /* Handling addition overflow with 64 bit input values is more
    849  * tricky than with 32 bit values.  */
    850 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop)
    851 {
    852     int8_t shift = (int8_t)shiftop;
    853     if (shift >= 64) {
    854         if (val) {
    855             SET_QC();
    856             val = ~0;
    857         }
    858     } else if (shift < -64) {
    859         val = 0;
    860     } else if (shift == -64) {
    861         val >>= 63;
    862     } else if (shift < 0) {
    863         val >>= (-shift - 1);
    864         if (val == UINT64_MAX) {
    865             /* In this case, it means that the rounding constant is 1,
    866              * and the addition would overflow. Return the actual
    867              * result directly.  */
    868             val = 0x8000000000000000ULL;
    869         } else {
    870             val++;
    871             val >>= 1;
    872         }
    873     } else { \
    874         uint64_t tmp = val;
    875         val <<= shift;
    876         if ((val >> shift) != tmp) {
    877             SET_QC();
    878             val = ~0;
    879         }
    880     }
    881     return val;
    882 }
    883 
    884 #define NEON_FN(dest, src1, src2) do { \
    885     int8_t tmp; \
    886     tmp = (int8_t)src2; \
    887     if (tmp >= (ssize_t)sizeof(src1) * 8) { \
    888         if (src1) { \
    889             SET_QC(); \
    890             dest = (1 << (sizeof(src1) * 8 - 1)); \
    891             if (src1 > 0) { \
    892                 dest--; \
    893             } \
    894         } else { \
    895             dest = 0; \
    896         } \
    897     } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \
    898         dest = 0; \
    899     } else if (tmp < 0) { \
    900         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    901     } else { \
    902         dest = src1 << tmp; \
    903         if ((dest >> tmp) != src1) { \
    904             SET_QC(); \
    905             dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \
    906             if (src1 > 0) { \
    907                 dest--; \
    908             } \
    909         } \
    910     }} while (0)
    911 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
    912 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
    913 #undef NEON_FN
    914 
    915 /* The addition of the rounding constant may overflow, so we use an
    916  * intermediate 64 bit accumulator.  */
    917 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop)
    918 {
    919     int32_t dest;
    920     int32_t val = (int32_t)valop;
    921     int8_t shift = (int8_t)shiftop;
    922     if (shift >= 32) {
    923         if (val) {
    924             SET_QC();
    925             dest = (val >> 31) ^ ~SIGNBIT;
    926         } else {
    927             dest = 0;
    928         }
    929     } else if (shift <= -32) {
    930         dest = 0;
    931     } else if (shift < 0) {
    932         int64_t big_dest = ((int64_t)val + (1 << (-1 - shift)));
    933         dest = big_dest >> -shift;
    934     } else {
    935         dest = val << shift;
    936         if ((dest >> shift) != val) {
    937             SET_QC();
    938             dest = (val >> 31) ^ ~SIGNBIT;
    939         }
    940     }
    941     return dest;
    942 }
    943 
    944 /* Handling addition overflow with 64 bit input values is more
    945  * tricky than with 32 bit values.  */
    946 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop)
    947 {
    948     int8_t shift = (uint8_t)shiftop;
    949     int64_t val = valop;
    950 
    951     if (shift >= 64) {
    952         if (val) {
    953             SET_QC();
    954             val = (val >> 63) ^ ~SIGNBIT64;
    955         }
    956     } else if (shift <= -64) {
    957         val = 0;
    958     } else if (shift < 0) {
    959         val >>= (-shift - 1);
    960         if (val == INT64_MAX) {
    961             /* In this case, it means that the rounding constant is 1,
    962              * and the addition would overflow. Return the actual
    963              * result directly.  */
    964             val = 0x4000000000000000ULL;
    965         } else {
    966             val++;
    967             val >>= 1;
    968         }
    969     } else {
    970         int64_t tmp = val;
    971         val <<= shift;
    972         if ((val >> shift) != tmp) {
    973             SET_QC();
    974             val = (tmp >> 63) ^ ~SIGNBIT64;
    975         }
    976     }
    977     return val;
    978 }
    979 
    980 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
    981 {
    982     uint32_t mask;
    983     mask = (a ^ b) & 0x80808080u;
    984     a &= ~0x80808080u;
    985     b &= ~0x80808080u;
    986     return (a + b) ^ mask;
    987 }
    988 
    989 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
    990 {
    991     uint32_t mask;
    992     mask = (a ^ b) & 0x80008000u;
    993     a &= ~0x80008000u;
    994     b &= ~0x80008000u;
    995     return (a + b) ^ mask;
    996 }
    997 
    998 #define NEON_FN(dest, src1, src2) dest = src1 + src2
    999 NEON_POP(padd_u8, neon_u8, 4)
   1000 NEON_POP(padd_u16, neon_u16, 2)
   1001 #undef NEON_FN
   1002 
   1003 #define NEON_FN(dest, src1, src2) dest = src1 - src2
   1004 NEON_VOP(sub_u8, neon_u8, 4)
   1005 NEON_VOP(sub_u16, neon_u16, 2)
   1006 #undef NEON_FN
   1007 
   1008 #define NEON_FN(dest, src1, src2) dest = src1 * src2
   1009 NEON_VOP(mul_u8, neon_u8, 4)
   1010 NEON_VOP(mul_u16, neon_u16, 2)
   1011 #undef NEON_FN
   1012 
   1013 /* Polynomial multiplication is like integer multiplication except the
   1014    partial products are XORed, not added.  */
   1015 uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
   1016 {
   1017     uint32_t mask;
   1018     uint32_t result;
   1019     result = 0;
   1020     while (op1) {
   1021         mask = 0;
   1022         if (op1 & 1)
   1023             mask |= 0xff;
   1024         if (op1 & (1 << 8))
   1025             mask |= (0xff << 8);
   1026         if (op1 & (1 << 16))
   1027             mask |= (0xff << 16);
   1028         if (op1 & (1 << 24))
   1029             mask |= (0xff << 24);
   1030         result ^= op2 & mask;
   1031         op1 = (op1 >> 1) & 0x7f7f7f7f;
   1032         op2 = (op2 << 1) & 0xfefefefe;
   1033     }
   1034     return result;
   1035 }
   1036 
   1037 uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2)
   1038 {
   1039     uint64_t result = 0;
   1040     uint64_t mask;
   1041     uint64_t op2ex = op2;
   1042     op2ex = (op2ex & 0xff) |
   1043         ((op2ex & 0xff00) << 8) |
   1044         ((op2ex & 0xff0000) << 16) |
   1045         ((op2ex & 0xff000000) << 24);
   1046     while (op1) {
   1047         mask = 0;
   1048         if (op1 & 1) {
   1049             mask |= 0xffff;
   1050         }
   1051         if (op1 & (1 << 8)) {
   1052             mask |= (0xffffU << 16);
   1053         }
   1054         if (op1 & (1 << 16)) {
   1055             mask |= (0xffffULL << 32);
   1056         }
   1057         if (op1 & (1 << 24)) {
   1058             mask |= (0xffffULL << 48);
   1059         }
   1060         result ^= op2ex & mask;
   1061         op1 = (op1 >> 1) & 0x7f7f7f7f;
   1062         op2ex <<= 1;
   1063     }
   1064     return result;
   1065 }
   1066 
   1067 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
   1068 NEON_VOP(tst_u8, neon_u8, 4)
   1069 NEON_VOP(tst_u16, neon_u16, 2)
   1070 NEON_VOP(tst_u32, neon_u32, 1)
   1071 #undef NEON_FN
   1072 
   1073 #define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
   1074 NEON_VOP(ceq_u8, neon_u8, 4)
   1075 NEON_VOP(ceq_u16, neon_u16, 2)
   1076 NEON_VOP(ceq_u32, neon_u32, 1)
   1077 #undef NEON_FN
   1078 
   1079 #define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
   1080 NEON_VOP1(abs_s8, neon_s8, 4)
   1081 NEON_VOP1(abs_s16, neon_s16, 2)
   1082 #undef NEON_FN
   1083 
   1084 /* Count Leading Sign/Zero Bits.  */
   1085 static inline int do_clz8(uint8_t x)
   1086 {
   1087     int n;
   1088     for (n = 8; x; n--)
   1089         x >>= 1;
   1090     return n;
   1091 }
   1092 
   1093 static inline int do_clz16(uint16_t x)
   1094 {
   1095     int n;
   1096     for (n = 16; x; n--)
   1097         x >>= 1;
   1098     return n;
   1099 }
   1100 
   1101 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
   1102 NEON_VOP1(clz_u8, neon_u8, 4)
   1103 #undef NEON_FN
   1104 
   1105 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
   1106 NEON_VOP1(clz_u16, neon_u16, 2)
   1107 #undef NEON_FN
   1108 
   1109 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
   1110 NEON_VOP1(cls_s8, neon_s8, 4)
   1111 #undef NEON_FN
   1112 
   1113 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
   1114 NEON_VOP1(cls_s16, neon_s16, 2)
   1115 #undef NEON_FN
   1116 
   1117 uint32_t HELPER(neon_cls_s32)(uint32_t x)
   1118 {
   1119     int count;
   1120     if ((int32_t)x < 0)
   1121         x = ~x;
   1122     for (count = 32; x; count--)
   1123         x = x >> 1;
   1124     return count - 1;
   1125 }
   1126 
   1127 /* Bit count.  */
   1128 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
   1129 {
   1130     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
   1131     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
   1132     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
   1133     return x;
   1134 }
   1135 
   1136 #define NEON_QDMULH16(dest, src1, src2, round) do { \
   1137     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
   1138     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
   1139         SET_QC(); \
   1140         tmp = (tmp >> 31) ^ ~SIGNBIT; \
   1141     } else { \
   1142         tmp <<= 1; \
   1143     } \
   1144     if (round) { \
   1145         int32_t old = tmp; \
   1146         tmp += 1 << 15; \
   1147         if ((int32_t)tmp < old) { \
   1148             SET_QC(); \
   1149             tmp = SIGNBIT - 1; \
   1150         } \
   1151     } \
   1152     dest = tmp >> 16; \
   1153     } while(0)
   1154 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
   1155 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
   1156 #undef NEON_FN
   1157 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
   1158 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
   1159 #undef NEON_FN
   1160 #undef NEON_QDMULH16
   1161 
   1162 #define NEON_QDMULH32(dest, src1, src2, round) do { \
   1163     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
   1164     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
   1165         SET_QC(); \
   1166         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
   1167     } else { \
   1168         tmp <<= 1; \
   1169     } \
   1170     if (round) { \
   1171         int64_t old = tmp; \
   1172         tmp += (int64_t)1 << 31; \
   1173         if ((int64_t)tmp < old) { \
   1174             SET_QC(); \
   1175             tmp = SIGNBIT64 - 1; \
   1176         } \
   1177     } \
   1178     dest = tmp >> 32; \
   1179     } while(0)
   1180 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
   1181 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
   1182 #undef NEON_FN
   1183 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
   1184 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
   1185 #undef NEON_FN
   1186 #undef NEON_QDMULH32
   1187 
   1188 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
   1189 {
   1190     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
   1191            | ((x >> 24) & 0xff000000u);
   1192 }
   1193 
   1194 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
   1195 {
   1196     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
   1197 }
   1198 
   1199 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
   1200 {
   1201     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
   1202             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
   1203 }
   1204 
   1205 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
   1206 {
   1207     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
   1208 }
   1209 
   1210 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
   1211 {
   1212     x &= 0xff80ff80ff80ff80ull;
   1213     x += 0x0080008000800080ull;
   1214     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
   1215             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
   1216 }
   1217 
   1218 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
   1219 {
   1220     x &= 0xffff8000ffff8000ull;
   1221     x += 0x0000800000008000ull;
   1222     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
   1223 }
   1224 
   1225 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x)
   1226 {
   1227     uint16_t s;
   1228     uint8_t d;
   1229     uint32_t res = 0;
   1230 #define SAT8(n) \
   1231     s = x >> n; \
   1232     if (s & 0x8000) { \
   1233         SET_QC(); \
   1234     } else { \
   1235         if (s > 0xff) { \
   1236             d = 0xff; \
   1237             SET_QC(); \
   1238         } else  { \
   1239             d = s; \
   1240         } \
   1241         res |= (uint32_t)d << (n / 2); \
   1242     }
   1243 
   1244     SAT8(0);
   1245     SAT8(16);
   1246     SAT8(32);
   1247     SAT8(48);
   1248 #undef SAT8
   1249     return res;
   1250 }
   1251 
   1252 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x)
   1253 {
   1254     uint16_t s;
   1255     uint8_t d;
   1256     uint32_t res = 0;
   1257 #define SAT8(n) \
   1258     s = x >> n; \
   1259     if (s > 0xff) { \
   1260         d = 0xff; \
   1261         SET_QC(); \
   1262     } else  { \
   1263         d = s; \
   1264     } \
   1265     res |= (uint32_t)d << (n / 2);
   1266 
   1267     SAT8(0);
   1268     SAT8(16);
   1269     SAT8(32);
   1270     SAT8(48);
   1271 #undef SAT8
   1272     return res;
   1273 }
   1274 
   1275 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x)
   1276 {
   1277     int16_t s;
   1278     uint8_t d;
   1279     uint32_t res = 0;
   1280 #define SAT8(n) \
   1281     s = x >> n; \
   1282     if (s != (int8_t)s) { \
   1283         d = (s >> 15) ^ 0x7f; \
   1284         SET_QC(); \
   1285     } else  { \
   1286         d = s; \
   1287     } \
   1288     res |= (uint32_t)d << (n / 2);
   1289 
   1290     SAT8(0);
   1291     SAT8(16);
   1292     SAT8(32);
   1293     SAT8(48);
   1294 #undef SAT8
   1295     return res;
   1296 }
   1297 
   1298 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x)
   1299 {
   1300     uint32_t high;
   1301     uint32_t low;
   1302     low = x;
   1303     if (low & 0x80000000) {
   1304         low = 0;
   1305         SET_QC();
   1306     } else if (low > 0xffff) {
   1307         low = 0xffff;
   1308         SET_QC();
   1309     }
   1310     high = x >> 32;
   1311     if (high & 0x80000000) {
   1312         high = 0;
   1313         SET_QC();
   1314     } else if (high > 0xffff) {
   1315         high = 0xffff;
   1316         SET_QC();
   1317     }
   1318     return low | (high << 16);
   1319 }
   1320 
   1321 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x)
   1322 {
   1323     uint32_t high;
   1324     uint32_t low;
   1325     low = x;
   1326     if (low > 0xffff) {
   1327         low = 0xffff;
   1328         SET_QC();
   1329     }
   1330     high = x >> 32;
   1331     if (high > 0xffff) {
   1332         high = 0xffff;
   1333         SET_QC();
   1334     }
   1335     return low | (high << 16);
   1336 }
   1337 
   1338 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x)
   1339 {
   1340     int32_t low;
   1341     int32_t high;
   1342     low = x;
   1343     if (low != (int16_t)low) {
   1344         low = (low >> 31) ^ 0x7fff;
   1345         SET_QC();
   1346     }
   1347     high = x >> 32;
   1348     if (high != (int16_t)high) {
   1349         high = (high >> 31) ^ 0x7fff;
   1350         SET_QC();
   1351     }
   1352     return (uint16_t)low | (high << 16);
   1353 }
   1354 
   1355 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x)
   1356 {
   1357     if (x & 0x8000000000000000ull) {
   1358         SET_QC();
   1359         return 0;
   1360     }
   1361     if (x > 0xffffffffu) {
   1362         SET_QC();
   1363         return 0xffffffffu;
   1364     }
   1365     return x;
   1366 }
   1367 
   1368 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x)
   1369 {
   1370     if (x > 0xffffffffu) {
   1371         SET_QC();
   1372         return 0xffffffffu;
   1373     }
   1374     return x;
   1375 }
   1376 
   1377 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x)
   1378 {
   1379     if ((int64_t)x != (int32_t)x) {
   1380         SET_QC();
   1381         return ((int64_t)x >> 63) ^ 0x7fffffff;
   1382     }
   1383     return x;
   1384 }
   1385 
   1386 uint64_t HELPER(neon_widen_u8)(uint32_t x)
   1387 {
   1388     uint64_t tmp;
   1389     uint64_t ret;
   1390     ret = (uint8_t)x;
   1391     tmp = (uint8_t)(x >> 8);
   1392     ret |= tmp << 16;
   1393     tmp = (uint8_t)(x >> 16);
   1394     ret |= tmp << 32;
   1395     tmp = (uint8_t)(x >> 24);
   1396     ret |= tmp << 48;
   1397     return ret;
   1398 }
   1399 
   1400 uint64_t HELPER(neon_widen_s8)(uint32_t x)
   1401 {
   1402     uint64_t tmp;
   1403     uint64_t ret;
   1404     ret = (uint16_t)(int8_t)x;
   1405     tmp = (uint16_t)(int8_t)(x >> 8);
   1406     ret |= tmp << 16;
   1407     tmp = (uint16_t)(int8_t)(x >> 16);
   1408     ret |= tmp << 32;
   1409     tmp = (uint16_t)(int8_t)(x >> 24);
   1410     ret |= tmp << 48;
   1411     return ret;
   1412 }
   1413 
   1414 uint64_t HELPER(neon_widen_u16)(uint32_t x)
   1415 {
   1416     uint64_t high = (uint16_t)(x >> 16);
   1417     return ((uint16_t)x) | (high << 32);
   1418 }
   1419 
   1420 uint64_t HELPER(neon_widen_s16)(uint32_t x)
   1421 {
   1422     uint64_t high = (int16_t)(x >> 16);
   1423     return ((uint32_t)(int16_t)x) | (high << 32);
   1424 }
   1425 
   1426 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
   1427 {
   1428     uint64_t mask;
   1429     mask = (a ^ b) & 0x8000800080008000ull;
   1430     a &= ~0x8000800080008000ull;
   1431     b &= ~0x8000800080008000ull;
   1432     return (a + b) ^ mask;
   1433 }
   1434 
   1435 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
   1436 {
   1437     uint64_t mask;
   1438     mask = (a ^ b) & 0x8000000080000000ull;
   1439     a &= ~0x8000000080000000ull;
   1440     b &= ~0x8000000080000000ull;
   1441     return (a + b) ^ mask;
   1442 }
   1443 
   1444 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
   1445 {
   1446     uint64_t tmp;
   1447     uint64_t tmp2;
   1448 
   1449     tmp = a & 0x0000ffff0000ffffull;
   1450     tmp += (a >> 16) & 0x0000ffff0000ffffull;
   1451     tmp2 = b & 0xffff0000ffff0000ull;
   1452     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
   1453     return    ( tmp         & 0xffff)
   1454             | ((tmp  >> 16) & 0xffff0000ull)
   1455             | ((tmp2 << 16) & 0xffff00000000ull)
   1456             | ( tmp2        & 0xffff000000000000ull);
   1457 }
   1458 
   1459 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
   1460 {
   1461     uint32_t low = a + (a >> 32);
   1462     uint32_t high = b + (b >> 32);
   1463     return low + ((uint64_t)high << 32);
   1464 }
   1465 
   1466 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
   1467 {
   1468     uint64_t mask;
   1469     mask = (a ^ ~b) & 0x8000800080008000ull;
   1470     a |= 0x8000800080008000ull;
   1471     b &= ~0x8000800080008000ull;
   1472     return (a - b) ^ mask;
   1473 }
   1474 
   1475 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
   1476 {
   1477     uint64_t mask;
   1478     mask = (a ^ ~b) & 0x8000000080000000ull;
   1479     a |= 0x8000000080000000ull;
   1480     b &= ~0x8000000080000000ull;
   1481     return (a - b) ^ mask;
   1482 }
   1483 
   1484 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b)
   1485 {
   1486     uint32_t x, y;
   1487     uint32_t low, high;
   1488 
   1489     x = a;
   1490     y = b;
   1491     low = x + y;
   1492     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1493         SET_QC();
   1494         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1495     }
   1496     x = a >> 32;
   1497     y = b >> 32;
   1498     high = x + y;
   1499     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1500         SET_QC();
   1501         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1502     }
   1503     return low | ((uint64_t)high << 32);
   1504 }
   1505 
   1506 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b)
   1507 {
   1508     uint64_t result;
   1509 
   1510     result = a + b;
   1511     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
   1512         SET_QC();
   1513         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
   1514     }
   1515     return result;
   1516 }
   1517 
   1518 /* We have to do the arithmetic in a larger type than
   1519  * the input type, because for example with a signed 32 bit
   1520  * op the absolute difference can overflow a signed 32 bit value.
   1521  */
   1522 #define DO_ABD(dest, x, y, intype, arithtype) do {            \
   1523     arithtype tmp_x = (intype)(x);                            \
   1524     arithtype tmp_y = (intype)(y);                            \
   1525     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
   1526     } while(0)
   1527 
   1528 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
   1529 {
   1530     uint64_t tmp;
   1531     uint64_t result;
   1532     DO_ABD(result, a, b, uint8_t, uint32_t);
   1533     DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t);
   1534     result |= tmp << 16;
   1535     DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t);
   1536     result |= tmp << 32;
   1537     DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t);
   1538     result |= tmp << 48;
   1539     return result;
   1540 }
   1541 
   1542 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
   1543 {
   1544     uint64_t tmp;
   1545     uint64_t result;
   1546     DO_ABD(result, a, b, int8_t, int32_t);
   1547     DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t);
   1548     result |= tmp << 16;
   1549     DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t);
   1550     result |= tmp << 32;
   1551     DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t);
   1552     result |= tmp << 48;
   1553     return result;
   1554 }
   1555 
   1556 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
   1557 {
   1558     uint64_t tmp;
   1559     uint64_t result;
   1560     DO_ABD(result, a, b, uint16_t, uint32_t);
   1561     DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
   1562     return result | (tmp << 32);
   1563 }
   1564 
   1565 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
   1566 {
   1567     uint64_t tmp;
   1568     uint64_t result;
   1569     DO_ABD(result, a, b, int16_t, int32_t);
   1570     DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t);
   1571     return result | (tmp << 32);
   1572 }
   1573 
   1574 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
   1575 {
   1576     uint64_t result;
   1577     DO_ABD(result, a, b, uint32_t, uint64_t);
   1578     return result;
   1579 }
   1580 
   1581 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
   1582 {
   1583     uint64_t result;
   1584     DO_ABD(result, a, b, int32_t, int64_t);
   1585     return result;
   1586 }
   1587 #undef DO_ABD
   1588 
   1589 /* Widening multiply. Named type is the source type.  */
   1590 #define DO_MULL(dest, x, y, type1, type2) do { \
   1591     type1 tmp_x = x; \
   1592     type1 tmp_y = y; \
   1593     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
   1594     } while(0)
   1595 
   1596 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
   1597 {
   1598     uint64_t tmp;
   1599     uint64_t result;
   1600 
   1601     DO_MULL(result, a, b, uint8_t, uint16_t);
   1602     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
   1603     result |= tmp << 16;
   1604     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
   1605     result |= tmp << 32;
   1606     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
   1607     result |= tmp << 48;
   1608     return result;
   1609 }
   1610 
   1611 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
   1612 {
   1613     uint64_t tmp;
   1614     uint64_t result;
   1615 
   1616     DO_MULL(result, a, b, int8_t, uint16_t);
   1617     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
   1618     result |= tmp << 16;
   1619     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
   1620     result |= tmp << 32;
   1621     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
   1622     result |= tmp << 48;
   1623     return result;
   1624 }
   1625 
   1626 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
   1627 {
   1628     uint64_t tmp;
   1629     uint64_t result;
   1630 
   1631     DO_MULL(result, a, b, uint16_t, uint32_t);
   1632     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
   1633     return result | (tmp << 32);
   1634 }
   1635 
   1636 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
   1637 {
   1638     uint64_t tmp;
   1639     uint64_t result;
   1640 
   1641     DO_MULL(result, a, b, int16_t, uint32_t);
   1642     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
   1643     return result | (tmp << 32);
   1644 }
   1645 
   1646 uint64_t HELPER(neon_negl_u16)(uint64_t x)
   1647 {
   1648     uint16_t tmp;
   1649     uint64_t result;
   1650     result = (uint16_t)-x;
   1651     tmp = -(x >> 16);
   1652     result |= (uint64_t)tmp << 16;
   1653     tmp = -(x >> 32);
   1654     result |= (uint64_t)tmp << 32;
   1655     tmp = -(x >> 48);
   1656     result |= (uint64_t)tmp << 48;
   1657     return result;
   1658 }
   1659 
   1660 uint64_t HELPER(neon_negl_u32)(uint64_t x)
   1661 {
   1662     uint32_t low = -x;
   1663     uint32_t high = -(x >> 32);
   1664     return low | ((uint64_t)high << 32);
   1665 }
   1666 
   1667 /* FIXME:  There should be a native op for this.  */
   1668 uint64_t HELPER(neon_negl_u64)(uint64_t x)
   1669 {
   1670     return -x;
   1671 }
   1672 
   1673 /* Saturating sign manipulation.  */
   1674 /* ??? Make these use NEON_VOP1 */
   1675 #define DO_QABS8(x) do { \
   1676     if (x == (int8_t)0x80) { \
   1677         x = 0x7f; \
   1678         SET_QC(); \
   1679     } else if (x < 0) { \
   1680         x = -x; \
   1681     }} while (0)
   1682 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x)
   1683 {
   1684     neon_s8 vec;
   1685     NEON_UNPACK(neon_s8, vec, x);
   1686     DO_QABS8(vec.v1);
   1687     DO_QABS8(vec.v2);
   1688     DO_QABS8(vec.v3);
   1689     DO_QABS8(vec.v4);
   1690     NEON_PACK(neon_s8, x, vec);
   1691     return x;
   1692 }
   1693 #undef DO_QABS8
   1694 
   1695 #define DO_QNEG8(x) do { \
   1696     if (x == (int8_t)0x80) { \
   1697         x = 0x7f; \
   1698         SET_QC(); \
   1699     } else { \
   1700         x = -x; \
   1701     }} while (0)
   1702 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x)
   1703 {
   1704     neon_s8 vec;
   1705     NEON_UNPACK(neon_s8, vec, x);
   1706     DO_QNEG8(vec.v1);
   1707     DO_QNEG8(vec.v2);
   1708     DO_QNEG8(vec.v3);
   1709     DO_QNEG8(vec.v4);
   1710     NEON_PACK(neon_s8, x, vec);
   1711     return x;
   1712 }
   1713 #undef DO_QNEG8
   1714 
   1715 #define DO_QABS16(x) do { \
   1716     if (x == (int16_t)0x8000) { \
   1717         x = 0x7fff; \
   1718         SET_QC(); \
   1719     } else if (x < 0) { \
   1720         x = -x; \
   1721     }} while (0)
   1722 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x)
   1723 {
   1724     neon_s16 vec;
   1725     NEON_UNPACK(neon_s16, vec, x);
   1726     DO_QABS16(vec.v1);
   1727     DO_QABS16(vec.v2);
   1728     NEON_PACK(neon_s16, x, vec);
   1729     return x;
   1730 }
   1731 #undef DO_QABS16
   1732 
   1733 #define DO_QNEG16(x) do { \
   1734     if (x == (int16_t)0x8000) { \
   1735         x = 0x7fff; \
   1736         SET_QC(); \
   1737     } else { \
   1738         x = -x; \
   1739     }} while (0)
   1740 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x)
   1741 {
   1742     neon_s16 vec;
   1743     NEON_UNPACK(neon_s16, vec, x);
   1744     DO_QNEG16(vec.v1);
   1745     DO_QNEG16(vec.v2);
   1746     NEON_PACK(neon_s16, x, vec);
   1747     return x;
   1748 }
   1749 #undef DO_QNEG16
   1750 
   1751 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x)
   1752 {
   1753     if (x == SIGNBIT) {
   1754         SET_QC();
   1755         x = ~SIGNBIT;
   1756     } else if ((int32_t)x < 0) {
   1757         x = -x;
   1758     }
   1759     return x;
   1760 }
   1761 
   1762 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x)
   1763 {
   1764     if (x == SIGNBIT) {
   1765         SET_QC();
   1766         x = ~SIGNBIT;
   1767     } else {
   1768         x = -x;
   1769     }
   1770     return x;
   1771 }
   1772 
   1773 /* NEON Float helpers.  */
   1774 uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b, void *fpstp)
   1775 {
   1776     float_status *fpst = fpstp;
   1777     return float32_val(float32_min(make_float32(a), make_float32(b), fpst));
   1778 }
   1779 
   1780 uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b, void *fpstp)
   1781 {
   1782     float_status *fpst = fpstp;
   1783     return float32_val(float32_max(make_float32(a), make_float32(b), fpst));
   1784 }
   1785 
   1786 uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b, void *fpstp)
   1787 {
   1788     float_status *fpst = fpstp;
   1789     float32 f0 = make_float32(a);
   1790     float32 f1 = make_float32(b);
   1791     return float32_val(float32_abs(float32_sub(f0, f1, fpst)));
   1792 }
   1793 
   1794 /* Floating point comparisons produce an integer result.
   1795  * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
   1796  * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
   1797  */
   1798 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp)
   1799 {
   1800     float_status *fpst = fpstp;
   1801     return -float32_eq_quiet(make_float32(a), make_float32(b), fpst);
   1802 }
   1803 
   1804 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp)
   1805 {
   1806     float_status *fpst = fpstp;
   1807     return -float32_le(make_float32(b), make_float32(a), fpst);
   1808 }
   1809 
   1810 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp)
   1811 {
   1812     float_status *fpst = fpstp;
   1813     return -float32_lt(make_float32(b), make_float32(a), fpst);
   1814 }
   1815 
   1816 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp)
   1817 {
   1818     float_status *fpst = fpstp;
   1819     float32 f0 = float32_abs(make_float32(a));
   1820     float32 f1 = float32_abs(make_float32(b));
   1821     return -float32_le(f1, f0, fpst);
   1822 }
   1823 
   1824 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp)
   1825 {
   1826     float_status *fpst = fpstp;
   1827     float32 f0 = float32_abs(make_float32(a));
   1828     float32 f1 = float32_abs(make_float32(b));
   1829     return -float32_lt(f1, f0, fpst);
   1830 }
   1831 
   1832 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1))
   1833 
   1834 void HELPER(neon_qunzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
   1835 {
   1836     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1837     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1838     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1839     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1840     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8)
   1841         | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24)
   1842         | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40)
   1843         | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56);
   1844     uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8)
   1845         | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24)
   1846         | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
   1847         | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56);
   1848     uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8)
   1849         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24)
   1850         | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40)
   1851         | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56);
   1852     uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8)
   1853         | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24)
   1854         | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40)
   1855         | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
   1856     env->vfp.regs[rm] = make_float64(m0);
   1857     env->vfp.regs[rm + 1] = make_float64(m1);
   1858     env->vfp.regs[rd] = make_float64(d0);
   1859     env->vfp.regs[rd + 1] = make_float64(d1);
   1860 }
   1861 
   1862 void HELPER(neon_qunzip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
   1863 {
   1864     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1865     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1866     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1867     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1868     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16)
   1869         | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48);
   1870     uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16)
   1871         | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48);
   1872     uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16)
   1873         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48);
   1874     uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16)
   1875         | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
   1876     env->vfp.regs[rm] = make_float64(m0);
   1877     env->vfp.regs[rm + 1] = make_float64(m1);
   1878     env->vfp.regs[rd] = make_float64(d0);
   1879     env->vfp.regs[rd + 1] = make_float64(d1);
   1880 }
   1881 
   1882 void HELPER(neon_qunzip32)(CPUARMState *env, uint32_t rd, uint32_t rm)
   1883 {
   1884     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1885     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1886     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1887     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1888     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32);
   1889     uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32);
   1890     uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32);
   1891     uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32);
   1892     env->vfp.regs[rm] = make_float64(m0);
   1893     env->vfp.regs[rm + 1] = make_float64(m1);
   1894     env->vfp.regs[rd] = make_float64(d0);
   1895     env->vfp.regs[rd + 1] = make_float64(d1);
   1896 }
   1897 
   1898 void HELPER(neon_unzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
   1899 {
   1900     uint64_t zm = float64_val(env->vfp.regs[rm]);
   1901     uint64_t zd = float64_val(env->vfp.regs[rd]);
   1902     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8)
   1903         | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24)
   1904         | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40)
   1905         | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56);
   1906     uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8)
   1907         | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24)
   1908         | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40)
   1909         | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56);
   1910     env->vfp.regs[rm] = make_float64(m0);
   1911     env->vfp.regs[rd] = make_float64(d0);
   1912 }
   1913 
   1914 void HELPER(neon_unzip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
   1915 {
   1916     uint64_t zm = float64_val(env->vfp.regs[rm]);
   1917     uint64_t zd = float64_val(env->vfp.regs[rd]);
   1918     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16)
   1919         | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48);
   1920     uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16)
   1921         | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48);
   1922     env->vfp.regs[rm] = make_float64(m0);
   1923     env->vfp.regs[rd] = make_float64(d0);
   1924 }
   1925 
   1926 void HELPER(neon_qzip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
   1927 {
   1928     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1929     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1930     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1931     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1932     uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8)
   1933         | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24)
   1934         | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40)
   1935         | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56);
   1936     uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8)
   1937         | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24)
   1938         | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40)
   1939         | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56);
   1940     uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8)
   1941         | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24)
   1942         | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40)
   1943         | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56);
   1944     uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8)
   1945         | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24)
   1946         | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40)
   1947         | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56);
   1948     env->vfp.regs[rm] = make_float64(m0);
   1949     env->vfp.regs[rm + 1] = make_float64(m1);
   1950     env->vfp.regs[rd] = make_float64(d0);
   1951     env->vfp.regs[rd + 1] = make_float64(d1);
   1952 }
   1953 
   1954 void HELPER(neon_qzip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
   1955 {
   1956     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1957     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1958     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1959     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1960     uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16)
   1961         | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48);
   1962     uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16)
   1963         | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48);
   1964     uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16)
   1965         | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48);
   1966     uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16)
   1967         | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48);
   1968     env->vfp.regs[rm] = make_float64(m0);
   1969     env->vfp.regs[rm + 1] = make_float64(m1);
   1970     env->vfp.regs[rd] = make_float64(d0);
   1971     env->vfp.regs[rd + 1] = make_float64(d1);
   1972 }
   1973 
   1974 void HELPER(neon_qzip32)(CPUARMState *env, uint32_t rd, uint32_t rm)
   1975 {
   1976     uint64_t zm0 = float64_val(env->vfp.regs[rm]);
   1977     uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]);
   1978     uint64_t zd0 = float64_val(env->vfp.regs[rd]);
   1979     uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]);
   1980     uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32);
   1981     uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32);
   1982     uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32);
   1983     uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32);
   1984     env->vfp.regs[rm] = make_float64(m0);
   1985     env->vfp.regs[rm + 1] = make_float64(m1);
   1986     env->vfp.regs[rd] = make_float64(d0);
   1987     env->vfp.regs[rd + 1] = make_float64(d1);
   1988 }
   1989 
   1990 void HELPER(neon_zip8)(CPUARMState *env, uint32_t rd, uint32_t rm)
   1991 {
   1992     uint64_t zm = float64_val(env->vfp.regs[rm]);
   1993     uint64_t zd = float64_val(env->vfp.regs[rd]);
   1994     uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8)
   1995         | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24)
   1996         | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40)
   1997         | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56);
   1998     uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8)
   1999         | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24)
   2000         | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40)
   2001         | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56);
   2002     env->vfp.regs[rm] = make_float64(m0);
   2003     env->vfp.regs[rd] = make_float64(d0);
   2004 }
   2005 
   2006 void HELPER(neon_zip16)(CPUARMState *env, uint32_t rd, uint32_t rm)
   2007 {
   2008     uint64_t zm = float64_val(env->vfp.regs[rm]);
   2009     uint64_t zd = float64_val(env->vfp.regs[rd]);
   2010     uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16)
   2011         | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48);
   2012     uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16)
   2013         | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48);
   2014     env->vfp.regs[rm] = make_float64(m0);
   2015     env->vfp.regs[rd] = make_float64(d0);
   2016 }
   2017