Home | History | Annotate | Download | only in target-arm
      1 /*
      2  * ARM NEON vector operations.
      3  *
      4  * Copyright (c) 2007, 2008 CodeSourcery.
      5  * Written by Paul Brook
      6  *
      7  * This code is licenced under the GNU GPL v2.
      8  */
      9 #include <stdlib.h>
     10 #include <stdio.h>
     11 
     12 #include "cpu.h"
     13 #include "exec-all.h"
     14 #include "helpers.h"
     15 
     16 #define SIGNBIT (uint32_t)0x80000000
     17 #define SIGNBIT64 ((uint64_t)1 << 63)
     18 
     19 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q
     20 
     21 static float_status neon_float_status;
     22 #define NFS &neon_float_status
     23 
     24 /* Helper routines to perform bitwise copies between float and int.  */
     25 static inline float32 vfp_itos(uint32_t i)
     26 {
     27     union {
     28         uint32_t i;
     29         float32 s;
     30     } v;
     31 
     32     v.i = i;
     33     return v.s;
     34 }
     35 
     36 static inline uint32_t vfp_stoi(float32 s)
     37 {
     38     union {
     39         uint32_t i;
     40         float32 s;
     41     } v;
     42 
     43     v.s = s;
     44     return v.i;
     45 }
     46 
     47 #define NEON_TYPE1(name, type) \
     48 typedef struct \
     49 { \
     50     type v1; \
     51 } neon_##name;
     52 #ifdef HOST_WORDS_BIGENDIAN
     53 #define NEON_TYPE2(name, type) \
     54 typedef struct \
     55 { \
     56     type v2; \
     57     type v1; \
     58 } neon_##name;
     59 #define NEON_TYPE4(name, type) \
     60 typedef struct \
     61 { \
     62     type v4; \
     63     type v3; \
     64     type v2; \
     65     type v1; \
     66 } neon_##name;
     67 #else
     68 #define NEON_TYPE2(name, type) \
     69 typedef struct \
     70 { \
     71     type v1; \
     72     type v2; \
     73 } neon_##name;
     74 #define NEON_TYPE4(name, type) \
     75 typedef struct \
     76 { \
     77     type v1; \
     78     type v2; \
     79     type v3; \
     80     type v4; \
     81 } neon_##name;
     82 #endif
     83 
     84 NEON_TYPE4(s8, int8_t)
     85 NEON_TYPE4(u8, uint8_t)
     86 NEON_TYPE2(s16, int16_t)
     87 NEON_TYPE2(u16, uint16_t)
     88 NEON_TYPE1(s32, int32_t)
     89 NEON_TYPE1(u32, uint32_t)
     90 #undef NEON_TYPE4
     91 #undef NEON_TYPE2
     92 #undef NEON_TYPE1
     93 
     94 /* Copy from a uint32_t to a vector structure type.  */
     95 #define NEON_UNPACK(vtype, dest, val) do { \
     96     union { \
     97         vtype v; \
     98         uint32_t i; \
     99     } conv_u; \
    100     conv_u.i = (val); \
    101     dest = conv_u.v; \
    102     } while(0)
    103 
    104 /* Copy from a vector structure type to a uint32_t.  */
    105 #define NEON_PACK(vtype, dest, val) do { \
    106     union { \
    107         vtype v; \
    108         uint32_t i; \
    109     } conv_u; \
    110     conv_u.v = (val); \
    111     dest = conv_u.i; \
    112     } while(0)
    113 
    114 #define NEON_DO1 \
    115     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1);
    116 #define NEON_DO2 \
    117     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
    118     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2);
    119 #define NEON_DO4 \
    120     NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \
    121     NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \
    122     NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \
    123     NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4);
    124 
    125 #define NEON_VOP_BODY(vtype, n) \
    126 { \
    127     uint32_t res; \
    128     vtype vsrc1; \
    129     vtype vsrc2; \
    130     vtype vdest; \
    131     NEON_UNPACK(vtype, vsrc1, arg1); \
    132     NEON_UNPACK(vtype, vsrc2, arg2); \
    133     NEON_DO##n; \
    134     NEON_PACK(vtype, res, vdest); \
    135     return res; \
    136 }
    137 
    138 #define NEON_VOP(name, vtype, n) \
    139 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    140 NEON_VOP_BODY(vtype, n)
    141 
    142 #define NEON_VOP_ENV(name, vtype, n) \
    143 uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \
    144 NEON_VOP_BODY(vtype, n)
    145 
    146 /* Pairwise operations.  */
    147 /* For 32-bit elements each segment only contains a single element, so
    148    the elementwise and pairwise operations are the same.  */
    149 #define NEON_PDO2 \
    150     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    151     NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2);
    152 #define NEON_PDO4 \
    153     NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \
    154     NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \
    155     NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \
    156     NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \
    157 
    158 #define NEON_POP(name, vtype, n) \
    159 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \
    160 { \
    161     uint32_t res; \
    162     vtype vsrc1; \
    163     vtype vsrc2; \
    164     vtype vdest; \
    165     NEON_UNPACK(vtype, vsrc1, arg1); \
    166     NEON_UNPACK(vtype, vsrc2, arg2); \
    167     NEON_PDO##n; \
    168     NEON_PACK(vtype, res, vdest); \
    169     return res; \
    170 }
    171 
    172 /* Unary operators.  */
    173 #define NEON_VOP1(name, vtype, n) \
    174 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \
    175 { \
    176     vtype vsrc1; \
    177     vtype vdest; \
    178     NEON_UNPACK(vtype, vsrc1, arg); \
    179     NEON_DO##n; \
    180     NEON_PACK(vtype, arg, vdest); \
    181     return arg; \
    182 }
    183 
    184 
    185 #define NEON_USAT(dest, src1, src2, type) do { \
    186     uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    187     if (tmp != (type)tmp) { \
    188         SET_QC(); \
    189         dest = ~0; \
    190     } else { \
    191         dest = tmp; \
    192     }} while(0)
    193 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    194 NEON_VOP_ENV(qadd_u8, neon_u8, 4)
    195 #undef NEON_FN
    196 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    197 NEON_VOP_ENV(qadd_u16, neon_u16, 2)
    198 #undef NEON_FN
    199 #undef NEON_USAT
    200 
    201 #define NEON_SSAT(dest, src1, src2, type) do { \
    202     int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \
    203     if (tmp != (type)tmp) { \
    204         SET_QC(); \
    205         if (src2 > 0) { \
    206             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    207         } else { \
    208             tmp = 1 << (sizeof(type) * 8 - 1); \
    209         } \
    210     } \
    211     dest = tmp; \
    212     } while(0)
    213 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    214 NEON_VOP_ENV(qadd_s8, neon_s8, 4)
    215 #undef NEON_FN
    216 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    217 NEON_VOP_ENV(qadd_s16, neon_s16, 2)
    218 #undef NEON_FN
    219 #undef NEON_SSAT
    220 
    221 #define NEON_USAT(dest, src1, src2, type) do { \
    222     uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    223     if (tmp != (type)tmp) { \
    224         SET_QC(); \
    225         dest = 0; \
    226     } else { \
    227         dest = tmp; \
    228     }} while(0)
    229 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t)
    230 NEON_VOP_ENV(qsub_u8, neon_u8, 4)
    231 #undef NEON_FN
    232 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t)
    233 NEON_VOP_ENV(qsub_u16, neon_u16, 2)
    234 #undef NEON_FN
    235 #undef NEON_USAT
    236 
    237 #define NEON_SSAT(dest, src1, src2, type) do { \
    238     int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \
    239     if (tmp != (type)tmp) { \
    240         SET_QC(); \
    241         if (src2 < 0) { \
    242             tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \
    243         } else { \
    244             tmp = 1 << (sizeof(type) * 8 - 1); \
    245         } \
    246     } \
    247     dest = tmp; \
    248     } while(0)
    249 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t)
    250 NEON_VOP_ENV(qsub_s8, neon_s8, 4)
    251 #undef NEON_FN
    252 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t)
    253 NEON_VOP_ENV(qsub_s16, neon_s16, 2)
    254 #undef NEON_FN
    255 #undef NEON_SSAT
    256 
    257 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1
    258 NEON_VOP(hadd_s8, neon_s8, 4)
    259 NEON_VOP(hadd_u8, neon_u8, 4)
    260 NEON_VOP(hadd_s16, neon_s16, 2)
    261 NEON_VOP(hadd_u16, neon_u16, 2)
    262 #undef NEON_FN
    263 
    264 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2)
    265 {
    266     int32_t dest;
    267 
    268     dest = (src1 >> 1) + (src2 >> 1);
    269     if (src1 & src2 & 1)
    270         dest++;
    271     return dest;
    272 }
    273 
    274 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2)
    275 {
    276     uint32_t dest;
    277 
    278     dest = (src1 >> 1) + (src2 >> 1);
    279     if (src1 & src2 & 1)
    280         dest++;
    281     return dest;
    282 }
    283 
    284 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1
    285 NEON_VOP(rhadd_s8, neon_s8, 4)
    286 NEON_VOP(rhadd_u8, neon_u8, 4)
    287 NEON_VOP(rhadd_s16, neon_s16, 2)
    288 NEON_VOP(rhadd_u16, neon_u16, 2)
    289 #undef NEON_FN
    290 
    291 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2)
    292 {
    293     int32_t dest;
    294 
    295     dest = (src1 >> 1) + (src2 >> 1);
    296     if ((src1 | src2) & 1)
    297         dest++;
    298     return dest;
    299 }
    300 
    301 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2)
    302 {
    303     uint32_t dest;
    304 
    305     dest = (src1 >> 1) + (src2 >> 1);
    306     if ((src1 | src2) & 1)
    307         dest++;
    308     return dest;
    309 }
    310 
    311 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1
    312 NEON_VOP(hsub_s8, neon_s8, 4)
    313 NEON_VOP(hsub_u8, neon_u8, 4)
    314 NEON_VOP(hsub_s16, neon_s16, 2)
    315 NEON_VOP(hsub_u16, neon_u16, 2)
    316 #undef NEON_FN
    317 
    318 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2)
    319 {
    320     int32_t dest;
    321 
    322     dest = (src1 >> 1) - (src2 >> 1);
    323     if ((~src1) & src2 & 1)
    324         dest--;
    325     return dest;
    326 }
    327 
    328 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2)
    329 {
    330     uint32_t dest;
    331 
    332     dest = (src1 >> 1) - (src2 >> 1);
    333     if ((~src1) & src2 & 1)
    334         dest--;
    335     return dest;
    336 }
    337 
    338 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0
    339 NEON_VOP(cgt_s8, neon_s8, 4)
    340 NEON_VOP(cgt_u8, neon_u8, 4)
    341 NEON_VOP(cgt_s16, neon_s16, 2)
    342 NEON_VOP(cgt_u16, neon_u16, 2)
    343 NEON_VOP(cgt_s32, neon_s32, 1)
    344 NEON_VOP(cgt_u32, neon_u32, 1)
    345 #undef NEON_FN
    346 
    347 #define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0
    348 NEON_VOP(cge_s8, neon_s8, 4)
    349 NEON_VOP(cge_u8, neon_u8, 4)
    350 NEON_VOP(cge_s16, neon_s16, 2)
    351 NEON_VOP(cge_u16, neon_u16, 2)
    352 NEON_VOP(cge_s32, neon_s32, 1)
    353 NEON_VOP(cge_u32, neon_u32, 1)
    354 #undef NEON_FN
    355 
    356 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2
    357 NEON_VOP(min_s8, neon_s8, 4)
    358 NEON_VOP(min_u8, neon_u8, 4)
    359 NEON_VOP(min_s16, neon_s16, 2)
    360 NEON_VOP(min_u16, neon_u16, 2)
    361 NEON_VOP(min_s32, neon_s32, 1)
    362 NEON_VOP(min_u32, neon_u32, 1)
    363 NEON_POP(pmin_s8, neon_s8, 4)
    364 NEON_POP(pmin_u8, neon_u8, 4)
    365 NEON_POP(pmin_s16, neon_s16, 2)
    366 NEON_POP(pmin_u16, neon_u16, 2)
    367 #undef NEON_FN
    368 
    369 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2
    370 NEON_VOP(max_s8, neon_s8, 4)
    371 NEON_VOP(max_u8, neon_u8, 4)
    372 NEON_VOP(max_s16, neon_s16, 2)
    373 NEON_VOP(max_u16, neon_u16, 2)
    374 NEON_VOP(max_s32, neon_s32, 1)
    375 NEON_VOP(max_u32, neon_u32, 1)
    376 NEON_POP(pmax_s8, neon_s8, 4)
    377 NEON_POP(pmax_u8, neon_u8, 4)
    378 NEON_POP(pmax_s16, neon_s16, 2)
    379 NEON_POP(pmax_u16, neon_u16, 2)
    380 #undef NEON_FN
    381 
    382 #define NEON_FN(dest, src1, src2) \
    383     dest = (src1 > src2) ? (src1 - src2) : (src2 - src1)
    384 NEON_VOP(abd_s8, neon_s8, 4)
    385 NEON_VOP(abd_u8, neon_u8, 4)
    386 NEON_VOP(abd_s16, neon_s16, 2)
    387 NEON_VOP(abd_u16, neon_u16, 2)
    388 NEON_VOP(abd_s32, neon_s32, 1)
    389 NEON_VOP(abd_u32, neon_u32, 1)
    390 #undef NEON_FN
    391 
    392 #define NEON_FN(dest, src1, src2) do { \
    393     int8_t tmp; \
    394     tmp = (int8_t)src2; \
    395     if (tmp >= sizeof(src1) * 8 || tmp <= -sizeof(src1) * 8) { \
    396         dest = 0; \
    397     } else if (tmp < 0) { \
    398         dest = src1 >> -tmp; \
    399     } else { \
    400         dest = src1 << tmp; \
    401     }} while (0)
    402 NEON_VOP(shl_u8, neon_u8, 4)
    403 NEON_VOP(shl_u16, neon_u16, 2)
    404 NEON_VOP(shl_u32, neon_u32, 1)
    405 #undef NEON_FN
    406 
    407 uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop)
    408 {
    409     int8_t shift = (int8_t)shiftop;
    410     if (shift >= 64 || shift <= -64) {
    411         val = 0;
    412     } else if (shift < 0) {
    413         val >>= -shift;
    414     } else {
    415         val <<= shift;
    416     }
    417     return val;
    418 }
    419 
    420 #define NEON_FN(dest, src1, src2) do { \
    421     int8_t tmp; \
    422     tmp = (int8_t)src2; \
    423     if (tmp >= sizeof(src1) * 8) { \
    424         dest = 0; \
    425     } else if (tmp <= -sizeof(src1) * 8) { \
    426         dest = src1 >> (sizeof(src1) * 8 - 1); \
    427     } else if (tmp < 0) { \
    428         dest = src1 >> -tmp; \
    429     } else { \
    430         dest = src1 << tmp; \
    431     }} while (0)
    432 NEON_VOP(shl_s8, neon_s8, 4)
    433 NEON_VOP(shl_s16, neon_s16, 2)
    434 NEON_VOP(shl_s32, neon_s32, 1)
    435 #undef NEON_FN
    436 
    437 uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop)
    438 {
    439     int8_t shift = (int8_t)shiftop;
    440     int64_t val = valop;
    441     if (shift >= 64) {
    442         val = 0;
    443     } else if (shift <= -64) {
    444         val >>= 63;
    445     } else if (shift < 0) {
    446         val >>= -shift;
    447     } else {
    448         val <<= shift;
    449     }
    450     return val;
    451 }
    452 
    453 #define NEON_FN(dest, src1, src2) do { \
    454     int8_t tmp; \
    455     tmp = (int8_t)src2; \
    456     if (tmp >= sizeof(src1) * 8) { \
    457         dest = 0; \
    458     } else if (tmp < -sizeof(src1) * 8) { \
    459         dest = src1 >> (sizeof(src1) * 8 - 1); \
    460     } else if (tmp == -sizeof(src1) * 8) { \
    461         dest = src1 >> (tmp - 1); \
    462         dest++; \
    463         dest >>= 1; \
    464     } else if (tmp < 0) { \
    465         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    466     } else { \
    467         dest = src1 << tmp; \
    468     }} while (0)
    469 NEON_VOP(rshl_s8, neon_s8, 4)
    470 NEON_VOP(rshl_s16, neon_s16, 2)
    471 NEON_VOP(rshl_s32, neon_s32, 1)
    472 #undef NEON_FN
    473 
    474 uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop)
    475 {
    476     int8_t shift = (int8_t)shiftop;
    477     int64_t val = valop;
    478     if (shift >= 64) {
    479         val = 0;
    480     } else if (shift < -64) {
    481         val >>= 63;
    482     } else if (shift == -63) {
    483         val >>= 63;
    484         val++;
    485         val >>= 1;
    486     } else if (shift < 0) {
    487         val = (val + ((int64_t)1 << (-1 - shift))) >> -shift;
    488     } else {
    489         val <<= shift;
    490     }
    491     return val;
    492 }
    493 
    494 #define NEON_FN(dest, src1, src2) do { \
    495     int8_t tmp; \
    496     tmp = (int8_t)src2; \
    497     if (tmp >= sizeof(src1) * 8 || tmp < -sizeof(src1) * 8) { \
    498         dest = 0; \
    499     } else if (tmp == -sizeof(src1) * 8) { \
    500         dest = src1 >> (tmp - 1); \
    501     } else if (tmp < 0) { \
    502         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    503     } else { \
    504         dest = src1 << tmp; \
    505     }} while (0)
    506 NEON_VOP(rshl_u8, neon_u8, 4)
    507 NEON_VOP(rshl_u16, neon_u16, 2)
    508 NEON_VOP(rshl_u32, neon_u32, 1)
    509 #undef NEON_FN
    510 
    511 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop)
    512 {
    513     int8_t shift = (uint8_t)shiftop;
    514     if (shift >= 64 || shift < 64) {
    515         val = 0;
    516     } else if (shift == -64) {
    517         /* Rounding a 1-bit result just preserves that bit.  */
    518         val >>= 63;
    519     } if (shift < 0) {
    520         val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift;
    521         val >>= -shift;
    522     } else {
    523         val <<= shift;
    524     }
    525     return val;
    526 }
    527 
    528 #define NEON_FN(dest, src1, src2) do { \
    529     int8_t tmp; \
    530     tmp = (int8_t)src2; \
    531     if (tmp >= sizeof(src1) * 8) { \
    532         if (src1) { \
    533             SET_QC(); \
    534             dest = ~0; \
    535         } else { \
    536             dest = 0; \
    537         } \
    538     } else if (tmp <= -sizeof(src1) * 8) { \
    539         dest = 0; \
    540     } else if (tmp < 0) { \
    541         dest = src1 >> -tmp; \
    542     } else { \
    543         dest = src1 << tmp; \
    544         if ((dest >> tmp) != src1) { \
    545             SET_QC(); \
    546             dest = ~0; \
    547         } \
    548     }} while (0)
    549 NEON_VOP_ENV(qshl_u8, neon_u8, 4)
    550 NEON_VOP_ENV(qshl_u16, neon_u16, 2)
    551 NEON_VOP_ENV(qshl_u32, neon_u32, 1)
    552 #undef NEON_FN
    553 
    554 uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
    555 {
    556     int8_t shift = (int8_t)shiftop;
    557     if (shift >= 64) {
    558         if (val) {
    559             val = ~(uint64_t)0;
    560             SET_QC();
    561         } else {
    562             val = 0;
    563         }
    564     } else if (shift <= -64) {
    565         val = 0;
    566     } else if (shift < 0) {
    567         val >>= -shift;
    568     } else {
    569         uint64_t tmp = val;
    570         val <<= shift;
    571         if ((val >> shift) != tmp) {
    572             SET_QC();
    573             val = ~(uint64_t)0;
    574         }
    575     }
    576     return val;
    577 }
    578 
    579 #define NEON_FN(dest, src1, src2) do { \
    580     int8_t tmp; \
    581     tmp = (int8_t)src2; \
    582     if (tmp >= sizeof(src1) * 8) { \
    583         if (src1) \
    584             SET_QC(); \
    585         dest = src1 >> 31; \
    586     } else if (tmp <= -sizeof(src1) * 8) { \
    587         dest = src1 >> 31; \
    588     } else if (tmp < 0) { \
    589         dest = src1 >> -tmp; \
    590     } else { \
    591         dest = src1 << tmp; \
    592         if ((dest >> tmp) != src1) { \
    593             SET_QC(); \
    594             dest = src2 >> 31; \
    595         } \
    596     }} while (0)
    597 NEON_VOP_ENV(qshl_s8, neon_s8, 4)
    598 NEON_VOP_ENV(qshl_s16, neon_s16, 2)
    599 NEON_VOP_ENV(qshl_s32, neon_s32, 1)
    600 #undef NEON_FN
    601 
    602 uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
    603 {
    604     int8_t shift = (uint8_t)shiftop;
    605     int64_t val = valop;
    606     if (shift >= 64) {
    607         if (val) {
    608             SET_QC();
    609             val = (val >> 63) & ~SIGNBIT64;
    610         }
    611     } else if (shift <= 64) {
    612         val >>= 63;
    613     } else if (shift < 0) {
    614         val >>= -shift;
    615     } else {
    616         int64_t tmp = val;
    617         val <<= shift;
    618         if ((val >> shift) != tmp) {
    619             SET_QC();
    620             val = (tmp >> 63) ^ ~SIGNBIT64;
    621         }
    622     }
    623     return val;
    624 }
    625 
    626 
    627 /* FIXME: This is wrong.  */
    628 #define NEON_FN(dest, src1, src2) do { \
    629     int8_t tmp; \
    630     tmp = (int8_t)src2; \
    631     if (tmp < 0) { \
    632         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    633     } else { \
    634         dest = src1 << tmp; \
    635         if ((dest >> tmp) != src1) { \
    636             SET_QC(); \
    637             dest = ~0; \
    638         } \
    639     }} while (0)
    640 NEON_VOP_ENV(qrshl_u8, neon_u8, 4)
    641 NEON_VOP_ENV(qrshl_u16, neon_u16, 2)
    642 NEON_VOP_ENV(qrshl_u32, neon_u32, 1)
    643 #undef NEON_FN
    644 
    645 uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop)
    646 {
    647     int8_t shift = (int8_t)shiftop;
    648     if (shift < 0) {
    649         val = (val + (1 << (-1 - shift))) >> -shift;
    650     } else { \
    651         uint64_t tmp = val;
    652         val <<= shift;
    653         if ((val >> shift) != tmp) {
    654             SET_QC();
    655             val = ~0;
    656         }
    657     }
    658     return val;
    659 }
    660 
    661 #define NEON_FN(dest, src1, src2) do { \
    662     int8_t tmp; \
    663     tmp = (int8_t)src2; \
    664     if (tmp < 0) { \
    665         dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \
    666     } else { \
    667         dest = src1 << tmp; \
    668         if ((dest >> tmp) != src1) { \
    669             SET_QC(); \
    670             dest = src1 >> 31; \
    671         } \
    672     }} while (0)
    673 NEON_VOP_ENV(qrshl_s8, neon_s8, 4)
    674 NEON_VOP_ENV(qrshl_s16, neon_s16, 2)
    675 NEON_VOP_ENV(qrshl_s32, neon_s32, 1)
    676 #undef NEON_FN
    677 
    678 uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop)
    679 {
    680     int8_t shift = (uint8_t)shiftop;
    681     int64_t val = valop;
    682 
    683     if (shift < 0) {
    684         val = (val + (1 << (-1 - shift))) >> -shift;
    685     } else {
    686         int64_t tmp = val;;
    687         val <<= shift;
    688         if ((val >> shift) != tmp) {
    689             SET_QC();
    690             val = tmp >> 31;
    691         }
    692     }
    693     return val;
    694 }
    695 
    696 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b)
    697 {
    698     uint32_t mask;
    699     mask = (a ^ b) & 0x80808080u;
    700     a &= ~0x80808080u;
    701     b &= ~0x80808080u;
    702     return (a + b) ^ mask;
    703 }
    704 
    705 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b)
    706 {
    707     uint32_t mask;
    708     mask = (a ^ b) & 0x80008000u;
    709     a &= ~0x80008000u;
    710     b &= ~0x80008000u;
    711     return (a + b) ^ mask;
    712 }
    713 
    714 #define NEON_FN(dest, src1, src2) dest = src1 + src2
    715 NEON_POP(padd_u8, neon_u8, 4)
    716 NEON_POP(padd_u16, neon_u16, 2)
    717 #undef NEON_FN
    718 
    719 #define NEON_FN(dest, src1, src2) dest = src1 - src2
    720 NEON_VOP(sub_u8, neon_u8, 4)
    721 NEON_VOP(sub_u16, neon_u16, 2)
    722 #undef NEON_FN
    723 
    724 #define NEON_FN(dest, src1, src2) dest = src1 * src2
    725 NEON_VOP(mul_u8, neon_u8, 4)
    726 NEON_VOP(mul_u16, neon_u16, 2)
    727 #undef NEON_FN
    728 
    729 /* Polynomial multiplication is like integer multiplication except the
    730    partial products are XORed, not added.  */
    731 uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2)
    732 {
    733     uint32_t mask;
    734     uint32_t result;
    735     result = 0;
    736     while (op1) {
    737         mask = 0;
    738         if (op1 & 1)
    739             mask |= 0xff;
    740         if (op1 & (1 << 8))
    741             mask |= (0xff << 8);
    742         if (op1 & (1 << 16))
    743             mask |= (0xff << 16);
    744         if (op1 & (1 << 24))
    745             mask |= (0xff << 24);
    746         result ^= op2 & mask;
    747         op1 = (op1 >> 1) & 0x7f7f7f7f;
    748         op2 = (op2 << 1) & 0xfefefefe;
    749     }
    750     return result;
    751 }
    752 
    753 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0
    754 NEON_VOP(tst_u8, neon_u8, 4)
    755 NEON_VOP(tst_u16, neon_u16, 2)
    756 NEON_VOP(tst_u32, neon_u32, 1)
    757 #undef NEON_FN
    758 
    759 #define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0
    760 NEON_VOP(ceq_u8, neon_u8, 4)
    761 NEON_VOP(ceq_u16, neon_u16, 2)
    762 NEON_VOP(ceq_u32, neon_u32, 1)
    763 #undef NEON_FN
    764 
    765 #define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src
    766 NEON_VOP1(abs_s8, neon_s8, 4)
    767 NEON_VOP1(abs_s16, neon_s16, 2)
    768 #undef NEON_FN
    769 
    770 /* Count Leading Sign/Zero Bits.  */
    771 static inline int do_clz8(uint8_t x)
    772 {
    773     int n;
    774     for (n = 8; x; n--)
    775         x >>= 1;
    776     return n;
    777 }
    778 
    779 static inline int do_clz16(uint16_t x)
    780 {
    781     int n;
    782     for (n = 16; x; n--)
    783         x >>= 1;
    784     return n;
    785 }
    786 
    787 #define NEON_FN(dest, src, dummy) dest = do_clz8(src)
    788 NEON_VOP1(clz_u8, neon_u8, 4)
    789 #undef NEON_FN
    790 
    791 #define NEON_FN(dest, src, dummy) dest = do_clz16(src)
    792 NEON_VOP1(clz_u16, neon_u16, 2)
    793 #undef NEON_FN
    794 
    795 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1
    796 NEON_VOP1(cls_s8, neon_s8, 4)
    797 #undef NEON_FN
    798 
    799 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1
    800 NEON_VOP1(cls_s16, neon_s16, 2)
    801 #undef NEON_FN
    802 
    803 uint32_t HELPER(neon_cls_s32)(uint32_t x)
    804 {
    805     int count;
    806     if ((int32_t)x < 0)
    807         x = ~x;
    808     for (count = 32; x; count--)
    809         x = x >> 1;
    810     return count - 1;
    811 }
    812 
    813 /* Bit count.  */
    814 uint32_t HELPER(neon_cnt_u8)(uint32_t x)
    815 {
    816     x = (x & 0x55555555) + ((x >>  1) & 0x55555555);
    817     x = (x & 0x33333333) + ((x >>  2) & 0x33333333);
    818     x = (x & 0x0f0f0f0f) + ((x >>  4) & 0x0f0f0f0f);
    819     return x;
    820 }
    821 
    822 #define NEON_QDMULH16(dest, src1, src2, round) do { \
    823     uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \
    824     if ((tmp ^ (tmp << 1)) & SIGNBIT) { \
    825         SET_QC(); \
    826         tmp = (tmp >> 31) ^ ~SIGNBIT; \
    827     } \
    828     tmp <<= 1; \
    829     if (round) { \
    830         int32_t old = tmp; \
    831         tmp += 1 << 15; \
    832         if ((int32_t)tmp < old) { \
    833             SET_QC(); \
    834             tmp = SIGNBIT - 1; \
    835         } \
    836     } \
    837     dest = tmp >> 16; \
    838     } while(0)
    839 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0)
    840 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2)
    841 #undef NEON_FN
    842 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1)
    843 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2)
    844 #undef NEON_FN
    845 #undef NEON_QDMULH16
    846 
    847 #define NEON_QDMULH32(dest, src1, src2, round) do { \
    848     uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \
    849     if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \
    850         SET_QC(); \
    851         tmp = (tmp >> 63) ^ ~SIGNBIT64; \
    852     } else { \
    853         tmp <<= 1; \
    854     } \
    855     if (round) { \
    856         int64_t old = tmp; \
    857         tmp += (int64_t)1 << 31; \
    858         if ((int64_t)tmp < old) { \
    859             SET_QC(); \
    860             tmp = SIGNBIT64 - 1; \
    861         } \
    862     } \
    863     dest = tmp >> 32; \
    864     } while(0)
    865 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0)
    866 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1)
    867 #undef NEON_FN
    868 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1)
    869 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1)
    870 #undef NEON_FN
    871 #undef NEON_QDMULH32
    872 
    873 uint32_t HELPER(neon_narrow_u8)(uint64_t x)
    874 {
    875     return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u)
    876            | ((x >> 24) & 0xff000000u);
    877 }
    878 
    879 uint32_t HELPER(neon_narrow_u16)(uint64_t x)
    880 {
    881     return (x & 0xffffu) | ((x >> 16) & 0xffff0000u);
    882 }
    883 
    884 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x)
    885 {
    886     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
    887             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
    888 }
    889 
    890 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x)
    891 {
    892     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
    893 }
    894 
    895 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x)
    896 {
    897     x &= 0xff80ff80ff80ff80ull;
    898     x += 0x0080008000800080ull;
    899     return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00)
    900             | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000);
    901 }
    902 
    903 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x)
    904 {
    905     x &= 0xffff8000ffff8000ull;
    906     x += 0x0000800000008000ull;
    907     return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000);
    908 }
    909 
    910 uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x)
    911 {
    912     uint16_t s;
    913     uint8_t d;
    914     uint32_t res = 0;
    915 #define SAT8(n) \
    916     s = x >> n; \
    917     if (s > 0xff) { \
    918         d = 0xff; \
    919         SET_QC(); \
    920     } else  { \
    921         d = s; \
    922     } \
    923     res |= (uint32_t)d << (n / 2);
    924 
    925     SAT8(0);
    926     SAT8(16);
    927     SAT8(32);
    928     SAT8(48);
    929 #undef SAT8
    930     return res;
    931 }
    932 
    933 uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x)
    934 {
    935     int16_t s;
    936     uint8_t d;
    937     uint32_t res = 0;
    938 #define SAT8(n) \
    939     s = x >> n; \
    940     if (s != (int8_t)s) { \
    941         d = (s >> 15) ^ 0x7f; \
    942         SET_QC(); \
    943     } else  { \
    944         d = s; \
    945     } \
    946     res |= (uint32_t)d << (n / 2);
    947 
    948     SAT8(0);
    949     SAT8(16);
    950     SAT8(32);
    951     SAT8(48);
    952 #undef SAT8
    953     return res;
    954 }
    955 
    956 uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x)
    957 {
    958     uint32_t high;
    959     uint32_t low;
    960     low = x;
    961     if (low > 0xffff) {
    962         low = 0xffff;
    963         SET_QC();
    964     }
    965     high = x >> 32;
    966     if (high > 0xffff) {
    967         high = 0xffff;
    968         SET_QC();
    969     }
    970     return low | (high << 16);
    971 }
    972 
    973 uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x)
    974 {
    975     int32_t low;
    976     int32_t high;
    977     low = x;
    978     if (low != (int16_t)low) {
    979         low = (low >> 31) ^ 0x7fff;
    980         SET_QC();
    981     }
    982     high = x >> 32;
    983     if (high != (int16_t)high) {
    984         high = (high >> 31) ^ 0x7fff;
    985         SET_QC();
    986     }
    987     return (uint16_t)low | (high << 16);
    988 }
    989 
    990 uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x)
    991 {
    992     if (x > 0xffffffffu) {
    993         SET_QC();
    994         return 0xffffffffu;
    995     }
    996     return x;
    997 }
    998 
    999 uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x)
   1000 {
   1001     if ((int64_t)x != (int32_t)x) {
   1002         SET_QC();
   1003         return (x >> 63) ^ 0x7fffffff;
   1004     }
   1005     return x;
   1006 }
   1007 
   1008 uint64_t HELPER(neon_widen_u8)(uint32_t x)
   1009 {
   1010     uint64_t tmp;
   1011     uint64_t ret;
   1012     ret = (uint8_t)x;
   1013     tmp = (uint8_t)(x >> 8);
   1014     ret |= tmp << 16;
   1015     tmp = (uint8_t)(x >> 16);
   1016     ret |= tmp << 32;
   1017     tmp = (uint8_t)(x >> 24);
   1018     ret |= tmp << 48;
   1019     return ret;
   1020 }
   1021 
   1022 uint64_t HELPER(neon_widen_s8)(uint32_t x)
   1023 {
   1024     uint64_t tmp;
   1025     uint64_t ret;
   1026     ret = (uint16_t)(int8_t)x;
   1027     tmp = (uint16_t)(int8_t)(x >> 8);
   1028     ret |= tmp << 16;
   1029     tmp = (uint16_t)(int8_t)(x >> 16);
   1030     ret |= tmp << 32;
   1031     tmp = (uint16_t)(int8_t)(x >> 24);
   1032     ret |= tmp << 48;
   1033     return ret;
   1034 }
   1035 
   1036 uint64_t HELPER(neon_widen_u16)(uint32_t x)
   1037 {
   1038     uint64_t high = (uint16_t)(x >> 16);
   1039     return ((uint16_t)x) | (high << 32);
   1040 }
   1041 
   1042 uint64_t HELPER(neon_widen_s16)(uint32_t x)
   1043 {
   1044     uint64_t high = (int16_t)(x >> 16);
   1045     return ((uint32_t)(int16_t)x) | (high << 32);
   1046 }
   1047 
   1048 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b)
   1049 {
   1050     uint64_t mask;
   1051     mask = (a ^ b) & 0x8000800080008000ull;
   1052     a &= ~0x8000800080008000ull;
   1053     b &= ~0x8000800080008000ull;
   1054     return (a + b) ^ mask;
   1055 }
   1056 
   1057 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b)
   1058 {
   1059     uint64_t mask;
   1060     mask = (a ^ b) & 0x8000000080000000ull;
   1061     a &= ~0x8000000080000000ull;
   1062     b &= ~0x8000000080000000ull;
   1063     return (a + b) ^ mask;
   1064 }
   1065 
   1066 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b)
   1067 {
   1068     uint64_t tmp;
   1069     uint64_t tmp2;
   1070 
   1071     tmp = a & 0x0000ffff0000ffffull;
   1072     tmp += (a >> 16) & 0x0000ffff0000ffffull;
   1073     tmp2 = b & 0xffff0000ffff0000ull;
   1074     tmp2 += (b << 16) & 0xffff0000ffff0000ull;
   1075     return    ( tmp         & 0xffff)
   1076             | ((tmp  >> 16) & 0xffff0000ull)
   1077             | ((tmp2 << 16) & 0xffff00000000ull)
   1078             | ( tmp2        & 0xffff000000000000ull);
   1079 }
   1080 
   1081 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b)
   1082 {
   1083     uint32_t low = a + (a >> 32);
   1084     uint32_t high = b + (b >> 32);
   1085     return low + ((uint64_t)high << 32);
   1086 }
   1087 
   1088 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b)
   1089 {
   1090     uint64_t mask;
   1091     mask = (a ^ ~b) & 0x8000800080008000ull;
   1092     a |= 0x8000800080008000ull;
   1093     b &= ~0x8000800080008000ull;
   1094     return (a - b) ^ mask;
   1095 }
   1096 
   1097 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b)
   1098 {
   1099     uint64_t mask;
   1100     mask = (a ^ ~b) & 0x8000000080000000ull;
   1101     a |= 0x8000000080000000ull;
   1102     b &= ~0x8000000080000000ull;
   1103     return (a - b) ^ mask;
   1104 }
   1105 
   1106 uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b)
   1107 {
   1108     uint32_t x, y;
   1109     uint32_t low, high;
   1110 
   1111     x = a;
   1112     y = b;
   1113     low = x + y;
   1114     if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1115         SET_QC();
   1116         low = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1117     }
   1118     x = a >> 32;
   1119     y = b >> 32;
   1120     high = x + y;
   1121     if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) {
   1122         SET_QC();
   1123         high = ((int32_t)x >> 31) ^ ~SIGNBIT;
   1124     }
   1125     return low | ((uint64_t)high << 32);
   1126 }
   1127 
   1128 uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b)
   1129 {
   1130     uint64_t result;
   1131 
   1132     result = a + b;
   1133     if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) {
   1134         SET_QC();
   1135         result = ((int64_t)a >> 63) ^ ~SIGNBIT64;
   1136     }
   1137     return result;
   1138 }
   1139 
   1140 #define DO_ABD(dest, x, y, type) do { \
   1141     type tmp_x = x; \
   1142     type tmp_y = y; \
   1143     dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \
   1144     } while(0)
   1145 
   1146 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b)
   1147 {
   1148     uint64_t tmp;
   1149     uint64_t result;
   1150     DO_ABD(result, a, b, uint8_t);
   1151     DO_ABD(tmp, a >> 8, b >> 8, uint8_t);
   1152     result |= tmp << 16;
   1153     DO_ABD(tmp, a >> 16, b >> 16, uint8_t);
   1154     result |= tmp << 32;
   1155     DO_ABD(tmp, a >> 24, b >> 24, uint8_t);
   1156     result |= tmp << 48;
   1157     return result;
   1158 }
   1159 
   1160 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b)
   1161 {
   1162     uint64_t tmp;
   1163     uint64_t result;
   1164     DO_ABD(result, a, b, int8_t);
   1165     DO_ABD(tmp, a >> 8, b >> 8, int8_t);
   1166     result |= tmp << 16;
   1167     DO_ABD(tmp, a >> 16, b >> 16, int8_t);
   1168     result |= tmp << 32;
   1169     DO_ABD(tmp, a >> 24, b >> 24, int8_t);
   1170     result |= tmp << 48;
   1171     return result;
   1172 }
   1173 
   1174 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b)
   1175 {
   1176     uint64_t tmp;
   1177     uint64_t result;
   1178     DO_ABD(result, a, b, uint16_t);
   1179     DO_ABD(tmp, a >> 16, b >> 16, uint16_t);
   1180     return result | (tmp << 32);
   1181 }
   1182 
   1183 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b)
   1184 {
   1185     uint64_t tmp;
   1186     uint64_t result;
   1187     DO_ABD(result, a, b, int16_t);
   1188     DO_ABD(tmp, a >> 16, b >> 16, int16_t);
   1189     return result | (tmp << 32);
   1190 }
   1191 
   1192 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b)
   1193 {
   1194     uint64_t result;
   1195     DO_ABD(result, a, b, uint32_t);
   1196     return result;
   1197 }
   1198 
   1199 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b)
   1200 {
   1201     uint64_t result;
   1202     DO_ABD(result, a, b, int32_t);
   1203     return result;
   1204 }
   1205 #undef DO_ABD
   1206 
   1207 /* Widening multiply. Named type is the source type.  */
   1208 #define DO_MULL(dest, x, y, type1, type2) do { \
   1209     type1 tmp_x = x; \
   1210     type1 tmp_y = y; \
   1211     dest = (type2)((type2)tmp_x * (type2)tmp_y); \
   1212     } while(0)
   1213 
   1214 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b)
   1215 {
   1216     uint64_t tmp;
   1217     uint64_t result;
   1218 
   1219     DO_MULL(result, a, b, uint8_t, uint16_t);
   1220     DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t);
   1221     result |= tmp << 16;
   1222     DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t);
   1223     result |= tmp << 32;
   1224     DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t);
   1225     result |= tmp << 48;
   1226     return result;
   1227 }
   1228 
   1229 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b)
   1230 {
   1231     uint64_t tmp;
   1232     uint64_t result;
   1233 
   1234     DO_MULL(result, a, b, int8_t, uint16_t);
   1235     DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t);
   1236     result |= tmp << 16;
   1237     DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t);
   1238     result |= tmp << 32;
   1239     DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t);
   1240     result |= tmp << 48;
   1241     return result;
   1242 }
   1243 
   1244 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b)
   1245 {
   1246     uint64_t tmp;
   1247     uint64_t result;
   1248 
   1249     DO_MULL(result, a, b, uint16_t, uint32_t);
   1250     DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t);
   1251     return result | (tmp << 32);
   1252 }
   1253 
   1254 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b)
   1255 {
   1256     uint64_t tmp;
   1257     uint64_t result;
   1258 
   1259     DO_MULL(result, a, b, int16_t, uint32_t);
   1260     DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t);
   1261     return result | (tmp << 32);
   1262 }
   1263 
   1264 uint64_t HELPER(neon_negl_u16)(uint64_t x)
   1265 {
   1266     uint16_t tmp;
   1267     uint64_t result;
   1268     result = (uint16_t)-x;
   1269     tmp = -(x >> 16);
   1270     result |= (uint64_t)tmp << 16;
   1271     tmp = -(x >> 32);
   1272     result |= (uint64_t)tmp << 32;
   1273     tmp = -(x >> 48);
   1274     result |= (uint64_t)tmp << 48;
   1275     return result;
   1276 }
   1277 
   1278 #include <stdio.h>
   1279 uint64_t HELPER(neon_negl_u32)(uint64_t x)
   1280 {
   1281     uint32_t low = -x;
   1282     uint32_t high = -(x >> 32);
   1283     return low | ((uint64_t)high << 32);
   1284 }
   1285 
   1286 /* FIXME:  There should be a native op for this.  */
   1287 uint64_t HELPER(neon_negl_u64)(uint64_t x)
   1288 {
   1289     return -x;
   1290 }
   1291 
   1292 /* Saturnating sign manuipulation.  */
   1293 /* ??? Make these use NEON_VOP1 */
   1294 #define DO_QABS8(x) do { \
   1295     if (x == (int8_t)0x80) { \
   1296         x = 0x7f; \
   1297         SET_QC(); \
   1298     } else if (x < 0) { \
   1299         x = -x; \
   1300     }} while (0)
   1301 uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x)
   1302 {
   1303     neon_s8 vec;
   1304     NEON_UNPACK(neon_s8, vec, x);
   1305     DO_QABS8(vec.v1);
   1306     DO_QABS8(vec.v2);
   1307     DO_QABS8(vec.v3);
   1308     DO_QABS8(vec.v4);
   1309     NEON_PACK(neon_s8, x, vec);
   1310     return x;
   1311 }
   1312 #undef DO_QABS8
   1313 
   1314 #define DO_QNEG8(x) do { \
   1315     if (x == (int8_t)0x80) { \
   1316         x = 0x7f; \
   1317         SET_QC(); \
   1318     } else { \
   1319         x = -x; \
   1320     }} while (0)
   1321 uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x)
   1322 {
   1323     neon_s8 vec;
   1324     NEON_UNPACK(neon_s8, vec, x);
   1325     DO_QNEG8(vec.v1);
   1326     DO_QNEG8(vec.v2);
   1327     DO_QNEG8(vec.v3);
   1328     DO_QNEG8(vec.v4);
   1329     NEON_PACK(neon_s8, x, vec);
   1330     return x;
   1331 }
   1332 #undef DO_QNEG8
   1333 
   1334 #define DO_QABS16(x) do { \
   1335     if (x == (int16_t)0x8000) { \
   1336         x = 0x7fff; \
   1337         SET_QC(); \
   1338     } else if (x < 0) { \
   1339         x = -x; \
   1340     }} while (0)
   1341 uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x)
   1342 {
   1343     neon_s16 vec;
   1344     NEON_UNPACK(neon_s16, vec, x);
   1345     DO_QABS16(vec.v1);
   1346     DO_QABS16(vec.v2);
   1347     NEON_PACK(neon_s16, x, vec);
   1348     return x;
   1349 }
   1350 #undef DO_QABS16
   1351 
   1352 #define DO_QNEG16(x) do { \
   1353     if (x == (int16_t)0x8000) { \
   1354         x = 0x7fff; \
   1355         SET_QC(); \
   1356     } else { \
   1357         x = -x; \
   1358     }} while (0)
   1359 uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x)
   1360 {
   1361     neon_s16 vec;
   1362     NEON_UNPACK(neon_s16, vec, x);
   1363     DO_QNEG16(vec.v1);
   1364     DO_QNEG16(vec.v2);
   1365     NEON_PACK(neon_s16, x, vec);
   1366     return x;
   1367 }
   1368 #undef DO_QNEG16
   1369 
   1370 uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x)
   1371 {
   1372     if (x == SIGNBIT) {
   1373         SET_QC();
   1374         x = ~SIGNBIT;
   1375     } else if ((int32_t)x < 0) {
   1376         x = -x;
   1377     }
   1378     return x;
   1379 }
   1380 
   1381 uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x)
   1382 {
   1383     if (x == SIGNBIT) {
   1384         SET_QC();
   1385         x = ~SIGNBIT;
   1386     } else {
   1387         x = -x;
   1388     }
   1389     return x;
   1390 }
   1391 
   1392 /* NEON Float helpers.  */
   1393 uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b)
   1394 {
   1395     float32 f0 = vfp_itos(a);
   1396     float32 f1 = vfp_itos(b);
   1397     return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b;
   1398 }
   1399 
   1400 uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b)
   1401 {
   1402     float32 f0 = vfp_itos(a);
   1403     float32 f1 = vfp_itos(b);
   1404     return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b;
   1405 }
   1406 
   1407 uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b)
   1408 {
   1409     float32 f0 = vfp_itos(a);
   1410     float32 f1 = vfp_itos(b);
   1411     return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1)
   1412                     ? float32_sub(f0, f1, NFS)
   1413                     : float32_sub(f1, f0, NFS));
   1414 }
   1415 
   1416 uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b)
   1417 {
   1418     return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS));
   1419 }
   1420 
   1421 uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b)
   1422 {
   1423     return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS));
   1424 }
   1425 
   1426 uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b)
   1427 {
   1428     return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS));
   1429 }
   1430 
   1431 /* Floating point comparisons produce an integer result.  */
   1432 #define NEON_VOP_FCMP(name, cmp) \
   1433 uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \
   1434 { \
   1435     if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \
   1436         return ~0; \
   1437     else \
   1438         return 0; \
   1439 }
   1440 
   1441 NEON_VOP_FCMP(ceq_f32, ==)
   1442 NEON_VOP_FCMP(cge_f32, >=)
   1443 NEON_VOP_FCMP(cgt_f32, >)
   1444 
   1445 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b)
   1446 {
   1447     float32 f0 = float32_abs(vfp_itos(a));
   1448     float32 f1 = float32_abs(vfp_itos(b));
   1449     return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0;
   1450 }
   1451 
   1452 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b)
   1453 {
   1454     float32 f0 = float32_abs(vfp_itos(a));
   1455     float32 f1 = float32_abs(vfp_itos(b));
   1456     return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0;
   1457 }
   1458