1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licensed under the GNU GPL v2. 8 */ 9 #include <stdlib.h> 10 #include <stdio.h> 11 12 #include "cpu.h" 13 #include "exec/exec-all.h" 14 #include "helper.h" 15 16 #define SIGNBIT (uint32_t)0x80000000 17 #define SIGNBIT64 ((uint64_t)1 << 63) 18 19 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] |= CPSR_Q 20 21 #define NEON_TYPE1(name, type) \ 22 typedef struct \ 23 { \ 24 type v1; \ 25 } neon_##name; 26 #ifdef HOST_WORDS_BIGENDIAN 27 #define NEON_TYPE2(name, type) \ 28 typedef struct \ 29 { \ 30 type v2; \ 31 type v1; \ 32 } neon_##name; 33 #define NEON_TYPE4(name, type) \ 34 typedef struct \ 35 { \ 36 type v4; \ 37 type v3; \ 38 type v2; \ 39 type v1; \ 40 } neon_##name; 41 #else 42 #define NEON_TYPE2(name, type) \ 43 typedef struct \ 44 { \ 45 type v1; \ 46 type v2; \ 47 } neon_##name; 48 #define NEON_TYPE4(name, type) \ 49 typedef struct \ 50 { \ 51 type v1; \ 52 type v2; \ 53 type v3; \ 54 type v4; \ 55 } neon_##name; 56 #endif 57 58 NEON_TYPE4(s8, int8_t) 59 NEON_TYPE4(u8, uint8_t) 60 NEON_TYPE2(s16, int16_t) 61 NEON_TYPE2(u16, uint16_t) 62 NEON_TYPE1(s32, int32_t) 63 NEON_TYPE1(u32, uint32_t) 64 #undef NEON_TYPE4 65 #undef NEON_TYPE2 66 #undef NEON_TYPE1 67 68 /* Copy from a uint32_t to a vector structure type. */ 69 #define NEON_UNPACK(vtype, dest, val) do { \ 70 union { \ 71 vtype v; \ 72 uint32_t i; \ 73 } conv_u; \ 74 conv_u.i = (val); \ 75 dest = conv_u.v; \ 76 } while(0) 77 78 /* Copy from a vector structure type to a uint32_t. */ 79 #define NEON_PACK(vtype, dest, val) do { \ 80 union { \ 81 vtype v; \ 82 uint32_t i; \ 83 } conv_u; \ 84 conv_u.v = (val); \ 85 dest = conv_u.i; \ 86 } while(0) 87 88 #define NEON_DO1 \ 89 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 90 #define NEON_DO2 \ 91 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 92 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 93 #define NEON_DO4 \ 94 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 95 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 96 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 97 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 98 99 #define NEON_VOP_BODY(vtype, n) \ 100 { \ 101 uint32_t res; \ 102 vtype vsrc1; \ 103 vtype vsrc2; \ 104 vtype vdest; \ 105 NEON_UNPACK(vtype, vsrc1, arg1); \ 106 NEON_UNPACK(vtype, vsrc2, arg2); \ 107 NEON_DO##n; \ 108 NEON_PACK(vtype, res, vdest); \ 109 return res; \ 110 } 111 112 #define NEON_VOP(name, vtype, n) \ 113 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 114 NEON_VOP_BODY(vtype, n) 115 116 #define NEON_VOP_ENV(name, vtype, n) \ 117 uint32_t HELPER(glue(neon_,name))(CPUARMState *env, uint32_t arg1, uint32_t arg2) \ 118 NEON_VOP_BODY(vtype, n) 119 120 /* Pairwise operations. */ 121 /* For 32-bit elements each segment only contains a single element, so 122 the elementwise and pairwise operations are the same. */ 123 #define NEON_PDO2 \ 124 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 125 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 126 #define NEON_PDO4 \ 127 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 128 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 129 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 130 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 131 132 #define NEON_POP(name, vtype, n) \ 133 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 134 { \ 135 uint32_t res; \ 136 vtype vsrc1; \ 137 vtype vsrc2; \ 138 vtype vdest; \ 139 NEON_UNPACK(vtype, vsrc1, arg1); \ 140 NEON_UNPACK(vtype, vsrc2, arg2); \ 141 NEON_PDO##n; \ 142 NEON_PACK(vtype, res, vdest); \ 143 return res; \ 144 } 145 146 /* Unary operators. */ 147 #define NEON_VOP1(name, vtype, n) \ 148 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 149 { \ 150 vtype vsrc1; \ 151 vtype vdest; \ 152 NEON_UNPACK(vtype, vsrc1, arg); \ 153 NEON_DO##n; \ 154 NEON_PACK(vtype, arg, vdest); \ 155 return arg; \ 156 } 157 158 159 #define NEON_USAT(dest, src1, src2, type) do { \ 160 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 161 if (tmp != (type)tmp) { \ 162 SET_QC(); \ 163 dest = ~0; \ 164 } else { \ 165 dest = tmp; \ 166 }} while(0) 167 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 168 NEON_VOP_ENV(qadd_u8, neon_u8, 4) 169 #undef NEON_FN 170 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 171 NEON_VOP_ENV(qadd_u16, neon_u16, 2) 172 #undef NEON_FN 173 #undef NEON_USAT 174 175 uint32_t HELPER(neon_qadd_u32)(CPUARMState *env, uint32_t a, uint32_t b) 176 { 177 uint32_t res = a + b; 178 if (res < a) { 179 SET_QC(); 180 res = ~0; 181 } 182 return res; 183 } 184 185 uint64_t HELPER(neon_qadd_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 186 { 187 uint64_t res; 188 189 res = src1 + src2; 190 if (res < src1) { 191 SET_QC(); 192 res = ~(uint64_t)0; 193 } 194 return res; 195 } 196 197 #define NEON_SSAT(dest, src1, src2, type) do { \ 198 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 199 if (tmp != (type)tmp) { \ 200 SET_QC(); \ 201 if (src2 > 0) { \ 202 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 203 } else { \ 204 tmp = 1 << (sizeof(type) * 8 - 1); \ 205 } \ 206 } \ 207 dest = tmp; \ 208 } while(0) 209 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 210 NEON_VOP_ENV(qadd_s8, neon_s8, 4) 211 #undef NEON_FN 212 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 213 NEON_VOP_ENV(qadd_s16, neon_s16, 2) 214 #undef NEON_FN 215 #undef NEON_SSAT 216 217 uint32_t HELPER(neon_qadd_s32)(CPUARMState *env, uint32_t a, uint32_t b) 218 { 219 uint32_t res = a + b; 220 if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { 221 SET_QC(); 222 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 223 } 224 return res; 225 } 226 227 uint64_t HELPER(neon_qadd_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 228 { 229 uint64_t res; 230 231 res = src1 + src2; 232 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { 233 SET_QC(); 234 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 235 } 236 return res; 237 } 238 239 #define NEON_USAT(dest, src1, src2, type) do { \ 240 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 241 if (tmp != (type)tmp) { \ 242 SET_QC(); \ 243 dest = 0; \ 244 } else { \ 245 dest = tmp; \ 246 }} while(0) 247 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 248 NEON_VOP_ENV(qsub_u8, neon_u8, 4) 249 #undef NEON_FN 250 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 251 NEON_VOP_ENV(qsub_u16, neon_u16, 2) 252 #undef NEON_FN 253 #undef NEON_USAT 254 255 uint32_t HELPER(neon_qsub_u32)(CPUARMState *env, uint32_t a, uint32_t b) 256 { 257 uint32_t res = a - b; 258 if (res > a) { 259 SET_QC(); 260 res = 0; 261 } 262 return res; 263 } 264 265 uint64_t HELPER(neon_qsub_u64)(CPUARMState *env, uint64_t src1, uint64_t src2) 266 { 267 uint64_t res; 268 269 if (src1 < src2) { 270 SET_QC(); 271 res = 0; 272 } else { 273 res = src1 - src2; 274 } 275 return res; 276 } 277 278 #define NEON_SSAT(dest, src1, src2, type) do { \ 279 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 280 if (tmp != (type)tmp) { \ 281 SET_QC(); \ 282 if (src2 < 0) { \ 283 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 284 } else { \ 285 tmp = 1 << (sizeof(type) * 8 - 1); \ 286 } \ 287 } \ 288 dest = tmp; \ 289 } while(0) 290 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 291 NEON_VOP_ENV(qsub_s8, neon_s8, 4) 292 #undef NEON_FN 293 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 294 NEON_VOP_ENV(qsub_s16, neon_s16, 2) 295 #undef NEON_FN 296 #undef NEON_SSAT 297 298 uint32_t HELPER(neon_qsub_s32)(CPUARMState *env, uint32_t a, uint32_t b) 299 { 300 uint32_t res = a - b; 301 if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { 302 SET_QC(); 303 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 304 } 305 return res; 306 } 307 308 uint64_t HELPER(neon_qsub_s64)(CPUARMState *env, uint64_t src1, uint64_t src2) 309 { 310 uint64_t res; 311 312 res = src1 - src2; 313 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { 314 SET_QC(); 315 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 316 } 317 return res; 318 } 319 320 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 321 NEON_VOP(hadd_s8, neon_s8, 4) 322 NEON_VOP(hadd_u8, neon_u8, 4) 323 NEON_VOP(hadd_s16, neon_s16, 2) 324 NEON_VOP(hadd_u16, neon_u16, 2) 325 #undef NEON_FN 326 327 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 328 { 329 int32_t dest; 330 331 dest = (src1 >> 1) + (src2 >> 1); 332 if (src1 & src2 & 1) 333 dest++; 334 return dest; 335 } 336 337 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 338 { 339 uint32_t dest; 340 341 dest = (src1 >> 1) + (src2 >> 1); 342 if (src1 & src2 & 1) 343 dest++; 344 return dest; 345 } 346 347 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 348 NEON_VOP(rhadd_s8, neon_s8, 4) 349 NEON_VOP(rhadd_u8, neon_u8, 4) 350 NEON_VOP(rhadd_s16, neon_s16, 2) 351 NEON_VOP(rhadd_u16, neon_u16, 2) 352 #undef NEON_FN 353 354 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 355 { 356 int32_t dest; 357 358 dest = (src1 >> 1) + (src2 >> 1); 359 if ((src1 | src2) & 1) 360 dest++; 361 return dest; 362 } 363 364 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 365 { 366 uint32_t dest; 367 368 dest = (src1 >> 1) + (src2 >> 1); 369 if ((src1 | src2) & 1) 370 dest++; 371 return dest; 372 } 373 374 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 375 NEON_VOP(hsub_s8, neon_s8, 4) 376 NEON_VOP(hsub_u8, neon_u8, 4) 377 NEON_VOP(hsub_s16, neon_s16, 2) 378 NEON_VOP(hsub_u16, neon_u16, 2) 379 #undef NEON_FN 380 381 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 382 { 383 int32_t dest; 384 385 dest = (src1 >> 1) - (src2 >> 1); 386 if ((~src1) & src2 & 1) 387 dest--; 388 return dest; 389 } 390 391 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 392 { 393 uint32_t dest; 394 395 dest = (src1 >> 1) - (src2 >> 1); 396 if ((~src1) & src2 & 1) 397 dest--; 398 return dest; 399 } 400 401 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0 402 NEON_VOP(cgt_s8, neon_s8, 4) 403 NEON_VOP(cgt_u8, neon_u8, 4) 404 NEON_VOP(cgt_s16, neon_s16, 2) 405 NEON_VOP(cgt_u16, neon_u16, 2) 406 NEON_VOP(cgt_s32, neon_s32, 1) 407 NEON_VOP(cgt_u32, neon_u32, 1) 408 #undef NEON_FN 409 410 #define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0 411 NEON_VOP(cge_s8, neon_s8, 4) 412 NEON_VOP(cge_u8, neon_u8, 4) 413 NEON_VOP(cge_s16, neon_s16, 2) 414 NEON_VOP(cge_u16, neon_u16, 2) 415 NEON_VOP(cge_s32, neon_s32, 1) 416 NEON_VOP(cge_u32, neon_u32, 1) 417 #undef NEON_FN 418 419 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 420 NEON_VOP(min_s8, neon_s8, 4) 421 NEON_VOP(min_u8, neon_u8, 4) 422 NEON_VOP(min_s16, neon_s16, 2) 423 NEON_VOP(min_u16, neon_u16, 2) 424 NEON_VOP(min_s32, neon_s32, 1) 425 NEON_VOP(min_u32, neon_u32, 1) 426 NEON_POP(pmin_s8, neon_s8, 4) 427 NEON_POP(pmin_u8, neon_u8, 4) 428 NEON_POP(pmin_s16, neon_s16, 2) 429 NEON_POP(pmin_u16, neon_u16, 2) 430 #undef NEON_FN 431 432 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 433 NEON_VOP(max_s8, neon_s8, 4) 434 NEON_VOP(max_u8, neon_u8, 4) 435 NEON_VOP(max_s16, neon_s16, 2) 436 NEON_VOP(max_u16, neon_u16, 2) 437 NEON_VOP(max_s32, neon_s32, 1) 438 NEON_VOP(max_u32, neon_u32, 1) 439 NEON_POP(pmax_s8, neon_s8, 4) 440 NEON_POP(pmax_u8, neon_u8, 4) 441 NEON_POP(pmax_s16, neon_s16, 2) 442 NEON_POP(pmax_u16, neon_u16, 2) 443 #undef NEON_FN 444 445 #define NEON_FN(dest, src1, src2) \ 446 dest = (src1 > src2) ? (src1 - src2) : (src2 - src1) 447 NEON_VOP(abd_s8, neon_s8, 4) 448 NEON_VOP(abd_u8, neon_u8, 4) 449 NEON_VOP(abd_s16, neon_s16, 2) 450 NEON_VOP(abd_u16, neon_u16, 2) 451 NEON_VOP(abd_s32, neon_s32, 1) 452 NEON_VOP(abd_u32, neon_u32, 1) 453 #undef NEON_FN 454 455 #define NEON_FN(dest, src1, src2) do { \ 456 int8_t tmp; \ 457 tmp = (int8_t)src2; \ 458 if (tmp >= (ssize_t)sizeof(src1) * 8 || \ 459 tmp <= -(ssize_t)sizeof(src1) * 8) { \ 460 dest = 0; \ 461 } else if (tmp < 0) { \ 462 dest = src1 >> -tmp; \ 463 } else { \ 464 dest = src1 << tmp; \ 465 }} while (0) 466 NEON_VOP(shl_u8, neon_u8, 4) 467 NEON_VOP(shl_u16, neon_u16, 2) 468 NEON_VOP(shl_u32, neon_u32, 1) 469 #undef NEON_FN 470 471 uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop) 472 { 473 int8_t shift = (int8_t)shiftop; 474 if (shift >= 64 || shift <= -64) { 475 val = 0; 476 } else if (shift < 0) { 477 val >>= -shift; 478 } else { 479 val <<= shift; 480 } 481 return val; 482 } 483 484 #define NEON_FN(dest, src1, src2) do { \ 485 int8_t tmp; \ 486 tmp = (int8_t)src2; \ 487 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 488 dest = 0; \ 489 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 490 dest = src1 >> (sizeof(src1) * 8 - 1); \ 491 } else if (tmp < 0) { \ 492 dest = src1 >> -tmp; \ 493 } else { \ 494 dest = src1 << tmp; \ 495 }} while (0) 496 NEON_VOP(shl_s8, neon_s8, 4) 497 NEON_VOP(shl_s16, neon_s16, 2) 498 NEON_VOP(shl_s32, neon_s32, 1) 499 #undef NEON_FN 500 501 uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) 502 { 503 int8_t shift = (int8_t)shiftop; 504 int64_t val = valop; 505 if (shift >= 64) { 506 val = 0; 507 } else if (shift <= -64) { 508 val >>= 63; 509 } else if (shift < 0) { 510 val >>= -shift; 511 } else { 512 val <<= shift; 513 } 514 return val; 515 } 516 517 #define NEON_FN(dest, src1, src2) do { \ 518 int8_t tmp; \ 519 tmp = (int8_t)src2; \ 520 if ((tmp >= (ssize_t)sizeof(src1) * 8) \ 521 || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \ 522 dest = 0; \ 523 } else if (tmp < 0) { \ 524 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 525 } else { \ 526 dest = src1 << tmp; \ 527 }} while (0) 528 NEON_VOP(rshl_s8, neon_s8, 4) 529 NEON_VOP(rshl_s16, neon_s16, 2) 530 #undef NEON_FN 531 532 /* The addition of the rounding constant may overflow, so we use an 533 * intermediate 64 bit accumulator. */ 534 uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop) 535 { 536 int32_t dest; 537 int32_t val = (int32_t)valop; 538 int8_t shift = (int8_t)shiftop; 539 if ((shift >= 32) || (shift <= -32)) { 540 dest = 0; 541 } else if (shift < 0) { 542 int64_t big_dest = ((int64_t)val + (1 << (-1 - shift))); 543 dest = big_dest >> -shift; 544 } else { 545 dest = val << shift; 546 } 547 return dest; 548 } 549 550 /* Handling addition overflow with 64 bit input values is more 551 * tricky than with 32 bit values. */ 552 uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) 553 { 554 int8_t shift = (int8_t)shiftop; 555 int64_t val = valop; 556 if ((shift >= 64) || (shift <= -64)) { 557 val = 0; 558 } else if (shift < 0) { 559 val >>= (-shift - 1); 560 if (val == INT64_MAX) { 561 /* In this case, it means that the rounding constant is 1, 562 * and the addition would overflow. Return the actual 563 * result directly. */ 564 val = 0x4000000000000000LL; 565 } else { 566 val++; 567 val >>= 1; 568 } 569 } else { 570 val <<= shift; 571 } 572 return val; 573 } 574 575 #define NEON_FN(dest, src1, src2) do { \ 576 int8_t tmp; \ 577 tmp = (int8_t)src2; \ 578 if (tmp >= (ssize_t)sizeof(src1) * 8 || \ 579 tmp < -(ssize_t)sizeof(src1) * 8) { \ 580 dest = 0; \ 581 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ 582 dest = src1 >> (-tmp - 1); \ 583 } else if (tmp < 0) { \ 584 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 585 } else { \ 586 dest = src1 << tmp; \ 587 }} while (0) 588 NEON_VOP(rshl_u8, neon_u8, 4) 589 NEON_VOP(rshl_u16, neon_u16, 2) 590 #undef NEON_FN 591 592 /* The addition of the rounding constant may overflow, so we use an 593 * intermediate 64 bit accumulator. */ 594 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop) 595 { 596 uint32_t dest; 597 int8_t shift = (int8_t)shiftop; 598 if (shift >= 32 || shift < -32) { 599 dest = 0; 600 } else if (shift == -32) { 601 dest = val >> 31; 602 } else if (shift < 0) { 603 uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift))); 604 dest = big_dest >> -shift; 605 } else { 606 dest = val << shift; 607 } 608 return dest; 609 } 610 611 /* Handling addition overflow with 64 bit input values is more 612 * tricky than with 32 bit values. */ 613 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) 614 { 615 int8_t shift = (uint8_t)shiftop; 616 if (shift >= 64 || shift < -64) { 617 val = 0; 618 } else if (shift == -64) { 619 /* Rounding a 1-bit result just preserves that bit. */ 620 val >>= 63; 621 } else if (shift < 0) { 622 val >>= (-shift - 1); 623 if (val == UINT64_MAX) { 624 /* In this case, it means that the rounding constant is 1, 625 * and the addition would overflow. Return the actual 626 * result directly. */ 627 val = 0x8000000000000000ULL; 628 } else { 629 val++; 630 val >>= 1; 631 } 632 } else { 633 val <<= shift; 634 } 635 return val; 636 } 637 638 #define NEON_FN(dest, src1, src2) do { \ 639 int8_t tmp; \ 640 tmp = (int8_t)src2; \ 641 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 642 if (src1) { \ 643 SET_QC(); \ 644 dest = ~0; \ 645 } else { \ 646 dest = 0; \ 647 } \ 648 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 649 dest = 0; \ 650 } else if (tmp < 0) { \ 651 dest = src1 >> -tmp; \ 652 } else { \ 653 dest = src1 << tmp; \ 654 if ((dest >> tmp) != src1) { \ 655 SET_QC(); \ 656 dest = ~0; \ 657 } \ 658 }} while (0) 659 NEON_VOP_ENV(qshl_u8, neon_u8, 4) 660 NEON_VOP_ENV(qshl_u16, neon_u16, 2) 661 NEON_VOP_ENV(qshl_u32, neon_u32, 1) 662 #undef NEON_FN 663 664 uint64_t HELPER(neon_qshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop) 665 { 666 int8_t shift = (int8_t)shiftop; 667 if (shift >= 64) { 668 if (val) { 669 val = ~(uint64_t)0; 670 SET_QC(); 671 } 672 } else if (shift <= -64) { 673 val = 0; 674 } else if (shift < 0) { 675 val >>= -shift; 676 } else { 677 uint64_t tmp = val; 678 val <<= shift; 679 if ((val >> shift) != tmp) { 680 SET_QC(); 681 val = ~(uint64_t)0; 682 } 683 } 684 return val; 685 } 686 687 #define NEON_FN(dest, src1, src2) do { \ 688 int8_t tmp; \ 689 tmp = (int8_t)src2; \ 690 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 691 if (src1) { \ 692 SET_QC(); \ 693 dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ 694 if (src1 > 0) { \ 695 dest--; \ 696 } \ 697 } else { \ 698 dest = src1; \ 699 } \ 700 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 701 dest = src1 >> 31; \ 702 } else if (tmp < 0) { \ 703 dest = src1 >> -tmp; \ 704 } else { \ 705 dest = src1 << tmp; \ 706 if ((dest >> tmp) != src1) { \ 707 SET_QC(); \ 708 dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ 709 if (src1 > 0) { \ 710 dest--; \ 711 } \ 712 } \ 713 }} while (0) 714 NEON_VOP_ENV(qshl_s8, neon_s8, 4) 715 NEON_VOP_ENV(qshl_s16, neon_s16, 2) 716 NEON_VOP_ENV(qshl_s32, neon_s32, 1) 717 #undef NEON_FN 718 719 uint64_t HELPER(neon_qshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop) 720 { 721 int8_t shift = (uint8_t)shiftop; 722 int64_t val = valop; 723 if (shift >= 64) { 724 if (val) { 725 SET_QC(); 726 val = (val >> 63) ^ ~SIGNBIT64; 727 } 728 } else if (shift <= -64) { 729 val >>= 63; 730 } else if (shift < 0) { 731 val >>= -shift; 732 } else { 733 int64_t tmp = val; 734 val <<= shift; 735 if ((val >> shift) != tmp) { 736 SET_QC(); 737 val = (tmp >> 63) ^ ~SIGNBIT64; 738 } 739 } 740 return val; 741 } 742 743 #define NEON_FN(dest, src1, src2) do { \ 744 if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \ 745 SET_QC(); \ 746 dest = 0; \ 747 } else { \ 748 int8_t tmp; \ 749 tmp = (int8_t)src2; \ 750 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 751 if (src1) { \ 752 SET_QC(); \ 753 dest = ~0; \ 754 } else { \ 755 dest = 0; \ 756 } \ 757 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 758 dest = 0; \ 759 } else if (tmp < 0) { \ 760 dest = src1 >> -tmp; \ 761 } else { \ 762 dest = src1 << tmp; \ 763 if ((dest >> tmp) != src1) { \ 764 SET_QC(); \ 765 dest = ~0; \ 766 } \ 767 } \ 768 }} while (0) 769 NEON_VOP_ENV(qshlu_s8, neon_u8, 4) 770 NEON_VOP_ENV(qshlu_s16, neon_u16, 2) 771 #undef NEON_FN 772 773 uint32_t HELPER(neon_qshlu_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop) 774 { 775 if ((int32_t)valop < 0) { 776 SET_QC(); 777 return 0; 778 } 779 return helper_neon_qshl_u32(env, valop, shiftop); 780 } 781 782 uint64_t HELPER(neon_qshlu_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop) 783 { 784 if ((int64_t)valop < 0) { 785 SET_QC(); 786 return 0; 787 } 788 return helper_neon_qshl_u64(env, valop, shiftop); 789 } 790 791 #define NEON_FN(dest, src1, src2) do { \ 792 int8_t tmp; \ 793 tmp = (int8_t)src2; \ 794 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 795 if (src1) { \ 796 SET_QC(); \ 797 dest = ~0; \ 798 } else { \ 799 dest = 0; \ 800 } \ 801 } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \ 802 dest = 0; \ 803 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ 804 dest = src1 >> (sizeof(src1) * 8 - 1); \ 805 } else if (tmp < 0) { \ 806 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 807 } else { \ 808 dest = src1 << tmp; \ 809 if ((dest >> tmp) != src1) { \ 810 SET_QC(); \ 811 dest = ~0; \ 812 } \ 813 }} while (0) 814 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) 815 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) 816 #undef NEON_FN 817 818 /* The addition of the rounding constant may overflow, so we use an 819 * intermediate 64 bit accumulator. */ 820 uint32_t HELPER(neon_qrshl_u32)(CPUARMState *env, uint32_t val, uint32_t shiftop) 821 { 822 uint32_t dest; 823 int8_t shift = (int8_t)shiftop; 824 if (shift >= 32) { 825 if (val) { 826 SET_QC(); 827 dest = ~0; 828 } else { 829 dest = 0; 830 } 831 } else if (shift < -32) { 832 dest = 0; 833 } else if (shift == -32) { 834 dest = val >> 31; 835 } else if (shift < 0) { 836 uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift))); 837 dest = big_dest >> -shift; 838 } else { 839 dest = val << shift; 840 if ((dest >> shift) != val) { 841 SET_QC(); 842 dest = ~0; 843 } 844 } 845 return dest; 846 } 847 848 /* Handling addition overflow with 64 bit input values is more 849 * tricky than with 32 bit values. */ 850 uint64_t HELPER(neon_qrshl_u64)(CPUARMState *env, uint64_t val, uint64_t shiftop) 851 { 852 int8_t shift = (int8_t)shiftop; 853 if (shift >= 64) { 854 if (val) { 855 SET_QC(); 856 val = ~0; 857 } 858 } else if (shift < -64) { 859 val = 0; 860 } else if (shift == -64) { 861 val >>= 63; 862 } else if (shift < 0) { 863 val >>= (-shift - 1); 864 if (val == UINT64_MAX) { 865 /* In this case, it means that the rounding constant is 1, 866 * and the addition would overflow. Return the actual 867 * result directly. */ 868 val = 0x8000000000000000ULL; 869 } else { 870 val++; 871 val >>= 1; 872 } 873 } else { \ 874 uint64_t tmp = val; 875 val <<= shift; 876 if ((val >> shift) != tmp) { 877 SET_QC(); 878 val = ~0; 879 } 880 } 881 return val; 882 } 883 884 #define NEON_FN(dest, src1, src2) do { \ 885 int8_t tmp; \ 886 tmp = (int8_t)src2; \ 887 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 888 if (src1) { \ 889 SET_QC(); \ 890 dest = (1 << (sizeof(src1) * 8 - 1)); \ 891 if (src1 > 0) { \ 892 dest--; \ 893 } \ 894 } else { \ 895 dest = 0; \ 896 } \ 897 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 898 dest = 0; \ 899 } else if (tmp < 0) { \ 900 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 901 } else { \ 902 dest = src1 << tmp; \ 903 if ((dest >> tmp) != src1) { \ 904 SET_QC(); \ 905 dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ 906 if (src1 > 0) { \ 907 dest--; \ 908 } \ 909 } \ 910 }} while (0) 911 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) 912 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) 913 #undef NEON_FN 914 915 /* The addition of the rounding constant may overflow, so we use an 916 * intermediate 64 bit accumulator. */ 917 uint32_t HELPER(neon_qrshl_s32)(CPUARMState *env, uint32_t valop, uint32_t shiftop) 918 { 919 int32_t dest; 920 int32_t val = (int32_t)valop; 921 int8_t shift = (int8_t)shiftop; 922 if (shift >= 32) { 923 if (val) { 924 SET_QC(); 925 dest = (val >> 31) ^ ~SIGNBIT; 926 } else { 927 dest = 0; 928 } 929 } else if (shift <= -32) { 930 dest = 0; 931 } else if (shift < 0) { 932 int64_t big_dest = ((int64_t)val + (1 << (-1 - shift))); 933 dest = big_dest >> -shift; 934 } else { 935 dest = val << shift; 936 if ((dest >> shift) != val) { 937 SET_QC(); 938 dest = (val >> 31) ^ ~SIGNBIT; 939 } 940 } 941 return dest; 942 } 943 944 /* Handling addition overflow with 64 bit input values is more 945 * tricky than with 32 bit values. */ 946 uint64_t HELPER(neon_qrshl_s64)(CPUARMState *env, uint64_t valop, uint64_t shiftop) 947 { 948 int8_t shift = (uint8_t)shiftop; 949 int64_t val = valop; 950 951 if (shift >= 64) { 952 if (val) { 953 SET_QC(); 954 val = (val >> 63) ^ ~SIGNBIT64; 955 } 956 } else if (shift <= -64) { 957 val = 0; 958 } else if (shift < 0) { 959 val >>= (-shift - 1); 960 if (val == INT64_MAX) { 961 /* In this case, it means that the rounding constant is 1, 962 * and the addition would overflow. Return the actual 963 * result directly. */ 964 val = 0x4000000000000000ULL; 965 } else { 966 val++; 967 val >>= 1; 968 } 969 } else { 970 int64_t tmp = val; 971 val <<= shift; 972 if ((val >> shift) != tmp) { 973 SET_QC(); 974 val = (tmp >> 63) ^ ~SIGNBIT64; 975 } 976 } 977 return val; 978 } 979 980 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 981 { 982 uint32_t mask; 983 mask = (a ^ b) & 0x80808080u; 984 a &= ~0x80808080u; 985 b &= ~0x80808080u; 986 return (a + b) ^ mask; 987 } 988 989 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 990 { 991 uint32_t mask; 992 mask = (a ^ b) & 0x80008000u; 993 a &= ~0x80008000u; 994 b &= ~0x80008000u; 995 return (a + b) ^ mask; 996 } 997 998 #define NEON_FN(dest, src1, src2) dest = src1 + src2 999 NEON_POP(padd_u8, neon_u8, 4) 1000 NEON_POP(padd_u16, neon_u16, 2) 1001 #undef NEON_FN 1002 1003 #define NEON_FN(dest, src1, src2) dest = src1 - src2 1004 NEON_VOP(sub_u8, neon_u8, 4) 1005 NEON_VOP(sub_u16, neon_u16, 2) 1006 #undef NEON_FN 1007 1008 #define NEON_FN(dest, src1, src2) dest = src1 * src2 1009 NEON_VOP(mul_u8, neon_u8, 4) 1010 NEON_VOP(mul_u16, neon_u16, 2) 1011 #undef NEON_FN 1012 1013 /* Polynomial multiplication is like integer multiplication except the 1014 partial products are XORed, not added. */ 1015 uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2) 1016 { 1017 uint32_t mask; 1018 uint32_t result; 1019 result = 0; 1020 while (op1) { 1021 mask = 0; 1022 if (op1 & 1) 1023 mask |= 0xff; 1024 if (op1 & (1 << 8)) 1025 mask |= (0xff << 8); 1026 if (op1 & (1 << 16)) 1027 mask |= (0xff << 16); 1028 if (op1 & (1 << 24)) 1029 mask |= (0xff << 24); 1030 result ^= op2 & mask; 1031 op1 = (op1 >> 1) & 0x7f7f7f7f; 1032 op2 = (op2 << 1) & 0xfefefefe; 1033 } 1034 return result; 1035 } 1036 1037 uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2) 1038 { 1039 uint64_t result = 0; 1040 uint64_t mask; 1041 uint64_t op2ex = op2; 1042 op2ex = (op2ex & 0xff) | 1043 ((op2ex & 0xff00) << 8) | 1044 ((op2ex & 0xff0000) << 16) | 1045 ((op2ex & 0xff000000) << 24); 1046 while (op1) { 1047 mask = 0; 1048 if (op1 & 1) { 1049 mask |= 0xffff; 1050 } 1051 if (op1 & (1 << 8)) { 1052 mask |= (0xffffU << 16); 1053 } 1054 if (op1 & (1 << 16)) { 1055 mask |= (0xffffULL << 32); 1056 } 1057 if (op1 & (1 << 24)) { 1058 mask |= (0xffffULL << 48); 1059 } 1060 result ^= op2ex & mask; 1061 op1 = (op1 >> 1) & 0x7f7f7f7f; 1062 op2ex <<= 1; 1063 } 1064 return result; 1065 } 1066 1067 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 1068 NEON_VOP(tst_u8, neon_u8, 4) 1069 NEON_VOP(tst_u16, neon_u16, 2) 1070 NEON_VOP(tst_u32, neon_u32, 1) 1071 #undef NEON_FN 1072 1073 #define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0 1074 NEON_VOP(ceq_u8, neon_u8, 4) 1075 NEON_VOP(ceq_u16, neon_u16, 2) 1076 NEON_VOP(ceq_u32, neon_u32, 1) 1077 #undef NEON_FN 1078 1079 #define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src 1080 NEON_VOP1(abs_s8, neon_s8, 4) 1081 NEON_VOP1(abs_s16, neon_s16, 2) 1082 #undef NEON_FN 1083 1084 /* Count Leading Sign/Zero Bits. */ 1085 static inline int do_clz8(uint8_t x) 1086 { 1087 int n; 1088 for (n = 8; x; n--) 1089 x >>= 1; 1090 return n; 1091 } 1092 1093 static inline int do_clz16(uint16_t x) 1094 { 1095 int n; 1096 for (n = 16; x; n--) 1097 x >>= 1; 1098 return n; 1099 } 1100 1101 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 1102 NEON_VOP1(clz_u8, neon_u8, 4) 1103 #undef NEON_FN 1104 1105 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 1106 NEON_VOP1(clz_u16, neon_u16, 2) 1107 #undef NEON_FN 1108 1109 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 1110 NEON_VOP1(cls_s8, neon_s8, 4) 1111 #undef NEON_FN 1112 1113 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 1114 NEON_VOP1(cls_s16, neon_s16, 2) 1115 #undef NEON_FN 1116 1117 uint32_t HELPER(neon_cls_s32)(uint32_t x) 1118 { 1119 int count; 1120 if ((int32_t)x < 0) 1121 x = ~x; 1122 for (count = 32; x; count--) 1123 x = x >> 1; 1124 return count - 1; 1125 } 1126 1127 /* Bit count. */ 1128 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 1129 { 1130 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 1131 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 1132 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 1133 return x; 1134 } 1135 1136 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 1137 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 1138 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 1139 SET_QC(); \ 1140 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 1141 } else { \ 1142 tmp <<= 1; \ 1143 } \ 1144 if (round) { \ 1145 int32_t old = tmp; \ 1146 tmp += 1 << 15; \ 1147 if ((int32_t)tmp < old) { \ 1148 SET_QC(); \ 1149 tmp = SIGNBIT - 1; \ 1150 } \ 1151 } \ 1152 dest = tmp >> 16; \ 1153 } while(0) 1154 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 1155 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) 1156 #undef NEON_FN 1157 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 1158 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) 1159 #undef NEON_FN 1160 #undef NEON_QDMULH16 1161 1162 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 1163 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 1164 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 1165 SET_QC(); \ 1166 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 1167 } else { \ 1168 tmp <<= 1; \ 1169 } \ 1170 if (round) { \ 1171 int64_t old = tmp; \ 1172 tmp += (int64_t)1 << 31; \ 1173 if ((int64_t)tmp < old) { \ 1174 SET_QC(); \ 1175 tmp = SIGNBIT64 - 1; \ 1176 } \ 1177 } \ 1178 dest = tmp >> 32; \ 1179 } while(0) 1180 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 1181 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) 1182 #undef NEON_FN 1183 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 1184 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) 1185 #undef NEON_FN 1186 #undef NEON_QDMULH32 1187 1188 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 1189 { 1190 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 1191 | ((x >> 24) & 0xff000000u); 1192 } 1193 1194 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 1195 { 1196 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 1197 } 1198 1199 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 1200 { 1201 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 1202 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 1203 } 1204 1205 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 1206 { 1207 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 1208 } 1209 1210 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 1211 { 1212 x &= 0xff80ff80ff80ff80ull; 1213 x += 0x0080008000800080ull; 1214 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 1215 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 1216 } 1217 1218 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 1219 { 1220 x &= 0xffff8000ffff8000ull; 1221 x += 0x0000800000008000ull; 1222 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 1223 } 1224 1225 uint32_t HELPER(neon_unarrow_sat8)(CPUARMState *env, uint64_t x) 1226 { 1227 uint16_t s; 1228 uint8_t d; 1229 uint32_t res = 0; 1230 #define SAT8(n) \ 1231 s = x >> n; \ 1232 if (s & 0x8000) { \ 1233 SET_QC(); \ 1234 } else { \ 1235 if (s > 0xff) { \ 1236 d = 0xff; \ 1237 SET_QC(); \ 1238 } else { \ 1239 d = s; \ 1240 } \ 1241 res |= (uint32_t)d << (n / 2); \ 1242 } 1243 1244 SAT8(0); 1245 SAT8(16); 1246 SAT8(32); 1247 SAT8(48); 1248 #undef SAT8 1249 return res; 1250 } 1251 1252 uint32_t HELPER(neon_narrow_sat_u8)(CPUARMState *env, uint64_t x) 1253 { 1254 uint16_t s; 1255 uint8_t d; 1256 uint32_t res = 0; 1257 #define SAT8(n) \ 1258 s = x >> n; \ 1259 if (s > 0xff) { \ 1260 d = 0xff; \ 1261 SET_QC(); \ 1262 } else { \ 1263 d = s; \ 1264 } \ 1265 res |= (uint32_t)d << (n / 2); 1266 1267 SAT8(0); 1268 SAT8(16); 1269 SAT8(32); 1270 SAT8(48); 1271 #undef SAT8 1272 return res; 1273 } 1274 1275 uint32_t HELPER(neon_narrow_sat_s8)(CPUARMState *env, uint64_t x) 1276 { 1277 int16_t s; 1278 uint8_t d; 1279 uint32_t res = 0; 1280 #define SAT8(n) \ 1281 s = x >> n; \ 1282 if (s != (int8_t)s) { \ 1283 d = (s >> 15) ^ 0x7f; \ 1284 SET_QC(); \ 1285 } else { \ 1286 d = s; \ 1287 } \ 1288 res |= (uint32_t)d << (n / 2); 1289 1290 SAT8(0); 1291 SAT8(16); 1292 SAT8(32); 1293 SAT8(48); 1294 #undef SAT8 1295 return res; 1296 } 1297 1298 uint32_t HELPER(neon_unarrow_sat16)(CPUARMState *env, uint64_t x) 1299 { 1300 uint32_t high; 1301 uint32_t low; 1302 low = x; 1303 if (low & 0x80000000) { 1304 low = 0; 1305 SET_QC(); 1306 } else if (low > 0xffff) { 1307 low = 0xffff; 1308 SET_QC(); 1309 } 1310 high = x >> 32; 1311 if (high & 0x80000000) { 1312 high = 0; 1313 SET_QC(); 1314 } else if (high > 0xffff) { 1315 high = 0xffff; 1316 SET_QC(); 1317 } 1318 return low | (high << 16); 1319 } 1320 1321 uint32_t HELPER(neon_narrow_sat_u16)(CPUARMState *env, uint64_t x) 1322 { 1323 uint32_t high; 1324 uint32_t low; 1325 low = x; 1326 if (low > 0xffff) { 1327 low = 0xffff; 1328 SET_QC(); 1329 } 1330 high = x >> 32; 1331 if (high > 0xffff) { 1332 high = 0xffff; 1333 SET_QC(); 1334 } 1335 return low | (high << 16); 1336 } 1337 1338 uint32_t HELPER(neon_narrow_sat_s16)(CPUARMState *env, uint64_t x) 1339 { 1340 int32_t low; 1341 int32_t high; 1342 low = x; 1343 if (low != (int16_t)low) { 1344 low = (low >> 31) ^ 0x7fff; 1345 SET_QC(); 1346 } 1347 high = x >> 32; 1348 if (high != (int16_t)high) { 1349 high = (high >> 31) ^ 0x7fff; 1350 SET_QC(); 1351 } 1352 return (uint16_t)low | (high << 16); 1353 } 1354 1355 uint32_t HELPER(neon_unarrow_sat32)(CPUARMState *env, uint64_t x) 1356 { 1357 if (x & 0x8000000000000000ull) { 1358 SET_QC(); 1359 return 0; 1360 } 1361 if (x > 0xffffffffu) { 1362 SET_QC(); 1363 return 0xffffffffu; 1364 } 1365 return x; 1366 } 1367 1368 uint32_t HELPER(neon_narrow_sat_u32)(CPUARMState *env, uint64_t x) 1369 { 1370 if (x > 0xffffffffu) { 1371 SET_QC(); 1372 return 0xffffffffu; 1373 } 1374 return x; 1375 } 1376 1377 uint32_t HELPER(neon_narrow_sat_s32)(CPUARMState *env, uint64_t x) 1378 { 1379 if ((int64_t)x != (int32_t)x) { 1380 SET_QC(); 1381 return ((int64_t)x >> 63) ^ 0x7fffffff; 1382 } 1383 return x; 1384 } 1385 1386 uint64_t HELPER(neon_widen_u8)(uint32_t x) 1387 { 1388 uint64_t tmp; 1389 uint64_t ret; 1390 ret = (uint8_t)x; 1391 tmp = (uint8_t)(x >> 8); 1392 ret |= tmp << 16; 1393 tmp = (uint8_t)(x >> 16); 1394 ret |= tmp << 32; 1395 tmp = (uint8_t)(x >> 24); 1396 ret |= tmp << 48; 1397 return ret; 1398 } 1399 1400 uint64_t HELPER(neon_widen_s8)(uint32_t x) 1401 { 1402 uint64_t tmp; 1403 uint64_t ret; 1404 ret = (uint16_t)(int8_t)x; 1405 tmp = (uint16_t)(int8_t)(x >> 8); 1406 ret |= tmp << 16; 1407 tmp = (uint16_t)(int8_t)(x >> 16); 1408 ret |= tmp << 32; 1409 tmp = (uint16_t)(int8_t)(x >> 24); 1410 ret |= tmp << 48; 1411 return ret; 1412 } 1413 1414 uint64_t HELPER(neon_widen_u16)(uint32_t x) 1415 { 1416 uint64_t high = (uint16_t)(x >> 16); 1417 return ((uint16_t)x) | (high << 32); 1418 } 1419 1420 uint64_t HELPER(neon_widen_s16)(uint32_t x) 1421 { 1422 uint64_t high = (int16_t)(x >> 16); 1423 return ((uint32_t)(int16_t)x) | (high << 32); 1424 } 1425 1426 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 1427 { 1428 uint64_t mask; 1429 mask = (a ^ b) & 0x8000800080008000ull; 1430 a &= ~0x8000800080008000ull; 1431 b &= ~0x8000800080008000ull; 1432 return (a + b) ^ mask; 1433 } 1434 1435 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 1436 { 1437 uint64_t mask; 1438 mask = (a ^ b) & 0x8000000080000000ull; 1439 a &= ~0x8000000080000000ull; 1440 b &= ~0x8000000080000000ull; 1441 return (a + b) ^ mask; 1442 } 1443 1444 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 1445 { 1446 uint64_t tmp; 1447 uint64_t tmp2; 1448 1449 tmp = a & 0x0000ffff0000ffffull; 1450 tmp += (a >> 16) & 0x0000ffff0000ffffull; 1451 tmp2 = b & 0xffff0000ffff0000ull; 1452 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 1453 return ( tmp & 0xffff) 1454 | ((tmp >> 16) & 0xffff0000ull) 1455 | ((tmp2 << 16) & 0xffff00000000ull) 1456 | ( tmp2 & 0xffff000000000000ull); 1457 } 1458 1459 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 1460 { 1461 uint32_t low = a + (a >> 32); 1462 uint32_t high = b + (b >> 32); 1463 return low + ((uint64_t)high << 32); 1464 } 1465 1466 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 1467 { 1468 uint64_t mask; 1469 mask = (a ^ ~b) & 0x8000800080008000ull; 1470 a |= 0x8000800080008000ull; 1471 b &= ~0x8000800080008000ull; 1472 return (a - b) ^ mask; 1473 } 1474 1475 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 1476 { 1477 uint64_t mask; 1478 mask = (a ^ ~b) & 0x8000000080000000ull; 1479 a |= 0x8000000080000000ull; 1480 b &= ~0x8000000080000000ull; 1481 return (a - b) ^ mask; 1482 } 1483 1484 uint64_t HELPER(neon_addl_saturate_s32)(CPUARMState *env, uint64_t a, uint64_t b) 1485 { 1486 uint32_t x, y; 1487 uint32_t low, high; 1488 1489 x = a; 1490 y = b; 1491 low = x + y; 1492 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1493 SET_QC(); 1494 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 1495 } 1496 x = a >> 32; 1497 y = b >> 32; 1498 high = x + y; 1499 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1500 SET_QC(); 1501 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 1502 } 1503 return low | ((uint64_t)high << 32); 1504 } 1505 1506 uint64_t HELPER(neon_addl_saturate_s64)(CPUARMState *env, uint64_t a, uint64_t b) 1507 { 1508 uint64_t result; 1509 1510 result = a + b; 1511 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 1512 SET_QC(); 1513 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 1514 } 1515 return result; 1516 } 1517 1518 /* We have to do the arithmetic in a larger type than 1519 * the input type, because for example with a signed 32 bit 1520 * op the absolute difference can overflow a signed 32 bit value. 1521 */ 1522 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 1523 arithtype tmp_x = (intype)(x); \ 1524 arithtype tmp_y = (intype)(y); \ 1525 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 1526 } while(0) 1527 1528 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 1529 { 1530 uint64_t tmp; 1531 uint64_t result; 1532 DO_ABD(result, a, b, uint8_t, uint32_t); 1533 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 1534 result |= tmp << 16; 1535 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 1536 result |= tmp << 32; 1537 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 1538 result |= tmp << 48; 1539 return result; 1540 } 1541 1542 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 1543 { 1544 uint64_t tmp; 1545 uint64_t result; 1546 DO_ABD(result, a, b, int8_t, int32_t); 1547 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 1548 result |= tmp << 16; 1549 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 1550 result |= tmp << 32; 1551 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 1552 result |= tmp << 48; 1553 return result; 1554 } 1555 1556 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 1557 { 1558 uint64_t tmp; 1559 uint64_t result; 1560 DO_ABD(result, a, b, uint16_t, uint32_t); 1561 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1562 return result | (tmp << 32); 1563 } 1564 1565 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1566 { 1567 uint64_t tmp; 1568 uint64_t result; 1569 DO_ABD(result, a, b, int16_t, int32_t); 1570 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1571 return result | (tmp << 32); 1572 } 1573 1574 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1575 { 1576 uint64_t result; 1577 DO_ABD(result, a, b, uint32_t, uint64_t); 1578 return result; 1579 } 1580 1581 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1582 { 1583 uint64_t result; 1584 DO_ABD(result, a, b, int32_t, int64_t); 1585 return result; 1586 } 1587 #undef DO_ABD 1588 1589 /* Widening multiply. Named type is the source type. */ 1590 #define DO_MULL(dest, x, y, type1, type2) do { \ 1591 type1 tmp_x = x; \ 1592 type1 tmp_y = y; \ 1593 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1594 } while(0) 1595 1596 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1597 { 1598 uint64_t tmp; 1599 uint64_t result; 1600 1601 DO_MULL(result, a, b, uint8_t, uint16_t); 1602 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1603 result |= tmp << 16; 1604 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1605 result |= tmp << 32; 1606 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1607 result |= tmp << 48; 1608 return result; 1609 } 1610 1611 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1612 { 1613 uint64_t tmp; 1614 uint64_t result; 1615 1616 DO_MULL(result, a, b, int8_t, uint16_t); 1617 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1618 result |= tmp << 16; 1619 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1620 result |= tmp << 32; 1621 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1622 result |= tmp << 48; 1623 return result; 1624 } 1625 1626 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1627 { 1628 uint64_t tmp; 1629 uint64_t result; 1630 1631 DO_MULL(result, a, b, uint16_t, uint32_t); 1632 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1633 return result | (tmp << 32); 1634 } 1635 1636 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1637 { 1638 uint64_t tmp; 1639 uint64_t result; 1640 1641 DO_MULL(result, a, b, int16_t, uint32_t); 1642 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1643 return result | (tmp << 32); 1644 } 1645 1646 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1647 { 1648 uint16_t tmp; 1649 uint64_t result; 1650 result = (uint16_t)-x; 1651 tmp = -(x >> 16); 1652 result |= (uint64_t)tmp << 16; 1653 tmp = -(x >> 32); 1654 result |= (uint64_t)tmp << 32; 1655 tmp = -(x >> 48); 1656 result |= (uint64_t)tmp << 48; 1657 return result; 1658 } 1659 1660 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1661 { 1662 uint32_t low = -x; 1663 uint32_t high = -(x >> 32); 1664 return low | ((uint64_t)high << 32); 1665 } 1666 1667 /* FIXME: There should be a native op for this. */ 1668 uint64_t HELPER(neon_negl_u64)(uint64_t x) 1669 { 1670 return -x; 1671 } 1672 1673 /* Saturating sign manipulation. */ 1674 /* ??? Make these use NEON_VOP1 */ 1675 #define DO_QABS8(x) do { \ 1676 if (x == (int8_t)0x80) { \ 1677 x = 0x7f; \ 1678 SET_QC(); \ 1679 } else if (x < 0) { \ 1680 x = -x; \ 1681 }} while (0) 1682 uint32_t HELPER(neon_qabs_s8)(CPUARMState *env, uint32_t x) 1683 { 1684 neon_s8 vec; 1685 NEON_UNPACK(neon_s8, vec, x); 1686 DO_QABS8(vec.v1); 1687 DO_QABS8(vec.v2); 1688 DO_QABS8(vec.v3); 1689 DO_QABS8(vec.v4); 1690 NEON_PACK(neon_s8, x, vec); 1691 return x; 1692 } 1693 #undef DO_QABS8 1694 1695 #define DO_QNEG8(x) do { \ 1696 if (x == (int8_t)0x80) { \ 1697 x = 0x7f; \ 1698 SET_QC(); \ 1699 } else { \ 1700 x = -x; \ 1701 }} while (0) 1702 uint32_t HELPER(neon_qneg_s8)(CPUARMState *env, uint32_t x) 1703 { 1704 neon_s8 vec; 1705 NEON_UNPACK(neon_s8, vec, x); 1706 DO_QNEG8(vec.v1); 1707 DO_QNEG8(vec.v2); 1708 DO_QNEG8(vec.v3); 1709 DO_QNEG8(vec.v4); 1710 NEON_PACK(neon_s8, x, vec); 1711 return x; 1712 } 1713 #undef DO_QNEG8 1714 1715 #define DO_QABS16(x) do { \ 1716 if (x == (int16_t)0x8000) { \ 1717 x = 0x7fff; \ 1718 SET_QC(); \ 1719 } else if (x < 0) { \ 1720 x = -x; \ 1721 }} while (0) 1722 uint32_t HELPER(neon_qabs_s16)(CPUARMState *env, uint32_t x) 1723 { 1724 neon_s16 vec; 1725 NEON_UNPACK(neon_s16, vec, x); 1726 DO_QABS16(vec.v1); 1727 DO_QABS16(vec.v2); 1728 NEON_PACK(neon_s16, x, vec); 1729 return x; 1730 } 1731 #undef DO_QABS16 1732 1733 #define DO_QNEG16(x) do { \ 1734 if (x == (int16_t)0x8000) { \ 1735 x = 0x7fff; \ 1736 SET_QC(); \ 1737 } else { \ 1738 x = -x; \ 1739 }} while (0) 1740 uint32_t HELPER(neon_qneg_s16)(CPUARMState *env, uint32_t x) 1741 { 1742 neon_s16 vec; 1743 NEON_UNPACK(neon_s16, vec, x); 1744 DO_QNEG16(vec.v1); 1745 DO_QNEG16(vec.v2); 1746 NEON_PACK(neon_s16, x, vec); 1747 return x; 1748 } 1749 #undef DO_QNEG16 1750 1751 uint32_t HELPER(neon_qabs_s32)(CPUARMState *env, uint32_t x) 1752 { 1753 if (x == SIGNBIT) { 1754 SET_QC(); 1755 x = ~SIGNBIT; 1756 } else if ((int32_t)x < 0) { 1757 x = -x; 1758 } 1759 return x; 1760 } 1761 1762 uint32_t HELPER(neon_qneg_s32)(CPUARMState *env, uint32_t x) 1763 { 1764 if (x == SIGNBIT) { 1765 SET_QC(); 1766 x = ~SIGNBIT; 1767 } else { 1768 x = -x; 1769 } 1770 return x; 1771 } 1772 1773 /* NEON Float helpers. */ 1774 uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b, void *fpstp) 1775 { 1776 float_status *fpst = fpstp; 1777 return float32_val(float32_min(make_float32(a), make_float32(b), fpst)); 1778 } 1779 1780 uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b, void *fpstp) 1781 { 1782 float_status *fpst = fpstp; 1783 return float32_val(float32_max(make_float32(a), make_float32(b), fpst)); 1784 } 1785 1786 uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b, void *fpstp) 1787 { 1788 float_status *fpst = fpstp; 1789 float32 f0 = make_float32(a); 1790 float32 f1 = make_float32(b); 1791 return float32_val(float32_abs(float32_sub(f0, f1, fpst))); 1792 } 1793 1794 /* Floating point comparisons produce an integer result. 1795 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1796 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1797 */ 1798 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b, void *fpstp) 1799 { 1800 float_status *fpst = fpstp; 1801 return -float32_eq_quiet(make_float32(a), make_float32(b), fpst); 1802 } 1803 1804 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b, void *fpstp) 1805 { 1806 float_status *fpst = fpstp; 1807 return -float32_le(make_float32(b), make_float32(a), fpst); 1808 } 1809 1810 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1811 { 1812 float_status *fpst = fpstp; 1813 return -float32_lt(make_float32(b), make_float32(a), fpst); 1814 } 1815 1816 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b, void *fpstp) 1817 { 1818 float_status *fpst = fpstp; 1819 float32 f0 = float32_abs(make_float32(a)); 1820 float32 f1 = float32_abs(make_float32(b)); 1821 return -float32_le(f1, f0, fpst); 1822 } 1823 1824 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b, void *fpstp) 1825 { 1826 float_status *fpst = fpstp; 1827 float32 f0 = float32_abs(make_float32(a)); 1828 float32 f1 = float32_abs(make_float32(b)); 1829 return -float32_lt(f1, f0, fpst); 1830 } 1831 1832 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1833 1834 void HELPER(neon_qunzip8)(CPUARMState *env, uint32_t rd, uint32_t rm) 1835 { 1836 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1837 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1838 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1839 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1840 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1841 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1842 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1843 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1844 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1845 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1846 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1847 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1848 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1849 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1850 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1851 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1852 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1853 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1854 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1855 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1856 env->vfp.regs[rm] = make_float64(m0); 1857 env->vfp.regs[rm + 1] = make_float64(m1); 1858 env->vfp.regs[rd] = make_float64(d0); 1859 env->vfp.regs[rd + 1] = make_float64(d1); 1860 } 1861 1862 void HELPER(neon_qunzip16)(CPUARMState *env, uint32_t rd, uint32_t rm) 1863 { 1864 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1865 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1866 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1867 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1868 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1869 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1870 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1871 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1872 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1873 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1874 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1875 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1876 env->vfp.regs[rm] = make_float64(m0); 1877 env->vfp.regs[rm + 1] = make_float64(m1); 1878 env->vfp.regs[rd] = make_float64(d0); 1879 env->vfp.regs[rd + 1] = make_float64(d1); 1880 } 1881 1882 void HELPER(neon_qunzip32)(CPUARMState *env, uint32_t rd, uint32_t rm) 1883 { 1884 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1885 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1886 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1887 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1888 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1889 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1890 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1891 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1892 env->vfp.regs[rm] = make_float64(m0); 1893 env->vfp.regs[rm + 1] = make_float64(m1); 1894 env->vfp.regs[rd] = make_float64(d0); 1895 env->vfp.regs[rd + 1] = make_float64(d1); 1896 } 1897 1898 void HELPER(neon_unzip8)(CPUARMState *env, uint32_t rd, uint32_t rm) 1899 { 1900 uint64_t zm = float64_val(env->vfp.regs[rm]); 1901 uint64_t zd = float64_val(env->vfp.regs[rd]); 1902 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1903 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1904 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1905 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1906 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1907 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1908 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1909 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1910 env->vfp.regs[rm] = make_float64(m0); 1911 env->vfp.regs[rd] = make_float64(d0); 1912 } 1913 1914 void HELPER(neon_unzip16)(CPUARMState *env, uint32_t rd, uint32_t rm) 1915 { 1916 uint64_t zm = float64_val(env->vfp.regs[rm]); 1917 uint64_t zd = float64_val(env->vfp.regs[rd]); 1918 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1919 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1920 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1921 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1922 env->vfp.regs[rm] = make_float64(m0); 1923 env->vfp.regs[rd] = make_float64(d0); 1924 } 1925 1926 void HELPER(neon_qzip8)(CPUARMState *env, uint32_t rd, uint32_t rm) 1927 { 1928 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1929 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1930 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1931 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1932 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1933 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1934 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1935 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1936 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1937 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1938 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1939 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1940 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1941 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1942 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1943 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1944 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1945 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1946 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1947 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1948 env->vfp.regs[rm] = make_float64(m0); 1949 env->vfp.regs[rm + 1] = make_float64(m1); 1950 env->vfp.regs[rd] = make_float64(d0); 1951 env->vfp.regs[rd + 1] = make_float64(d1); 1952 } 1953 1954 void HELPER(neon_qzip16)(CPUARMState *env, uint32_t rd, uint32_t rm) 1955 { 1956 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1957 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1958 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1959 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1960 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1961 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1962 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1963 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1964 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1965 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1966 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1967 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1968 env->vfp.regs[rm] = make_float64(m0); 1969 env->vfp.regs[rm + 1] = make_float64(m1); 1970 env->vfp.regs[rd] = make_float64(d0); 1971 env->vfp.regs[rd + 1] = make_float64(d1); 1972 } 1973 1974 void HELPER(neon_qzip32)(CPUARMState *env, uint32_t rd, uint32_t rm) 1975 { 1976 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1977 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1978 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1979 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1980 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1981 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1982 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1983 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1984 env->vfp.regs[rm] = make_float64(m0); 1985 env->vfp.regs[rm + 1] = make_float64(m1); 1986 env->vfp.regs[rd] = make_float64(d0); 1987 env->vfp.regs[rd + 1] = make_float64(d1); 1988 } 1989 1990 void HELPER(neon_zip8)(CPUARMState *env, uint32_t rd, uint32_t rm) 1991 { 1992 uint64_t zm = float64_val(env->vfp.regs[rm]); 1993 uint64_t zd = float64_val(env->vfp.regs[rd]); 1994 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 1995 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 1996 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1997 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 1998 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 1999 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 2000 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 2001 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 2002 env->vfp.regs[rm] = make_float64(m0); 2003 env->vfp.regs[rd] = make_float64(d0); 2004 } 2005 2006 void HELPER(neon_zip16)(CPUARMState *env, uint32_t rd, uint32_t rm) 2007 { 2008 uint64_t zm = float64_val(env->vfp.regs[rm]); 2009 uint64_t zd = float64_val(env->vfp.regs[rd]); 2010 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 2011 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 2012 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 2013 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 2014 env->vfp.regs[rm] = make_float64(m0); 2015 env->vfp.regs[rd] = make_float64(d0); 2016 } 2017