1 /* 2 * ARM NEON vector operations. 3 * 4 * Copyright (c) 2007, 2008 CodeSourcery. 5 * Written by Paul Brook 6 * 7 * This code is licenced under the GNU GPL v2. 8 */ 9 #include <stdlib.h> 10 #include <stdio.h> 11 12 #include "cpu.h" 13 #include "exec.h" 14 #include "helper.h" 15 16 #define SIGNBIT (uint32_t)0x80000000 17 #define SIGNBIT64 ((uint64_t)1 << 63) 18 19 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q 20 21 #define NFS (&env->vfp.standard_fp_status) 22 23 #define NEON_TYPE1(name, type) \ 24 typedef struct \ 25 { \ 26 type v1; \ 27 } neon_##name; 28 #ifdef HOST_WORDS_BIGENDIAN 29 #define NEON_TYPE2(name, type) \ 30 typedef struct \ 31 { \ 32 type v2; \ 33 type v1; \ 34 } neon_##name; 35 #define NEON_TYPE4(name, type) \ 36 typedef struct \ 37 { \ 38 type v4; \ 39 type v3; \ 40 type v2; \ 41 type v1; \ 42 } neon_##name; 43 #else 44 #define NEON_TYPE2(name, type) \ 45 typedef struct \ 46 { \ 47 type v1; \ 48 type v2; \ 49 } neon_##name; 50 #define NEON_TYPE4(name, type) \ 51 typedef struct \ 52 { \ 53 type v1; \ 54 type v2; \ 55 type v3; \ 56 type v4; \ 57 } neon_##name; 58 #endif 59 60 NEON_TYPE4(s8, int8_t) 61 NEON_TYPE4(u8, uint8_t) 62 NEON_TYPE2(s16, int16_t) 63 NEON_TYPE2(u16, uint16_t) 64 NEON_TYPE1(s32, int32_t) 65 NEON_TYPE1(u32, uint32_t) 66 #undef NEON_TYPE4 67 #undef NEON_TYPE2 68 #undef NEON_TYPE1 69 70 /* Copy from a uint32_t to a vector structure type. */ 71 #define NEON_UNPACK(vtype, dest, val) do { \ 72 union { \ 73 vtype v; \ 74 uint32_t i; \ 75 } conv_u; \ 76 conv_u.i = (val); \ 77 dest = conv_u.v; \ 78 } while(0) 79 80 /* Copy from a vector structure type to a uint32_t. */ 81 #define NEON_PACK(vtype, dest, val) do { \ 82 union { \ 83 vtype v; \ 84 uint32_t i; \ 85 } conv_u; \ 86 conv_u.v = (val); \ 87 dest = conv_u.i; \ 88 } while(0) 89 90 #define NEON_DO1 \ 91 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); 92 #define NEON_DO2 \ 93 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 94 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); 95 #define NEON_DO4 \ 96 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ 97 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ 98 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ 99 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); 100 101 #define NEON_VOP_BODY(vtype, n) \ 102 { \ 103 uint32_t res; \ 104 vtype vsrc1; \ 105 vtype vsrc2; \ 106 vtype vdest; \ 107 NEON_UNPACK(vtype, vsrc1, arg1); \ 108 NEON_UNPACK(vtype, vsrc2, arg2); \ 109 NEON_DO##n; \ 110 NEON_PACK(vtype, res, vdest); \ 111 return res; \ 112 } 113 114 #define NEON_VOP(name, vtype, n) \ 115 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 116 NEON_VOP_BODY(vtype, n) 117 118 /* Pairwise operations. */ 119 /* For 32-bit elements each segment only contains a single element, so 120 the elementwise and pairwise operations are the same. */ 121 #define NEON_PDO2 \ 122 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 123 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); 124 #define NEON_PDO4 \ 125 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ 126 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ 127 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ 128 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ 129 130 #define NEON_POP(name, vtype, n) \ 131 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ 132 { \ 133 uint32_t res; \ 134 vtype vsrc1; \ 135 vtype vsrc2; \ 136 vtype vdest; \ 137 NEON_UNPACK(vtype, vsrc1, arg1); \ 138 NEON_UNPACK(vtype, vsrc2, arg2); \ 139 NEON_PDO##n; \ 140 NEON_PACK(vtype, res, vdest); \ 141 return res; \ 142 } 143 144 /* Unary operators. */ 145 #define NEON_VOP1(name, vtype, n) \ 146 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ 147 { \ 148 vtype vsrc1; \ 149 vtype vdest; \ 150 NEON_UNPACK(vtype, vsrc1, arg); \ 151 NEON_DO##n; \ 152 NEON_PACK(vtype, arg, vdest); \ 153 return arg; \ 154 } 155 156 157 #define NEON_USAT(dest, src1, src2, type) do { \ 158 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 159 if (tmp != (type)tmp) { \ 160 SET_QC(); \ 161 dest = ~0; \ 162 } else { \ 163 dest = tmp; \ 164 }} while(0) 165 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 166 NEON_VOP(qadd_u8, neon_u8, 4) 167 #undef NEON_FN 168 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 169 NEON_VOP(qadd_u16, neon_u16, 2) 170 #undef NEON_FN 171 #undef NEON_USAT 172 173 uint32_t HELPER(neon_qadd_u32)(uint32_t a, uint32_t b) 174 { 175 uint32_t res = a + b; 176 if (res < a) { 177 SET_QC(); 178 res = ~0; 179 } 180 return res; 181 } 182 183 uint64_t HELPER(neon_qadd_u64)(uint64_t src1, uint64_t src2) 184 { 185 uint64_t res; 186 187 res = src1 + src2; 188 if (res < src1) { 189 SET_QC(); 190 res = ~(uint64_t)0; 191 } 192 return res; 193 } 194 195 #define NEON_SSAT(dest, src1, src2, type) do { \ 196 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ 197 if (tmp != (type)tmp) { \ 198 SET_QC(); \ 199 if (src2 > 0) { \ 200 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 201 } else { \ 202 tmp = 1 << (sizeof(type) * 8 - 1); \ 203 } \ 204 } \ 205 dest = tmp; \ 206 } while(0) 207 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 208 NEON_VOP(qadd_s8, neon_s8, 4) 209 #undef NEON_FN 210 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 211 NEON_VOP(qadd_s16, neon_s16, 2) 212 #undef NEON_FN 213 #undef NEON_SSAT 214 215 uint32_t HELPER(neon_qadd_s32)(uint32_t a, uint32_t b) 216 { 217 uint32_t res = a + b; 218 if (((res ^ a) & SIGNBIT) && !((a ^ b) & SIGNBIT)) { 219 SET_QC(); 220 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 221 } 222 return res; 223 } 224 225 uint64_t HELPER(neon_qadd_s64)(uint64_t src1, uint64_t src2) 226 { 227 uint64_t res; 228 229 res = src1 + src2; 230 if (((res ^ src1) & SIGNBIT64) && !((src1 ^ src2) & SIGNBIT64)) { 231 SET_QC(); 232 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 233 } 234 return res; 235 } 236 237 #define NEON_USAT(dest, src1, src2, type) do { \ 238 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 239 if (tmp != (type)tmp) { \ 240 SET_QC(); \ 241 dest = 0; \ 242 } else { \ 243 dest = tmp; \ 244 }} while(0) 245 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) 246 NEON_VOP(qsub_u8, neon_u8, 4) 247 #undef NEON_FN 248 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) 249 NEON_VOP(qsub_u16, neon_u16, 2) 250 #undef NEON_FN 251 #undef NEON_USAT 252 253 uint32_t HELPER(neon_qsub_u32)(uint32_t a, uint32_t b) 254 { 255 uint32_t res = a - b; 256 if (res > a) { 257 SET_QC(); 258 res = 0; 259 } 260 return res; 261 } 262 263 uint64_t HELPER(neon_qsub_u64)(uint64_t src1, uint64_t src2) 264 { 265 uint64_t res; 266 267 if (src1 < src2) { 268 SET_QC(); 269 res = 0; 270 } else { 271 res = src1 - src2; 272 } 273 return res; 274 } 275 276 #define NEON_SSAT(dest, src1, src2, type) do { \ 277 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ 278 if (tmp != (type)tmp) { \ 279 SET_QC(); \ 280 if (src2 < 0) { \ 281 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ 282 } else { \ 283 tmp = 1 << (sizeof(type) * 8 - 1); \ 284 } \ 285 } \ 286 dest = tmp; \ 287 } while(0) 288 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) 289 NEON_VOP(qsub_s8, neon_s8, 4) 290 #undef NEON_FN 291 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) 292 NEON_VOP(qsub_s16, neon_s16, 2) 293 #undef NEON_FN 294 #undef NEON_SSAT 295 296 uint32_t HELPER(neon_qsub_s32)(uint32_t a, uint32_t b) 297 { 298 uint32_t res = a - b; 299 if (((res ^ a) & SIGNBIT) && ((a ^ b) & SIGNBIT)) { 300 SET_QC(); 301 res = ~(((int32_t)a >> 31) ^ SIGNBIT); 302 } 303 return res; 304 } 305 306 uint64_t HELPER(neon_qsub_s64)(uint64_t src1, uint64_t src2) 307 { 308 uint64_t res; 309 310 res = src1 - src2; 311 if (((res ^ src1) & SIGNBIT64) && ((src1 ^ src2) & SIGNBIT64)) { 312 SET_QC(); 313 res = ((int64_t)src1 >> 63) ^ ~SIGNBIT64; 314 } 315 return res; 316 } 317 318 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 319 NEON_VOP(hadd_s8, neon_s8, 4) 320 NEON_VOP(hadd_u8, neon_u8, 4) 321 NEON_VOP(hadd_s16, neon_s16, 2) 322 NEON_VOP(hadd_u16, neon_u16, 2) 323 #undef NEON_FN 324 325 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) 326 { 327 int32_t dest; 328 329 dest = (src1 >> 1) + (src2 >> 1); 330 if (src1 & src2 & 1) 331 dest++; 332 return dest; 333 } 334 335 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) 336 { 337 uint32_t dest; 338 339 dest = (src1 >> 1) + (src2 >> 1); 340 if (src1 & src2 & 1) 341 dest++; 342 return dest; 343 } 344 345 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 346 NEON_VOP(rhadd_s8, neon_s8, 4) 347 NEON_VOP(rhadd_u8, neon_u8, 4) 348 NEON_VOP(rhadd_s16, neon_s16, 2) 349 NEON_VOP(rhadd_u16, neon_u16, 2) 350 #undef NEON_FN 351 352 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) 353 { 354 int32_t dest; 355 356 dest = (src1 >> 1) + (src2 >> 1); 357 if ((src1 | src2) & 1) 358 dest++; 359 return dest; 360 } 361 362 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) 363 { 364 uint32_t dest; 365 366 dest = (src1 >> 1) + (src2 >> 1); 367 if ((src1 | src2) & 1) 368 dest++; 369 return dest; 370 } 371 372 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 373 NEON_VOP(hsub_s8, neon_s8, 4) 374 NEON_VOP(hsub_u8, neon_u8, 4) 375 NEON_VOP(hsub_s16, neon_s16, 2) 376 NEON_VOP(hsub_u16, neon_u16, 2) 377 #undef NEON_FN 378 379 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) 380 { 381 int32_t dest; 382 383 dest = (src1 >> 1) - (src2 >> 1); 384 if ((~src1) & src2 & 1) 385 dest--; 386 return dest; 387 } 388 389 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) 390 { 391 uint32_t dest; 392 393 dest = (src1 >> 1) - (src2 >> 1); 394 if ((~src1) & src2 & 1) 395 dest--; 396 return dest; 397 } 398 399 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0 400 NEON_VOP(cgt_s8, neon_s8, 4) 401 NEON_VOP(cgt_u8, neon_u8, 4) 402 NEON_VOP(cgt_s16, neon_s16, 2) 403 NEON_VOP(cgt_u16, neon_u16, 2) 404 NEON_VOP(cgt_s32, neon_s32, 1) 405 NEON_VOP(cgt_u32, neon_u32, 1) 406 #undef NEON_FN 407 408 #define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0 409 NEON_VOP(cge_s8, neon_s8, 4) 410 NEON_VOP(cge_u8, neon_u8, 4) 411 NEON_VOP(cge_s16, neon_s16, 2) 412 NEON_VOP(cge_u16, neon_u16, 2) 413 NEON_VOP(cge_s32, neon_s32, 1) 414 NEON_VOP(cge_u32, neon_u32, 1) 415 #undef NEON_FN 416 417 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 418 NEON_VOP(min_s8, neon_s8, 4) 419 NEON_VOP(min_u8, neon_u8, 4) 420 NEON_VOP(min_s16, neon_s16, 2) 421 NEON_VOP(min_u16, neon_u16, 2) 422 NEON_VOP(min_s32, neon_s32, 1) 423 NEON_VOP(min_u32, neon_u32, 1) 424 NEON_POP(pmin_s8, neon_s8, 4) 425 NEON_POP(pmin_u8, neon_u8, 4) 426 NEON_POP(pmin_s16, neon_s16, 2) 427 NEON_POP(pmin_u16, neon_u16, 2) 428 #undef NEON_FN 429 430 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 431 NEON_VOP(max_s8, neon_s8, 4) 432 NEON_VOP(max_u8, neon_u8, 4) 433 NEON_VOP(max_s16, neon_s16, 2) 434 NEON_VOP(max_u16, neon_u16, 2) 435 NEON_VOP(max_s32, neon_s32, 1) 436 NEON_VOP(max_u32, neon_u32, 1) 437 NEON_POP(pmax_s8, neon_s8, 4) 438 NEON_POP(pmax_u8, neon_u8, 4) 439 NEON_POP(pmax_s16, neon_s16, 2) 440 NEON_POP(pmax_u16, neon_u16, 2) 441 #undef NEON_FN 442 443 #define NEON_FN(dest, src1, src2) \ 444 dest = (src1 > src2) ? (src1 - src2) : (src2 - src1) 445 NEON_VOP(abd_s8, neon_s8, 4) 446 NEON_VOP(abd_u8, neon_u8, 4) 447 NEON_VOP(abd_s16, neon_s16, 2) 448 NEON_VOP(abd_u16, neon_u16, 2) 449 NEON_VOP(abd_s32, neon_s32, 1) 450 NEON_VOP(abd_u32, neon_u32, 1) 451 #undef NEON_FN 452 453 #define NEON_FN(dest, src1, src2) do { \ 454 int8_t tmp; \ 455 tmp = (int8_t)src2; \ 456 if (tmp >= (ssize_t)sizeof(src1) * 8 || \ 457 tmp <= -(ssize_t)sizeof(src1) * 8) { \ 458 dest = 0; \ 459 } else if (tmp < 0) { \ 460 dest = src1 >> -tmp; \ 461 } else { \ 462 dest = src1 << tmp; \ 463 }} while (0) 464 NEON_VOP(shl_u8, neon_u8, 4) 465 NEON_VOP(shl_u16, neon_u16, 2) 466 NEON_VOP(shl_u32, neon_u32, 1) 467 #undef NEON_FN 468 469 uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop) 470 { 471 int8_t shift = (int8_t)shiftop; 472 if (shift >= 64 || shift <= -64) { 473 val = 0; 474 } else if (shift < 0) { 475 val >>= -shift; 476 } else { 477 val <<= shift; 478 } 479 return val; 480 } 481 482 #define NEON_FN(dest, src1, src2) do { \ 483 int8_t tmp; \ 484 tmp = (int8_t)src2; \ 485 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 486 dest = 0; \ 487 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 488 dest = src1 >> (sizeof(src1) * 8 - 1); \ 489 } else if (tmp < 0) { \ 490 dest = src1 >> -tmp; \ 491 } else { \ 492 dest = src1 << tmp; \ 493 }} while (0) 494 NEON_VOP(shl_s8, neon_s8, 4) 495 NEON_VOP(shl_s16, neon_s16, 2) 496 NEON_VOP(shl_s32, neon_s32, 1) 497 #undef NEON_FN 498 499 uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) 500 { 501 int8_t shift = (int8_t)shiftop; 502 int64_t val = valop; 503 if (shift >= 64) { 504 val = 0; 505 } else if (shift <= -64) { 506 val >>= 63; 507 } else if (shift < 0) { 508 val >>= -shift; 509 } else { 510 val <<= shift; 511 } 512 return val; 513 } 514 515 #define NEON_FN(dest, src1, src2) do { \ 516 int8_t tmp; \ 517 tmp = (int8_t)src2; \ 518 if ((tmp >= (ssize_t)sizeof(src1) * 8) \ 519 || (tmp <= -(ssize_t)sizeof(src1) * 8)) { \ 520 dest = 0; \ 521 } else if (tmp < 0) { \ 522 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 523 } else { \ 524 dest = src1 << tmp; \ 525 }} while (0) 526 NEON_VOP(rshl_s8, neon_s8, 4) 527 NEON_VOP(rshl_s16, neon_s16, 2) 528 #undef NEON_FN 529 530 /* The addition of the rounding constant may overflow, so we use an 531 * intermediate 64 bits accumulator. */ 532 uint32_t HELPER(neon_rshl_s32)(uint32_t valop, uint32_t shiftop) 533 { 534 int32_t dest; 535 int32_t val = (int32_t)valop; 536 int8_t shift = (int8_t)shiftop; 537 if ((shift >= 32) || (shift <= -32)) { 538 dest = 0; 539 } else if (shift < 0) { 540 int64_t big_dest = ((int64_t)val + (1 << (-1 - shift))); 541 dest = big_dest >> -shift; 542 } else { 543 dest = val << shift; 544 } 545 return dest; 546 } 547 548 /* Handling addition overflow with 64 bits inputs values is more 549 * tricky than with 32 bits values. */ 550 uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) 551 { 552 int8_t shift = (int8_t)shiftop; 553 int64_t val = valop; 554 if ((shift >= 64) || (shift <= -64)) { 555 val = 0; 556 } else if (shift < 0) { 557 val >>= (-shift - 1); 558 if (val == INT64_MAX) { 559 /* In this case, it means that the rounding constant is 1, 560 * and the addition would overflow. Return the actual 561 * result directly. */ 562 val = 0x4000000000000000LL; 563 } else { 564 val++; 565 val >>= 1; 566 } 567 } else { 568 val <<= shift; 569 } 570 return val; 571 } 572 573 #define NEON_FN(dest, src1, src2) do { \ 574 int8_t tmp; \ 575 tmp = (int8_t)src2; \ 576 if (tmp >= (ssize_t)sizeof(src1) * 8 || \ 577 tmp < -(ssize_t)sizeof(src1) * 8) { \ 578 dest = 0; \ 579 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ 580 dest = src1 >> (-tmp - 1); \ 581 } else if (tmp < 0) { \ 582 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 583 } else { \ 584 dest = src1 << tmp; \ 585 }} while (0) 586 NEON_VOP(rshl_u8, neon_u8, 4) 587 NEON_VOP(rshl_u16, neon_u16, 2) 588 #undef NEON_FN 589 590 /* The addition of the rounding constant may overflow, so we use an 591 * intermediate 64 bits accumulator. */ 592 uint32_t HELPER(neon_rshl_u32)(uint32_t val, uint32_t shiftop) 593 { 594 uint32_t dest; 595 int8_t shift = (int8_t)shiftop; 596 if (shift >= 32 || shift < -32) { 597 dest = 0; 598 } else if (shift == -32) { 599 dest = val >> 31; 600 } else if (shift < 0) { 601 uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift))); 602 dest = big_dest >> -shift; 603 } else { 604 dest = val << shift; 605 } 606 return dest; 607 } 608 609 /* Handling addition overflow with 64 bits inputs values is more 610 * tricky than with 32 bits values. */ 611 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) 612 { 613 int8_t shift = (uint8_t)shiftop; 614 if (shift >= 64 || shift < -64) { 615 val = 0; 616 } else if (shift == -64) { 617 /* Rounding a 1-bit result just preserves that bit. */ 618 val >>= 63; 619 } else if (shift < 0) { 620 val >>= (-shift - 1); 621 if (val == UINT64_MAX) { 622 /* In this case, it means that the rounding constant is 1, 623 * and the addition would overflow. Return the actual 624 * result directly. */ 625 val = 0x8000000000000000ULL; 626 } else { 627 val++; 628 val >>= 1; 629 } 630 } else { 631 val <<= shift; 632 } 633 return val; 634 } 635 636 #define NEON_FN(dest, src1, src2) do { \ 637 int8_t tmp; \ 638 tmp = (int8_t)src2; \ 639 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 640 if (src1) { \ 641 SET_QC(); \ 642 dest = ~0; \ 643 } else { \ 644 dest = 0; \ 645 } \ 646 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 647 dest = 0; \ 648 } else if (tmp < 0) { \ 649 dest = src1 >> -tmp; \ 650 } else { \ 651 dest = src1 << tmp; \ 652 if ((dest >> tmp) != src1) { \ 653 SET_QC(); \ 654 dest = ~0; \ 655 } \ 656 }} while (0) 657 NEON_VOP(qshl_u8, neon_u8, 4) 658 NEON_VOP(qshl_u16, neon_u16, 2) 659 NEON_VOP(qshl_u32, neon_u32, 1) 660 #undef NEON_FN 661 662 uint64_t HELPER(neon_qshl_u64)(uint64_t val, uint64_t shiftop) 663 { 664 int8_t shift = (int8_t)shiftop; 665 if (shift >= 64) { 666 if (val) { 667 val = ~(uint64_t)0; 668 SET_QC(); 669 } 670 } else if (shift <= -64) { 671 val = 0; 672 } else if (shift < 0) { 673 val >>= -shift; 674 } else { 675 uint64_t tmp = val; 676 val <<= shift; 677 if ((val >> shift) != tmp) { 678 SET_QC(); 679 val = ~(uint64_t)0; 680 } 681 } 682 return val; 683 } 684 685 #define NEON_FN(dest, src1, src2) do { \ 686 int8_t tmp; \ 687 tmp = (int8_t)src2; \ 688 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 689 if (src1) { \ 690 SET_QC(); \ 691 dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ 692 if (src1 > 0) { \ 693 dest--; \ 694 } \ 695 } else { \ 696 dest = src1; \ 697 } \ 698 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 699 dest = src1 >> 31; \ 700 } else if (tmp < 0) { \ 701 dest = src1 >> -tmp; \ 702 } else { \ 703 dest = src1 << tmp; \ 704 if ((dest >> tmp) != src1) { \ 705 SET_QC(); \ 706 dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ 707 if (src1 > 0) { \ 708 dest--; \ 709 } \ 710 } \ 711 }} while (0) 712 NEON_VOP(qshl_s8, neon_s8, 4) 713 NEON_VOP(qshl_s16, neon_s16, 2) 714 NEON_VOP(qshl_s32, neon_s32, 1) 715 #undef NEON_FN 716 717 uint64_t HELPER(neon_qshl_s64)(uint64_t valop, uint64_t shiftop) 718 { 719 int8_t shift = (uint8_t)shiftop; 720 int64_t val = valop; 721 if (shift >= 64) { 722 if (val) { 723 SET_QC(); 724 val = (val >> 63) ^ ~SIGNBIT64; 725 } 726 } else if (shift <= -64) { 727 val >>= 63; 728 } else if (shift < 0) { 729 val >>= -shift; 730 } else { 731 int64_t tmp = val; 732 val <<= shift; 733 if ((val >> shift) != tmp) { 734 SET_QC(); 735 val = (tmp >> 63) ^ ~SIGNBIT64; 736 } 737 } 738 return val; 739 } 740 741 #define NEON_FN(dest, src1, src2) do { \ 742 if (src1 & (1 << (sizeof(src1) * 8 - 1))) { \ 743 SET_QC(); \ 744 dest = 0; \ 745 } else { \ 746 int8_t tmp; \ 747 tmp = (int8_t)src2; \ 748 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 749 if (src1) { \ 750 SET_QC(); \ 751 dest = ~0; \ 752 } else { \ 753 dest = 0; \ 754 } \ 755 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 756 dest = 0; \ 757 } else if (tmp < 0) { \ 758 dest = src1 >> -tmp; \ 759 } else { \ 760 dest = src1 << tmp; \ 761 if ((dest >> tmp) != src1) { \ 762 SET_QC(); \ 763 dest = ~0; \ 764 } \ 765 } \ 766 }} while (0) 767 NEON_VOP(qshlu_s8, neon_u8, 4) 768 NEON_VOP(qshlu_s16, neon_u16, 2) 769 #undef NEON_FN 770 771 uint32_t HELPER(neon_qshlu_s32)(uint32_t valop, uint32_t shiftop) 772 { 773 if ((int32_t)valop < 0) { 774 SET_QC(); 775 return 0; 776 } 777 return helper_neon_qshl_u32(valop, shiftop); 778 } 779 780 uint64_t HELPER(neon_qshlu_s64)(uint64_t valop, uint64_t shiftop) 781 { 782 if ((int64_t)valop < 0) { 783 SET_QC(); 784 return 0; 785 } 786 return helper_neon_qshl_u64(valop, shiftop); 787 } 788 789 /* FIXME: This is wrong. */ 790 #define NEON_FN(dest, src1, src2) do { \ 791 int8_t tmp; \ 792 tmp = (int8_t)src2; \ 793 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 794 if (src1) { \ 795 SET_QC(); \ 796 dest = ~0; \ 797 } else { \ 798 dest = 0; \ 799 } \ 800 } else if (tmp < -(ssize_t)sizeof(src1) * 8) { \ 801 dest = 0; \ 802 } else if (tmp == -(ssize_t)sizeof(src1) * 8) { \ 803 dest = src1 >> (sizeof(src1) * 8 - 1); \ 804 } else if (tmp < 0) { \ 805 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 806 } else { \ 807 dest = src1 << tmp; \ 808 if ((dest >> tmp) != src1) { \ 809 SET_QC(); \ 810 dest = ~0; \ 811 } \ 812 }} while (0) 813 NEON_VOP(qrshl_u8, neon_u8, 4) 814 NEON_VOP(qrshl_u16, neon_u16, 2) 815 #undef NEON_FN 816 817 /* The addition of the rounding constant may overflow, so we use an 818 * intermediate 64 bits accumulator. */ 819 uint32_t HELPER(neon_qrshl_u32)(uint32_t val, uint32_t shiftop) 820 { 821 uint32_t dest; 822 int8_t shift = (int8_t)shiftop; 823 if (shift >= 32) { 824 if (val) { 825 SET_QC(); 826 dest = ~0; 827 } else { 828 dest = 0; 829 } 830 } else if (shift < -32) { 831 dest = 0; 832 } else if (shift == -32) { 833 dest = val >> 31; 834 } else if (shift < 0) { 835 uint64_t big_dest = ((uint64_t)val + (1 << (-1 - shift))); 836 dest = big_dest >> -shift; 837 } else { 838 dest = val << shift; 839 if ((dest >> shift) != val) { 840 SET_QC(); 841 dest = ~0; 842 } 843 } 844 return dest; 845 } 846 847 /* Handling addition overflow with 64 bits inputs values is more 848 * tricky than with 32 bits values. */ 849 uint64_t HELPER(neon_qrshl_u64)(uint64_t val, uint64_t shiftop) 850 { 851 int8_t shift = (int8_t)shiftop; 852 if (shift >= 64) { 853 if (val) { 854 SET_QC(); 855 val = ~0; 856 } 857 } else if (shift < -64) { 858 val = 0; 859 } else if (shift == -64) { 860 val >>= 63; 861 } else if (shift < 0) { 862 val >>= (-shift - 1); 863 if (val == UINT64_MAX) { 864 /* In this case, it means that the rounding constant is 1, 865 * and the addition would overflow. Return the actual 866 * result directly. */ 867 val = 0x8000000000000000ULL; 868 } else { 869 val++; 870 val >>= 1; 871 } 872 } else { \ 873 uint64_t tmp = val; 874 val <<= shift; 875 if ((val >> shift) != tmp) { 876 SET_QC(); 877 val = ~0; 878 } 879 } 880 return val; 881 } 882 883 #define NEON_FN(dest, src1, src2) do { \ 884 int8_t tmp; \ 885 tmp = (int8_t)src2; \ 886 if (tmp >= (ssize_t)sizeof(src1) * 8) { \ 887 if (src1) { \ 888 SET_QC(); \ 889 dest = (1 << (sizeof(src1) * 8 - 1)); \ 890 if (src1 > 0) { \ 891 dest--; \ 892 } \ 893 } else { \ 894 dest = 0; \ 895 } \ 896 } else if (tmp <= -(ssize_t)sizeof(src1) * 8) { \ 897 dest = 0; \ 898 } else if (tmp < 0) { \ 899 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ 900 } else { \ 901 dest = src1 << tmp; \ 902 if ((dest >> tmp) != src1) { \ 903 SET_QC(); \ 904 dest = (uint32_t)(1 << (sizeof(src1) * 8 - 1)); \ 905 if (src1 > 0) { \ 906 dest--; \ 907 } \ 908 } \ 909 }} while (0) 910 NEON_VOP(qrshl_s8, neon_s8, 4) 911 NEON_VOP(qrshl_s16, neon_s16, 2) 912 #undef NEON_FN 913 914 /* The addition of the rounding constant may overflow, so we use an 915 * intermediate 64 bits accumulator. */ 916 uint32_t HELPER(neon_qrshl_s32)(uint32_t valop, uint32_t shiftop) 917 { 918 int32_t dest; 919 int32_t val = (int32_t)valop; 920 int8_t shift = (int8_t)shiftop; 921 if (shift >= 32) { 922 if (val) { 923 SET_QC(); 924 dest = (val >> 31) ^ ~SIGNBIT; 925 } else { 926 dest = 0; 927 } 928 } else if (shift <= -32) { 929 dest = 0; 930 } else if (shift < 0) { 931 int64_t big_dest = ((int64_t)val + (1 << (-1 - shift))); 932 dest = big_dest >> -shift; 933 } else { 934 dest = val << shift; 935 if ((dest >> shift) != val) { 936 SET_QC(); 937 dest = (val >> 31) ^ ~SIGNBIT; 938 } 939 } 940 return dest; 941 } 942 943 /* Handling addition overflow with 64 bits inputs values is more 944 * tricky than with 32 bits values. */ 945 uint64_t HELPER(neon_qrshl_s64)(uint64_t valop, uint64_t shiftop) 946 { 947 int8_t shift = (uint8_t)shiftop; 948 int64_t val = valop; 949 950 if (shift >= 64) { 951 if (val) { 952 SET_QC(); 953 val = (val >> 63) ^ ~SIGNBIT64; 954 } 955 } else if (shift <= -64) { 956 val = 0; 957 } else if (shift < 0) { 958 val >>= (-shift - 1); 959 if (val == INT64_MAX) { 960 /* In this case, it means that the rounding constant is 1, 961 * and the addition would overflow. Return the actual 962 * result directly. */ 963 val = 0x4000000000000000ULL; 964 } else { 965 val++; 966 val >>= 1; 967 } 968 } else { 969 int64_t tmp = val; 970 val <<= shift; 971 if ((val >> shift) != tmp) { 972 SET_QC(); 973 val = (tmp >> 63) ^ ~SIGNBIT64; 974 } 975 } 976 return val; 977 } 978 979 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) 980 { 981 uint32_t mask; 982 mask = (a ^ b) & 0x80808080u; 983 a &= ~0x80808080u; 984 b &= ~0x80808080u; 985 return (a + b) ^ mask; 986 } 987 988 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) 989 { 990 uint32_t mask; 991 mask = (a ^ b) & 0x80008000u; 992 a &= ~0x80008000u; 993 b &= ~0x80008000u; 994 return (a + b) ^ mask; 995 } 996 997 #define NEON_FN(dest, src1, src2) dest = src1 + src2 998 NEON_POP(padd_u8, neon_u8, 4) 999 NEON_POP(padd_u16, neon_u16, 2) 1000 #undef NEON_FN 1001 1002 #define NEON_FN(dest, src1, src2) dest = src1 - src2 1003 NEON_VOP(sub_u8, neon_u8, 4) 1004 NEON_VOP(sub_u16, neon_u16, 2) 1005 #undef NEON_FN 1006 1007 #define NEON_FN(dest, src1, src2) dest = src1 * src2 1008 NEON_VOP(mul_u8, neon_u8, 4) 1009 NEON_VOP(mul_u16, neon_u16, 2) 1010 #undef NEON_FN 1011 1012 /* Polynomial multiplication is like integer multiplication except the 1013 partial products are XORed, not added. */ 1014 uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2) 1015 { 1016 uint32_t mask; 1017 uint32_t result; 1018 result = 0; 1019 while (op1) { 1020 mask = 0; 1021 if (op1 & 1) 1022 mask |= 0xff; 1023 if (op1 & (1 << 8)) 1024 mask |= (0xff << 8); 1025 if (op1 & (1 << 16)) 1026 mask |= (0xff << 16); 1027 if (op1 & (1 << 24)) 1028 mask |= (0xff << 24); 1029 result ^= op2 & mask; 1030 op1 = (op1 >> 1) & 0x7f7f7f7f; 1031 op2 = (op2 << 1) & 0xfefefefe; 1032 } 1033 return result; 1034 } 1035 1036 uint64_t HELPER(neon_mull_p8)(uint32_t op1, uint32_t op2) 1037 { 1038 uint64_t result = 0; 1039 uint64_t mask; 1040 uint64_t op2ex = op2; 1041 op2ex = (op2ex & 0xff) | 1042 ((op2ex & 0xff00) << 8) | 1043 ((op2ex & 0xff0000) << 16) | 1044 ((op2ex & 0xff000000) << 24); 1045 while (op1) { 1046 mask = 0; 1047 if (op1 & 1) { 1048 mask |= 0xffff; 1049 } 1050 if (op1 & (1 << 8)) { 1051 mask |= (0xffffU << 16); 1052 } 1053 if (op1 & (1 << 16)) { 1054 mask |= (0xffffULL << 32); 1055 } 1056 if (op1 & (1 << 24)) { 1057 mask |= (0xffffULL << 48); 1058 } 1059 result ^= op2ex & mask; 1060 op1 = (op1 >> 1) & 0x7f7f7f7f; 1061 op2ex <<= 1; 1062 } 1063 return result; 1064 } 1065 1066 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 1067 NEON_VOP(tst_u8, neon_u8, 4) 1068 NEON_VOP(tst_u16, neon_u16, 2) 1069 NEON_VOP(tst_u32, neon_u32, 1) 1070 #undef NEON_FN 1071 1072 #define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0 1073 NEON_VOP(ceq_u8, neon_u8, 4) 1074 NEON_VOP(ceq_u16, neon_u16, 2) 1075 NEON_VOP(ceq_u32, neon_u32, 1) 1076 #undef NEON_FN 1077 1078 #define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src 1079 NEON_VOP1(abs_s8, neon_s8, 4) 1080 NEON_VOP1(abs_s16, neon_s16, 2) 1081 #undef NEON_FN 1082 1083 /* Count Leading Sign/Zero Bits. */ 1084 static inline int do_clz8(uint8_t x) 1085 { 1086 int n; 1087 for (n = 8; x; n--) 1088 x >>= 1; 1089 return n; 1090 } 1091 1092 static inline int do_clz16(uint16_t x) 1093 { 1094 int n; 1095 for (n = 16; x; n--) 1096 x >>= 1; 1097 return n; 1098 } 1099 1100 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) 1101 NEON_VOP1(clz_u8, neon_u8, 4) 1102 #undef NEON_FN 1103 1104 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) 1105 NEON_VOP1(clz_u16, neon_u16, 2) 1106 #undef NEON_FN 1107 1108 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 1109 NEON_VOP1(cls_s8, neon_s8, 4) 1110 #undef NEON_FN 1111 1112 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 1113 NEON_VOP1(cls_s16, neon_s16, 2) 1114 #undef NEON_FN 1115 1116 uint32_t HELPER(neon_cls_s32)(uint32_t x) 1117 { 1118 int count; 1119 if ((int32_t)x < 0) 1120 x = ~x; 1121 for (count = 32; x; count--) 1122 x = x >> 1; 1123 return count - 1; 1124 } 1125 1126 /* Bit count. */ 1127 uint32_t HELPER(neon_cnt_u8)(uint32_t x) 1128 { 1129 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); 1130 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); 1131 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); 1132 return x; 1133 } 1134 1135 #define NEON_QDMULH16(dest, src1, src2, round) do { \ 1136 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ 1137 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ 1138 SET_QC(); \ 1139 tmp = (tmp >> 31) ^ ~SIGNBIT; \ 1140 } else { \ 1141 tmp <<= 1; \ 1142 } \ 1143 if (round) { \ 1144 int32_t old = tmp; \ 1145 tmp += 1 << 15; \ 1146 if ((int32_t)tmp < old) { \ 1147 SET_QC(); \ 1148 tmp = SIGNBIT - 1; \ 1149 } \ 1150 } \ 1151 dest = tmp >> 16; \ 1152 } while(0) 1153 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) 1154 NEON_VOP(qdmulh_s16, neon_s16, 2) 1155 #undef NEON_FN 1156 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) 1157 NEON_VOP(qrdmulh_s16, neon_s16, 2) 1158 #undef NEON_FN 1159 #undef NEON_QDMULH16 1160 1161 #define NEON_QDMULH32(dest, src1, src2, round) do { \ 1162 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ 1163 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ 1164 SET_QC(); \ 1165 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ 1166 } else { \ 1167 tmp <<= 1; \ 1168 } \ 1169 if (round) { \ 1170 int64_t old = tmp; \ 1171 tmp += (int64_t)1 << 31; \ 1172 if ((int64_t)tmp < old) { \ 1173 SET_QC(); \ 1174 tmp = SIGNBIT64 - 1; \ 1175 } \ 1176 } \ 1177 dest = tmp >> 32; \ 1178 } while(0) 1179 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) 1180 NEON_VOP(qdmulh_s32, neon_s32, 1) 1181 #undef NEON_FN 1182 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) 1183 NEON_VOP(qrdmulh_s32, neon_s32, 1) 1184 #undef NEON_FN 1185 #undef NEON_QDMULH32 1186 1187 uint32_t HELPER(neon_narrow_u8)(uint64_t x) 1188 { 1189 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) 1190 | ((x >> 24) & 0xff000000u); 1191 } 1192 1193 uint32_t HELPER(neon_narrow_u16)(uint64_t x) 1194 { 1195 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); 1196 } 1197 1198 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) 1199 { 1200 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 1201 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 1202 } 1203 1204 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) 1205 { 1206 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 1207 } 1208 1209 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) 1210 { 1211 x &= 0xff80ff80ff80ff80ull; 1212 x += 0x0080008000800080ull; 1213 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) 1214 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); 1215 } 1216 1217 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) 1218 { 1219 x &= 0xffff8000ffff8000ull; 1220 x += 0x0000800000008000ull; 1221 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); 1222 } 1223 1224 uint32_t HELPER(neon_unarrow_sat8)(uint64_t x) 1225 { 1226 uint16_t s; 1227 uint8_t d; 1228 uint32_t res = 0; 1229 #define SAT8(n) \ 1230 s = x >> n; \ 1231 if (s & 0x8000) { \ 1232 SET_QC(); \ 1233 } else { \ 1234 if (s > 0xff) { \ 1235 d = 0xff; \ 1236 SET_QC(); \ 1237 } else { \ 1238 d = s; \ 1239 } \ 1240 res |= (uint32_t)d << (n / 2); \ 1241 } 1242 1243 SAT8(0); 1244 SAT8(16); 1245 SAT8(32); 1246 SAT8(48); 1247 #undef SAT8 1248 return res; 1249 } 1250 1251 uint32_t HELPER(neon_narrow_sat_u8)(uint64_t x) 1252 { 1253 uint16_t s; 1254 uint8_t d; 1255 uint32_t res = 0; 1256 #define SAT8(n) \ 1257 s = x >> n; \ 1258 if (s > 0xff) { \ 1259 d = 0xff; \ 1260 SET_QC(); \ 1261 } else { \ 1262 d = s; \ 1263 } \ 1264 res |= (uint32_t)d << (n / 2); 1265 1266 SAT8(0); 1267 SAT8(16); 1268 SAT8(32); 1269 SAT8(48); 1270 #undef SAT8 1271 return res; 1272 } 1273 1274 uint32_t HELPER(neon_narrow_sat_s8)(uint64_t x) 1275 { 1276 int16_t s; 1277 uint8_t d; 1278 uint32_t res = 0; 1279 #define SAT8(n) \ 1280 s = x >> n; \ 1281 if (s != (int8_t)s) { \ 1282 d = (s >> 15) ^ 0x7f; \ 1283 SET_QC(); \ 1284 } else { \ 1285 d = s; \ 1286 } \ 1287 res |= (uint32_t)d << (n / 2); 1288 1289 SAT8(0); 1290 SAT8(16); 1291 SAT8(32); 1292 SAT8(48); 1293 #undef SAT8 1294 return res; 1295 } 1296 1297 uint32_t HELPER(neon_unarrow_sat16)(uint64_t x) 1298 { 1299 uint32_t high; 1300 uint32_t low; 1301 low = x; 1302 if (low & 0x80000000) { 1303 low = 0; 1304 SET_QC(); 1305 } else if (low > 0xffff) { 1306 low = 0xffff; 1307 SET_QC(); 1308 } 1309 high = x >> 32; 1310 if (high & 0x80000000) { 1311 high = 0; 1312 SET_QC(); 1313 } else if (high > 0xffff) { 1314 high = 0xffff; 1315 SET_QC(); 1316 } 1317 return low | (high << 16); 1318 } 1319 1320 uint32_t HELPER(neon_narrow_sat_u16)(uint64_t x) 1321 { 1322 uint32_t high; 1323 uint32_t low; 1324 low = x; 1325 if (low > 0xffff) { 1326 low = 0xffff; 1327 SET_QC(); 1328 } 1329 high = x >> 32; 1330 if (high > 0xffff) { 1331 high = 0xffff; 1332 SET_QC(); 1333 } 1334 return low | (high << 16); 1335 } 1336 1337 uint32_t HELPER(neon_narrow_sat_s16)(uint64_t x) 1338 { 1339 int32_t low; 1340 int32_t high; 1341 low = x; 1342 if (low != (int16_t)low) { 1343 low = (low >> 31) ^ 0x7fff; 1344 SET_QC(); 1345 } 1346 high = x >> 32; 1347 if (high != (int16_t)high) { 1348 high = (high >> 31) ^ 0x7fff; 1349 SET_QC(); 1350 } 1351 return (uint16_t)low | (high << 16); 1352 } 1353 1354 uint32_t HELPER(neon_unarrow_sat32)(uint64_t x) 1355 { 1356 if (x & 0x8000000000000000ull) { 1357 SET_QC(); 1358 return 0; 1359 } 1360 if (x > 0xffffffffu) { 1361 SET_QC(); 1362 return 0xffffffffu; 1363 } 1364 return x; 1365 } 1366 1367 uint32_t HELPER(neon_narrow_sat_u32)(uint64_t x) 1368 { 1369 if (x > 0xffffffffu) { 1370 SET_QC(); 1371 return 0xffffffffu; 1372 } 1373 return x; 1374 } 1375 1376 uint32_t HELPER(neon_narrow_sat_s32)(uint64_t x) 1377 { 1378 if ((int64_t)x != (int32_t)x) { 1379 SET_QC(); 1380 return ((int64_t)x >> 63) ^ 0x7fffffff; 1381 } 1382 return x; 1383 } 1384 1385 uint64_t HELPER(neon_widen_u8)(uint32_t x) 1386 { 1387 uint64_t tmp; 1388 uint64_t ret; 1389 ret = (uint8_t)x; 1390 tmp = (uint8_t)(x >> 8); 1391 ret |= tmp << 16; 1392 tmp = (uint8_t)(x >> 16); 1393 ret |= tmp << 32; 1394 tmp = (uint8_t)(x >> 24); 1395 ret |= tmp << 48; 1396 return ret; 1397 } 1398 1399 uint64_t HELPER(neon_widen_s8)(uint32_t x) 1400 { 1401 uint64_t tmp; 1402 uint64_t ret; 1403 ret = (uint16_t)(int8_t)x; 1404 tmp = (uint16_t)(int8_t)(x >> 8); 1405 ret |= tmp << 16; 1406 tmp = (uint16_t)(int8_t)(x >> 16); 1407 ret |= tmp << 32; 1408 tmp = (uint16_t)(int8_t)(x >> 24); 1409 ret |= tmp << 48; 1410 return ret; 1411 } 1412 1413 uint64_t HELPER(neon_widen_u16)(uint32_t x) 1414 { 1415 uint64_t high = (uint16_t)(x >> 16); 1416 return ((uint16_t)x) | (high << 32); 1417 } 1418 1419 uint64_t HELPER(neon_widen_s16)(uint32_t x) 1420 { 1421 uint64_t high = (int16_t)(x >> 16); 1422 return ((uint32_t)(int16_t)x) | (high << 32); 1423 } 1424 1425 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) 1426 { 1427 uint64_t mask; 1428 mask = (a ^ b) & 0x8000800080008000ull; 1429 a &= ~0x8000800080008000ull; 1430 b &= ~0x8000800080008000ull; 1431 return (a + b) ^ mask; 1432 } 1433 1434 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) 1435 { 1436 uint64_t mask; 1437 mask = (a ^ b) & 0x8000000080000000ull; 1438 a &= ~0x8000000080000000ull; 1439 b &= ~0x8000000080000000ull; 1440 return (a + b) ^ mask; 1441 } 1442 1443 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) 1444 { 1445 uint64_t tmp; 1446 uint64_t tmp2; 1447 1448 tmp = a & 0x0000ffff0000ffffull; 1449 tmp += (a >> 16) & 0x0000ffff0000ffffull; 1450 tmp2 = b & 0xffff0000ffff0000ull; 1451 tmp2 += (b << 16) & 0xffff0000ffff0000ull; 1452 return ( tmp & 0xffff) 1453 | ((tmp >> 16) & 0xffff0000ull) 1454 | ((tmp2 << 16) & 0xffff00000000ull) 1455 | ( tmp2 & 0xffff000000000000ull); 1456 } 1457 1458 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) 1459 { 1460 uint32_t low = a + (a >> 32); 1461 uint32_t high = b + (b >> 32); 1462 return low + ((uint64_t)high << 32); 1463 } 1464 1465 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) 1466 { 1467 uint64_t mask; 1468 mask = (a ^ ~b) & 0x8000800080008000ull; 1469 a |= 0x8000800080008000ull; 1470 b &= ~0x8000800080008000ull; 1471 return (a - b) ^ mask; 1472 } 1473 1474 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) 1475 { 1476 uint64_t mask; 1477 mask = (a ^ ~b) & 0x8000000080000000ull; 1478 a |= 0x8000000080000000ull; 1479 b &= ~0x8000000080000000ull; 1480 return (a - b) ^ mask; 1481 } 1482 1483 uint64_t HELPER(neon_addl_saturate_s32)(uint64_t a, uint64_t b) 1484 { 1485 uint32_t x, y; 1486 uint32_t low, high; 1487 1488 x = a; 1489 y = b; 1490 low = x + y; 1491 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1492 SET_QC(); 1493 low = ((int32_t)x >> 31) ^ ~SIGNBIT; 1494 } 1495 x = a >> 32; 1496 y = b >> 32; 1497 high = x + y; 1498 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { 1499 SET_QC(); 1500 high = ((int32_t)x >> 31) ^ ~SIGNBIT; 1501 } 1502 return low | ((uint64_t)high << 32); 1503 } 1504 1505 uint64_t HELPER(neon_addl_saturate_s64)(uint64_t a, uint64_t b) 1506 { 1507 uint64_t result; 1508 1509 result = a + b; 1510 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { 1511 SET_QC(); 1512 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; 1513 } 1514 return result; 1515 } 1516 1517 /* We have to do the arithmetic in a larger type than 1518 * the input type, because for example with a signed 32 bit 1519 * op the absolute difference can overflow a signed 32 bit value. 1520 */ 1521 #define DO_ABD(dest, x, y, intype, arithtype) do { \ 1522 arithtype tmp_x = (intype)(x); \ 1523 arithtype tmp_y = (intype)(y); \ 1524 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ 1525 } while(0) 1526 1527 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) 1528 { 1529 uint64_t tmp; 1530 uint64_t result; 1531 DO_ABD(result, a, b, uint8_t, uint32_t); 1532 DO_ABD(tmp, a >> 8, b >> 8, uint8_t, uint32_t); 1533 result |= tmp << 16; 1534 DO_ABD(tmp, a >> 16, b >> 16, uint8_t, uint32_t); 1535 result |= tmp << 32; 1536 DO_ABD(tmp, a >> 24, b >> 24, uint8_t, uint32_t); 1537 result |= tmp << 48; 1538 return result; 1539 } 1540 1541 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) 1542 { 1543 uint64_t tmp; 1544 uint64_t result; 1545 DO_ABD(result, a, b, int8_t, int32_t); 1546 DO_ABD(tmp, a >> 8, b >> 8, int8_t, int32_t); 1547 result |= tmp << 16; 1548 DO_ABD(tmp, a >> 16, b >> 16, int8_t, int32_t); 1549 result |= tmp << 32; 1550 DO_ABD(tmp, a >> 24, b >> 24, int8_t, int32_t); 1551 result |= tmp << 48; 1552 return result; 1553 } 1554 1555 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) 1556 { 1557 uint64_t tmp; 1558 uint64_t result; 1559 DO_ABD(result, a, b, uint16_t, uint32_t); 1560 DO_ABD(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1561 return result | (tmp << 32); 1562 } 1563 1564 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) 1565 { 1566 uint64_t tmp; 1567 uint64_t result; 1568 DO_ABD(result, a, b, int16_t, int32_t); 1569 DO_ABD(tmp, a >> 16, b >> 16, int16_t, int32_t); 1570 return result | (tmp << 32); 1571 } 1572 1573 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) 1574 { 1575 uint64_t result; 1576 DO_ABD(result, a, b, uint32_t, uint64_t); 1577 return result; 1578 } 1579 1580 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) 1581 { 1582 uint64_t result; 1583 DO_ABD(result, a, b, int32_t, int64_t); 1584 return result; 1585 } 1586 #undef DO_ABD 1587 1588 /* Widening multiply. Named type is the source type. */ 1589 #define DO_MULL(dest, x, y, type1, type2) do { \ 1590 type1 tmp_x = x; \ 1591 type1 tmp_y = y; \ 1592 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ 1593 } while(0) 1594 1595 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) 1596 { 1597 uint64_t tmp; 1598 uint64_t result; 1599 1600 DO_MULL(result, a, b, uint8_t, uint16_t); 1601 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); 1602 result |= tmp << 16; 1603 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); 1604 result |= tmp << 32; 1605 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); 1606 result |= tmp << 48; 1607 return result; 1608 } 1609 1610 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) 1611 { 1612 uint64_t tmp; 1613 uint64_t result; 1614 1615 DO_MULL(result, a, b, int8_t, uint16_t); 1616 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); 1617 result |= tmp << 16; 1618 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); 1619 result |= tmp << 32; 1620 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); 1621 result |= tmp << 48; 1622 return result; 1623 } 1624 1625 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) 1626 { 1627 uint64_t tmp; 1628 uint64_t result; 1629 1630 DO_MULL(result, a, b, uint16_t, uint32_t); 1631 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); 1632 return result | (tmp << 32); 1633 } 1634 1635 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) 1636 { 1637 uint64_t tmp; 1638 uint64_t result; 1639 1640 DO_MULL(result, a, b, int16_t, uint32_t); 1641 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); 1642 return result | (tmp << 32); 1643 } 1644 1645 uint64_t HELPER(neon_negl_u16)(uint64_t x) 1646 { 1647 uint16_t tmp; 1648 uint64_t result; 1649 result = (uint16_t)-x; 1650 tmp = -(x >> 16); 1651 result |= (uint64_t)tmp << 16; 1652 tmp = -(x >> 32); 1653 result |= (uint64_t)tmp << 32; 1654 tmp = -(x >> 48); 1655 result |= (uint64_t)tmp << 48; 1656 return result; 1657 } 1658 1659 uint64_t HELPER(neon_negl_u32)(uint64_t x) 1660 { 1661 uint32_t low = -x; 1662 uint32_t high = -(x >> 32); 1663 return low | ((uint64_t)high << 32); 1664 } 1665 1666 /* FIXME: There should be a native op for this. */ 1667 uint64_t HELPER(neon_negl_u64)(uint64_t x) 1668 { 1669 return -x; 1670 } 1671 1672 /* Saturnating sign manuipulation. */ 1673 /* ??? Make these use NEON_VOP1 */ 1674 #define DO_QABS8(x) do { \ 1675 if (x == (int8_t)0x80) { \ 1676 x = 0x7f; \ 1677 SET_QC(); \ 1678 } else if (x < 0) { \ 1679 x = -x; \ 1680 }} while (0) 1681 uint32_t HELPER(neon_qabs_s8)(uint32_t x) 1682 { 1683 neon_s8 vec; 1684 NEON_UNPACK(neon_s8, vec, x); 1685 DO_QABS8(vec.v1); 1686 DO_QABS8(vec.v2); 1687 DO_QABS8(vec.v3); 1688 DO_QABS8(vec.v4); 1689 NEON_PACK(neon_s8, x, vec); 1690 return x; 1691 } 1692 #undef DO_QABS8 1693 1694 #define DO_QNEG8(x) do { \ 1695 if (x == (int8_t)0x80) { \ 1696 x = 0x7f; \ 1697 SET_QC(); \ 1698 } else { \ 1699 x = -x; \ 1700 }} while (0) 1701 uint32_t HELPER(neon_qneg_s8)(uint32_t x) 1702 { 1703 neon_s8 vec; 1704 NEON_UNPACK(neon_s8, vec, x); 1705 DO_QNEG8(vec.v1); 1706 DO_QNEG8(vec.v2); 1707 DO_QNEG8(vec.v3); 1708 DO_QNEG8(vec.v4); 1709 NEON_PACK(neon_s8, x, vec); 1710 return x; 1711 } 1712 #undef DO_QNEG8 1713 1714 #define DO_QABS16(x) do { \ 1715 if (x == (int16_t)0x8000) { \ 1716 x = 0x7fff; \ 1717 SET_QC(); \ 1718 } else if (x < 0) { \ 1719 x = -x; \ 1720 }} while (0) 1721 uint32_t HELPER(neon_qabs_s16)(uint32_t x) 1722 { 1723 neon_s16 vec; 1724 NEON_UNPACK(neon_s16, vec, x); 1725 DO_QABS16(vec.v1); 1726 DO_QABS16(vec.v2); 1727 NEON_PACK(neon_s16, x, vec); 1728 return x; 1729 } 1730 #undef DO_QABS16 1731 1732 #define DO_QNEG16(x) do { \ 1733 if (x == (int16_t)0x8000) { \ 1734 x = 0x7fff; \ 1735 SET_QC(); \ 1736 } else { \ 1737 x = -x; \ 1738 }} while (0) 1739 uint32_t HELPER(neon_qneg_s16)(uint32_t x) 1740 { 1741 neon_s16 vec; 1742 NEON_UNPACK(neon_s16, vec, x); 1743 DO_QNEG16(vec.v1); 1744 DO_QNEG16(vec.v2); 1745 NEON_PACK(neon_s16, x, vec); 1746 return x; 1747 } 1748 #undef DO_QNEG16 1749 1750 uint32_t HELPER(neon_qabs_s32)(uint32_t x) 1751 { 1752 if (x == SIGNBIT) { 1753 SET_QC(); 1754 x = ~SIGNBIT; 1755 } else if ((int32_t)x < 0) { 1756 x = -x; 1757 } 1758 return x; 1759 } 1760 1761 uint32_t HELPER(neon_qneg_s32)(uint32_t x) 1762 { 1763 if (x == SIGNBIT) { 1764 SET_QC(); 1765 x = ~SIGNBIT; 1766 } else { 1767 x = -x; 1768 } 1769 return x; 1770 } 1771 1772 /* NEON Float helpers. */ 1773 uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b) 1774 { 1775 return float32_val(float32_min(make_float32(a), make_float32(b), NFS)); 1776 } 1777 1778 uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b) 1779 { 1780 return float32_val(float32_max(make_float32(a), make_float32(b), NFS)); 1781 } 1782 1783 uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b) 1784 { 1785 float32 f0 = make_float32(a); 1786 float32 f1 = make_float32(b); 1787 return float32_val(float32_abs(float32_sub(f0, f1, NFS))); 1788 } 1789 1790 uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b) 1791 { 1792 return float32_val(float32_add(make_float32(a), make_float32(b), NFS)); 1793 } 1794 1795 uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b) 1796 { 1797 return float32_val(float32_sub(make_float32(a), make_float32(b), NFS)); 1798 } 1799 1800 uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b) 1801 { 1802 return float32_val(float32_mul(make_float32(a), make_float32(b), NFS)); 1803 } 1804 1805 /* Floating point comparisons produce an integer result. 1806 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. 1807 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. 1808 */ 1809 uint32_t HELPER(neon_ceq_f32)(uint32_t a, uint32_t b) 1810 { 1811 return -float32_eq_quiet(make_float32(a), make_float32(b), NFS); 1812 } 1813 1814 uint32_t HELPER(neon_cge_f32)(uint32_t a, uint32_t b) 1815 { 1816 return -float32_le(make_float32(b), make_float32(a), NFS); 1817 } 1818 1819 uint32_t HELPER(neon_cgt_f32)(uint32_t a, uint32_t b) 1820 { 1821 return -float32_lt(make_float32(b), make_float32(a), NFS); 1822 } 1823 1824 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b) 1825 { 1826 float32 f0 = float32_abs(make_float32(a)); 1827 float32 f1 = float32_abs(make_float32(b)); 1828 return -float32_le(f1, f0, NFS); 1829 } 1830 1831 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b) 1832 { 1833 float32 f0 = float32_abs(make_float32(a)); 1834 float32 f1 = float32_abs(make_float32(b)); 1835 return -float32_lt(f1, f0, NFS); 1836 } 1837 1838 #define ELEM(V, N, SIZE) (((V) >> ((N) * (SIZE))) & ((1ull << (SIZE)) - 1)) 1839 1840 void HELPER(neon_qunzip8)(uint32_t rd, uint32_t rm) 1841 { 1842 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1843 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1844 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1845 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1846 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zd0, 2, 8) << 8) 1847 | (ELEM(zd0, 4, 8) << 16) | (ELEM(zd0, 6, 8) << 24) 1848 | (ELEM(zd1, 0, 8) << 32) | (ELEM(zd1, 2, 8) << 40) 1849 | (ELEM(zd1, 4, 8) << 48) | (ELEM(zd1, 6, 8) << 56); 1850 uint64_t d1 = ELEM(zm0, 0, 8) | (ELEM(zm0, 2, 8) << 8) 1851 | (ELEM(zm0, 4, 8) << 16) | (ELEM(zm0, 6, 8) << 24) 1852 | (ELEM(zm1, 0, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1853 | (ELEM(zm1, 4, 8) << 48) | (ELEM(zm1, 6, 8) << 56); 1854 uint64_t m0 = ELEM(zd0, 1, 8) | (ELEM(zd0, 3, 8) << 8) 1855 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zd0, 7, 8) << 24) 1856 | (ELEM(zd1, 1, 8) << 32) | (ELEM(zd1, 3, 8) << 40) 1857 | (ELEM(zd1, 5, 8) << 48) | (ELEM(zd1, 7, 8) << 56); 1858 uint64_t m1 = ELEM(zm0, 1, 8) | (ELEM(zm0, 3, 8) << 8) 1859 | (ELEM(zm0, 5, 8) << 16) | (ELEM(zm0, 7, 8) << 24) 1860 | (ELEM(zm1, 1, 8) << 32) | (ELEM(zm1, 3, 8) << 40) 1861 | (ELEM(zm1, 5, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1862 env->vfp.regs[rm] = make_float64(m0); 1863 env->vfp.regs[rm + 1] = make_float64(m1); 1864 env->vfp.regs[rd] = make_float64(d0); 1865 env->vfp.regs[rd + 1] = make_float64(d1); 1866 } 1867 1868 void HELPER(neon_qunzip16)(uint32_t rd, uint32_t rm) 1869 { 1870 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1871 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1872 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1873 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1874 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zd0, 2, 16) << 16) 1875 | (ELEM(zd1, 0, 16) << 32) | (ELEM(zd1, 2, 16) << 48); 1876 uint64_t d1 = ELEM(zm0, 0, 16) | (ELEM(zm0, 2, 16) << 16) 1877 | (ELEM(zm1, 0, 16) << 32) | (ELEM(zm1, 2, 16) << 48); 1878 uint64_t m0 = ELEM(zd0, 1, 16) | (ELEM(zd0, 3, 16) << 16) 1879 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zd1, 3, 16) << 48); 1880 uint64_t m1 = ELEM(zm0, 1, 16) | (ELEM(zm0, 3, 16) << 16) 1881 | (ELEM(zm1, 1, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1882 env->vfp.regs[rm] = make_float64(m0); 1883 env->vfp.regs[rm + 1] = make_float64(m1); 1884 env->vfp.regs[rd] = make_float64(d0); 1885 env->vfp.regs[rd + 1] = make_float64(d1); 1886 } 1887 1888 void HELPER(neon_qunzip32)(uint32_t rd, uint32_t rm) 1889 { 1890 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1891 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1892 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1893 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1894 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zd1, 0, 32) << 32); 1895 uint64_t d1 = ELEM(zm0, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1896 uint64_t m0 = ELEM(zd0, 1, 32) | (ELEM(zd1, 1, 32) << 32); 1897 uint64_t m1 = ELEM(zm0, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1898 env->vfp.regs[rm] = make_float64(m0); 1899 env->vfp.regs[rm + 1] = make_float64(m1); 1900 env->vfp.regs[rd] = make_float64(d0); 1901 env->vfp.regs[rd + 1] = make_float64(d1); 1902 } 1903 1904 void HELPER(neon_unzip8)(uint32_t rd, uint32_t rm) 1905 { 1906 uint64_t zm = float64_val(env->vfp.regs[rm]); 1907 uint64_t zd = float64_val(env->vfp.regs[rd]); 1908 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zd, 2, 8) << 8) 1909 | (ELEM(zd, 4, 8) << 16) | (ELEM(zd, 6, 8) << 24) 1910 | (ELEM(zm, 0, 8) << 32) | (ELEM(zm, 2, 8) << 40) 1911 | (ELEM(zm, 4, 8) << 48) | (ELEM(zm, 6, 8) << 56); 1912 uint64_t m0 = ELEM(zd, 1, 8) | (ELEM(zd, 3, 8) << 8) 1913 | (ELEM(zd, 5, 8) << 16) | (ELEM(zd, 7, 8) << 24) 1914 | (ELEM(zm, 1, 8) << 32) | (ELEM(zm, 3, 8) << 40) 1915 | (ELEM(zm, 5, 8) << 48) | (ELEM(zm, 7, 8) << 56); 1916 env->vfp.regs[rm] = make_float64(m0); 1917 env->vfp.regs[rd] = make_float64(d0); 1918 } 1919 1920 void HELPER(neon_unzip16)(uint32_t rd, uint32_t rm) 1921 { 1922 uint64_t zm = float64_val(env->vfp.regs[rm]); 1923 uint64_t zd = float64_val(env->vfp.regs[rd]); 1924 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zd, 2, 16) << 16) 1925 | (ELEM(zm, 0, 16) << 32) | (ELEM(zm, 2, 16) << 48); 1926 uint64_t m0 = ELEM(zd, 1, 16) | (ELEM(zd, 3, 16) << 16) 1927 | (ELEM(zm, 1, 16) << 32) | (ELEM(zm, 3, 16) << 48); 1928 env->vfp.regs[rm] = make_float64(m0); 1929 env->vfp.regs[rd] = make_float64(d0); 1930 } 1931 1932 void HELPER(neon_qzip8)(uint32_t rd, uint32_t rm) 1933 { 1934 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1935 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1936 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1937 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1938 uint64_t d0 = ELEM(zd0, 0, 8) | (ELEM(zm0, 0, 8) << 8) 1939 | (ELEM(zd0, 1, 8) << 16) | (ELEM(zm0, 1, 8) << 24) 1940 | (ELEM(zd0, 2, 8) << 32) | (ELEM(zm0, 2, 8) << 40) 1941 | (ELEM(zd0, 3, 8) << 48) | (ELEM(zm0, 3, 8) << 56); 1942 uint64_t d1 = ELEM(zd0, 4, 8) | (ELEM(zm0, 4, 8) << 8) 1943 | (ELEM(zd0, 5, 8) << 16) | (ELEM(zm0, 5, 8) << 24) 1944 | (ELEM(zd0, 6, 8) << 32) | (ELEM(zm0, 6, 8) << 40) 1945 | (ELEM(zd0, 7, 8) << 48) | (ELEM(zm0, 7, 8) << 56); 1946 uint64_t m0 = ELEM(zd1, 0, 8) | (ELEM(zm1, 0, 8) << 8) 1947 | (ELEM(zd1, 1, 8) << 16) | (ELEM(zm1, 1, 8) << 24) 1948 | (ELEM(zd1, 2, 8) << 32) | (ELEM(zm1, 2, 8) << 40) 1949 | (ELEM(zd1, 3, 8) << 48) | (ELEM(zm1, 3, 8) << 56); 1950 uint64_t m1 = ELEM(zd1, 4, 8) | (ELEM(zm1, 4, 8) << 8) 1951 | (ELEM(zd1, 5, 8) << 16) | (ELEM(zm1, 5, 8) << 24) 1952 | (ELEM(zd1, 6, 8) << 32) | (ELEM(zm1, 6, 8) << 40) 1953 | (ELEM(zd1, 7, 8) << 48) | (ELEM(zm1, 7, 8) << 56); 1954 env->vfp.regs[rm] = make_float64(m0); 1955 env->vfp.regs[rm + 1] = make_float64(m1); 1956 env->vfp.regs[rd] = make_float64(d0); 1957 env->vfp.regs[rd + 1] = make_float64(d1); 1958 } 1959 1960 void HELPER(neon_qzip16)(uint32_t rd, uint32_t rm) 1961 { 1962 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1963 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1964 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1965 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1966 uint64_t d0 = ELEM(zd0, 0, 16) | (ELEM(zm0, 0, 16) << 16) 1967 | (ELEM(zd0, 1, 16) << 32) | (ELEM(zm0, 1, 16) << 48); 1968 uint64_t d1 = ELEM(zd0, 2, 16) | (ELEM(zm0, 2, 16) << 16) 1969 | (ELEM(zd0, 3, 16) << 32) | (ELEM(zm0, 3, 16) << 48); 1970 uint64_t m0 = ELEM(zd1, 0, 16) | (ELEM(zm1, 0, 16) << 16) 1971 | (ELEM(zd1, 1, 16) << 32) | (ELEM(zm1, 1, 16) << 48); 1972 uint64_t m1 = ELEM(zd1, 2, 16) | (ELEM(zm1, 2, 16) << 16) 1973 | (ELEM(zd1, 3, 16) << 32) | (ELEM(zm1, 3, 16) << 48); 1974 env->vfp.regs[rm] = make_float64(m0); 1975 env->vfp.regs[rm + 1] = make_float64(m1); 1976 env->vfp.regs[rd] = make_float64(d0); 1977 env->vfp.regs[rd + 1] = make_float64(d1); 1978 } 1979 1980 void HELPER(neon_qzip32)(uint32_t rd, uint32_t rm) 1981 { 1982 uint64_t zm0 = float64_val(env->vfp.regs[rm]); 1983 uint64_t zm1 = float64_val(env->vfp.regs[rm + 1]); 1984 uint64_t zd0 = float64_val(env->vfp.regs[rd]); 1985 uint64_t zd1 = float64_val(env->vfp.regs[rd + 1]); 1986 uint64_t d0 = ELEM(zd0, 0, 32) | (ELEM(zm0, 0, 32) << 32); 1987 uint64_t d1 = ELEM(zd0, 1, 32) | (ELEM(zm0, 1, 32) << 32); 1988 uint64_t m0 = ELEM(zd1, 0, 32) | (ELEM(zm1, 0, 32) << 32); 1989 uint64_t m1 = ELEM(zd1, 1, 32) | (ELEM(zm1, 1, 32) << 32); 1990 env->vfp.regs[rm] = make_float64(m0); 1991 env->vfp.regs[rm + 1] = make_float64(m1); 1992 env->vfp.regs[rd] = make_float64(d0); 1993 env->vfp.regs[rd + 1] = make_float64(d1); 1994 } 1995 1996 void HELPER(neon_zip8)(uint32_t rd, uint32_t rm) 1997 { 1998 uint64_t zm = float64_val(env->vfp.regs[rm]); 1999 uint64_t zd = float64_val(env->vfp.regs[rd]); 2000 uint64_t d0 = ELEM(zd, 0, 8) | (ELEM(zm, 0, 8) << 8) 2001 | (ELEM(zd, 1, 8) << 16) | (ELEM(zm, 1, 8) << 24) 2002 | (ELEM(zd, 2, 8) << 32) | (ELEM(zm, 2, 8) << 40) 2003 | (ELEM(zd, 3, 8) << 48) | (ELEM(zm, 3, 8) << 56); 2004 uint64_t m0 = ELEM(zd, 4, 8) | (ELEM(zm, 4, 8) << 8) 2005 | (ELEM(zd, 5, 8) << 16) | (ELEM(zm, 5, 8) << 24) 2006 | (ELEM(zd, 6, 8) << 32) | (ELEM(zm, 6, 8) << 40) 2007 | (ELEM(zd, 7, 8) << 48) | (ELEM(zm, 7, 8) << 56); 2008 env->vfp.regs[rm] = make_float64(m0); 2009 env->vfp.regs[rd] = make_float64(d0); 2010 } 2011 2012 void HELPER(neon_zip16)(uint32_t rd, uint32_t rm) 2013 { 2014 uint64_t zm = float64_val(env->vfp.regs[rm]); 2015 uint64_t zd = float64_val(env->vfp.regs[rd]); 2016 uint64_t d0 = ELEM(zd, 0, 16) | (ELEM(zm, 0, 16) << 16) 2017 | (ELEM(zd, 1, 16) << 32) | (ELEM(zm, 1, 16) << 48); 2018 uint64_t m0 = ELEM(zd, 2, 16) | (ELEM(zm, 2, 16) << 16) 2019 | (ELEM(zd, 3, 16) << 32) | (ELEM(zm, 3, 16) << 48); 2020 env->vfp.regs[rm] = make_float64(m0); 2021 env->vfp.regs[rd] = make_float64(d0); 2022 } 2023