1 2 /*---------------------------------------------------------------*/ 3 /*--- begin host_generic_simd64.c ---*/ 4 /*---------------------------------------------------------------*/ 5 6 /* 7 This file is part of Valgrind, a dynamic binary instrumentation 8 framework. 9 10 Copyright (C) 2004-2013 OpenWorks LLP 11 info (at) open-works.net 12 13 This program is free software; you can redistribute it and/or 14 modify it under the terms of the GNU General Public License as 15 published by the Free Software Foundation; either version 2 of the 16 License, or (at your option) any later version. 17 18 This program is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with this program; if not, write to the Free Software 25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 26 02110-1301, USA. 27 28 The GNU General Public License is contained in the file COPYING. 29 30 Neither the names of the U.S. Department of Energy nor the 31 University of California nor the names of its contributors may be 32 used to endorse or promote products derived from this software 33 without prior written permission. 34 */ 35 36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases 37 where the instruction selectors cannot generate code in-line. 38 These are purely back-end entities and cannot be seen/referenced 39 from IR. There are also helpers for 32-bit arithmetic in here. */ 40 41 #include "libvex_basictypes.h" 42 #include "main_util.h" // LIKELY, UNLIKELY 43 #include "host_generic_simd64.h" 44 45 46 47 /* Tuple/select functions for 32x2 vectors. */ 48 49 static inline ULong mk32x2 ( UInt w1, UInt w0 ) { 50 return (((ULong)w1) << 32) | ((ULong)w0); 51 } 52 53 static inline UInt sel32x2_1 ( ULong w64 ) { 54 return 0xFFFFFFFF & toUInt(w64 >> 32); 55 } 56 static inline UInt sel32x2_0 ( ULong w64 ) { 57 return 0xFFFFFFFF & toUInt(w64); 58 } 59 60 61 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless 62 with 64-bit shifts so we give it a hand. */ 63 64 static inline ULong mk16x4 ( UShort w3, UShort w2, 65 UShort w1, UShort w0 ) { 66 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2); 67 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0); 68 return mk32x2(hi32, lo32); 69 } 70 71 static inline UShort sel16x4_3 ( ULong w64 ) { 72 UInt hi32 = toUInt(w64 >> 32); 73 return toUShort(0xFFFF & (hi32 >> 16)); 74 } 75 static inline UShort sel16x4_2 ( ULong w64 ) { 76 UInt hi32 = toUInt(w64 >> 32); 77 return toUShort(0xFFFF & hi32); 78 } 79 static inline UShort sel16x4_1 ( ULong w64 ) { 80 UInt lo32 = (UInt)w64; 81 return toUShort(0xFFFF & (lo32 >> 16)); 82 } 83 static inline UShort sel16x4_0 ( ULong w64 ) { 84 UInt lo32 = (UInt)w64; 85 return toUShort(0xFFFF & lo32); 86 } 87 88 89 /* Tuple/select functions for 8x8 vectors. */ 90 91 static inline ULong mk8x8 ( UChar w7, UChar w6, 92 UChar w5, UChar w4, 93 UChar w3, UChar w2, 94 UChar w1, UChar w0 ) { 95 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16) 96 | (((UInt)w5) << 8) | (((UInt)w4) << 0); 97 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16) 98 | (((UInt)w1) << 8) | (((UInt)w0) << 0); 99 return mk32x2(hi32, lo32); 100 } 101 102 static inline UChar sel8x8_7 ( ULong w64 ) { 103 UInt hi32 = toUInt(w64 >> 32); 104 return toUChar(0xFF & (hi32 >> 24)); 105 } 106 static inline UChar sel8x8_6 ( ULong w64 ) { 107 UInt hi32 = toUInt(w64 >> 32); 108 return toUChar(0xFF & (hi32 >> 16)); 109 } 110 static inline UChar sel8x8_5 ( ULong w64 ) { 111 UInt hi32 = toUInt(w64 >> 32); 112 return toUChar(0xFF & (hi32 >> 8)); 113 } 114 static inline UChar sel8x8_4 ( ULong w64 ) { 115 UInt hi32 = toUInt(w64 >> 32); 116 return toUChar(0xFF & (hi32 >> 0)); 117 } 118 static inline UChar sel8x8_3 ( ULong w64 ) { 119 UInt lo32 = (UInt)w64; 120 return toUChar(0xFF & (lo32 >> 24)); 121 } 122 static inline UChar sel8x8_2 ( ULong w64 ) { 123 UInt lo32 = (UInt)w64; 124 return toUChar(0xFF & (lo32 >> 16)); 125 } 126 static inline UChar sel8x8_1 ( ULong w64 ) { 127 UInt lo32 = (UInt)w64; 128 return toUChar(0xFF & (lo32 >> 8)); 129 } 130 static inline UChar sel8x8_0 ( ULong w64 ) { 131 UInt lo32 = (UInt)w64; 132 return toUChar(0xFF & (lo32 >> 0)); 133 } 134 135 static inline UChar index8x8 ( ULong w64, UChar ix ) { 136 ix &= 7; 137 return toUChar((w64 >> (8*ix)) & 0xFF); 138 } 139 140 141 /* Scalar helpers. */ 142 143 static inline Int qadd32S ( Int xx, Int yy ) 144 { 145 Long t = ((Long)xx) + ((Long)yy); 146 const Long loLim = -0x80000000LL; 147 const Long hiLim = 0x7FFFFFFFLL; 148 if (t < loLim) t = loLim; 149 if (t > hiLim) t = hiLim; 150 return (Int)t; 151 } 152 153 static inline Short qadd16S ( Short xx, Short yy ) 154 { 155 Int t = ((Int)xx) + ((Int)yy); 156 if (t < -32768) t = -32768; 157 if (t > 32767) t = 32767; 158 return (Short)t; 159 } 160 161 static inline Char qadd8S ( Char xx, Char yy ) 162 { 163 Int t = ((Int)xx) + ((Int)yy); 164 if (t < -128) t = -128; 165 if (t > 127) t = 127; 166 return (Char)t; 167 } 168 169 static inline UShort qadd16U ( UShort xx, UShort yy ) 170 { 171 UInt t = ((UInt)xx) + ((UInt)yy); 172 if (t > 0xFFFF) t = 0xFFFF; 173 return (UShort)t; 174 } 175 176 static inline UChar qadd8U ( UChar xx, UChar yy ) 177 { 178 UInt t = ((UInt)xx) + ((UInt)yy); 179 if (t > 0xFF) t = 0xFF; 180 return (UChar)t; 181 } 182 183 static inline Int qsub32S ( Int xx, Int yy ) 184 { 185 Long t = ((Long)xx) - ((Long)yy); 186 const Long loLim = -0x80000000LL; 187 const Long hiLim = 0x7FFFFFFFLL; 188 if (t < loLim) t = loLim; 189 if (t > hiLim) t = hiLim; 190 return (Int)t; 191 } 192 193 static inline Short qsub16S ( Short xx, Short yy ) 194 { 195 Int t = ((Int)xx) - ((Int)yy); 196 if (t < -32768) t = -32768; 197 if (t > 32767) t = 32767; 198 return (Short)t; 199 } 200 201 static inline Char qsub8S ( Char xx, Char yy ) 202 { 203 Int t = ((Int)xx) - ((Int)yy); 204 if (t < -128) t = -128; 205 if (t > 127) t = 127; 206 return (Char)t; 207 } 208 209 static inline UShort qsub16U ( UShort xx, UShort yy ) 210 { 211 Int t = ((Int)xx) - ((Int)yy); 212 if (t < 0) t = 0; 213 if (t > 0xFFFF) t = 0xFFFF; 214 return (UShort)t; 215 } 216 217 static inline UChar qsub8U ( UChar xx, UChar yy ) 218 { 219 Int t = ((Int)xx) - ((Int)yy); 220 if (t < 0) t = 0; 221 if (t > 0xFF) t = 0xFF; 222 return (UChar)t; 223 } 224 225 static inline Short mul16 ( Short xx, Short yy ) 226 { 227 Int t = ((Int)xx) * ((Int)yy); 228 return (Short)t; 229 } 230 231 static inline Int mul32 ( Int xx, Int yy ) 232 { 233 Int t = ((Int)xx) * ((Int)yy); 234 return (Int)t; 235 } 236 237 static inline Short mulhi16S ( Short xx, Short yy ) 238 { 239 Int t = ((Int)xx) * ((Int)yy); 240 t >>=/*s*/ 16; 241 return (Short)t; 242 } 243 244 static inline UShort mulhi16U ( UShort xx, UShort yy ) 245 { 246 UInt t = ((UInt)xx) * ((UInt)yy); 247 t >>=/*u*/ 16; 248 return (UShort)t; 249 } 250 251 static inline UInt cmpeq32 ( UInt xx, UInt yy ) 252 { 253 return xx==yy ? 0xFFFFFFFF : 0; 254 } 255 256 static inline UShort cmpeq16 ( UShort xx, UShort yy ) 257 { 258 return toUShort(xx==yy ? 0xFFFF : 0); 259 } 260 261 static inline UChar cmpeq8 ( UChar xx, UChar yy ) 262 { 263 return toUChar(xx==yy ? 0xFF : 0); 264 } 265 266 static inline UInt cmpgt32S ( Int xx, Int yy ) 267 { 268 return xx>yy ? 0xFFFFFFFF : 0; 269 } 270 271 static inline UShort cmpgt16S ( Short xx, Short yy ) 272 { 273 return toUShort(xx>yy ? 0xFFFF : 0); 274 } 275 276 static inline UChar cmpgt8S ( Char xx, Char yy ) 277 { 278 return toUChar(xx>yy ? 0xFF : 0); 279 } 280 281 static inline UInt cmpnez32 ( UInt xx ) 282 { 283 return xx==0 ? 0 : 0xFFFFFFFF; 284 } 285 286 static inline UShort cmpnez16 ( UShort xx ) 287 { 288 return toUShort(xx==0 ? 0 : 0xFFFF); 289 } 290 291 static inline UChar cmpnez8 ( UChar xx ) 292 { 293 return toUChar(xx==0 ? 0 : 0xFF); 294 } 295 296 static inline Short qnarrow32Sto16S ( UInt xx0 ) 297 { 298 Int xx = (Int)xx0; 299 if (xx < -32768) xx = -32768; 300 if (xx > 32767) xx = 32767; 301 return (Short)xx; 302 } 303 304 static inline Char qnarrow16Sto8S ( UShort xx0 ) 305 { 306 Short xx = (Short)xx0; 307 if (xx < -128) xx = -128; 308 if (xx > 127) xx = 127; 309 return (Char)xx; 310 } 311 312 static inline UChar qnarrow16Sto8U ( UShort xx0 ) 313 { 314 Short xx = (Short)xx0; 315 if (xx < 0) xx = 0; 316 if (xx > 255) xx = 255; 317 return (UChar)xx; 318 } 319 320 static inline UShort narrow32to16 ( UInt xx ) 321 { 322 return (UShort)xx; 323 } 324 325 static inline UChar narrow16to8 ( UShort xx ) 326 { 327 return (UChar)xx; 328 } 329 330 /* shifts: we don't care about out-of-range ones, since 331 that is dealt with at a higher level. */ 332 333 static inline UChar shl8 ( UChar v, UInt n ) 334 { 335 return toUChar(v << n); 336 } 337 338 static inline UChar sar8 ( UChar v, UInt n ) 339 { 340 return toUChar(((Char)v) >> n); 341 } 342 343 static inline UShort shl16 ( UShort v, UInt n ) 344 { 345 return toUShort(v << n); 346 } 347 348 static inline UShort shr16 ( UShort v, UInt n ) 349 { 350 return toUShort((((UShort)v) >> n)); 351 } 352 353 static inline UShort sar16 ( UShort v, UInt n ) 354 { 355 return toUShort(((Short)v) >> n); 356 } 357 358 static inline UInt shl32 ( UInt v, UInt n ) 359 { 360 return v << n; 361 } 362 363 static inline UInt shr32 ( UInt v, UInt n ) 364 { 365 return (((UInt)v) >> n); 366 } 367 368 static inline UInt sar32 ( UInt v, UInt n ) 369 { 370 return ((Int)v) >> n; 371 } 372 373 static inline UChar avg8U ( UChar xx, UChar yy ) 374 { 375 UInt xxi = (UInt)xx; 376 UInt yyi = (UInt)yy; 377 UInt r = (xxi + yyi + 1) >> 1; 378 return (UChar)r; 379 } 380 381 static inline UShort avg16U ( UShort xx, UShort yy ) 382 { 383 UInt xxi = (UInt)xx; 384 UInt yyi = (UInt)yy; 385 UInt r = (xxi + yyi + 1) >> 1; 386 return (UShort)r; 387 } 388 389 static inline Short max16S ( Short xx, Short yy ) 390 { 391 return toUShort((xx > yy) ? xx : yy); 392 } 393 394 static inline UChar max8U ( UChar xx, UChar yy ) 395 { 396 return toUChar((xx > yy) ? xx : yy); 397 } 398 399 static inline Short min16S ( Short xx, Short yy ) 400 { 401 return toUShort((xx < yy) ? xx : yy); 402 } 403 404 static inline UChar min8U ( UChar xx, UChar yy ) 405 { 406 return toUChar((xx < yy) ? xx : yy); 407 } 408 409 static inline UShort hadd16U ( UShort xx, UShort yy ) 410 { 411 UInt xxi = (UInt)xx; 412 UInt yyi = (UInt)yy; 413 UInt r = (xxi + yyi) >> 1; 414 return (UShort)r; 415 } 416 417 static inline Short hadd16S ( Short xx, Short yy ) 418 { 419 Int xxi = (Int)xx; 420 Int yyi = (Int)yy; 421 Int r = (xxi + yyi) >> 1; 422 return (Short)r; 423 } 424 425 static inline UShort hsub16U ( UShort xx, UShort yy ) 426 { 427 UInt xxi = (UInt)xx; 428 UInt yyi = (UInt)yy; 429 UInt r = (xxi - yyi) >> 1; 430 return (UShort)r; 431 } 432 433 static inline Short hsub16S ( Short xx, Short yy ) 434 { 435 Int xxi = (Int)xx; 436 Int yyi = (Int)yy; 437 Int r = (xxi - yyi) >> 1; 438 return (Short)r; 439 } 440 441 static inline UChar hadd8U ( UChar xx, UChar yy ) 442 { 443 UInt xxi = (UInt)xx; 444 UInt yyi = (UInt)yy; 445 UInt r = (xxi + yyi) >> 1; 446 return (UChar)r; 447 } 448 449 static inline Char hadd8S ( Char xx, Char yy ) 450 { 451 Int xxi = (Int)xx; 452 Int yyi = (Int)yy; 453 Int r = (xxi + yyi) >> 1; 454 return (Char)r; 455 } 456 457 static inline UChar hsub8U ( UChar xx, UChar yy ) 458 { 459 UInt xxi = (UInt)xx; 460 UInt yyi = (UInt)yy; 461 UInt r = (xxi - yyi) >> 1; 462 return (UChar)r; 463 } 464 465 static inline Char hsub8S ( Char xx, Char yy ) 466 { 467 Int xxi = (Int)xx; 468 Int yyi = (Int)yy; 469 Int r = (xxi - yyi) >> 1; 470 return (Char)r; 471 } 472 473 static inline UInt absdiff8U ( UChar xx, UChar yy ) 474 { 475 UInt xxu = (UChar)xx; 476 UInt yyu = (UChar)yy; 477 return xxu >= yyu ? xxu - yyu : yyu - xxu; 478 } 479 480 /* ----------------------------------------------------- */ 481 /* Start of the externally visible functions. These simply 482 implement the corresponding IR primops. */ 483 /* ----------------------------------------------------- */ 484 485 /* ------------ Normal addition ------------ */ 486 487 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy ) 488 { 489 return mk32x2( 490 sel32x2_1(xx) + sel32x2_1(yy), 491 sel32x2_0(xx) + sel32x2_0(yy) 492 ); 493 } 494 495 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy ) 496 { 497 return mk16x4( 498 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ), 499 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ), 500 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ), 501 toUShort( sel16x4_0(xx) + sel16x4_0(yy) ) 502 ); 503 } 504 505 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy ) 506 { 507 return mk8x8( 508 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ), 509 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ), 510 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ), 511 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ), 512 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ), 513 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ), 514 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ), 515 toUChar( sel8x8_0(xx) + sel8x8_0(yy) ) 516 ); 517 } 518 519 /* ------------ Saturating addition ------------ */ 520 521 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy ) 522 { 523 return mk16x4( 524 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ), 525 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ), 526 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ), 527 qadd16S( sel16x4_0(xx), sel16x4_0(yy) ) 528 ); 529 } 530 531 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy ) 532 { 533 return mk8x8( 534 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ), 535 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ), 536 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ), 537 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ), 538 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ), 539 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ), 540 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ), 541 qadd8S( sel8x8_0(xx), sel8x8_0(yy) ) 542 ); 543 } 544 545 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy ) 546 { 547 return mk16x4( 548 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ), 549 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ), 550 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ), 551 qadd16U( sel16x4_0(xx), sel16x4_0(yy) ) 552 ); 553 } 554 555 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy ) 556 { 557 return mk8x8( 558 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ), 559 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ), 560 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ), 561 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ), 562 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ), 563 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ), 564 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ), 565 qadd8U( sel8x8_0(xx), sel8x8_0(yy) ) 566 ); 567 } 568 569 /* ------------ Normal subtraction ------------ */ 570 571 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy ) 572 { 573 return mk32x2( 574 sel32x2_1(xx) - sel32x2_1(yy), 575 sel32x2_0(xx) - sel32x2_0(yy) 576 ); 577 } 578 579 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy ) 580 { 581 return mk16x4( 582 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ), 583 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ), 584 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ), 585 toUShort( sel16x4_0(xx) - sel16x4_0(yy) ) 586 ); 587 } 588 589 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy ) 590 { 591 return mk8x8( 592 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ), 593 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ), 594 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ), 595 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ), 596 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ), 597 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ), 598 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ), 599 toUChar( sel8x8_0(xx) - sel8x8_0(yy) ) 600 ); 601 } 602 603 /* ------------ Saturating subtraction ------------ */ 604 605 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy ) 606 { 607 return mk16x4( 608 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ), 609 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ), 610 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ), 611 qsub16S( sel16x4_0(xx), sel16x4_0(yy) ) 612 ); 613 } 614 615 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy ) 616 { 617 return mk8x8( 618 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ), 619 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ), 620 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ), 621 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ), 622 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ), 623 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ), 624 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ), 625 qsub8S( sel8x8_0(xx), sel8x8_0(yy) ) 626 ); 627 } 628 629 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy ) 630 { 631 return mk16x4( 632 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ), 633 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ), 634 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ), 635 qsub16U( sel16x4_0(xx), sel16x4_0(yy) ) 636 ); 637 } 638 639 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy ) 640 { 641 return mk8x8( 642 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ), 643 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ), 644 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ), 645 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ), 646 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ), 647 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ), 648 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ), 649 qsub8U( sel8x8_0(xx), sel8x8_0(yy) ) 650 ); 651 } 652 653 /* ------------ Multiplication ------------ */ 654 655 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy ) 656 { 657 return mk16x4( 658 mul16( sel16x4_3(xx), sel16x4_3(yy) ), 659 mul16( sel16x4_2(xx), sel16x4_2(yy) ), 660 mul16( sel16x4_1(xx), sel16x4_1(yy) ), 661 mul16( sel16x4_0(xx), sel16x4_0(yy) ) 662 ); 663 } 664 665 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy ) 666 { 667 return mk32x2( 668 mul32( sel32x2_1(xx), sel32x2_1(yy) ), 669 mul32( sel32x2_0(xx), sel32x2_0(yy) ) 670 ); 671 } 672 673 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy ) 674 { 675 return mk16x4( 676 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ), 677 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ), 678 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ), 679 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) ) 680 ); 681 } 682 683 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy ) 684 { 685 return mk16x4( 686 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ), 687 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ), 688 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ), 689 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) ) 690 ); 691 } 692 693 /* ------------ Comparison ------------ */ 694 695 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy ) 696 { 697 return mk32x2( 698 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ), 699 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) ) 700 ); 701 } 702 703 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy ) 704 { 705 return mk16x4( 706 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ), 707 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ), 708 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ), 709 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) ) 710 ); 711 } 712 713 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy ) 714 { 715 return mk8x8( 716 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ), 717 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ), 718 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ), 719 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ), 720 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ), 721 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ), 722 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ), 723 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) ) 724 ); 725 } 726 727 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy ) 728 { 729 return mk32x2( 730 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ), 731 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) ) 732 ); 733 } 734 735 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy ) 736 { 737 return mk16x4( 738 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ), 739 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ), 740 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ), 741 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) ) 742 ); 743 } 744 745 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy ) 746 { 747 return mk8x8( 748 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ), 749 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ), 750 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ), 751 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ), 752 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ), 753 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ), 754 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ), 755 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) ) 756 ); 757 } 758 759 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx ) 760 { 761 return mk32x2( 762 cmpnez32( sel32x2_1(xx) ), 763 cmpnez32( sel32x2_0(xx) ) 764 ); 765 } 766 767 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx ) 768 { 769 return mk16x4( 770 cmpnez16( sel16x4_3(xx) ), 771 cmpnez16( sel16x4_2(xx) ), 772 cmpnez16( sel16x4_1(xx) ), 773 cmpnez16( sel16x4_0(xx) ) 774 ); 775 } 776 777 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx ) 778 { 779 return mk8x8( 780 cmpnez8( sel8x8_7(xx) ), 781 cmpnez8( sel8x8_6(xx) ), 782 cmpnez8( sel8x8_5(xx) ), 783 cmpnez8( sel8x8_4(xx) ), 784 cmpnez8( sel8x8_3(xx) ), 785 cmpnez8( sel8x8_2(xx) ), 786 cmpnez8( sel8x8_1(xx) ), 787 cmpnez8( sel8x8_0(xx) ) 788 ); 789 } 790 791 /* ------------ Saturating narrowing ------------ */ 792 793 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb ) 794 { 795 UInt d = sel32x2_1(aa); 796 UInt c = sel32x2_0(aa); 797 UInt b = sel32x2_1(bb); 798 UInt a = sel32x2_0(bb); 799 return mk16x4( 800 qnarrow32Sto16S(d), 801 qnarrow32Sto16S(c), 802 qnarrow32Sto16S(b), 803 qnarrow32Sto16S(a) 804 ); 805 } 806 807 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb ) 808 { 809 UShort h = sel16x4_3(aa); 810 UShort g = sel16x4_2(aa); 811 UShort f = sel16x4_1(aa); 812 UShort e = sel16x4_0(aa); 813 UShort d = sel16x4_3(bb); 814 UShort c = sel16x4_2(bb); 815 UShort b = sel16x4_1(bb); 816 UShort a = sel16x4_0(bb); 817 return mk8x8( 818 qnarrow16Sto8S(h), 819 qnarrow16Sto8S(g), 820 qnarrow16Sto8S(f), 821 qnarrow16Sto8S(e), 822 qnarrow16Sto8S(d), 823 qnarrow16Sto8S(c), 824 qnarrow16Sto8S(b), 825 qnarrow16Sto8S(a) 826 ); 827 } 828 829 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb ) 830 { 831 UShort h = sel16x4_3(aa); 832 UShort g = sel16x4_2(aa); 833 UShort f = sel16x4_1(aa); 834 UShort e = sel16x4_0(aa); 835 UShort d = sel16x4_3(bb); 836 UShort c = sel16x4_2(bb); 837 UShort b = sel16x4_1(bb); 838 UShort a = sel16x4_0(bb); 839 return mk8x8( 840 qnarrow16Sto8U(h), 841 qnarrow16Sto8U(g), 842 qnarrow16Sto8U(f), 843 qnarrow16Sto8U(e), 844 qnarrow16Sto8U(d), 845 qnarrow16Sto8U(c), 846 qnarrow16Sto8U(b), 847 qnarrow16Sto8U(a) 848 ); 849 } 850 851 /* ------------ Truncating narrowing ------------ */ 852 853 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb ) 854 { 855 UInt d = sel32x2_1(aa); 856 UInt c = sel32x2_0(aa); 857 UInt b = sel32x2_1(bb); 858 UInt a = sel32x2_0(bb); 859 return mk16x4( 860 narrow32to16(d), 861 narrow32to16(c), 862 narrow32to16(b), 863 narrow32to16(a) 864 ); 865 } 866 867 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb ) 868 { 869 UShort h = sel16x4_3(aa); 870 UShort g = sel16x4_2(aa); 871 UShort f = sel16x4_1(aa); 872 UShort e = sel16x4_0(aa); 873 UShort d = sel16x4_3(bb); 874 UShort c = sel16x4_2(bb); 875 UShort b = sel16x4_1(bb); 876 UShort a = sel16x4_0(bb); 877 return mk8x8( 878 narrow16to8(h), 879 narrow16to8(g), 880 narrow16to8(f), 881 narrow16to8(e), 882 narrow16to8(d), 883 narrow16to8(c), 884 narrow16to8(b), 885 narrow16to8(a) 886 ); 887 } 888 889 /* ------------ Interleaving ------------ */ 890 891 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb ) 892 { 893 return mk8x8( 894 sel8x8_7(aa), 895 sel8x8_7(bb), 896 sel8x8_6(aa), 897 sel8x8_6(bb), 898 sel8x8_5(aa), 899 sel8x8_5(bb), 900 sel8x8_4(aa), 901 sel8x8_4(bb) 902 ); 903 } 904 905 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb ) 906 { 907 return mk8x8( 908 sel8x8_3(aa), 909 sel8x8_3(bb), 910 sel8x8_2(aa), 911 sel8x8_2(bb), 912 sel8x8_1(aa), 913 sel8x8_1(bb), 914 sel8x8_0(aa), 915 sel8x8_0(bb) 916 ); 917 } 918 919 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb ) 920 { 921 return mk16x4( 922 sel16x4_3(aa), 923 sel16x4_3(bb), 924 sel16x4_2(aa), 925 sel16x4_2(bb) 926 ); 927 } 928 929 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb ) 930 { 931 return mk16x4( 932 sel16x4_1(aa), 933 sel16x4_1(bb), 934 sel16x4_0(aa), 935 sel16x4_0(bb) 936 ); 937 } 938 939 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb ) 940 { 941 return mk32x2( 942 sel32x2_1(aa), 943 sel32x2_1(bb) 944 ); 945 } 946 947 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb ) 948 { 949 return mk32x2( 950 sel32x2_0(aa), 951 sel32x2_0(bb) 952 ); 953 } 954 955 /* ------------ Concatenation ------------ */ 956 957 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb ) 958 { 959 return mk16x4( 960 sel16x4_3(aa), 961 sel16x4_1(aa), 962 sel16x4_3(bb), 963 sel16x4_1(bb) 964 ); 965 } 966 967 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb ) 968 { 969 return mk16x4( 970 sel16x4_2(aa), 971 sel16x4_0(aa), 972 sel16x4_2(bb), 973 sel16x4_0(bb) 974 ); 975 } 976 977 /* misc hack looking for a proper home */ 978 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) 979 { 980 return mk8x8( 981 index8x8(aa, sel8x8_7(bb)), 982 index8x8(aa, sel8x8_6(bb)), 983 index8x8(aa, sel8x8_5(bb)), 984 index8x8(aa, sel8x8_4(bb)), 985 index8x8(aa, sel8x8_3(bb)), 986 index8x8(aa, sel8x8_2(bb)), 987 index8x8(aa, sel8x8_1(bb)), 988 index8x8(aa, sel8x8_0(bb)) 989 ); 990 } 991 992 /* ------------ Shifting ------------ */ 993 /* Note that because these primops are undefined if the shift amount 994 equals or exceeds the lane width, the shift amount is masked so 995 that the scalar shifts are always in range. In fact, given the 996 semantics of these primops (ShlN16x4, etc) it is an error if in 997 fact we are ever given an out-of-range shift amount. 998 */ 999 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn ) 1000 { 1001 /* vassert(nn < 32); */ 1002 nn &= 31; 1003 return mk32x2( 1004 shl32( sel32x2_1(xx), nn ), 1005 shl32( sel32x2_0(xx), nn ) 1006 ); 1007 } 1008 1009 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn ) 1010 { 1011 /* vassert(nn < 16); */ 1012 nn &= 15; 1013 return mk16x4( 1014 shl16( sel16x4_3(xx), nn ), 1015 shl16( sel16x4_2(xx), nn ), 1016 shl16( sel16x4_1(xx), nn ), 1017 shl16( sel16x4_0(xx), nn ) 1018 ); 1019 } 1020 1021 ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn ) 1022 { 1023 /* vassert(nn < 8); */ 1024 nn &= 7; 1025 return mk8x8( 1026 shl8( sel8x8_7(xx), nn ), 1027 shl8( sel8x8_6(xx), nn ), 1028 shl8( sel8x8_5(xx), nn ), 1029 shl8( sel8x8_4(xx), nn ), 1030 shl8( sel8x8_3(xx), nn ), 1031 shl8( sel8x8_2(xx), nn ), 1032 shl8( sel8x8_1(xx), nn ), 1033 shl8( sel8x8_0(xx), nn ) 1034 ); 1035 } 1036 1037 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn ) 1038 { 1039 /* vassert(nn < 32); */ 1040 nn &= 31; 1041 return mk32x2( 1042 shr32( sel32x2_1(xx), nn ), 1043 shr32( sel32x2_0(xx), nn ) 1044 ); 1045 } 1046 1047 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn ) 1048 { 1049 /* vassert(nn < 16); */ 1050 nn &= 15; 1051 return mk16x4( 1052 shr16( sel16x4_3(xx), nn ), 1053 shr16( sel16x4_2(xx), nn ), 1054 shr16( sel16x4_1(xx), nn ), 1055 shr16( sel16x4_0(xx), nn ) 1056 ); 1057 } 1058 1059 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn ) 1060 { 1061 /* vassert(nn < 32); */ 1062 nn &= 31; 1063 return mk32x2( 1064 sar32( sel32x2_1(xx), nn ), 1065 sar32( sel32x2_0(xx), nn ) 1066 ); 1067 } 1068 1069 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn ) 1070 { 1071 /* vassert(nn < 16); */ 1072 nn &= 15; 1073 return mk16x4( 1074 sar16( sel16x4_3(xx), nn ), 1075 sar16( sel16x4_2(xx), nn ), 1076 sar16( sel16x4_1(xx), nn ), 1077 sar16( sel16x4_0(xx), nn ) 1078 ); 1079 } 1080 1081 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn ) 1082 { 1083 /* vassert(nn < 8); */ 1084 nn &= 7; 1085 return mk8x8( 1086 sar8( sel8x8_7(xx), nn ), 1087 sar8( sel8x8_6(xx), nn ), 1088 sar8( sel8x8_5(xx), nn ), 1089 sar8( sel8x8_4(xx), nn ), 1090 sar8( sel8x8_3(xx), nn ), 1091 sar8( sel8x8_2(xx), nn ), 1092 sar8( sel8x8_1(xx), nn ), 1093 sar8( sel8x8_0(xx), nn ) 1094 ); 1095 } 1096 1097 /* ------------ Averaging ------------ */ 1098 1099 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy ) 1100 { 1101 return mk8x8( 1102 avg8U( sel8x8_7(xx), sel8x8_7(yy) ), 1103 avg8U( sel8x8_6(xx), sel8x8_6(yy) ), 1104 avg8U( sel8x8_5(xx), sel8x8_5(yy) ), 1105 avg8U( sel8x8_4(xx), sel8x8_4(yy) ), 1106 avg8U( sel8x8_3(xx), sel8x8_3(yy) ), 1107 avg8U( sel8x8_2(xx), sel8x8_2(yy) ), 1108 avg8U( sel8x8_1(xx), sel8x8_1(yy) ), 1109 avg8U( sel8x8_0(xx), sel8x8_0(yy) ) 1110 ); 1111 } 1112 1113 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy ) 1114 { 1115 return mk16x4( 1116 avg16U( sel16x4_3(xx), sel16x4_3(yy) ), 1117 avg16U( sel16x4_2(xx), sel16x4_2(yy) ), 1118 avg16U( sel16x4_1(xx), sel16x4_1(yy) ), 1119 avg16U( sel16x4_0(xx), sel16x4_0(yy) ) 1120 ); 1121 } 1122 1123 /* ------------ max/min ------------ */ 1124 1125 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy ) 1126 { 1127 return mk16x4( 1128 max16S( sel16x4_3(xx), sel16x4_3(yy) ), 1129 max16S( sel16x4_2(xx), sel16x4_2(yy) ), 1130 max16S( sel16x4_1(xx), sel16x4_1(yy) ), 1131 max16S( sel16x4_0(xx), sel16x4_0(yy) ) 1132 ); 1133 } 1134 1135 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy ) 1136 { 1137 return mk8x8( 1138 max8U( sel8x8_7(xx), sel8x8_7(yy) ), 1139 max8U( sel8x8_6(xx), sel8x8_6(yy) ), 1140 max8U( sel8x8_5(xx), sel8x8_5(yy) ), 1141 max8U( sel8x8_4(xx), sel8x8_4(yy) ), 1142 max8U( sel8x8_3(xx), sel8x8_3(yy) ), 1143 max8U( sel8x8_2(xx), sel8x8_2(yy) ), 1144 max8U( sel8x8_1(xx), sel8x8_1(yy) ), 1145 max8U( sel8x8_0(xx), sel8x8_0(yy) ) 1146 ); 1147 } 1148 1149 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy ) 1150 { 1151 return mk16x4( 1152 min16S( sel16x4_3(xx), sel16x4_3(yy) ), 1153 min16S( sel16x4_2(xx), sel16x4_2(yy) ), 1154 min16S( sel16x4_1(xx), sel16x4_1(yy) ), 1155 min16S( sel16x4_0(xx), sel16x4_0(yy) ) 1156 ); 1157 } 1158 1159 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy ) 1160 { 1161 return mk8x8( 1162 min8U( sel8x8_7(xx), sel8x8_7(yy) ), 1163 min8U( sel8x8_6(xx), sel8x8_6(yy) ), 1164 min8U( sel8x8_5(xx), sel8x8_5(yy) ), 1165 min8U( sel8x8_4(xx), sel8x8_4(yy) ), 1166 min8U( sel8x8_3(xx), sel8x8_3(yy) ), 1167 min8U( sel8x8_2(xx), sel8x8_2(yy) ), 1168 min8U( sel8x8_1(xx), sel8x8_1(yy) ), 1169 min8U( sel8x8_0(xx), sel8x8_0(yy) ) 1170 ); 1171 } 1172 1173 UInt h_generic_calc_GetMSBs8x8 ( ULong xx ) 1174 { 1175 UInt r = 0; 1176 if (xx & (1ULL << (64-1))) r |= (1<<7); 1177 if (xx & (1ULL << (56-1))) r |= (1<<6); 1178 if (xx & (1ULL << (48-1))) r |= (1<<5); 1179 if (xx & (1ULL << (40-1))) r |= (1<<4); 1180 if (xx & (1ULL << (32-1))) r |= (1<<3); 1181 if (xx & (1ULL << (24-1))) r |= (1<<2); 1182 if (xx & (1ULL << (16-1))) r |= (1<<1); 1183 if (xx & (1ULL << ( 8-1))) r |= (1<<0); 1184 return r; 1185 } 1186 1187 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */ 1188 1189 /* Tuple/select functions for 16x2 vectors. */ 1190 static inline UInt mk16x2 ( UShort w1, UShort w2 ) { 1191 return (((UInt)w1) << 16) | ((UInt)w2); 1192 } 1193 1194 static inline UShort sel16x2_1 ( UInt w32 ) { 1195 return 0xFFFF & (UShort)(w32 >> 16); 1196 } 1197 static inline UShort sel16x2_0 ( UInt w32 ) { 1198 return 0xFFFF & (UShort)(w32); 1199 } 1200 1201 static inline UInt mk8x4 ( UChar w3, UChar w2, 1202 UChar w1, UChar w0 ) { 1203 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16) 1204 | (((UInt)w1) << 8) | (((UInt)w0) << 0); 1205 return w32; 1206 } 1207 1208 static inline UChar sel8x4_3 ( UInt w32 ) { 1209 return toUChar(0xFF & (w32 >> 24)); 1210 } 1211 static inline UChar sel8x4_2 ( UInt w32 ) { 1212 return toUChar(0xFF & (w32 >> 16)); 1213 } 1214 static inline UChar sel8x4_1 ( UInt w32 ) { 1215 return toUChar(0xFF & (w32 >> 8)); 1216 } 1217 static inline UChar sel8x4_0 ( UInt w32 ) { 1218 return toUChar(0xFF & (w32 >> 0)); 1219 } 1220 1221 1222 /* ----------------------------------------------------- */ 1223 /* More externally visible functions. These simply 1224 implement the corresponding IR primops. */ 1225 /* ----------------------------------------------------- */ 1226 1227 /* ------ 16x2 ------ */ 1228 1229 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy ) 1230 { 1231 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy), 1232 sel16x2_0(xx) + sel16x2_0(yy) ); 1233 } 1234 1235 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy ) 1236 { 1237 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy), 1238 sel16x2_0(xx) - sel16x2_0(yy) ); 1239 } 1240 1241 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy ) 1242 { 1243 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ), 1244 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1245 } 1246 1247 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy ) 1248 { 1249 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ), 1250 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1251 } 1252 1253 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy ) 1254 { 1255 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ), 1256 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1257 } 1258 1259 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy ) 1260 { 1261 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ), 1262 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1263 } 1264 1265 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy ) 1266 { 1267 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ), 1268 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1269 } 1270 1271 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy ) 1272 { 1273 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ), 1274 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1275 } 1276 1277 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy ) 1278 { 1279 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ), 1280 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1281 } 1282 1283 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy ) 1284 { 1285 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ), 1286 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1287 } 1288 1289 /* ------ 8x4 ------ */ 1290 1291 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy ) 1292 { 1293 return mk8x4( 1294 sel8x4_3(xx) + sel8x4_3(yy), 1295 sel8x4_2(xx) + sel8x4_2(yy), 1296 sel8x4_1(xx) + sel8x4_1(yy), 1297 sel8x4_0(xx) + sel8x4_0(yy) 1298 ); 1299 } 1300 1301 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy ) 1302 { 1303 return mk8x4( 1304 sel8x4_3(xx) - sel8x4_3(yy), 1305 sel8x4_2(xx) - sel8x4_2(yy), 1306 sel8x4_1(xx) - sel8x4_1(yy), 1307 sel8x4_0(xx) - sel8x4_0(yy) 1308 ); 1309 } 1310 1311 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy ) 1312 { 1313 return mk8x4( 1314 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ), 1315 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ), 1316 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ), 1317 hadd8U( sel8x4_0(xx), sel8x4_0(yy) ) 1318 ); 1319 } 1320 1321 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy ) 1322 { 1323 return mk8x4( 1324 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ), 1325 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ), 1326 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ), 1327 hadd8S( sel8x4_0(xx), sel8x4_0(yy) ) 1328 ); 1329 } 1330 1331 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy ) 1332 { 1333 return mk8x4( 1334 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ), 1335 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ), 1336 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ), 1337 hsub8U( sel8x4_0(xx), sel8x4_0(yy) ) 1338 ); 1339 } 1340 1341 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy ) 1342 { 1343 return mk8x4( 1344 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ), 1345 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ), 1346 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ), 1347 hsub8S( sel8x4_0(xx), sel8x4_0(yy) ) 1348 ); 1349 } 1350 1351 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy ) 1352 { 1353 return mk8x4( 1354 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ), 1355 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ), 1356 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ), 1357 qadd8U( sel8x4_0(xx), sel8x4_0(yy) ) 1358 ); 1359 } 1360 1361 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy ) 1362 { 1363 return mk8x4( 1364 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ), 1365 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ), 1366 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ), 1367 qadd8S( sel8x4_0(xx), sel8x4_0(yy) ) 1368 ); 1369 } 1370 1371 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy ) 1372 { 1373 return mk8x4( 1374 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ), 1375 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ), 1376 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ), 1377 qsub8U( sel8x4_0(xx), sel8x4_0(yy) ) 1378 ); 1379 } 1380 1381 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy ) 1382 { 1383 return mk8x4( 1384 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ), 1385 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ), 1386 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ), 1387 qsub8S( sel8x4_0(xx), sel8x4_0(yy) ) 1388 ); 1389 } 1390 1391 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx ) 1392 { 1393 return mk16x2( 1394 cmpnez16( sel16x2_1(xx) ), 1395 cmpnez16( sel16x2_0(xx) ) 1396 ); 1397 } 1398 1399 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx ) 1400 { 1401 return mk8x4( 1402 cmpnez8( sel8x4_3(xx) ), 1403 cmpnez8( sel8x4_2(xx) ), 1404 cmpnez8( sel8x4_1(xx) ), 1405 cmpnez8( sel8x4_0(xx) ) 1406 ); 1407 } 1408 1409 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy ) 1410 { 1411 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) ) 1412 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) ) 1413 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) ) 1414 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) ); 1415 } 1416 1417 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy ) 1418 { 1419 return qadd32S( xx, yy ); 1420 } 1421 1422 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy ) 1423 { 1424 return qsub32S( xx, yy ); 1425 } 1426 1427 1428 /*------------------------------------------------------------------*/ 1429 /* Decimal Floating Point (DFP) externally visible helper functions */ 1430 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */ 1431 /*------------------------------------------------------------------*/ 1432 1433 #define NOT( x ) ( ( ( x ) == 0) ? 1 : 0) 1434 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) ) 1435 #define PUT( x, y ) ( ( x )<< ( y ) ) 1436 1437 static ULong dpb_to_bcd( ULong chunk ) 1438 { 1439 Short a, b, c, d, e, f, g, h, i, j, k, m; 1440 Short p, q, r, s, t, u, v, w, x, y; 1441 ULong value; 1442 1443 /* convert 10 bit densely packed BCD to BCD */ 1444 p = GET( chunk, 9 ); 1445 q = GET( chunk, 8 ); 1446 r = GET( chunk, 7 ); 1447 s = GET( chunk, 6 ); 1448 t = GET( chunk, 5 ); 1449 u = GET( chunk, 4 ); 1450 v = GET( chunk, 3 ); 1451 w = GET( chunk, 2 ); 1452 x = GET( chunk, 1 ); 1453 y = GET( chunk, 0 ); 1454 1455 /* The BCD bit values are given by the following boolean equations.*/ 1456 a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) ); 1457 b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) ); 1458 c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) ); 1459 d = r; 1460 e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w ); 1461 f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) ); 1462 g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) ); 1463 h = u; 1464 i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) ); 1465 j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x ) 1466 | ( p & w & NOT(x) & v ) | ( w & NOT(v) ); 1467 k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x ) 1468 | ( q & v & w & NOT(x) ) | ( x & NOT(v) ); 1469 m = y; 1470 1471 value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7) 1472 | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2) 1473 | PUT(k, 1) | PUT(m, 0); 1474 return value; 1475 } 1476 1477 static ULong bcd_to_dpb( ULong chunk ) 1478 { 1479 Short a, b, c, d, e, f, g, h, i, j, k, m; 1480 Short p, q, r, s, t, u, v, w, x, y; 1481 ULong value; 1482 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value 1483 The boolean equations to calculate the value of each of the DPD bit 1484 is given in Appendix B of Book 1: Power ISA User Instruction set. The 1485 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value 1486 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are: 1487 */ 1488 a = GET( chunk, 11 ); 1489 b = GET( chunk, 10 ); 1490 c = GET( chunk, 9 ); 1491 d = GET( chunk, 8 ); 1492 e = GET( chunk, 7 ); 1493 f = GET( chunk, 6 ); 1494 g = GET( chunk, 5 ); 1495 h = GET( chunk, 4 ); 1496 i = GET( chunk, 3 ); 1497 j = GET( chunk, 2 ); 1498 k = GET( chunk, 1 ); 1499 m = GET( chunk, 0 ); 1500 1501 p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) ); 1502 q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) ); 1503 r = d; 1504 s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) ) 1505 | ( f & NOT(a) & NOT(e) ) | ( e & i ); 1506 t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) ) 1507 | ( g & NOT(a) & NOT(e) ) | ( a & i ); 1508 u = h; 1509 v = a | e | i; 1510 w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a; 1511 x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e; 1512 y = m; 1513 1514 value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5) 1515 | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y; 1516 1517 return value; 1518 } 1519 1520 ULong h_calc_DPBtoBCD( ULong dpb ) 1521 { 1522 ULong result, chunk; 1523 Int i; 1524 1525 result = 0; 1526 1527 for (i = 0; i < 5; i++) { 1528 chunk = dpb >> ( 4 - i ) * 10; 1529 result = result << 12; 1530 result |= dpb_to_bcd( chunk & 0x3FF ); 1531 } 1532 return result; 1533 } 1534 1535 ULong h_calc_BCDtoDPB( ULong bcd ) 1536 { 1537 ULong result, chunk; 1538 Int i; 1539 1540 result = 0; 1541 1542 for (i = 0; i < 5; i++) { 1543 chunk = bcd >> ( 4 - i ) * 12; 1544 result = result << 10; 1545 result |= bcd_to_dpb( chunk & 0xFFF ); 1546 } 1547 return result; 1548 } 1549 #undef NOT 1550 #undef GET 1551 #undef PUT 1552 1553 1554 /* ----------------------------------------------------- */ 1555 /* Signed and unsigned integer division, that behave like 1556 the ARMv7 UDIV ansd SDIV instructions. 1557 1558 sdiv32 also behaves like 64-bit v8 SDIV on w-regs. 1559 udiv32 also behaves like 64-bit v8 UDIV on w-regs. 1560 */ 1561 /* ----------------------------------------------------- */ 1562 1563 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y ) 1564 { 1565 // Division by zero --> zero 1566 if (UNLIKELY(y == 0)) return 0; 1567 // C requires rounding towards zero, which is also what we need. 1568 return x / y; 1569 } 1570 1571 ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y ) 1572 { 1573 // Division by zero --> zero 1574 if (UNLIKELY(y == 0)) return 0; 1575 // C requires rounding towards zero, which is also what we need. 1576 return x / y; 1577 } 1578 1579 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y ) 1580 { 1581 // Division by zero --> zero 1582 if (UNLIKELY(y == 0)) return 0; 1583 // The single case that produces an unrepresentable result 1584 if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000) 1585 && ((UInt)y) == ((UInt)0xFFFFFFFF) )) 1586 return (Int)(UInt)0x80000000; 1587 // Else return the result rounded towards zero. C89 says 1588 // this is implementation defined (in the signed case), but gcc 1589 // promises to round towards zero. Nevertheless, at startup, 1590 // in main_main.c, do a check for that. 1591 return x / y; 1592 } 1593 1594 Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y ) 1595 { 1596 // Division by zero --> zero 1597 if (UNLIKELY(y == 0)) return 0; 1598 // The single case that produces an unrepresentable result 1599 if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL ) 1600 && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) )) 1601 return (Long)(ULong)0x8000000000000000ULL; 1602 // Else return the result rounded towards zero. C89 says 1603 // this is implementation defined (in the signed case), but gcc 1604 // promises to round towards zero. Nevertheless, at startup, 1605 // in main_main.c, do a check for that. 1606 return x / y; 1607 } 1608 1609 1610 /*---------------------------------------------------------------*/ 1611 /*--- end host_generic_simd64.c ---*/ 1612 /*---------------------------------------------------------------*/ 1613