1 2 /*---------------------------------------------------------------*/ 3 /*--- begin host_generic_simd64.c ---*/ 4 /*---------------------------------------------------------------*/ 5 6 /* 7 This file is part of Valgrind, a dynamic binary instrumentation 8 framework. 9 10 Copyright (C) 2004-2012 OpenWorks LLP 11 info (at) open-works.net 12 13 This program is free software; you can redistribute it and/or 14 modify it under the terms of the GNU General Public License as 15 published by the Free Software Foundation; either version 2 of the 16 License, or (at your option) any later version. 17 18 This program is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with this program; if not, write to the Free Software 25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 26 02110-1301, USA. 27 28 The GNU General Public License is contained in the file COPYING. 29 30 Neither the names of the U.S. Department of Energy nor the 31 University of California nor the names of its contributors may be 32 used to endorse or promote products derived from this software 33 without prior written permission. 34 */ 35 36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases 37 where the instruction selectors cannot generate code in-line. 38 These are purely back-end entities and cannot be seen/referenced 39 from IR. */ 40 41 #include "libvex_basictypes.h" 42 #include "host_generic_simd64.h" 43 44 45 46 /* Tuple/select functions for 32x2 vectors. */ 47 48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) { 49 return (((ULong)w1) << 32) | ((ULong)w0); 50 } 51 52 static inline UInt sel32x2_1 ( ULong w64 ) { 53 return 0xFFFFFFFF & toUInt(w64 >> 32); 54 } 55 static inline UInt sel32x2_0 ( ULong w64 ) { 56 return 0xFFFFFFFF & toUInt(w64); 57 } 58 59 60 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless 61 with 64-bit shifts so we give it a hand. */ 62 63 static inline ULong mk16x4 ( UShort w3, UShort w2, 64 UShort w1, UShort w0 ) { 65 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2); 66 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0); 67 return mk32x2(hi32, lo32); 68 } 69 70 static inline UShort sel16x4_3 ( ULong w64 ) { 71 UInt hi32 = toUInt(w64 >> 32); 72 return toUShort(0xFFFF & (hi32 >> 16)); 73 } 74 static inline UShort sel16x4_2 ( ULong w64 ) { 75 UInt hi32 = toUInt(w64 >> 32); 76 return toUShort(0xFFFF & hi32); 77 } 78 static inline UShort sel16x4_1 ( ULong w64 ) { 79 UInt lo32 = (UInt)w64; 80 return toUShort(0xFFFF & (lo32 >> 16)); 81 } 82 static inline UShort sel16x4_0 ( ULong w64 ) { 83 UInt lo32 = (UInt)w64; 84 return toUShort(0xFFFF & lo32); 85 } 86 87 88 /* Tuple/select functions for 8x8 vectors. */ 89 90 static inline ULong mk8x8 ( UChar w7, UChar w6, 91 UChar w5, UChar w4, 92 UChar w3, UChar w2, 93 UChar w1, UChar w0 ) { 94 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16) 95 | (((UInt)w5) << 8) | (((UInt)w4) << 0); 96 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16) 97 | (((UInt)w1) << 8) | (((UInt)w0) << 0); 98 return mk32x2(hi32, lo32); 99 } 100 101 static inline UChar sel8x8_7 ( ULong w64 ) { 102 UInt hi32 = toUInt(w64 >> 32); 103 return toUChar(0xFF & (hi32 >> 24)); 104 } 105 static inline UChar sel8x8_6 ( ULong w64 ) { 106 UInt hi32 = toUInt(w64 >> 32); 107 return toUChar(0xFF & (hi32 >> 16)); 108 } 109 static inline UChar sel8x8_5 ( ULong w64 ) { 110 UInt hi32 = toUInt(w64 >> 32); 111 return toUChar(0xFF & (hi32 >> 8)); 112 } 113 static inline UChar sel8x8_4 ( ULong w64 ) { 114 UInt hi32 = toUInt(w64 >> 32); 115 return toUChar(0xFF & (hi32 >> 0)); 116 } 117 static inline UChar sel8x8_3 ( ULong w64 ) { 118 UInt lo32 = (UInt)w64; 119 return toUChar(0xFF & (lo32 >> 24)); 120 } 121 static inline UChar sel8x8_2 ( ULong w64 ) { 122 UInt lo32 = (UInt)w64; 123 return toUChar(0xFF & (lo32 >> 16)); 124 } 125 static inline UChar sel8x8_1 ( ULong w64 ) { 126 UInt lo32 = (UInt)w64; 127 return toUChar(0xFF & (lo32 >> 8)); 128 } 129 static inline UChar sel8x8_0 ( ULong w64 ) { 130 UInt lo32 = (UInt)w64; 131 return toUChar(0xFF & (lo32 >> 0)); 132 } 133 134 static inline UChar index8x8 ( ULong w64, UChar ix ) { 135 ix &= 7; 136 return toUChar((w64 >> (8*ix)) & 0xFF); 137 } 138 139 140 /* Scalar helpers. */ 141 142 static inline Int qadd32S ( Int xx, Int yy ) 143 { 144 Long t = ((Long)xx) + ((Long)yy); 145 const Long loLim = -0x80000000LL; 146 const Long hiLim = 0x7FFFFFFFLL; 147 if (t < loLim) t = loLim; 148 if (t > hiLim) t = hiLim; 149 return (Int)t; 150 } 151 152 static inline Short qadd16S ( Short xx, Short yy ) 153 { 154 Int t = ((Int)xx) + ((Int)yy); 155 if (t < -32768) t = -32768; 156 if (t > 32767) t = 32767; 157 return (Short)t; 158 } 159 160 static inline Char qadd8S ( Char xx, Char yy ) 161 { 162 Int t = ((Int)xx) + ((Int)yy); 163 if (t < -128) t = -128; 164 if (t > 127) t = 127; 165 return (Char)t; 166 } 167 168 static inline UShort qadd16U ( UShort xx, UShort yy ) 169 { 170 UInt t = ((UInt)xx) + ((UInt)yy); 171 if (t > 0xFFFF) t = 0xFFFF; 172 return (UShort)t; 173 } 174 175 static inline UChar qadd8U ( UChar xx, UChar yy ) 176 { 177 UInt t = ((UInt)xx) + ((UInt)yy); 178 if (t > 0xFF) t = 0xFF; 179 return (UChar)t; 180 } 181 182 static inline Int qsub32S ( Int xx, Int yy ) 183 { 184 Long t = ((Long)xx) - ((Long)yy); 185 const Long loLim = -0x80000000LL; 186 const Long hiLim = 0x7FFFFFFFLL; 187 if (t < loLim) t = loLim; 188 if (t > hiLim) t = hiLim; 189 return (Int)t; 190 } 191 192 static inline Short qsub16S ( Short xx, Short yy ) 193 { 194 Int t = ((Int)xx) - ((Int)yy); 195 if (t < -32768) t = -32768; 196 if (t > 32767) t = 32767; 197 return (Short)t; 198 } 199 200 static inline Char qsub8S ( Char xx, Char yy ) 201 { 202 Int t = ((Int)xx) - ((Int)yy); 203 if (t < -128) t = -128; 204 if (t > 127) t = 127; 205 return (Char)t; 206 } 207 208 static inline UShort qsub16U ( UShort xx, UShort yy ) 209 { 210 Int t = ((Int)xx) - ((Int)yy); 211 if (t < 0) t = 0; 212 if (t > 0xFFFF) t = 0xFFFF; 213 return (UShort)t; 214 } 215 216 static inline UChar qsub8U ( UChar xx, UChar yy ) 217 { 218 Int t = ((Int)xx) - ((Int)yy); 219 if (t < 0) t = 0; 220 if (t > 0xFF) t = 0xFF; 221 return (UChar)t; 222 } 223 224 static inline Short mul16 ( Short xx, Short yy ) 225 { 226 Int t = ((Int)xx) * ((Int)yy); 227 return (Short)t; 228 } 229 230 static inline Int mul32 ( Int xx, Int yy ) 231 { 232 Int t = ((Int)xx) * ((Int)yy); 233 return (Int)t; 234 } 235 236 static inline Short mulhi16S ( Short xx, Short yy ) 237 { 238 Int t = ((Int)xx) * ((Int)yy); 239 t >>=/*s*/ 16; 240 return (Short)t; 241 } 242 243 static inline UShort mulhi16U ( UShort xx, UShort yy ) 244 { 245 UInt t = ((UInt)xx) * ((UInt)yy); 246 t >>=/*u*/ 16; 247 return (UShort)t; 248 } 249 250 static inline UInt cmpeq32 ( UInt xx, UInt yy ) 251 { 252 return xx==yy ? 0xFFFFFFFF : 0; 253 } 254 255 static inline UShort cmpeq16 ( UShort xx, UShort yy ) 256 { 257 return toUShort(xx==yy ? 0xFFFF : 0); 258 } 259 260 static inline UChar cmpeq8 ( UChar xx, UChar yy ) 261 { 262 return toUChar(xx==yy ? 0xFF : 0); 263 } 264 265 static inline UInt cmpgt32S ( Int xx, Int yy ) 266 { 267 return xx>yy ? 0xFFFFFFFF : 0; 268 } 269 270 static inline UShort cmpgt16S ( Short xx, Short yy ) 271 { 272 return toUShort(xx>yy ? 0xFFFF : 0); 273 } 274 275 static inline UChar cmpgt8S ( Char xx, Char yy ) 276 { 277 return toUChar(xx>yy ? 0xFF : 0); 278 } 279 280 static inline UInt cmpnez32 ( UInt xx ) 281 { 282 return xx==0 ? 0 : 0xFFFFFFFF; 283 } 284 285 static inline UShort cmpnez16 ( UShort xx ) 286 { 287 return toUShort(xx==0 ? 0 : 0xFFFF); 288 } 289 290 static inline UChar cmpnez8 ( UChar xx ) 291 { 292 return toUChar(xx==0 ? 0 : 0xFF); 293 } 294 295 static inline Short qnarrow32Sto16S ( UInt xx0 ) 296 { 297 Int xx = (Int)xx0; 298 if (xx < -32768) xx = -32768; 299 if (xx > 32767) xx = 32767; 300 return (Short)xx; 301 } 302 303 static inline Char qnarrow16Sto8S ( UShort xx0 ) 304 { 305 Short xx = (Short)xx0; 306 if (xx < -128) xx = -128; 307 if (xx > 127) xx = 127; 308 return (Char)xx; 309 } 310 311 static inline UChar qnarrow16Sto8U ( UShort xx0 ) 312 { 313 Short xx = (Short)xx0; 314 if (xx < 0) xx = 0; 315 if (xx > 255) xx = 255; 316 return (UChar)xx; 317 } 318 319 static inline UShort narrow32to16 ( UInt xx ) 320 { 321 return (UShort)xx; 322 } 323 324 static inline UChar narrow16to8 ( UShort xx ) 325 { 326 return (UChar)xx; 327 } 328 329 /* shifts: we don't care about out-of-range ones, since 330 that is dealt with at a higher level. */ 331 332 static inline UChar shl8 ( UChar v, UInt n ) 333 { 334 return toUChar(v << n); 335 } 336 337 static inline UChar sar8 ( UChar v, UInt n ) 338 { 339 return toUChar(((Char)v) >> n); 340 } 341 342 static inline UShort shl16 ( UShort v, UInt n ) 343 { 344 return toUShort(v << n); 345 } 346 347 static inline UShort shr16 ( UShort v, UInt n ) 348 { 349 return toUShort((((UShort)v) >> n)); 350 } 351 352 static inline UShort sar16 ( UShort v, UInt n ) 353 { 354 return toUShort(((Short)v) >> n); 355 } 356 357 static inline UInt shl32 ( UInt v, UInt n ) 358 { 359 return v << n; 360 } 361 362 static inline UInt shr32 ( UInt v, UInt n ) 363 { 364 return (((UInt)v) >> n); 365 } 366 367 static inline UInt sar32 ( UInt v, UInt n ) 368 { 369 return ((Int)v) >> n; 370 } 371 372 static inline UChar avg8U ( UChar xx, UChar yy ) 373 { 374 UInt xxi = (UInt)xx; 375 UInt yyi = (UInt)yy; 376 UInt r = (xxi + yyi + 1) >> 1; 377 return (UChar)r; 378 } 379 380 static inline UShort avg16U ( UShort xx, UShort yy ) 381 { 382 UInt xxi = (UInt)xx; 383 UInt yyi = (UInt)yy; 384 UInt r = (xxi + yyi + 1) >> 1; 385 return (UShort)r; 386 } 387 388 static inline Short max16S ( Short xx, Short yy ) 389 { 390 return toUShort((xx > yy) ? xx : yy); 391 } 392 393 static inline UChar max8U ( UChar xx, UChar yy ) 394 { 395 return toUChar((xx > yy) ? xx : yy); 396 } 397 398 static inline Short min16S ( Short xx, Short yy ) 399 { 400 return toUShort((xx < yy) ? xx : yy); 401 } 402 403 static inline UChar min8U ( UChar xx, UChar yy ) 404 { 405 return toUChar((xx < yy) ? xx : yy); 406 } 407 408 static inline UShort hadd16U ( UShort xx, UShort yy ) 409 { 410 UInt xxi = (UInt)xx; 411 UInt yyi = (UInt)yy; 412 UInt r = (xxi + yyi) >> 1; 413 return (UShort)r; 414 } 415 416 static inline Short hadd16S ( Short xx, Short yy ) 417 { 418 Int xxi = (Int)xx; 419 Int yyi = (Int)yy; 420 Int r = (xxi + yyi) >> 1; 421 return (Short)r; 422 } 423 424 static inline UShort hsub16U ( UShort xx, UShort yy ) 425 { 426 UInt xxi = (UInt)xx; 427 UInt yyi = (UInt)yy; 428 UInt r = (xxi - yyi) >> 1; 429 return (UShort)r; 430 } 431 432 static inline Short hsub16S ( Short xx, Short yy ) 433 { 434 Int xxi = (Int)xx; 435 Int yyi = (Int)yy; 436 Int r = (xxi - yyi) >> 1; 437 return (Short)r; 438 } 439 440 static inline UChar hadd8U ( UChar xx, UChar yy ) 441 { 442 UInt xxi = (UInt)xx; 443 UInt yyi = (UInt)yy; 444 UInt r = (xxi + yyi) >> 1; 445 return (UChar)r; 446 } 447 448 static inline Char hadd8S ( Char xx, Char yy ) 449 { 450 Int xxi = (Int)xx; 451 Int yyi = (Int)yy; 452 Int r = (xxi + yyi) >> 1; 453 return (Char)r; 454 } 455 456 static inline UChar hsub8U ( UChar xx, UChar yy ) 457 { 458 UInt xxi = (UInt)xx; 459 UInt yyi = (UInt)yy; 460 UInt r = (xxi - yyi) >> 1; 461 return (UChar)r; 462 } 463 464 static inline Char hsub8S ( Char xx, Char yy ) 465 { 466 Int xxi = (Int)xx; 467 Int yyi = (Int)yy; 468 Int r = (xxi - yyi) >> 1; 469 return (Char)r; 470 } 471 472 static inline UInt absdiff8U ( UChar xx, UChar yy ) 473 { 474 UInt xxu = (UChar)xx; 475 UInt yyu = (UChar)yy; 476 return xxu >= yyu ? xxu - yyu : yyu - xxu; 477 } 478 479 /* ----------------------------------------------------- */ 480 /* Start of the externally visible functions. These simply 481 implement the corresponding IR primops. */ 482 /* ----------------------------------------------------- */ 483 484 /* ------------ Normal addition ------------ */ 485 486 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy ) 487 { 488 return mk32x2( 489 sel32x2_1(xx) + sel32x2_1(yy), 490 sel32x2_0(xx) + sel32x2_0(yy) 491 ); 492 } 493 494 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy ) 495 { 496 return mk16x4( 497 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ), 498 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ), 499 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ), 500 toUShort( sel16x4_0(xx) + sel16x4_0(yy) ) 501 ); 502 } 503 504 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy ) 505 { 506 return mk8x8( 507 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ), 508 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ), 509 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ), 510 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ), 511 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ), 512 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ), 513 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ), 514 toUChar( sel8x8_0(xx) + sel8x8_0(yy) ) 515 ); 516 } 517 518 /* ------------ Saturating addition ------------ */ 519 520 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy ) 521 { 522 return mk16x4( 523 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ), 524 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ), 525 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ), 526 qadd16S( sel16x4_0(xx), sel16x4_0(yy) ) 527 ); 528 } 529 530 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy ) 531 { 532 return mk8x8( 533 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ), 534 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ), 535 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ), 536 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ), 537 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ), 538 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ), 539 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ), 540 qadd8S( sel8x8_0(xx), sel8x8_0(yy) ) 541 ); 542 } 543 544 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy ) 545 { 546 return mk16x4( 547 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ), 548 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ), 549 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ), 550 qadd16U( sel16x4_0(xx), sel16x4_0(yy) ) 551 ); 552 } 553 554 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy ) 555 { 556 return mk8x8( 557 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ), 558 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ), 559 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ), 560 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ), 561 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ), 562 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ), 563 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ), 564 qadd8U( sel8x8_0(xx), sel8x8_0(yy) ) 565 ); 566 } 567 568 /* ------------ Normal subtraction ------------ */ 569 570 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy ) 571 { 572 return mk32x2( 573 sel32x2_1(xx) - sel32x2_1(yy), 574 sel32x2_0(xx) - sel32x2_0(yy) 575 ); 576 } 577 578 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy ) 579 { 580 return mk16x4( 581 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ), 582 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ), 583 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ), 584 toUShort( sel16x4_0(xx) - sel16x4_0(yy) ) 585 ); 586 } 587 588 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy ) 589 { 590 return mk8x8( 591 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ), 592 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ), 593 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ), 594 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ), 595 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ), 596 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ), 597 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ), 598 toUChar( sel8x8_0(xx) - sel8x8_0(yy) ) 599 ); 600 } 601 602 /* ------------ Saturating subtraction ------------ */ 603 604 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy ) 605 { 606 return mk16x4( 607 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ), 608 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ), 609 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ), 610 qsub16S( sel16x4_0(xx), sel16x4_0(yy) ) 611 ); 612 } 613 614 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy ) 615 { 616 return mk8x8( 617 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ), 618 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ), 619 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ), 620 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ), 621 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ), 622 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ), 623 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ), 624 qsub8S( sel8x8_0(xx), sel8x8_0(yy) ) 625 ); 626 } 627 628 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy ) 629 { 630 return mk16x4( 631 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ), 632 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ), 633 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ), 634 qsub16U( sel16x4_0(xx), sel16x4_0(yy) ) 635 ); 636 } 637 638 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy ) 639 { 640 return mk8x8( 641 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ), 642 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ), 643 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ), 644 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ), 645 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ), 646 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ), 647 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ), 648 qsub8U( sel8x8_0(xx), sel8x8_0(yy) ) 649 ); 650 } 651 652 /* ------------ Multiplication ------------ */ 653 654 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy ) 655 { 656 return mk16x4( 657 mul16( sel16x4_3(xx), sel16x4_3(yy) ), 658 mul16( sel16x4_2(xx), sel16x4_2(yy) ), 659 mul16( sel16x4_1(xx), sel16x4_1(yy) ), 660 mul16( sel16x4_0(xx), sel16x4_0(yy) ) 661 ); 662 } 663 664 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy ) 665 { 666 return mk32x2( 667 mul32( sel32x2_1(xx), sel32x2_1(yy) ), 668 mul32( sel32x2_0(xx), sel32x2_0(yy) ) 669 ); 670 } 671 672 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy ) 673 { 674 return mk16x4( 675 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ), 676 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ), 677 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ), 678 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) ) 679 ); 680 } 681 682 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy ) 683 { 684 return mk16x4( 685 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ), 686 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ), 687 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ), 688 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) ) 689 ); 690 } 691 692 /* ------------ Comparison ------------ */ 693 694 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy ) 695 { 696 return mk32x2( 697 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ), 698 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) ) 699 ); 700 } 701 702 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy ) 703 { 704 return mk16x4( 705 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ), 706 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ), 707 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ), 708 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) ) 709 ); 710 } 711 712 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy ) 713 { 714 return mk8x8( 715 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ), 716 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ), 717 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ), 718 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ), 719 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ), 720 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ), 721 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ), 722 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) ) 723 ); 724 } 725 726 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy ) 727 { 728 return mk32x2( 729 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ), 730 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) ) 731 ); 732 } 733 734 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy ) 735 { 736 return mk16x4( 737 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ), 738 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ), 739 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ), 740 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) ) 741 ); 742 } 743 744 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy ) 745 { 746 return mk8x8( 747 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ), 748 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ), 749 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ), 750 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ), 751 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ), 752 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ), 753 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ), 754 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) ) 755 ); 756 } 757 758 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx ) 759 { 760 return mk32x2( 761 cmpnez32( sel32x2_1(xx) ), 762 cmpnez32( sel32x2_0(xx) ) 763 ); 764 } 765 766 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx ) 767 { 768 return mk16x4( 769 cmpnez16( sel16x4_3(xx) ), 770 cmpnez16( sel16x4_2(xx) ), 771 cmpnez16( sel16x4_1(xx) ), 772 cmpnez16( sel16x4_0(xx) ) 773 ); 774 } 775 776 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx ) 777 { 778 return mk8x8( 779 cmpnez8( sel8x8_7(xx) ), 780 cmpnez8( sel8x8_6(xx) ), 781 cmpnez8( sel8x8_5(xx) ), 782 cmpnez8( sel8x8_4(xx) ), 783 cmpnez8( sel8x8_3(xx) ), 784 cmpnez8( sel8x8_2(xx) ), 785 cmpnez8( sel8x8_1(xx) ), 786 cmpnez8( sel8x8_0(xx) ) 787 ); 788 } 789 790 /* ------------ Saturating narrowing ------------ */ 791 792 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb ) 793 { 794 UInt d = sel32x2_1(aa); 795 UInt c = sel32x2_0(aa); 796 UInt b = sel32x2_1(bb); 797 UInt a = sel32x2_0(bb); 798 return mk16x4( 799 qnarrow32Sto16S(d), 800 qnarrow32Sto16S(c), 801 qnarrow32Sto16S(b), 802 qnarrow32Sto16S(a) 803 ); 804 } 805 806 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb ) 807 { 808 UShort h = sel16x4_3(aa); 809 UShort g = sel16x4_2(aa); 810 UShort f = sel16x4_1(aa); 811 UShort e = sel16x4_0(aa); 812 UShort d = sel16x4_3(bb); 813 UShort c = sel16x4_2(bb); 814 UShort b = sel16x4_1(bb); 815 UShort a = sel16x4_0(bb); 816 return mk8x8( 817 qnarrow16Sto8S(h), 818 qnarrow16Sto8S(g), 819 qnarrow16Sto8S(f), 820 qnarrow16Sto8S(e), 821 qnarrow16Sto8S(d), 822 qnarrow16Sto8S(c), 823 qnarrow16Sto8S(b), 824 qnarrow16Sto8S(a) 825 ); 826 } 827 828 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb ) 829 { 830 UShort h = sel16x4_3(aa); 831 UShort g = sel16x4_2(aa); 832 UShort f = sel16x4_1(aa); 833 UShort e = sel16x4_0(aa); 834 UShort d = sel16x4_3(bb); 835 UShort c = sel16x4_2(bb); 836 UShort b = sel16x4_1(bb); 837 UShort a = sel16x4_0(bb); 838 return mk8x8( 839 qnarrow16Sto8U(h), 840 qnarrow16Sto8U(g), 841 qnarrow16Sto8U(f), 842 qnarrow16Sto8U(e), 843 qnarrow16Sto8U(d), 844 qnarrow16Sto8U(c), 845 qnarrow16Sto8U(b), 846 qnarrow16Sto8U(a) 847 ); 848 } 849 850 /* ------------ Truncating narrowing ------------ */ 851 852 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb ) 853 { 854 UInt d = sel32x2_1(aa); 855 UInt c = sel32x2_0(aa); 856 UInt b = sel32x2_1(bb); 857 UInt a = sel32x2_0(bb); 858 return mk16x4( 859 narrow32to16(d), 860 narrow32to16(c), 861 narrow32to16(b), 862 narrow32to16(a) 863 ); 864 } 865 866 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb ) 867 { 868 UShort h = sel16x4_3(aa); 869 UShort g = sel16x4_2(aa); 870 UShort f = sel16x4_1(aa); 871 UShort e = sel16x4_0(aa); 872 UShort d = sel16x4_3(bb); 873 UShort c = sel16x4_2(bb); 874 UShort b = sel16x4_1(bb); 875 UShort a = sel16x4_0(bb); 876 return mk8x8( 877 narrow16to8(h), 878 narrow16to8(g), 879 narrow16to8(f), 880 narrow16to8(e), 881 narrow16to8(d), 882 narrow16to8(c), 883 narrow16to8(b), 884 narrow16to8(a) 885 ); 886 } 887 888 /* ------------ Interleaving ------------ */ 889 890 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb ) 891 { 892 return mk8x8( 893 sel8x8_7(aa), 894 sel8x8_7(bb), 895 sel8x8_6(aa), 896 sel8x8_6(bb), 897 sel8x8_5(aa), 898 sel8x8_5(bb), 899 sel8x8_4(aa), 900 sel8x8_4(bb) 901 ); 902 } 903 904 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb ) 905 { 906 return mk8x8( 907 sel8x8_3(aa), 908 sel8x8_3(bb), 909 sel8x8_2(aa), 910 sel8x8_2(bb), 911 sel8x8_1(aa), 912 sel8x8_1(bb), 913 sel8x8_0(aa), 914 sel8x8_0(bb) 915 ); 916 } 917 918 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb ) 919 { 920 return mk16x4( 921 sel16x4_3(aa), 922 sel16x4_3(bb), 923 sel16x4_2(aa), 924 sel16x4_2(bb) 925 ); 926 } 927 928 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb ) 929 { 930 return mk16x4( 931 sel16x4_1(aa), 932 sel16x4_1(bb), 933 sel16x4_0(aa), 934 sel16x4_0(bb) 935 ); 936 } 937 938 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb ) 939 { 940 return mk32x2( 941 sel32x2_1(aa), 942 sel32x2_1(bb) 943 ); 944 } 945 946 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb ) 947 { 948 return mk32x2( 949 sel32x2_0(aa), 950 sel32x2_0(bb) 951 ); 952 } 953 954 /* ------------ Concatenation ------------ */ 955 956 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb ) 957 { 958 return mk16x4( 959 sel16x4_3(aa), 960 sel16x4_1(aa), 961 sel16x4_3(bb), 962 sel16x4_1(bb) 963 ); 964 } 965 966 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb ) 967 { 968 return mk16x4( 969 sel16x4_2(aa), 970 sel16x4_0(aa), 971 sel16x4_2(bb), 972 sel16x4_0(bb) 973 ); 974 } 975 976 /* misc hack looking for a proper home */ 977 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) 978 { 979 return mk8x8( 980 index8x8(aa, sel8x8_7(bb)), 981 index8x8(aa, sel8x8_6(bb)), 982 index8x8(aa, sel8x8_5(bb)), 983 index8x8(aa, sel8x8_4(bb)), 984 index8x8(aa, sel8x8_3(bb)), 985 index8x8(aa, sel8x8_2(bb)), 986 index8x8(aa, sel8x8_1(bb)), 987 index8x8(aa, sel8x8_0(bb)) 988 ); 989 } 990 991 /* ------------ Shifting ------------ */ 992 /* Note that because these primops are undefined if the shift amount 993 equals or exceeds the lane width, the shift amount is masked so 994 that the scalar shifts are always in range. In fact, given the 995 semantics of these primops (ShlN16x4, etc) it is an error if in 996 fact we are ever given an out-of-range shift amount. 997 */ 998 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn ) 999 { 1000 /* vassert(nn < 32); */ 1001 nn &= 31; 1002 return mk32x2( 1003 shl32( sel32x2_1(xx), nn ), 1004 shl32( sel32x2_0(xx), nn ) 1005 ); 1006 } 1007 1008 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn ) 1009 { 1010 /* vassert(nn < 16); */ 1011 nn &= 15; 1012 return mk16x4( 1013 shl16( sel16x4_3(xx), nn ), 1014 shl16( sel16x4_2(xx), nn ), 1015 shl16( sel16x4_1(xx), nn ), 1016 shl16( sel16x4_0(xx), nn ) 1017 ); 1018 } 1019 1020 ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn ) 1021 { 1022 /* vassert(nn < 8); */ 1023 nn &= 7; 1024 return mk8x8( 1025 shl8( sel8x8_7(xx), nn ), 1026 shl8( sel8x8_6(xx), nn ), 1027 shl8( sel8x8_5(xx), nn ), 1028 shl8( sel8x8_4(xx), nn ), 1029 shl8( sel8x8_3(xx), nn ), 1030 shl8( sel8x8_2(xx), nn ), 1031 shl8( sel8x8_1(xx), nn ), 1032 shl8( sel8x8_0(xx), nn ) 1033 ); 1034 } 1035 1036 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn ) 1037 { 1038 /* vassert(nn < 32); */ 1039 nn &= 31; 1040 return mk32x2( 1041 shr32( sel32x2_1(xx), nn ), 1042 shr32( sel32x2_0(xx), nn ) 1043 ); 1044 } 1045 1046 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn ) 1047 { 1048 /* vassert(nn < 16); */ 1049 nn &= 15; 1050 return mk16x4( 1051 shr16( sel16x4_3(xx), nn ), 1052 shr16( sel16x4_2(xx), nn ), 1053 shr16( sel16x4_1(xx), nn ), 1054 shr16( sel16x4_0(xx), nn ) 1055 ); 1056 } 1057 1058 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn ) 1059 { 1060 /* vassert(nn < 32); */ 1061 nn &= 31; 1062 return mk32x2( 1063 sar32( sel32x2_1(xx), nn ), 1064 sar32( sel32x2_0(xx), nn ) 1065 ); 1066 } 1067 1068 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn ) 1069 { 1070 /* vassert(nn < 16); */ 1071 nn &= 15; 1072 return mk16x4( 1073 sar16( sel16x4_3(xx), nn ), 1074 sar16( sel16x4_2(xx), nn ), 1075 sar16( sel16x4_1(xx), nn ), 1076 sar16( sel16x4_0(xx), nn ) 1077 ); 1078 } 1079 1080 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn ) 1081 { 1082 /* vassert(nn < 8); */ 1083 nn &= 7; 1084 return mk8x8( 1085 sar8( sel8x8_7(xx), nn ), 1086 sar8( sel8x8_6(xx), nn ), 1087 sar8( sel8x8_5(xx), nn ), 1088 sar8( sel8x8_4(xx), nn ), 1089 sar8( sel8x8_3(xx), nn ), 1090 sar8( sel8x8_2(xx), nn ), 1091 sar8( sel8x8_1(xx), nn ), 1092 sar8( sel8x8_0(xx), nn ) 1093 ); 1094 } 1095 1096 /* ------------ Averaging ------------ */ 1097 1098 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy ) 1099 { 1100 return mk8x8( 1101 avg8U( sel8x8_7(xx), sel8x8_7(yy) ), 1102 avg8U( sel8x8_6(xx), sel8x8_6(yy) ), 1103 avg8U( sel8x8_5(xx), sel8x8_5(yy) ), 1104 avg8U( sel8x8_4(xx), sel8x8_4(yy) ), 1105 avg8U( sel8x8_3(xx), sel8x8_3(yy) ), 1106 avg8U( sel8x8_2(xx), sel8x8_2(yy) ), 1107 avg8U( sel8x8_1(xx), sel8x8_1(yy) ), 1108 avg8U( sel8x8_0(xx), sel8x8_0(yy) ) 1109 ); 1110 } 1111 1112 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy ) 1113 { 1114 return mk16x4( 1115 avg16U( sel16x4_3(xx), sel16x4_3(yy) ), 1116 avg16U( sel16x4_2(xx), sel16x4_2(yy) ), 1117 avg16U( sel16x4_1(xx), sel16x4_1(yy) ), 1118 avg16U( sel16x4_0(xx), sel16x4_0(yy) ) 1119 ); 1120 } 1121 1122 /* ------------ max/min ------------ */ 1123 1124 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy ) 1125 { 1126 return mk16x4( 1127 max16S( sel16x4_3(xx), sel16x4_3(yy) ), 1128 max16S( sel16x4_2(xx), sel16x4_2(yy) ), 1129 max16S( sel16x4_1(xx), sel16x4_1(yy) ), 1130 max16S( sel16x4_0(xx), sel16x4_0(yy) ) 1131 ); 1132 } 1133 1134 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy ) 1135 { 1136 return mk8x8( 1137 max8U( sel8x8_7(xx), sel8x8_7(yy) ), 1138 max8U( sel8x8_6(xx), sel8x8_6(yy) ), 1139 max8U( sel8x8_5(xx), sel8x8_5(yy) ), 1140 max8U( sel8x8_4(xx), sel8x8_4(yy) ), 1141 max8U( sel8x8_3(xx), sel8x8_3(yy) ), 1142 max8U( sel8x8_2(xx), sel8x8_2(yy) ), 1143 max8U( sel8x8_1(xx), sel8x8_1(yy) ), 1144 max8U( sel8x8_0(xx), sel8x8_0(yy) ) 1145 ); 1146 } 1147 1148 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy ) 1149 { 1150 return mk16x4( 1151 min16S( sel16x4_3(xx), sel16x4_3(yy) ), 1152 min16S( sel16x4_2(xx), sel16x4_2(yy) ), 1153 min16S( sel16x4_1(xx), sel16x4_1(yy) ), 1154 min16S( sel16x4_0(xx), sel16x4_0(yy) ) 1155 ); 1156 } 1157 1158 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy ) 1159 { 1160 return mk8x8( 1161 min8U( sel8x8_7(xx), sel8x8_7(yy) ), 1162 min8U( sel8x8_6(xx), sel8x8_6(yy) ), 1163 min8U( sel8x8_5(xx), sel8x8_5(yy) ), 1164 min8U( sel8x8_4(xx), sel8x8_4(yy) ), 1165 min8U( sel8x8_3(xx), sel8x8_3(yy) ), 1166 min8U( sel8x8_2(xx), sel8x8_2(yy) ), 1167 min8U( sel8x8_1(xx), sel8x8_1(yy) ), 1168 min8U( sel8x8_0(xx), sel8x8_0(yy) ) 1169 ); 1170 } 1171 1172 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */ 1173 1174 /* Tuple/select functions for 16x2 vectors. */ 1175 static inline UInt mk16x2 ( UShort w1, UShort w2 ) { 1176 return (((UInt)w1) << 16) | ((UInt)w2); 1177 } 1178 1179 static inline UShort sel16x2_1 ( UInt w32 ) { 1180 return 0xFFFF & (UShort)(w32 >> 16); 1181 } 1182 static inline UShort sel16x2_0 ( UInt w32 ) { 1183 return 0xFFFF & (UShort)(w32); 1184 } 1185 1186 static inline UInt mk8x4 ( UChar w3, UChar w2, 1187 UChar w1, UChar w0 ) { 1188 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16) 1189 | (((UInt)w1) << 8) | (((UInt)w0) << 0); 1190 return w32; 1191 } 1192 1193 static inline UChar sel8x4_3 ( UInt w32 ) { 1194 return toUChar(0xFF & (w32 >> 24)); 1195 } 1196 static inline UChar sel8x4_2 ( UInt w32 ) { 1197 return toUChar(0xFF & (w32 >> 16)); 1198 } 1199 static inline UChar sel8x4_1 ( UInt w32 ) { 1200 return toUChar(0xFF & (w32 >> 8)); 1201 } 1202 static inline UChar sel8x4_0 ( UInt w32 ) { 1203 return toUChar(0xFF & (w32 >> 0)); 1204 } 1205 1206 1207 /* ----------------------------------------------------- */ 1208 /* More externally visible functions. These simply 1209 implement the corresponding IR primops. */ 1210 /* ----------------------------------------------------- */ 1211 1212 /* ------ 16x2 ------ */ 1213 1214 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy ) 1215 { 1216 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy), 1217 sel16x2_0(xx) + sel16x2_0(yy) ); 1218 } 1219 1220 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy ) 1221 { 1222 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy), 1223 sel16x2_0(xx) - sel16x2_0(yy) ); 1224 } 1225 1226 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy ) 1227 { 1228 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ), 1229 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1230 } 1231 1232 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy ) 1233 { 1234 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ), 1235 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1236 } 1237 1238 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy ) 1239 { 1240 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ), 1241 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1242 } 1243 1244 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy ) 1245 { 1246 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ), 1247 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1248 } 1249 1250 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy ) 1251 { 1252 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ), 1253 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1254 } 1255 1256 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy ) 1257 { 1258 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ), 1259 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1260 } 1261 1262 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy ) 1263 { 1264 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ), 1265 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1266 } 1267 1268 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy ) 1269 { 1270 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ), 1271 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1272 } 1273 1274 /* ------ 8x4 ------ */ 1275 1276 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy ) 1277 { 1278 return mk8x4( 1279 sel8x4_3(xx) + sel8x4_3(yy), 1280 sel8x4_2(xx) + sel8x4_2(yy), 1281 sel8x4_1(xx) + sel8x4_1(yy), 1282 sel8x4_0(xx) + sel8x4_0(yy) 1283 ); 1284 } 1285 1286 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy ) 1287 { 1288 return mk8x4( 1289 sel8x4_3(xx) - sel8x4_3(yy), 1290 sel8x4_2(xx) - sel8x4_2(yy), 1291 sel8x4_1(xx) - sel8x4_1(yy), 1292 sel8x4_0(xx) - sel8x4_0(yy) 1293 ); 1294 } 1295 1296 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy ) 1297 { 1298 return mk8x4( 1299 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ), 1300 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ), 1301 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ), 1302 hadd8U( sel8x4_0(xx), sel8x4_0(yy) ) 1303 ); 1304 } 1305 1306 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy ) 1307 { 1308 return mk8x4( 1309 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ), 1310 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ), 1311 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ), 1312 hadd8S( sel8x4_0(xx), sel8x4_0(yy) ) 1313 ); 1314 } 1315 1316 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy ) 1317 { 1318 return mk8x4( 1319 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ), 1320 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ), 1321 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ), 1322 hsub8U( sel8x4_0(xx), sel8x4_0(yy) ) 1323 ); 1324 } 1325 1326 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy ) 1327 { 1328 return mk8x4( 1329 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ), 1330 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ), 1331 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ), 1332 hsub8S( sel8x4_0(xx), sel8x4_0(yy) ) 1333 ); 1334 } 1335 1336 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy ) 1337 { 1338 return mk8x4( 1339 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ), 1340 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ), 1341 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ), 1342 qadd8U( sel8x4_0(xx), sel8x4_0(yy) ) 1343 ); 1344 } 1345 1346 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy ) 1347 { 1348 return mk8x4( 1349 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ), 1350 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ), 1351 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ), 1352 qadd8S( sel8x4_0(xx), sel8x4_0(yy) ) 1353 ); 1354 } 1355 1356 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy ) 1357 { 1358 return mk8x4( 1359 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ), 1360 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ), 1361 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ), 1362 qsub8U( sel8x4_0(xx), sel8x4_0(yy) ) 1363 ); 1364 } 1365 1366 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy ) 1367 { 1368 return mk8x4( 1369 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ), 1370 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ), 1371 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ), 1372 qsub8S( sel8x4_0(xx), sel8x4_0(yy) ) 1373 ); 1374 } 1375 1376 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx ) 1377 { 1378 return mk16x2( 1379 cmpnez16( sel16x2_1(xx) ), 1380 cmpnez16( sel16x2_0(xx) ) 1381 ); 1382 } 1383 1384 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx ) 1385 { 1386 return mk8x4( 1387 cmpnez8( sel8x4_3(xx) ), 1388 cmpnez8( sel8x4_2(xx) ), 1389 cmpnez8( sel8x4_1(xx) ), 1390 cmpnez8( sel8x4_0(xx) ) 1391 ); 1392 } 1393 1394 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy ) 1395 { 1396 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) ) 1397 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) ) 1398 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) ) 1399 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) ); 1400 } 1401 1402 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy ) 1403 { 1404 return qadd32S( xx, yy ); 1405 } 1406 1407 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy ) 1408 { 1409 return qsub32S( xx, yy ); 1410 } 1411 1412 1413 /*------------------------------------------------------------------*/ 1414 /* Decimal Floating Point (DFP) externally visible helper functions */ 1415 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD */ 1416 /*------------------------------------------------------------------*/ 1417 1418 #define NOT( x ) ( ( ( x ) == 0) ? 1 : 0) 1419 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) ) 1420 #define PUT( x, y ) ( ( x )<< ( y ) ) 1421 1422 ULong dpb_to_bcd( ULong chunk ) 1423 { 1424 Short a, b, c, d, e, f, g, h, i, j, k, m; 1425 Short p, q, r, s, t, u, v, w, x, y; 1426 ULong value; 1427 1428 /* convert 10 bit densely packed BCD to BCD */ 1429 p = GET( chunk, 9 ); 1430 q = GET( chunk, 8 ); 1431 r = GET( chunk, 7 ); 1432 s = GET( chunk, 6 ); 1433 t = GET( chunk, 5 ); 1434 u = GET( chunk, 4 ); 1435 v = GET( chunk, 3 ); 1436 w = GET( chunk, 2 ); 1437 x = GET( chunk, 1 ); 1438 y = GET( chunk, 0 ); 1439 1440 /* The BCD bit values are given by the following boolean equations.*/ 1441 a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) ); 1442 b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) ); 1443 c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) ); 1444 d = r; 1445 e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w ); 1446 f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) ); 1447 g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) ); 1448 h = u; 1449 i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) ); 1450 j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x ) 1451 | ( p & w & NOT(x) & v ) | ( w & NOT(v) ); 1452 k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x ) 1453 | ( q & v & w & NOT(x) ) | ( x & NOT(v) ); 1454 m = y; 1455 1456 value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7) 1457 | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2) 1458 | PUT(k, 1) | PUT(m, 0); 1459 return value; 1460 } 1461 1462 ULong bcd_to_dpb( ULong chunk ) 1463 { 1464 Short a, b, c, d, e, f, g, h, i, j, k, m; 1465 Short p, q, r, s, t, u, v, w, x, y; 1466 ULong value; 1467 /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value 1468 The boolean equations to calculate the value of each of the DPD bit 1469 is given in Appendix B of Book 1: Power ISA User Instruction set. The 1470 bits for the DPD number are [abcdefghijkm]. The bits for the BCD value 1471 are [pqrstuvwxy]. The boolean logic equations in psuedo C code are: 1472 */ 1473 a = GET( chunk, 11 ); 1474 b = GET( chunk, 10 ); 1475 c = GET( chunk, 9 ); 1476 d = GET( chunk, 8 ); 1477 e = GET( chunk, 7 ); 1478 f = GET( chunk, 6 ); 1479 g = GET( chunk, 5 ); 1480 h = GET( chunk, 4 ); 1481 i = GET( chunk, 3 ); 1482 j = GET( chunk, 2 ); 1483 k = GET( chunk, 1 ); 1484 m = GET( chunk, 0 ); 1485 1486 p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) ); 1487 q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) ); 1488 r = d; 1489 s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) ) 1490 | ( f & NOT(a) & NOT(e) ) | ( e & i ); 1491 t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) ) 1492 | ( g & NOT(a) & NOT(e) ) | ( a & i ); 1493 u = h; 1494 v = a | e | i; 1495 w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a; 1496 x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e; 1497 y = m; 1498 1499 value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5) 1500 | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y; 1501 1502 return value; 1503 } 1504 1505 ULong h_DPBtoBCD( ULong dpb ) 1506 { 1507 ULong result, chunk; 1508 Int i; 1509 1510 result = 0; 1511 1512 for (i = 0; i < 5; i++) { 1513 chunk = dpb >> ( 4 - i ) * 10; 1514 result = result << 12; 1515 result |= dpb_to_bcd( chunk & 0x3FF ); 1516 } 1517 return result; 1518 } 1519 1520 ULong h_BCDtoDPB( ULong bcd ) 1521 { 1522 ULong result, chunk; 1523 Int i; 1524 1525 result = 0; 1526 1527 for (i = 0; i < 5; i++) { 1528 chunk = bcd >> ( 4 - i ) * 12; 1529 result = result << 10; 1530 result |= bcd_to_dpb( chunk & 0xFFF ); 1531 } 1532 return result; 1533 } 1534 #undef NOT 1535 #undef GET 1536 #undef PUT 1537 1538 /*---------------------------------------------------------------*/ 1539 /*--- end host_generic_simd64.c ---*/ 1540 /*---------------------------------------------------------------*/ 1541 1542