1 2 /*---------------------------------------------------------------*/ 3 /*--- begin host_generic_simd64.c ---*/ 4 /*---------------------------------------------------------------*/ 5 6 /* 7 This file is part of Valgrind, a dynamic binary instrumentation 8 framework. 9 10 Copyright (C) 2004-2011 OpenWorks LLP 11 info (at) open-works.net 12 13 This program is free software; you can redistribute it and/or 14 modify it under the terms of the GNU General Public License as 15 published by the Free Software Foundation; either version 2 of the 16 License, or (at your option) any later version. 17 18 This program is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with this program; if not, write to the Free Software 25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 26 02110-1301, USA. 27 28 The GNU General Public License is contained in the file COPYING. 29 30 Neither the names of the U.S. Department of Energy nor the 31 University of California nor the names of its contributors may be 32 used to endorse or promote products derived from this software 33 without prior written permission. 34 */ 35 36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases 37 where the instruction selectors cannot generate code in-line. 38 These are purely back-end entities and cannot be seen/referenced 39 from IR. */ 40 41 #include "libvex_basictypes.h" 42 #include "host_generic_simd64.h" 43 44 45 46 /* Tuple/select functions for 32x2 vectors. */ 47 48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) { 49 return (((ULong)w1) << 32) | ((ULong)w0); 50 } 51 52 static inline UInt sel32x2_1 ( ULong w64 ) { 53 return 0xFFFFFFFF & toUInt(w64 >> 32); 54 } 55 static inline UInt sel32x2_0 ( ULong w64 ) { 56 return 0xFFFFFFFF & toUInt(w64); 57 } 58 59 60 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless 61 with 64-bit shifts so we give it a hand. */ 62 63 static inline ULong mk16x4 ( UShort w3, UShort w2, 64 UShort w1, UShort w0 ) { 65 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2); 66 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0); 67 return mk32x2(hi32, lo32); 68 } 69 70 static inline UShort sel16x4_3 ( ULong w64 ) { 71 UInt hi32 = toUInt(w64 >> 32); 72 return toUShort(0xFFFF & (hi32 >> 16)); 73 } 74 static inline UShort sel16x4_2 ( ULong w64 ) { 75 UInt hi32 = toUInt(w64 >> 32); 76 return toUShort(0xFFFF & hi32); 77 } 78 static inline UShort sel16x4_1 ( ULong w64 ) { 79 UInt lo32 = (UInt)w64; 80 return toUShort(0xFFFF & (lo32 >> 16)); 81 } 82 static inline UShort sel16x4_0 ( ULong w64 ) { 83 UInt lo32 = (UInt)w64; 84 return toUShort(0xFFFF & lo32); 85 } 86 87 88 /* Tuple/select functions for 8x8 vectors. */ 89 90 static inline ULong mk8x8 ( UChar w7, UChar w6, 91 UChar w5, UChar w4, 92 UChar w3, UChar w2, 93 UChar w1, UChar w0 ) { 94 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16) 95 | (((UInt)w5) << 8) | (((UInt)w4) << 0); 96 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16) 97 | (((UInt)w1) << 8) | (((UInt)w0) << 0); 98 return mk32x2(hi32, lo32); 99 } 100 101 static inline UChar sel8x8_7 ( ULong w64 ) { 102 UInt hi32 = toUInt(w64 >> 32); 103 return toUChar(0xFF & (hi32 >> 24)); 104 } 105 static inline UChar sel8x8_6 ( ULong w64 ) { 106 UInt hi32 = toUInt(w64 >> 32); 107 return toUChar(0xFF & (hi32 >> 16)); 108 } 109 static inline UChar sel8x8_5 ( ULong w64 ) { 110 UInt hi32 = toUInt(w64 >> 32); 111 return toUChar(0xFF & (hi32 >> 8)); 112 } 113 static inline UChar sel8x8_4 ( ULong w64 ) { 114 UInt hi32 = toUInt(w64 >> 32); 115 return toUChar(0xFF & (hi32 >> 0)); 116 } 117 static inline UChar sel8x8_3 ( ULong w64 ) { 118 UInt lo32 = (UInt)w64; 119 return toUChar(0xFF & (lo32 >> 24)); 120 } 121 static inline UChar sel8x8_2 ( ULong w64 ) { 122 UInt lo32 = (UInt)w64; 123 return toUChar(0xFF & (lo32 >> 16)); 124 } 125 static inline UChar sel8x8_1 ( ULong w64 ) { 126 UInt lo32 = (UInt)w64; 127 return toUChar(0xFF & (lo32 >> 8)); 128 } 129 static inline UChar sel8x8_0 ( ULong w64 ) { 130 UInt lo32 = (UInt)w64; 131 return toUChar(0xFF & (lo32 >> 0)); 132 } 133 134 static inline UChar index8x8 ( ULong w64, UChar ix ) { 135 ix &= 7; 136 return toUChar((w64 >> (8*ix)) & 0xFF); 137 } 138 139 140 /* Scalar helpers. */ 141 142 static inline Short qadd16S ( Short xx, Short yy ) 143 { 144 Int t = ((Int)xx) + ((Int)yy); 145 if (t < -32768) t = -32768; 146 if (t > 32767) t = 32767; 147 return (Short)t; 148 } 149 150 static inline Char qadd8S ( Char xx, Char yy ) 151 { 152 Int t = ((Int)xx) + ((Int)yy); 153 if (t < -128) t = -128; 154 if (t > 127) t = 127; 155 return (Char)t; 156 } 157 158 static inline UShort qadd16U ( UShort xx, UShort yy ) 159 { 160 UInt t = ((UInt)xx) + ((UInt)yy); 161 if (t > 0xFFFF) t = 0xFFFF; 162 return (UShort)t; 163 } 164 165 static inline UChar qadd8U ( UChar xx, UChar yy ) 166 { 167 UInt t = ((UInt)xx) + ((UInt)yy); 168 if (t > 0xFF) t = 0xFF; 169 return (UChar)t; 170 } 171 172 static inline Short qsub16S ( Short xx, Short yy ) 173 { 174 Int t = ((Int)xx) - ((Int)yy); 175 if (t < -32768) t = -32768; 176 if (t > 32767) t = 32767; 177 return (Short)t; 178 } 179 180 static inline Char qsub8S ( Char xx, Char yy ) 181 { 182 Int t = ((Int)xx) - ((Int)yy); 183 if (t < -128) t = -128; 184 if (t > 127) t = 127; 185 return (Char)t; 186 } 187 188 static inline UShort qsub16U ( UShort xx, UShort yy ) 189 { 190 Int t = ((Int)xx) - ((Int)yy); 191 if (t < 0) t = 0; 192 if (t > 0xFFFF) t = 0xFFFF; 193 return (UShort)t; 194 } 195 196 static inline UChar qsub8U ( UChar xx, UChar yy ) 197 { 198 Int t = ((Int)xx) - ((Int)yy); 199 if (t < 0) t = 0; 200 if (t > 0xFF) t = 0xFF; 201 return (UChar)t; 202 } 203 204 static inline Short mul16 ( Short xx, Short yy ) 205 { 206 Int t = ((Int)xx) * ((Int)yy); 207 return (Short)t; 208 } 209 210 static inline Int mul32 ( Int xx, Int yy ) 211 { 212 Int t = ((Int)xx) * ((Int)yy); 213 return (Int)t; 214 } 215 216 static inline Short mulhi16S ( Short xx, Short yy ) 217 { 218 Int t = ((Int)xx) * ((Int)yy); 219 t >>=/*s*/ 16; 220 return (Short)t; 221 } 222 223 static inline UShort mulhi16U ( UShort xx, UShort yy ) 224 { 225 UInt t = ((UInt)xx) * ((UInt)yy); 226 t >>=/*u*/ 16; 227 return (UShort)t; 228 } 229 230 static inline UInt cmpeq32 ( UInt xx, UInt yy ) 231 { 232 return xx==yy ? 0xFFFFFFFF : 0; 233 } 234 235 static inline UShort cmpeq16 ( UShort xx, UShort yy ) 236 { 237 return toUShort(xx==yy ? 0xFFFF : 0); 238 } 239 240 static inline UChar cmpeq8 ( UChar xx, UChar yy ) 241 { 242 return toUChar(xx==yy ? 0xFF : 0); 243 } 244 245 static inline UInt cmpgt32S ( Int xx, Int yy ) 246 { 247 return xx>yy ? 0xFFFFFFFF : 0; 248 } 249 250 static inline UShort cmpgt16S ( Short xx, Short yy ) 251 { 252 return toUShort(xx>yy ? 0xFFFF : 0); 253 } 254 255 static inline UChar cmpgt8S ( Char xx, Char yy ) 256 { 257 return toUChar(xx>yy ? 0xFF : 0); 258 } 259 260 static inline UInt cmpnez32 ( UInt xx ) 261 { 262 return xx==0 ? 0 : 0xFFFFFFFF; 263 } 264 265 static inline UShort cmpnez16 ( UShort xx ) 266 { 267 return toUShort(xx==0 ? 0 : 0xFFFF); 268 } 269 270 static inline UChar cmpnez8 ( UChar xx ) 271 { 272 return toUChar(xx==0 ? 0 : 0xFF); 273 } 274 275 static inline Short qnarrow32Sto16S ( UInt xx0 ) 276 { 277 Int xx = (Int)xx0; 278 if (xx < -32768) xx = -32768; 279 if (xx > 32767) xx = 32767; 280 return (Short)xx; 281 } 282 283 static inline Char qnarrow16Sto8S ( UShort xx0 ) 284 { 285 Short xx = (Short)xx0; 286 if (xx < -128) xx = -128; 287 if (xx > 127) xx = 127; 288 return (Char)xx; 289 } 290 291 static inline UChar qnarrow16Sto8U ( UShort xx0 ) 292 { 293 Short xx = (Short)xx0; 294 if (xx < 0) xx = 0; 295 if (xx > 255) xx = 255; 296 return (UChar)xx; 297 } 298 299 static inline UShort narrow32to16 ( UInt xx ) 300 { 301 return (UShort)xx; 302 } 303 304 static inline UChar narrow16to8 ( UShort xx ) 305 { 306 return (UChar)xx; 307 } 308 309 /* shifts: we don't care about out-of-range ones, since 310 that is dealt with at a higher level. */ 311 312 static inline UChar shl8 ( UChar v, UInt n ) 313 { 314 return toUChar(v << n); 315 } 316 317 static inline UChar sar8 ( UChar v, UInt n ) 318 { 319 return toUChar(((Char)v) >> n); 320 } 321 322 static inline UShort shl16 ( UShort v, UInt n ) 323 { 324 return toUShort(v << n); 325 } 326 327 static inline UShort shr16 ( UShort v, UInt n ) 328 { 329 return toUShort((((UShort)v) >> n)); 330 } 331 332 static inline UShort sar16 ( UShort v, UInt n ) 333 { 334 return toUShort(((Short)v) >> n); 335 } 336 337 static inline UInt shl32 ( UInt v, UInt n ) 338 { 339 return v << n; 340 } 341 342 static inline UInt shr32 ( UInt v, UInt n ) 343 { 344 return (((UInt)v) >> n); 345 } 346 347 static inline UInt sar32 ( UInt v, UInt n ) 348 { 349 return ((Int)v) >> n; 350 } 351 352 static inline UChar avg8U ( UChar xx, UChar yy ) 353 { 354 UInt xxi = (UInt)xx; 355 UInt yyi = (UInt)yy; 356 UInt r = (xxi + yyi + 1) >> 1; 357 return (UChar)r; 358 } 359 360 static inline UShort avg16U ( UShort xx, UShort yy ) 361 { 362 UInt xxi = (UInt)xx; 363 UInt yyi = (UInt)yy; 364 UInt r = (xxi + yyi + 1) >> 1; 365 return (UShort)r; 366 } 367 368 static inline Short max16S ( Short xx, Short yy ) 369 { 370 return toUShort((xx > yy) ? xx : yy); 371 } 372 373 static inline UChar max8U ( UChar xx, UChar yy ) 374 { 375 return toUChar((xx > yy) ? xx : yy); 376 } 377 378 static inline Short min16S ( Short xx, Short yy ) 379 { 380 return toUShort((xx < yy) ? xx : yy); 381 } 382 383 static inline UChar min8U ( UChar xx, UChar yy ) 384 { 385 return toUChar((xx < yy) ? xx : yy); 386 } 387 388 static inline UShort hadd16U ( UShort xx, UShort yy ) 389 { 390 UInt xxi = (UInt)xx; 391 UInt yyi = (UInt)yy; 392 UInt r = (xxi + yyi) >> 1; 393 return (UShort)r; 394 } 395 396 static inline Short hadd16S ( Short xx, Short yy ) 397 { 398 Int xxi = (Int)xx; 399 Int yyi = (Int)yy; 400 Int r = (xxi + yyi) >> 1; 401 return (Short)r; 402 } 403 404 static inline UShort hsub16U ( UShort xx, UShort yy ) 405 { 406 UInt xxi = (UInt)xx; 407 UInt yyi = (UInt)yy; 408 UInt r = (xxi - yyi) >> 1; 409 return (UShort)r; 410 } 411 412 static inline Short hsub16S ( Short xx, Short yy ) 413 { 414 Int xxi = (Int)xx; 415 Int yyi = (Int)yy; 416 Int r = (xxi - yyi) >> 1; 417 return (Short)r; 418 } 419 420 static inline UChar hadd8U ( UChar xx, UChar yy ) 421 { 422 UInt xxi = (UInt)xx; 423 UInt yyi = (UInt)yy; 424 UInt r = (xxi + yyi) >> 1; 425 return (UChar)r; 426 } 427 428 static inline Char hadd8S ( Char xx, Char yy ) 429 { 430 Int xxi = (Int)xx; 431 Int yyi = (Int)yy; 432 Int r = (xxi + yyi) >> 1; 433 return (Char)r; 434 } 435 436 static inline UChar hsub8U ( UChar xx, UChar yy ) 437 { 438 UInt xxi = (UInt)xx; 439 UInt yyi = (UInt)yy; 440 UInt r = (xxi - yyi) >> 1; 441 return (UChar)r; 442 } 443 444 static inline Char hsub8S ( Char xx, Char yy ) 445 { 446 Int xxi = (Int)xx; 447 Int yyi = (Int)yy; 448 Int r = (xxi - yyi) >> 1; 449 return (Char)r; 450 } 451 452 static inline UInt absdiff8U ( UChar xx, UChar yy ) 453 { 454 UInt xxu = (UChar)xx; 455 UInt yyu = (UChar)yy; 456 return xxu >= yyu ? xxu - yyu : yyu - xxu; 457 } 458 459 /* ----------------------------------------------------- */ 460 /* Start of the externally visible functions. These simply 461 implement the corresponding IR primops. */ 462 /* ----------------------------------------------------- */ 463 464 /* ------------ Normal addition ------------ */ 465 466 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy ) 467 { 468 return mk32x2( 469 sel32x2_1(xx) + sel32x2_1(yy), 470 sel32x2_0(xx) + sel32x2_0(yy) 471 ); 472 } 473 474 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy ) 475 { 476 return mk16x4( 477 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ), 478 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ), 479 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ), 480 toUShort( sel16x4_0(xx) + sel16x4_0(yy) ) 481 ); 482 } 483 484 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy ) 485 { 486 return mk8x8( 487 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ), 488 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ), 489 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ), 490 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ), 491 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ), 492 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ), 493 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ), 494 toUChar( sel8x8_0(xx) + sel8x8_0(yy) ) 495 ); 496 } 497 498 /* ------------ Saturating addition ------------ */ 499 500 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy ) 501 { 502 return mk16x4( 503 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ), 504 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ), 505 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ), 506 qadd16S( sel16x4_0(xx), sel16x4_0(yy) ) 507 ); 508 } 509 510 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy ) 511 { 512 return mk8x8( 513 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ), 514 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ), 515 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ), 516 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ), 517 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ), 518 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ), 519 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ), 520 qadd8S( sel8x8_0(xx), sel8x8_0(yy) ) 521 ); 522 } 523 524 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy ) 525 { 526 return mk16x4( 527 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ), 528 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ), 529 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ), 530 qadd16U( sel16x4_0(xx), sel16x4_0(yy) ) 531 ); 532 } 533 534 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy ) 535 { 536 return mk8x8( 537 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ), 538 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ), 539 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ), 540 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ), 541 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ), 542 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ), 543 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ), 544 qadd8U( sel8x8_0(xx), sel8x8_0(yy) ) 545 ); 546 } 547 548 /* ------------ Normal subtraction ------------ */ 549 550 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy ) 551 { 552 return mk32x2( 553 sel32x2_1(xx) - sel32x2_1(yy), 554 sel32x2_0(xx) - sel32x2_0(yy) 555 ); 556 } 557 558 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy ) 559 { 560 return mk16x4( 561 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ), 562 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ), 563 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ), 564 toUShort( sel16x4_0(xx) - sel16x4_0(yy) ) 565 ); 566 } 567 568 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy ) 569 { 570 return mk8x8( 571 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ), 572 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ), 573 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ), 574 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ), 575 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ), 576 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ), 577 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ), 578 toUChar( sel8x8_0(xx) - sel8x8_0(yy) ) 579 ); 580 } 581 582 /* ------------ Saturating subtraction ------------ */ 583 584 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy ) 585 { 586 return mk16x4( 587 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ), 588 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ), 589 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ), 590 qsub16S( sel16x4_0(xx), sel16x4_0(yy) ) 591 ); 592 } 593 594 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy ) 595 { 596 return mk8x8( 597 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ), 598 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ), 599 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ), 600 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ), 601 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ), 602 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ), 603 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ), 604 qsub8S( sel8x8_0(xx), sel8x8_0(yy) ) 605 ); 606 } 607 608 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy ) 609 { 610 return mk16x4( 611 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ), 612 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ), 613 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ), 614 qsub16U( sel16x4_0(xx), sel16x4_0(yy) ) 615 ); 616 } 617 618 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy ) 619 { 620 return mk8x8( 621 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ), 622 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ), 623 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ), 624 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ), 625 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ), 626 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ), 627 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ), 628 qsub8U( sel8x8_0(xx), sel8x8_0(yy) ) 629 ); 630 } 631 632 /* ------------ Multiplication ------------ */ 633 634 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy ) 635 { 636 return mk16x4( 637 mul16( sel16x4_3(xx), sel16x4_3(yy) ), 638 mul16( sel16x4_2(xx), sel16x4_2(yy) ), 639 mul16( sel16x4_1(xx), sel16x4_1(yy) ), 640 mul16( sel16x4_0(xx), sel16x4_0(yy) ) 641 ); 642 } 643 644 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy ) 645 { 646 return mk32x2( 647 mul32( sel32x2_1(xx), sel32x2_1(yy) ), 648 mul32( sel32x2_0(xx), sel32x2_0(yy) ) 649 ); 650 } 651 652 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy ) 653 { 654 return mk16x4( 655 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ), 656 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ), 657 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ), 658 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) ) 659 ); 660 } 661 662 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy ) 663 { 664 return mk16x4( 665 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ), 666 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ), 667 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ), 668 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) ) 669 ); 670 } 671 672 /* ------------ Comparison ------------ */ 673 674 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy ) 675 { 676 return mk32x2( 677 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ), 678 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) ) 679 ); 680 } 681 682 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy ) 683 { 684 return mk16x4( 685 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ), 686 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ), 687 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ), 688 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) ) 689 ); 690 } 691 692 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy ) 693 { 694 return mk8x8( 695 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ), 696 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ), 697 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ), 698 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ), 699 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ), 700 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ), 701 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ), 702 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) ) 703 ); 704 } 705 706 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy ) 707 { 708 return mk32x2( 709 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ), 710 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) ) 711 ); 712 } 713 714 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy ) 715 { 716 return mk16x4( 717 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ), 718 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ), 719 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ), 720 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) ) 721 ); 722 } 723 724 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy ) 725 { 726 return mk8x8( 727 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ), 728 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ), 729 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ), 730 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ), 731 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ), 732 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ), 733 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ), 734 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) ) 735 ); 736 } 737 738 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx ) 739 { 740 return mk32x2( 741 cmpnez32( sel32x2_1(xx) ), 742 cmpnez32( sel32x2_0(xx) ) 743 ); 744 } 745 746 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx ) 747 { 748 return mk16x4( 749 cmpnez16( sel16x4_3(xx) ), 750 cmpnez16( sel16x4_2(xx) ), 751 cmpnez16( sel16x4_1(xx) ), 752 cmpnez16( sel16x4_0(xx) ) 753 ); 754 } 755 756 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx ) 757 { 758 return mk8x8( 759 cmpnez8( sel8x8_7(xx) ), 760 cmpnez8( sel8x8_6(xx) ), 761 cmpnez8( sel8x8_5(xx) ), 762 cmpnez8( sel8x8_4(xx) ), 763 cmpnez8( sel8x8_3(xx) ), 764 cmpnez8( sel8x8_2(xx) ), 765 cmpnez8( sel8x8_1(xx) ), 766 cmpnez8( sel8x8_0(xx) ) 767 ); 768 } 769 770 /* ------------ Saturating narrowing ------------ */ 771 772 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb ) 773 { 774 UInt d = sel32x2_1(aa); 775 UInt c = sel32x2_0(aa); 776 UInt b = sel32x2_1(bb); 777 UInt a = sel32x2_0(bb); 778 return mk16x4( 779 qnarrow32Sto16S(d), 780 qnarrow32Sto16S(c), 781 qnarrow32Sto16S(b), 782 qnarrow32Sto16S(a) 783 ); 784 } 785 786 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb ) 787 { 788 UShort h = sel16x4_3(aa); 789 UShort g = sel16x4_2(aa); 790 UShort f = sel16x4_1(aa); 791 UShort e = sel16x4_0(aa); 792 UShort d = sel16x4_3(bb); 793 UShort c = sel16x4_2(bb); 794 UShort b = sel16x4_1(bb); 795 UShort a = sel16x4_0(bb); 796 return mk8x8( 797 qnarrow16Sto8S(h), 798 qnarrow16Sto8S(g), 799 qnarrow16Sto8S(f), 800 qnarrow16Sto8S(e), 801 qnarrow16Sto8S(d), 802 qnarrow16Sto8S(c), 803 qnarrow16Sto8S(b), 804 qnarrow16Sto8S(a) 805 ); 806 } 807 808 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb ) 809 { 810 UShort h = sel16x4_3(aa); 811 UShort g = sel16x4_2(aa); 812 UShort f = sel16x4_1(aa); 813 UShort e = sel16x4_0(aa); 814 UShort d = sel16x4_3(bb); 815 UShort c = sel16x4_2(bb); 816 UShort b = sel16x4_1(bb); 817 UShort a = sel16x4_0(bb); 818 return mk8x8( 819 qnarrow16Sto8U(h), 820 qnarrow16Sto8U(g), 821 qnarrow16Sto8U(f), 822 qnarrow16Sto8U(e), 823 qnarrow16Sto8U(d), 824 qnarrow16Sto8U(c), 825 qnarrow16Sto8U(b), 826 qnarrow16Sto8U(a) 827 ); 828 } 829 830 /* ------------ Truncating narrowing ------------ */ 831 832 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb ) 833 { 834 UInt d = sel32x2_1(aa); 835 UInt c = sel32x2_0(aa); 836 UInt b = sel32x2_1(bb); 837 UInt a = sel32x2_0(bb); 838 return mk16x4( 839 narrow32to16(d), 840 narrow32to16(c), 841 narrow32to16(b), 842 narrow32to16(a) 843 ); 844 } 845 846 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb ) 847 { 848 UShort h = sel16x4_3(aa); 849 UShort g = sel16x4_2(aa); 850 UShort f = sel16x4_1(aa); 851 UShort e = sel16x4_0(aa); 852 UShort d = sel16x4_3(bb); 853 UShort c = sel16x4_2(bb); 854 UShort b = sel16x4_1(bb); 855 UShort a = sel16x4_0(bb); 856 return mk8x8( 857 narrow16to8(h), 858 narrow16to8(g), 859 narrow16to8(f), 860 narrow16to8(e), 861 narrow16to8(d), 862 narrow16to8(c), 863 narrow16to8(b), 864 narrow16to8(a) 865 ); 866 } 867 868 /* ------------ Interleaving ------------ */ 869 870 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb ) 871 { 872 return mk8x8( 873 sel8x8_7(aa), 874 sel8x8_7(bb), 875 sel8x8_6(aa), 876 sel8x8_6(bb), 877 sel8x8_5(aa), 878 sel8x8_5(bb), 879 sel8x8_4(aa), 880 sel8x8_4(bb) 881 ); 882 } 883 884 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb ) 885 { 886 return mk8x8( 887 sel8x8_3(aa), 888 sel8x8_3(bb), 889 sel8x8_2(aa), 890 sel8x8_2(bb), 891 sel8x8_1(aa), 892 sel8x8_1(bb), 893 sel8x8_0(aa), 894 sel8x8_0(bb) 895 ); 896 } 897 898 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb ) 899 { 900 return mk16x4( 901 sel16x4_3(aa), 902 sel16x4_3(bb), 903 sel16x4_2(aa), 904 sel16x4_2(bb) 905 ); 906 } 907 908 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb ) 909 { 910 return mk16x4( 911 sel16x4_1(aa), 912 sel16x4_1(bb), 913 sel16x4_0(aa), 914 sel16x4_0(bb) 915 ); 916 } 917 918 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb ) 919 { 920 return mk32x2( 921 sel32x2_1(aa), 922 sel32x2_1(bb) 923 ); 924 } 925 926 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb ) 927 { 928 return mk32x2( 929 sel32x2_0(aa), 930 sel32x2_0(bb) 931 ); 932 } 933 934 /* ------------ Concatenation ------------ */ 935 936 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb ) 937 { 938 return mk16x4( 939 sel16x4_3(aa), 940 sel16x4_1(aa), 941 sel16x4_3(bb), 942 sel16x4_1(bb) 943 ); 944 } 945 946 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb ) 947 { 948 return mk16x4( 949 sel16x4_2(aa), 950 sel16x4_0(aa), 951 sel16x4_2(bb), 952 sel16x4_0(bb) 953 ); 954 } 955 956 /* misc hack looking for a proper home */ 957 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) 958 { 959 return mk8x8( 960 index8x8(aa, sel8x8_7(bb)), 961 index8x8(aa, sel8x8_6(bb)), 962 index8x8(aa, sel8x8_5(bb)), 963 index8x8(aa, sel8x8_4(bb)), 964 index8x8(aa, sel8x8_3(bb)), 965 index8x8(aa, sel8x8_2(bb)), 966 index8x8(aa, sel8x8_1(bb)), 967 index8x8(aa, sel8x8_0(bb)) 968 ); 969 } 970 971 /* ------------ Shifting ------------ */ 972 /* Note that because these primops are undefined if the shift amount 973 equals or exceeds the lane width, the shift amount is masked so 974 that the scalar shifts are always in range. In fact, given the 975 semantics of these primops (ShlN16x4, etc) it is an error if in 976 fact we are ever given an out-of-range shift amount. 977 */ 978 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn ) 979 { 980 /* vassert(nn < 32); */ 981 nn &= 31; 982 return mk32x2( 983 shl32( sel32x2_1(xx), nn ), 984 shl32( sel32x2_0(xx), nn ) 985 ); 986 } 987 988 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn ) 989 { 990 /* vassert(nn < 16); */ 991 nn &= 15; 992 return mk16x4( 993 shl16( sel16x4_3(xx), nn ), 994 shl16( sel16x4_2(xx), nn ), 995 shl16( sel16x4_1(xx), nn ), 996 shl16( sel16x4_0(xx), nn ) 997 ); 998 } 999 1000 ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn ) 1001 { 1002 /* vassert(nn < 8); */ 1003 nn &= 7; 1004 return mk8x8( 1005 shl8( sel8x8_7(xx), nn ), 1006 shl8( sel8x8_6(xx), nn ), 1007 shl8( sel8x8_5(xx), nn ), 1008 shl8( sel8x8_4(xx), nn ), 1009 shl8( sel8x8_3(xx), nn ), 1010 shl8( sel8x8_2(xx), nn ), 1011 shl8( sel8x8_1(xx), nn ), 1012 shl8( sel8x8_0(xx), nn ) 1013 ); 1014 } 1015 1016 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn ) 1017 { 1018 /* vassert(nn < 32); */ 1019 nn &= 31; 1020 return mk32x2( 1021 shr32( sel32x2_1(xx), nn ), 1022 shr32( sel32x2_0(xx), nn ) 1023 ); 1024 } 1025 1026 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn ) 1027 { 1028 /* vassert(nn < 16); */ 1029 nn &= 15; 1030 return mk16x4( 1031 shr16( sel16x4_3(xx), nn ), 1032 shr16( sel16x4_2(xx), nn ), 1033 shr16( sel16x4_1(xx), nn ), 1034 shr16( sel16x4_0(xx), nn ) 1035 ); 1036 } 1037 1038 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn ) 1039 { 1040 /* vassert(nn < 32); */ 1041 nn &= 31; 1042 return mk32x2( 1043 sar32( sel32x2_1(xx), nn ), 1044 sar32( sel32x2_0(xx), nn ) 1045 ); 1046 } 1047 1048 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn ) 1049 { 1050 /* vassert(nn < 16); */ 1051 nn &= 15; 1052 return mk16x4( 1053 sar16( sel16x4_3(xx), nn ), 1054 sar16( sel16x4_2(xx), nn ), 1055 sar16( sel16x4_1(xx), nn ), 1056 sar16( sel16x4_0(xx), nn ) 1057 ); 1058 } 1059 1060 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn ) 1061 { 1062 /* vassert(nn < 8); */ 1063 nn &= 7; 1064 return mk8x8( 1065 sar8( sel8x8_7(xx), nn ), 1066 sar8( sel8x8_6(xx), nn ), 1067 sar8( sel8x8_5(xx), nn ), 1068 sar8( sel8x8_4(xx), nn ), 1069 sar8( sel8x8_3(xx), nn ), 1070 sar8( sel8x8_2(xx), nn ), 1071 sar8( sel8x8_1(xx), nn ), 1072 sar8( sel8x8_0(xx), nn ) 1073 ); 1074 } 1075 1076 /* ------------ Averaging ------------ */ 1077 1078 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy ) 1079 { 1080 return mk8x8( 1081 avg8U( sel8x8_7(xx), sel8x8_7(yy) ), 1082 avg8U( sel8x8_6(xx), sel8x8_6(yy) ), 1083 avg8U( sel8x8_5(xx), sel8x8_5(yy) ), 1084 avg8U( sel8x8_4(xx), sel8x8_4(yy) ), 1085 avg8U( sel8x8_3(xx), sel8x8_3(yy) ), 1086 avg8U( sel8x8_2(xx), sel8x8_2(yy) ), 1087 avg8U( sel8x8_1(xx), sel8x8_1(yy) ), 1088 avg8U( sel8x8_0(xx), sel8x8_0(yy) ) 1089 ); 1090 } 1091 1092 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy ) 1093 { 1094 return mk16x4( 1095 avg16U( sel16x4_3(xx), sel16x4_3(yy) ), 1096 avg16U( sel16x4_2(xx), sel16x4_2(yy) ), 1097 avg16U( sel16x4_1(xx), sel16x4_1(yy) ), 1098 avg16U( sel16x4_0(xx), sel16x4_0(yy) ) 1099 ); 1100 } 1101 1102 /* ------------ max/min ------------ */ 1103 1104 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy ) 1105 { 1106 return mk16x4( 1107 max16S( sel16x4_3(xx), sel16x4_3(yy) ), 1108 max16S( sel16x4_2(xx), sel16x4_2(yy) ), 1109 max16S( sel16x4_1(xx), sel16x4_1(yy) ), 1110 max16S( sel16x4_0(xx), sel16x4_0(yy) ) 1111 ); 1112 } 1113 1114 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy ) 1115 { 1116 return mk8x8( 1117 max8U( sel8x8_7(xx), sel8x8_7(yy) ), 1118 max8U( sel8x8_6(xx), sel8x8_6(yy) ), 1119 max8U( sel8x8_5(xx), sel8x8_5(yy) ), 1120 max8U( sel8x8_4(xx), sel8x8_4(yy) ), 1121 max8U( sel8x8_3(xx), sel8x8_3(yy) ), 1122 max8U( sel8x8_2(xx), sel8x8_2(yy) ), 1123 max8U( sel8x8_1(xx), sel8x8_1(yy) ), 1124 max8U( sel8x8_0(xx), sel8x8_0(yy) ) 1125 ); 1126 } 1127 1128 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy ) 1129 { 1130 return mk16x4( 1131 min16S( sel16x4_3(xx), sel16x4_3(yy) ), 1132 min16S( sel16x4_2(xx), sel16x4_2(yy) ), 1133 min16S( sel16x4_1(xx), sel16x4_1(yy) ), 1134 min16S( sel16x4_0(xx), sel16x4_0(yy) ) 1135 ); 1136 } 1137 1138 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy ) 1139 { 1140 return mk8x8( 1141 min8U( sel8x8_7(xx), sel8x8_7(yy) ), 1142 min8U( sel8x8_6(xx), sel8x8_6(yy) ), 1143 min8U( sel8x8_5(xx), sel8x8_5(yy) ), 1144 min8U( sel8x8_4(xx), sel8x8_4(yy) ), 1145 min8U( sel8x8_3(xx), sel8x8_3(yy) ), 1146 min8U( sel8x8_2(xx), sel8x8_2(yy) ), 1147 min8U( sel8x8_1(xx), sel8x8_1(yy) ), 1148 min8U( sel8x8_0(xx), sel8x8_0(yy) ) 1149 ); 1150 } 1151 1152 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */ 1153 1154 /* Tuple/select functions for 16x2 vectors. */ 1155 static inline UInt mk16x2 ( UShort w1, UShort w2 ) { 1156 return (((UInt)w1) << 16) | ((UInt)w2); 1157 } 1158 1159 static inline UShort sel16x2_1 ( UInt w32 ) { 1160 return 0xFFFF & (UShort)(w32 >> 16); 1161 } 1162 static inline UShort sel16x2_0 ( UInt w32 ) { 1163 return 0xFFFF & (UShort)(w32); 1164 } 1165 1166 static inline UInt mk8x4 ( UChar w3, UChar w2, 1167 UChar w1, UChar w0 ) { 1168 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16) 1169 | (((UInt)w1) << 8) | (((UInt)w0) << 0); 1170 return w32; 1171 } 1172 1173 static inline UChar sel8x4_3 ( UInt w32 ) { 1174 return toUChar(0xFF & (w32 >> 24)); 1175 } 1176 static inline UChar sel8x4_2 ( UInt w32 ) { 1177 return toUChar(0xFF & (w32 >> 16)); 1178 } 1179 static inline UChar sel8x4_1 ( UInt w32 ) { 1180 return toUChar(0xFF & (w32 >> 8)); 1181 } 1182 static inline UChar sel8x4_0 ( UInt w32 ) { 1183 return toUChar(0xFF & (w32 >> 0)); 1184 } 1185 1186 1187 /* ----------------------------------------------------- */ 1188 /* More externally visible functions. These simply 1189 implement the corresponding IR primops. */ 1190 /* ----------------------------------------------------- */ 1191 1192 /* ------ 16x2 ------ */ 1193 1194 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy ) 1195 { 1196 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy), 1197 sel16x2_0(xx) + sel16x2_0(yy) ); 1198 } 1199 1200 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy ) 1201 { 1202 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy), 1203 sel16x2_0(xx) - sel16x2_0(yy) ); 1204 } 1205 1206 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy ) 1207 { 1208 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ), 1209 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1210 } 1211 1212 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy ) 1213 { 1214 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ), 1215 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1216 } 1217 1218 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy ) 1219 { 1220 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ), 1221 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1222 } 1223 1224 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy ) 1225 { 1226 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ), 1227 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1228 } 1229 1230 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy ) 1231 { 1232 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ), 1233 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1234 } 1235 1236 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy ) 1237 { 1238 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ), 1239 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1240 } 1241 1242 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy ) 1243 { 1244 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ), 1245 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1246 } 1247 1248 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy ) 1249 { 1250 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ), 1251 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1252 } 1253 1254 /* ------ 8x4 ------ */ 1255 1256 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy ) 1257 { 1258 return mk8x4( 1259 sel8x4_3(xx) + sel8x4_3(yy), 1260 sel8x4_2(xx) + sel8x4_2(yy), 1261 sel8x4_1(xx) + sel8x4_1(yy), 1262 sel8x4_0(xx) + sel8x4_0(yy) 1263 ); 1264 } 1265 1266 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy ) 1267 { 1268 return mk8x4( 1269 sel8x4_3(xx) - sel8x4_3(yy), 1270 sel8x4_2(xx) - sel8x4_2(yy), 1271 sel8x4_1(xx) - sel8x4_1(yy), 1272 sel8x4_0(xx) - sel8x4_0(yy) 1273 ); 1274 } 1275 1276 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy ) 1277 { 1278 return mk8x4( 1279 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ), 1280 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ), 1281 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ), 1282 hadd8U( sel8x4_0(xx), sel8x4_0(yy) ) 1283 ); 1284 } 1285 1286 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy ) 1287 { 1288 return mk8x4( 1289 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ), 1290 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ), 1291 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ), 1292 hadd8S( sel8x4_0(xx), sel8x4_0(yy) ) 1293 ); 1294 } 1295 1296 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy ) 1297 { 1298 return mk8x4( 1299 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ), 1300 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ), 1301 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ), 1302 hsub8U( sel8x4_0(xx), sel8x4_0(yy) ) 1303 ); 1304 } 1305 1306 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy ) 1307 { 1308 return mk8x4( 1309 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ), 1310 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ), 1311 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ), 1312 hsub8S( sel8x4_0(xx), sel8x4_0(yy) ) 1313 ); 1314 } 1315 1316 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy ) 1317 { 1318 return mk8x4( 1319 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ), 1320 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ), 1321 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ), 1322 qadd8U( sel8x4_0(xx), sel8x4_0(yy) ) 1323 ); 1324 } 1325 1326 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy ) 1327 { 1328 return mk8x4( 1329 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ), 1330 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ), 1331 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ), 1332 qadd8S( sel8x4_0(xx), sel8x4_0(yy) ) 1333 ); 1334 } 1335 1336 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy ) 1337 { 1338 return mk8x4( 1339 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ), 1340 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ), 1341 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ), 1342 qsub8U( sel8x4_0(xx), sel8x4_0(yy) ) 1343 ); 1344 } 1345 1346 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy ) 1347 { 1348 return mk8x4( 1349 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ), 1350 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ), 1351 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ), 1352 qsub8S( sel8x4_0(xx), sel8x4_0(yy) ) 1353 ); 1354 } 1355 1356 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx ) 1357 { 1358 return mk16x2( 1359 cmpnez16( sel16x2_1(xx) ), 1360 cmpnez16( sel16x2_0(xx) ) 1361 ); 1362 } 1363 1364 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx ) 1365 { 1366 return mk8x4( 1367 cmpnez8( sel8x4_3(xx) ), 1368 cmpnez8( sel8x4_2(xx) ), 1369 cmpnez8( sel8x4_1(xx) ), 1370 cmpnez8( sel8x4_0(xx) ) 1371 ); 1372 } 1373 1374 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy ) 1375 { 1376 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) ) 1377 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) ) 1378 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) ) 1379 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) ); 1380 } 1381 1382 1383 /*---------------------------------------------------------------*/ 1384 /*--- end host_generic_simd64.c ---*/ 1385 /*---------------------------------------------------------------*/ 1386