1 2 /*---------------------------------------------------------------*/ 3 /*--- begin host_generic_simd64.c ---*/ 4 /*---------------------------------------------------------------*/ 5 6 /* 7 This file is part of Valgrind, a dynamic binary instrumentation 8 framework. 9 10 Copyright (C) 2004-2010 OpenWorks LLP 11 info (at) open-works.net 12 13 This program is free software; you can redistribute it and/or 14 modify it under the terms of the GNU General Public License as 15 published by the Free Software Foundation; either version 2 of the 16 License, or (at your option) any later version. 17 18 This program is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21 General Public License for more details. 22 23 You should have received a copy of the GNU General Public License 24 along with this program; if not, write to the Free Software 25 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 26 02110-1301, USA. 27 28 The GNU General Public License is contained in the file COPYING. 29 30 Neither the names of the U.S. Department of Energy nor the 31 University of California nor the names of its contributors may be 32 used to endorse or promote products derived from this software 33 without prior written permission. 34 */ 35 36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases 37 where the instruction selectors cannot generate code in-line. 38 These are purely back-end entities and cannot be seen/referenced 39 from IR. */ 40 41 #include "libvex_basictypes.h" 42 #include "host_generic_simd64.h" 43 44 45 46 /* Tuple/select functions for 32x2 vectors. */ 47 48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) { 49 return (((ULong)w1) << 32) | ((ULong)w0); 50 } 51 52 static inline UInt sel32x2_1 ( ULong w64 ) { 53 return 0xFFFFFFFF & toUInt(w64 >> 32); 54 } 55 static inline UInt sel32x2_0 ( ULong w64 ) { 56 return 0xFFFFFFFF & toUInt(w64); 57 } 58 59 60 /* Tuple/select functions for 16x4 vectors. gcc is pretty hopeless 61 with 64-bit shifts so we give it a hand. */ 62 63 static inline ULong mk16x4 ( UShort w3, UShort w2, 64 UShort w1, UShort w0 ) { 65 UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2); 66 UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0); 67 return mk32x2(hi32, lo32); 68 } 69 70 static inline UShort sel16x4_3 ( ULong w64 ) { 71 UInt hi32 = toUInt(w64 >> 32); 72 return toUShort(0xFFFF & (hi32 >> 16)); 73 } 74 static inline UShort sel16x4_2 ( ULong w64 ) { 75 UInt hi32 = toUInt(w64 >> 32); 76 return toUShort(0xFFFF & hi32); 77 } 78 static inline UShort sel16x4_1 ( ULong w64 ) { 79 UInt lo32 = (UInt)w64; 80 return toUShort(0xFFFF & (lo32 >> 16)); 81 } 82 static inline UShort sel16x4_0 ( ULong w64 ) { 83 UInt lo32 = (UInt)w64; 84 return toUShort(0xFFFF & lo32); 85 } 86 87 88 /* Tuple/select functions for 8x8 vectors. */ 89 90 static inline ULong mk8x8 ( UChar w7, UChar w6, 91 UChar w5, UChar w4, 92 UChar w3, UChar w2, 93 UChar w1, UChar w0 ) { 94 UInt hi32 = (((UInt)w7) << 24) | (((UInt)w6) << 16) 95 | (((UInt)w5) << 8) | (((UInt)w4) << 0); 96 UInt lo32 = (((UInt)w3) << 24) | (((UInt)w2) << 16) 97 | (((UInt)w1) << 8) | (((UInt)w0) << 0); 98 return mk32x2(hi32, lo32); 99 } 100 101 static inline UChar sel8x8_7 ( ULong w64 ) { 102 UInt hi32 = toUInt(w64 >> 32); 103 return toUChar(0xFF & (hi32 >> 24)); 104 } 105 static inline UChar sel8x8_6 ( ULong w64 ) { 106 UInt hi32 = toUInt(w64 >> 32); 107 return toUChar(0xFF & (hi32 >> 16)); 108 } 109 static inline UChar sel8x8_5 ( ULong w64 ) { 110 UInt hi32 = toUInt(w64 >> 32); 111 return toUChar(0xFF & (hi32 >> 8)); 112 } 113 static inline UChar sel8x8_4 ( ULong w64 ) { 114 UInt hi32 = toUInt(w64 >> 32); 115 return toUChar(0xFF & (hi32 >> 0)); 116 } 117 static inline UChar sel8x8_3 ( ULong w64 ) { 118 UInt lo32 = (UInt)w64; 119 return toUChar(0xFF & (lo32 >> 24)); 120 } 121 static inline UChar sel8x8_2 ( ULong w64 ) { 122 UInt lo32 = (UInt)w64; 123 return toUChar(0xFF & (lo32 >> 16)); 124 } 125 static inline UChar sel8x8_1 ( ULong w64 ) { 126 UInt lo32 = (UInt)w64; 127 return toUChar(0xFF & (lo32 >> 8)); 128 } 129 static inline UChar sel8x8_0 ( ULong w64 ) { 130 UInt lo32 = (UInt)w64; 131 return toUChar(0xFF & (lo32 >> 0)); 132 } 133 134 static inline UChar index8x8 ( ULong w64, UChar ix ) { 135 ix &= 7; 136 return toUChar((w64 >> (8*ix)) & 0xFF); 137 } 138 139 140 /* Scalar helpers. */ 141 142 static inline Short qadd16S ( Short xx, Short yy ) 143 { 144 Int t = ((Int)xx) + ((Int)yy); 145 if (t < -32768) t = -32768; 146 if (t > 32767) t = 32767; 147 return (Short)t; 148 } 149 150 static inline Char qadd8S ( Char xx, Char yy ) 151 { 152 Int t = ((Int)xx) + ((Int)yy); 153 if (t < -128) t = -128; 154 if (t > 127) t = 127; 155 return (Char)t; 156 } 157 158 static inline UShort qadd16U ( UShort xx, UShort yy ) 159 { 160 UInt t = ((UInt)xx) + ((UInt)yy); 161 if (t > 0xFFFF) t = 0xFFFF; 162 return (UShort)t; 163 } 164 165 static inline UChar qadd8U ( UChar xx, UChar yy ) 166 { 167 UInt t = ((UInt)xx) + ((UInt)yy); 168 if (t > 0xFF) t = 0xFF; 169 return (UChar)t; 170 } 171 172 static inline Short qsub16S ( Short xx, Short yy ) 173 { 174 Int t = ((Int)xx) - ((Int)yy); 175 if (t < -32768) t = -32768; 176 if (t > 32767) t = 32767; 177 return (Short)t; 178 } 179 180 static inline Char qsub8S ( Char xx, Char yy ) 181 { 182 Int t = ((Int)xx) - ((Int)yy); 183 if (t < -128) t = -128; 184 if (t > 127) t = 127; 185 return (Char)t; 186 } 187 188 static inline UShort qsub16U ( UShort xx, UShort yy ) 189 { 190 Int t = ((Int)xx) - ((Int)yy); 191 if (t < 0) t = 0; 192 if (t > 0xFFFF) t = 0xFFFF; 193 return (UShort)t; 194 } 195 196 static inline UChar qsub8U ( UChar xx, UChar yy ) 197 { 198 Int t = ((Int)xx) - ((Int)yy); 199 if (t < 0) t = 0; 200 if (t > 0xFF) t = 0xFF; 201 return (UChar)t; 202 } 203 204 static inline Short mul16 ( Short xx, Short yy ) 205 { 206 Int t = ((Int)xx) * ((Int)yy); 207 return (Short)t; 208 } 209 210 static inline Int mul32 ( Int xx, Int yy ) 211 { 212 Int t = ((Int)xx) * ((Int)yy); 213 return (Int)t; 214 } 215 216 static inline Short mulhi16S ( Short xx, Short yy ) 217 { 218 Int t = ((Int)xx) * ((Int)yy); 219 t >>=/*s*/ 16; 220 return (Short)t; 221 } 222 223 static inline UShort mulhi16U ( UShort xx, UShort yy ) 224 { 225 UInt t = ((UInt)xx) * ((UInt)yy); 226 t >>=/*u*/ 16; 227 return (UShort)t; 228 } 229 230 static inline UInt cmpeq32 ( UInt xx, UInt yy ) 231 { 232 return xx==yy ? 0xFFFFFFFF : 0; 233 } 234 235 static inline UShort cmpeq16 ( UShort xx, UShort yy ) 236 { 237 return toUShort(xx==yy ? 0xFFFF : 0); 238 } 239 240 static inline UChar cmpeq8 ( UChar xx, UChar yy ) 241 { 242 return toUChar(xx==yy ? 0xFF : 0); 243 } 244 245 static inline UInt cmpgt32S ( Int xx, Int yy ) 246 { 247 return xx>yy ? 0xFFFFFFFF : 0; 248 } 249 250 static inline UShort cmpgt16S ( Short xx, Short yy ) 251 { 252 return toUShort(xx>yy ? 0xFFFF : 0); 253 } 254 255 static inline UChar cmpgt8S ( Char xx, Char yy ) 256 { 257 return toUChar(xx>yy ? 0xFF : 0); 258 } 259 260 static inline UInt cmpnez32 ( UInt xx ) 261 { 262 return xx==0 ? 0 : 0xFFFFFFFF; 263 } 264 265 static inline UShort cmpnez16 ( UShort xx ) 266 { 267 return toUShort(xx==0 ? 0 : 0xFFFF); 268 } 269 270 static inline UChar cmpnez8 ( UChar xx ) 271 { 272 return toUChar(xx==0 ? 0 : 0xFF); 273 } 274 275 static inline Short qnarrow32Sto16 ( UInt xx0 ) 276 { 277 Int xx = (Int)xx0; 278 if (xx < -32768) xx = -32768; 279 if (xx > 32767) xx = 32767; 280 return (Short)xx; 281 } 282 283 static inline Char qnarrow16Sto8 ( UShort xx0 ) 284 { 285 Short xx = (Short)xx0; 286 if (xx < -128) xx = -128; 287 if (xx > 127) xx = 127; 288 return (Char)xx; 289 } 290 291 static inline UChar qnarrow16Uto8 ( UShort xx0 ) 292 { 293 Short xx = (Short)xx0; 294 if (xx < 0) xx = 0; 295 if (xx > 255) xx = 255; 296 return (UChar)xx; 297 } 298 299 /* shifts: we don't care about out-of-range ones, since 300 that is dealt with at a higher level. */ 301 302 static inline UChar shl8 ( UChar v, UInt n ) 303 { 304 return toUChar(v << n); 305 } 306 307 static inline UChar sar8 ( UChar v, UInt n ) 308 { 309 return toUChar(((Char)v) >> n); 310 } 311 312 static inline UShort shl16 ( UShort v, UInt n ) 313 { 314 return toUShort(v << n); 315 } 316 317 static inline UShort shr16 ( UShort v, UInt n ) 318 { 319 return toUShort((((UShort)v) >> n)); 320 } 321 322 static inline UShort sar16 ( UShort v, UInt n ) 323 { 324 return toUShort(((Short)v) >> n); 325 } 326 327 static inline UInt shl32 ( UInt v, UInt n ) 328 { 329 return v << n; 330 } 331 332 static inline UInt shr32 ( UInt v, UInt n ) 333 { 334 return (((UInt)v) >> n); 335 } 336 337 static inline UInt sar32 ( UInt v, UInt n ) 338 { 339 return ((Int)v) >> n; 340 } 341 342 static inline UChar avg8U ( UChar xx, UChar yy ) 343 { 344 UInt xxi = (UInt)xx; 345 UInt yyi = (UInt)yy; 346 UInt r = (xxi + yyi + 1) >> 1; 347 return (UChar)r; 348 } 349 350 static inline UShort avg16U ( UShort xx, UShort yy ) 351 { 352 UInt xxi = (UInt)xx; 353 UInt yyi = (UInt)yy; 354 UInt r = (xxi + yyi + 1) >> 1; 355 return (UShort)r; 356 } 357 358 static inline Short max16S ( Short xx, Short yy ) 359 { 360 return toUShort((xx > yy) ? xx : yy); 361 } 362 363 static inline UChar max8U ( UChar xx, UChar yy ) 364 { 365 return toUChar((xx > yy) ? xx : yy); 366 } 367 368 static inline Short min16S ( Short xx, Short yy ) 369 { 370 return toUShort((xx < yy) ? xx : yy); 371 } 372 373 static inline UChar min8U ( UChar xx, UChar yy ) 374 { 375 return toUChar((xx < yy) ? xx : yy); 376 } 377 378 static inline UShort hadd16U ( UShort xx, UShort yy ) 379 { 380 UInt xxi = (UInt)xx; 381 UInt yyi = (UInt)yy; 382 UInt r = (xxi + yyi) >> 1; 383 return (UShort)r; 384 } 385 386 static inline Short hadd16S ( Short xx, Short yy ) 387 { 388 Int xxi = (Int)xx; 389 Int yyi = (Int)yy; 390 Int r = (xxi + yyi) >> 1; 391 return (Short)r; 392 } 393 394 static inline UShort hsub16U ( UShort xx, UShort yy ) 395 { 396 UInt xxi = (UInt)xx; 397 UInt yyi = (UInt)yy; 398 UInt r = (xxi - yyi) >> 1; 399 return (UShort)r; 400 } 401 402 static inline Short hsub16S ( Short xx, Short yy ) 403 { 404 Int xxi = (Int)xx; 405 Int yyi = (Int)yy; 406 Int r = (xxi - yyi) >> 1; 407 return (Short)r; 408 } 409 410 static inline UChar hadd8U ( UChar xx, UChar yy ) 411 { 412 UInt xxi = (UInt)xx; 413 UInt yyi = (UInt)yy; 414 UInt r = (xxi + yyi) >> 1; 415 return (UChar)r; 416 } 417 418 static inline Char hadd8S ( Char xx, Char yy ) 419 { 420 Int xxi = (Int)xx; 421 Int yyi = (Int)yy; 422 Int r = (xxi + yyi) >> 1; 423 return (Char)r; 424 } 425 426 static inline UChar hsub8U ( UChar xx, UChar yy ) 427 { 428 UInt xxi = (UInt)xx; 429 UInt yyi = (UInt)yy; 430 UInt r = (xxi - yyi) >> 1; 431 return (UChar)r; 432 } 433 434 static inline Char hsub8S ( Char xx, Char yy ) 435 { 436 Int xxi = (Int)xx; 437 Int yyi = (Int)yy; 438 Int r = (xxi - yyi) >> 1; 439 return (Char)r; 440 } 441 442 static inline UInt absdiff8U ( UChar xx, UChar yy ) 443 { 444 UInt xxu = (UChar)xx; 445 UInt yyu = (UChar)yy; 446 return xxu >= yyu ? xxu - yyu : yyu - xxu; 447 } 448 449 /* ----------------------------------------------------- */ 450 /* Start of the externally visible functions. These simply 451 implement the corresponding IR primops. */ 452 /* ----------------------------------------------------- */ 453 454 /* ------------ Normal addition ------------ */ 455 456 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy ) 457 { 458 return mk32x2( 459 sel32x2_1(xx) + sel32x2_1(yy), 460 sel32x2_0(xx) + sel32x2_0(yy) 461 ); 462 } 463 464 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy ) 465 { 466 return mk16x4( 467 toUShort( sel16x4_3(xx) + sel16x4_3(yy) ), 468 toUShort( sel16x4_2(xx) + sel16x4_2(yy) ), 469 toUShort( sel16x4_1(xx) + sel16x4_1(yy) ), 470 toUShort( sel16x4_0(xx) + sel16x4_0(yy) ) 471 ); 472 } 473 474 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy ) 475 { 476 return mk8x8( 477 toUChar( sel8x8_7(xx) + sel8x8_7(yy) ), 478 toUChar( sel8x8_6(xx) + sel8x8_6(yy) ), 479 toUChar( sel8x8_5(xx) + sel8x8_5(yy) ), 480 toUChar( sel8x8_4(xx) + sel8x8_4(yy) ), 481 toUChar( sel8x8_3(xx) + sel8x8_3(yy) ), 482 toUChar( sel8x8_2(xx) + sel8x8_2(yy) ), 483 toUChar( sel8x8_1(xx) + sel8x8_1(yy) ), 484 toUChar( sel8x8_0(xx) + sel8x8_0(yy) ) 485 ); 486 } 487 488 /* ------------ Saturating addition ------------ */ 489 490 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy ) 491 { 492 return mk16x4( 493 qadd16S( sel16x4_3(xx), sel16x4_3(yy) ), 494 qadd16S( sel16x4_2(xx), sel16x4_2(yy) ), 495 qadd16S( sel16x4_1(xx), sel16x4_1(yy) ), 496 qadd16S( sel16x4_0(xx), sel16x4_0(yy) ) 497 ); 498 } 499 500 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy ) 501 { 502 return mk8x8( 503 qadd8S( sel8x8_7(xx), sel8x8_7(yy) ), 504 qadd8S( sel8x8_6(xx), sel8x8_6(yy) ), 505 qadd8S( sel8x8_5(xx), sel8x8_5(yy) ), 506 qadd8S( sel8x8_4(xx), sel8x8_4(yy) ), 507 qadd8S( sel8x8_3(xx), sel8x8_3(yy) ), 508 qadd8S( sel8x8_2(xx), sel8x8_2(yy) ), 509 qadd8S( sel8x8_1(xx), sel8x8_1(yy) ), 510 qadd8S( sel8x8_0(xx), sel8x8_0(yy) ) 511 ); 512 } 513 514 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy ) 515 { 516 return mk16x4( 517 qadd16U( sel16x4_3(xx), sel16x4_3(yy) ), 518 qadd16U( sel16x4_2(xx), sel16x4_2(yy) ), 519 qadd16U( sel16x4_1(xx), sel16x4_1(yy) ), 520 qadd16U( sel16x4_0(xx), sel16x4_0(yy) ) 521 ); 522 } 523 524 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy ) 525 { 526 return mk8x8( 527 qadd8U( sel8x8_7(xx), sel8x8_7(yy) ), 528 qadd8U( sel8x8_6(xx), sel8x8_6(yy) ), 529 qadd8U( sel8x8_5(xx), sel8x8_5(yy) ), 530 qadd8U( sel8x8_4(xx), sel8x8_4(yy) ), 531 qadd8U( sel8x8_3(xx), sel8x8_3(yy) ), 532 qadd8U( sel8x8_2(xx), sel8x8_2(yy) ), 533 qadd8U( sel8x8_1(xx), sel8x8_1(yy) ), 534 qadd8U( sel8x8_0(xx), sel8x8_0(yy) ) 535 ); 536 } 537 538 /* ------------ Normal subtraction ------------ */ 539 540 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy ) 541 { 542 return mk32x2( 543 sel32x2_1(xx) - sel32x2_1(yy), 544 sel32x2_0(xx) - sel32x2_0(yy) 545 ); 546 } 547 548 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy ) 549 { 550 return mk16x4( 551 toUShort( sel16x4_3(xx) - sel16x4_3(yy) ), 552 toUShort( sel16x4_2(xx) - sel16x4_2(yy) ), 553 toUShort( sel16x4_1(xx) - sel16x4_1(yy) ), 554 toUShort( sel16x4_0(xx) - sel16x4_0(yy) ) 555 ); 556 } 557 558 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy ) 559 { 560 return mk8x8( 561 toUChar( sel8x8_7(xx) - sel8x8_7(yy) ), 562 toUChar( sel8x8_6(xx) - sel8x8_6(yy) ), 563 toUChar( sel8x8_5(xx) - sel8x8_5(yy) ), 564 toUChar( sel8x8_4(xx) - sel8x8_4(yy) ), 565 toUChar( sel8x8_3(xx) - sel8x8_3(yy) ), 566 toUChar( sel8x8_2(xx) - sel8x8_2(yy) ), 567 toUChar( sel8x8_1(xx) - sel8x8_1(yy) ), 568 toUChar( sel8x8_0(xx) - sel8x8_0(yy) ) 569 ); 570 } 571 572 /* ------------ Saturating subtraction ------------ */ 573 574 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy ) 575 { 576 return mk16x4( 577 qsub16S( sel16x4_3(xx), sel16x4_3(yy) ), 578 qsub16S( sel16x4_2(xx), sel16x4_2(yy) ), 579 qsub16S( sel16x4_1(xx), sel16x4_1(yy) ), 580 qsub16S( sel16x4_0(xx), sel16x4_0(yy) ) 581 ); 582 } 583 584 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy ) 585 { 586 return mk8x8( 587 qsub8S( sel8x8_7(xx), sel8x8_7(yy) ), 588 qsub8S( sel8x8_6(xx), sel8x8_6(yy) ), 589 qsub8S( sel8x8_5(xx), sel8x8_5(yy) ), 590 qsub8S( sel8x8_4(xx), sel8x8_4(yy) ), 591 qsub8S( sel8x8_3(xx), sel8x8_3(yy) ), 592 qsub8S( sel8x8_2(xx), sel8x8_2(yy) ), 593 qsub8S( sel8x8_1(xx), sel8x8_1(yy) ), 594 qsub8S( sel8x8_0(xx), sel8x8_0(yy) ) 595 ); 596 } 597 598 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy ) 599 { 600 return mk16x4( 601 qsub16U( sel16x4_3(xx), sel16x4_3(yy) ), 602 qsub16U( sel16x4_2(xx), sel16x4_2(yy) ), 603 qsub16U( sel16x4_1(xx), sel16x4_1(yy) ), 604 qsub16U( sel16x4_0(xx), sel16x4_0(yy) ) 605 ); 606 } 607 608 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy ) 609 { 610 return mk8x8( 611 qsub8U( sel8x8_7(xx), sel8x8_7(yy) ), 612 qsub8U( sel8x8_6(xx), sel8x8_6(yy) ), 613 qsub8U( sel8x8_5(xx), sel8x8_5(yy) ), 614 qsub8U( sel8x8_4(xx), sel8x8_4(yy) ), 615 qsub8U( sel8x8_3(xx), sel8x8_3(yy) ), 616 qsub8U( sel8x8_2(xx), sel8x8_2(yy) ), 617 qsub8U( sel8x8_1(xx), sel8x8_1(yy) ), 618 qsub8U( sel8x8_0(xx), sel8x8_0(yy) ) 619 ); 620 } 621 622 /* ------------ Multiplication ------------ */ 623 624 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy ) 625 { 626 return mk16x4( 627 mul16( sel16x4_3(xx), sel16x4_3(yy) ), 628 mul16( sel16x4_2(xx), sel16x4_2(yy) ), 629 mul16( sel16x4_1(xx), sel16x4_1(yy) ), 630 mul16( sel16x4_0(xx), sel16x4_0(yy) ) 631 ); 632 } 633 634 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy ) 635 { 636 return mk32x2( 637 mul32( sel32x2_1(xx), sel32x2_1(yy) ), 638 mul32( sel32x2_0(xx), sel32x2_0(yy) ) 639 ); 640 } 641 642 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy ) 643 { 644 return mk16x4( 645 mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ), 646 mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ), 647 mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ), 648 mulhi16S( sel16x4_0(xx), sel16x4_0(yy) ) 649 ); 650 } 651 652 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy ) 653 { 654 return mk16x4( 655 mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ), 656 mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ), 657 mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ), 658 mulhi16U( sel16x4_0(xx), sel16x4_0(yy) ) 659 ); 660 } 661 662 /* ------------ Comparison ------------ */ 663 664 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy ) 665 { 666 return mk32x2( 667 cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ), 668 cmpeq32( sel32x2_0(xx), sel32x2_0(yy) ) 669 ); 670 } 671 672 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy ) 673 { 674 return mk16x4( 675 cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ), 676 cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ), 677 cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ), 678 cmpeq16( sel16x4_0(xx), sel16x4_0(yy) ) 679 ); 680 } 681 682 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy ) 683 { 684 return mk8x8( 685 cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ), 686 cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ), 687 cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ), 688 cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ), 689 cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ), 690 cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ), 691 cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ), 692 cmpeq8( sel8x8_0(xx), sel8x8_0(yy) ) 693 ); 694 } 695 696 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy ) 697 { 698 return mk32x2( 699 cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ), 700 cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) ) 701 ); 702 } 703 704 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy ) 705 { 706 return mk16x4( 707 cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ), 708 cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ), 709 cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ), 710 cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) ) 711 ); 712 } 713 714 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy ) 715 { 716 return mk8x8( 717 cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ), 718 cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ), 719 cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ), 720 cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ), 721 cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ), 722 cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ), 723 cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ), 724 cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) ) 725 ); 726 } 727 728 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx ) 729 { 730 return mk32x2( 731 cmpnez32( sel32x2_1(xx) ), 732 cmpnez32( sel32x2_0(xx) ) 733 ); 734 } 735 736 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx ) 737 { 738 return mk16x4( 739 cmpnez16( sel16x4_3(xx) ), 740 cmpnez16( sel16x4_2(xx) ), 741 cmpnez16( sel16x4_1(xx) ), 742 cmpnez16( sel16x4_0(xx) ) 743 ); 744 } 745 746 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx ) 747 { 748 return mk8x8( 749 cmpnez8( sel8x8_7(xx) ), 750 cmpnez8( sel8x8_6(xx) ), 751 cmpnez8( sel8x8_5(xx) ), 752 cmpnez8( sel8x8_4(xx) ), 753 cmpnez8( sel8x8_3(xx) ), 754 cmpnez8( sel8x8_2(xx) ), 755 cmpnez8( sel8x8_1(xx) ), 756 cmpnez8( sel8x8_0(xx) ) 757 ); 758 } 759 760 /* ------------ Saturating narrowing ------------ */ 761 762 ULong h_generic_calc_QNarrow32Sx2 ( ULong aa, ULong bb ) 763 { 764 UInt d = sel32x2_1(aa); 765 UInt c = sel32x2_0(aa); 766 UInt b = sel32x2_1(bb); 767 UInt a = sel32x2_0(bb); 768 return mk16x4( 769 qnarrow32Sto16(d), 770 qnarrow32Sto16(c), 771 qnarrow32Sto16(b), 772 qnarrow32Sto16(a) 773 ); 774 } 775 776 ULong h_generic_calc_QNarrow16Sx4 ( ULong aa, ULong bb ) 777 { 778 UShort h = sel16x4_3(aa); 779 UShort g = sel16x4_2(aa); 780 UShort f = sel16x4_1(aa); 781 UShort e = sel16x4_0(aa); 782 UShort d = sel16x4_3(bb); 783 UShort c = sel16x4_2(bb); 784 UShort b = sel16x4_1(bb); 785 UShort a = sel16x4_0(bb); 786 return mk8x8( 787 qnarrow16Sto8(h), 788 qnarrow16Sto8(g), 789 qnarrow16Sto8(f), 790 qnarrow16Sto8(e), 791 qnarrow16Sto8(d), 792 qnarrow16Sto8(c), 793 qnarrow16Sto8(b), 794 qnarrow16Sto8(a) 795 ); 796 } 797 798 ULong h_generic_calc_QNarrow16Ux4 ( ULong aa, ULong bb ) 799 { 800 UShort h = sel16x4_3(aa); 801 UShort g = sel16x4_2(aa); 802 UShort f = sel16x4_1(aa); 803 UShort e = sel16x4_0(aa); 804 UShort d = sel16x4_3(bb); 805 UShort c = sel16x4_2(bb); 806 UShort b = sel16x4_1(bb); 807 UShort a = sel16x4_0(bb); 808 return mk8x8( 809 qnarrow16Uto8(h), 810 qnarrow16Uto8(g), 811 qnarrow16Uto8(f), 812 qnarrow16Uto8(e), 813 qnarrow16Uto8(d), 814 qnarrow16Uto8(c), 815 qnarrow16Uto8(b), 816 qnarrow16Uto8(a) 817 ); 818 } 819 820 /* ------------ Interleaving ------------ */ 821 822 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb ) 823 { 824 return mk8x8( 825 sel8x8_7(aa), 826 sel8x8_7(bb), 827 sel8x8_6(aa), 828 sel8x8_6(bb), 829 sel8x8_5(aa), 830 sel8x8_5(bb), 831 sel8x8_4(aa), 832 sel8x8_4(bb) 833 ); 834 } 835 836 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb ) 837 { 838 return mk8x8( 839 sel8x8_3(aa), 840 sel8x8_3(bb), 841 sel8x8_2(aa), 842 sel8x8_2(bb), 843 sel8x8_1(aa), 844 sel8x8_1(bb), 845 sel8x8_0(aa), 846 sel8x8_0(bb) 847 ); 848 } 849 850 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb ) 851 { 852 return mk16x4( 853 sel16x4_3(aa), 854 sel16x4_3(bb), 855 sel16x4_2(aa), 856 sel16x4_2(bb) 857 ); 858 } 859 860 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb ) 861 { 862 return mk16x4( 863 sel16x4_1(aa), 864 sel16x4_1(bb), 865 sel16x4_0(aa), 866 sel16x4_0(bb) 867 ); 868 } 869 870 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb ) 871 { 872 return mk32x2( 873 sel32x2_1(aa), 874 sel32x2_1(bb) 875 ); 876 } 877 878 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb ) 879 { 880 return mk32x2( 881 sel32x2_0(aa), 882 sel32x2_0(bb) 883 ); 884 } 885 886 /* ------------ Concatenation ------------ */ 887 888 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb ) 889 { 890 return mk16x4( 891 sel16x4_3(aa), 892 sel16x4_1(aa), 893 sel16x4_3(bb), 894 sel16x4_1(bb) 895 ); 896 } 897 898 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb ) 899 { 900 return mk16x4( 901 sel16x4_2(aa), 902 sel16x4_0(aa), 903 sel16x4_2(bb), 904 sel16x4_0(bb) 905 ); 906 } 907 908 /* misc hack looking for a proper home */ 909 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb ) 910 { 911 return mk8x8( 912 index8x8(aa, sel8x8_7(bb)), 913 index8x8(aa, sel8x8_6(bb)), 914 index8x8(aa, sel8x8_5(bb)), 915 index8x8(aa, sel8x8_4(bb)), 916 index8x8(aa, sel8x8_3(bb)), 917 index8x8(aa, sel8x8_2(bb)), 918 index8x8(aa, sel8x8_1(bb)), 919 index8x8(aa, sel8x8_0(bb)) 920 ); 921 } 922 923 /* ------------ Shifting ------------ */ 924 /* Note that because these primops are undefined if the shift amount 925 equals or exceeds the lane width, the shift amount is masked so 926 that the scalar shifts are always in range. In fact, given the 927 semantics of these primops (ShlN16x4, etc) it is an error if in 928 fact we are ever given an out-of-range shift amount. 929 */ 930 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn ) 931 { 932 /* vassert(nn < 32); */ 933 nn &= 31; 934 return mk32x2( 935 shl32( sel32x2_1(xx), nn ), 936 shl32( sel32x2_0(xx), nn ) 937 ); 938 } 939 940 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn ) 941 { 942 /* vassert(nn < 16); */ 943 nn &= 15; 944 return mk16x4( 945 shl16( sel16x4_3(xx), nn ), 946 shl16( sel16x4_2(xx), nn ), 947 shl16( sel16x4_1(xx), nn ), 948 shl16( sel16x4_0(xx), nn ) 949 ); 950 } 951 952 ULong h_generic_calc_ShlN8x8 ( ULong xx, UInt nn ) 953 { 954 /* vassert(nn < 8); */ 955 nn &= 7; 956 return mk8x8( 957 shl8( sel8x8_7(xx), nn ), 958 shl8( sel8x8_6(xx), nn ), 959 shl8( sel8x8_5(xx), nn ), 960 shl8( sel8x8_4(xx), nn ), 961 shl8( sel8x8_3(xx), nn ), 962 shl8( sel8x8_2(xx), nn ), 963 shl8( sel8x8_1(xx), nn ), 964 shl8( sel8x8_0(xx), nn ) 965 ); 966 } 967 968 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn ) 969 { 970 /* vassert(nn < 32); */ 971 nn &= 31; 972 return mk32x2( 973 shr32( sel32x2_1(xx), nn ), 974 shr32( sel32x2_0(xx), nn ) 975 ); 976 } 977 978 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn ) 979 { 980 /* vassert(nn < 16); */ 981 nn &= 15; 982 return mk16x4( 983 shr16( sel16x4_3(xx), nn ), 984 shr16( sel16x4_2(xx), nn ), 985 shr16( sel16x4_1(xx), nn ), 986 shr16( sel16x4_0(xx), nn ) 987 ); 988 } 989 990 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn ) 991 { 992 /* vassert(nn < 32); */ 993 nn &= 31; 994 return mk32x2( 995 sar32( sel32x2_1(xx), nn ), 996 sar32( sel32x2_0(xx), nn ) 997 ); 998 } 999 1000 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn ) 1001 { 1002 /* vassert(nn < 16); */ 1003 nn &= 15; 1004 return mk16x4( 1005 sar16( sel16x4_3(xx), nn ), 1006 sar16( sel16x4_2(xx), nn ), 1007 sar16( sel16x4_1(xx), nn ), 1008 sar16( sel16x4_0(xx), nn ) 1009 ); 1010 } 1011 1012 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn ) 1013 { 1014 /* vassert(nn < 8); */ 1015 nn &= 7; 1016 return mk8x8( 1017 sar8( sel8x8_7(xx), nn ), 1018 sar8( sel8x8_6(xx), nn ), 1019 sar8( sel8x8_5(xx), nn ), 1020 sar8( sel8x8_4(xx), nn ), 1021 sar8( sel8x8_3(xx), nn ), 1022 sar8( sel8x8_2(xx), nn ), 1023 sar8( sel8x8_1(xx), nn ), 1024 sar8( sel8x8_0(xx), nn ) 1025 ); 1026 } 1027 1028 /* ------------ Averaging ------------ */ 1029 1030 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy ) 1031 { 1032 return mk8x8( 1033 avg8U( sel8x8_7(xx), sel8x8_7(yy) ), 1034 avg8U( sel8x8_6(xx), sel8x8_6(yy) ), 1035 avg8U( sel8x8_5(xx), sel8x8_5(yy) ), 1036 avg8U( sel8x8_4(xx), sel8x8_4(yy) ), 1037 avg8U( sel8x8_3(xx), sel8x8_3(yy) ), 1038 avg8U( sel8x8_2(xx), sel8x8_2(yy) ), 1039 avg8U( sel8x8_1(xx), sel8x8_1(yy) ), 1040 avg8U( sel8x8_0(xx), sel8x8_0(yy) ) 1041 ); 1042 } 1043 1044 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy ) 1045 { 1046 return mk16x4( 1047 avg16U( sel16x4_3(xx), sel16x4_3(yy) ), 1048 avg16U( sel16x4_2(xx), sel16x4_2(yy) ), 1049 avg16U( sel16x4_1(xx), sel16x4_1(yy) ), 1050 avg16U( sel16x4_0(xx), sel16x4_0(yy) ) 1051 ); 1052 } 1053 1054 /* ------------ max/min ------------ */ 1055 1056 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy ) 1057 { 1058 return mk16x4( 1059 max16S( sel16x4_3(xx), sel16x4_3(yy) ), 1060 max16S( sel16x4_2(xx), sel16x4_2(yy) ), 1061 max16S( sel16x4_1(xx), sel16x4_1(yy) ), 1062 max16S( sel16x4_0(xx), sel16x4_0(yy) ) 1063 ); 1064 } 1065 1066 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy ) 1067 { 1068 return mk8x8( 1069 max8U( sel8x8_7(xx), sel8x8_7(yy) ), 1070 max8U( sel8x8_6(xx), sel8x8_6(yy) ), 1071 max8U( sel8x8_5(xx), sel8x8_5(yy) ), 1072 max8U( sel8x8_4(xx), sel8x8_4(yy) ), 1073 max8U( sel8x8_3(xx), sel8x8_3(yy) ), 1074 max8U( sel8x8_2(xx), sel8x8_2(yy) ), 1075 max8U( sel8x8_1(xx), sel8x8_1(yy) ), 1076 max8U( sel8x8_0(xx), sel8x8_0(yy) ) 1077 ); 1078 } 1079 1080 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy ) 1081 { 1082 return mk16x4( 1083 min16S( sel16x4_3(xx), sel16x4_3(yy) ), 1084 min16S( sel16x4_2(xx), sel16x4_2(yy) ), 1085 min16S( sel16x4_1(xx), sel16x4_1(yy) ), 1086 min16S( sel16x4_0(xx), sel16x4_0(yy) ) 1087 ); 1088 } 1089 1090 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy ) 1091 { 1092 return mk8x8( 1093 min8U( sel8x8_7(xx), sel8x8_7(yy) ), 1094 min8U( sel8x8_6(xx), sel8x8_6(yy) ), 1095 min8U( sel8x8_5(xx), sel8x8_5(yy) ), 1096 min8U( sel8x8_4(xx), sel8x8_4(yy) ), 1097 min8U( sel8x8_3(xx), sel8x8_3(yy) ), 1098 min8U( sel8x8_2(xx), sel8x8_2(yy) ), 1099 min8U( sel8x8_1(xx), sel8x8_1(yy) ), 1100 min8U( sel8x8_0(xx), sel8x8_0(yy) ) 1101 ); 1102 } 1103 1104 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */ 1105 1106 /* Tuple/select functions for 16x2 vectors. */ 1107 static inline UInt mk16x2 ( UShort w1, UShort w2 ) { 1108 return (((UInt)w1) << 16) | ((UInt)w2); 1109 } 1110 1111 static inline UShort sel16x2_1 ( UInt w32 ) { 1112 return 0xFFFF & (UShort)(w32 >> 16); 1113 } 1114 static inline UShort sel16x2_0 ( UInt w32 ) { 1115 return 0xFFFF & (UShort)(w32); 1116 } 1117 1118 static inline UInt mk8x4 ( UChar w3, UChar w2, 1119 UChar w1, UChar w0 ) { 1120 UInt w32 = (((UInt)w3) << 24) | (((UInt)w2) << 16) 1121 | (((UInt)w1) << 8) | (((UInt)w0) << 0); 1122 return w32; 1123 } 1124 1125 static inline UChar sel8x4_3 ( UInt w32 ) { 1126 return toUChar(0xFF & (w32 >> 24)); 1127 } 1128 static inline UChar sel8x4_2 ( UInt w32 ) { 1129 return toUChar(0xFF & (w32 >> 16)); 1130 } 1131 static inline UChar sel8x4_1 ( UInt w32 ) { 1132 return toUChar(0xFF & (w32 >> 8)); 1133 } 1134 static inline UChar sel8x4_0 ( UInt w32 ) { 1135 return toUChar(0xFF & (w32 >> 0)); 1136 } 1137 1138 1139 /* ----------------------------------------------------- */ 1140 /* More externally visible functions. These simply 1141 implement the corresponding IR primops. */ 1142 /* ----------------------------------------------------- */ 1143 1144 /* ------ 16x2 ------ */ 1145 1146 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy ) 1147 { 1148 return mk16x2( sel16x2_1(xx) + sel16x2_1(yy), 1149 sel16x2_0(xx) + sel16x2_0(yy) ); 1150 } 1151 1152 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy ) 1153 { 1154 return mk16x2( sel16x2_1(xx) - sel16x2_1(yy), 1155 sel16x2_0(xx) - sel16x2_0(yy) ); 1156 } 1157 1158 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy ) 1159 { 1160 return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ), 1161 hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1162 } 1163 1164 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy ) 1165 { 1166 return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ), 1167 hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1168 } 1169 1170 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy ) 1171 { 1172 return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ), 1173 hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1174 } 1175 1176 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy ) 1177 { 1178 return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ), 1179 hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1180 } 1181 1182 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy ) 1183 { 1184 return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ), 1185 qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1186 } 1187 1188 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy ) 1189 { 1190 return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ), 1191 qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1192 } 1193 1194 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy ) 1195 { 1196 return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ), 1197 qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) ); 1198 } 1199 1200 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy ) 1201 { 1202 return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ), 1203 qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) ); 1204 } 1205 1206 /* ------ 8x4 ------ */ 1207 1208 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy ) 1209 { 1210 return mk8x4( 1211 sel8x4_3(xx) + sel8x4_3(yy), 1212 sel8x4_2(xx) + sel8x4_2(yy), 1213 sel8x4_1(xx) + sel8x4_1(yy), 1214 sel8x4_0(xx) + sel8x4_0(yy) 1215 ); 1216 } 1217 1218 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy ) 1219 { 1220 return mk8x4( 1221 sel8x4_3(xx) - sel8x4_3(yy), 1222 sel8x4_2(xx) - sel8x4_2(yy), 1223 sel8x4_1(xx) - sel8x4_1(yy), 1224 sel8x4_0(xx) - sel8x4_0(yy) 1225 ); 1226 } 1227 1228 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy ) 1229 { 1230 return mk8x4( 1231 hadd8U( sel8x4_3(xx), sel8x4_3(yy) ), 1232 hadd8U( sel8x4_2(xx), sel8x4_2(yy) ), 1233 hadd8U( sel8x4_1(xx), sel8x4_1(yy) ), 1234 hadd8U( sel8x4_0(xx), sel8x4_0(yy) ) 1235 ); 1236 } 1237 1238 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy ) 1239 { 1240 return mk8x4( 1241 hadd8S( sel8x4_3(xx), sel8x4_3(yy) ), 1242 hadd8S( sel8x4_2(xx), sel8x4_2(yy) ), 1243 hadd8S( sel8x4_1(xx), sel8x4_1(yy) ), 1244 hadd8S( sel8x4_0(xx), sel8x4_0(yy) ) 1245 ); 1246 } 1247 1248 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy ) 1249 { 1250 return mk8x4( 1251 hsub8U( sel8x4_3(xx), sel8x4_3(yy) ), 1252 hsub8U( sel8x4_2(xx), sel8x4_2(yy) ), 1253 hsub8U( sel8x4_1(xx), sel8x4_1(yy) ), 1254 hsub8U( sel8x4_0(xx), sel8x4_0(yy) ) 1255 ); 1256 } 1257 1258 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy ) 1259 { 1260 return mk8x4( 1261 hsub8S( sel8x4_3(xx), sel8x4_3(yy) ), 1262 hsub8S( sel8x4_2(xx), sel8x4_2(yy) ), 1263 hsub8S( sel8x4_1(xx), sel8x4_1(yy) ), 1264 hsub8S( sel8x4_0(xx), sel8x4_0(yy) ) 1265 ); 1266 } 1267 1268 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy ) 1269 { 1270 return mk8x4( 1271 qadd8U( sel8x4_3(xx), sel8x4_3(yy) ), 1272 qadd8U( sel8x4_2(xx), sel8x4_2(yy) ), 1273 qadd8U( sel8x4_1(xx), sel8x4_1(yy) ), 1274 qadd8U( sel8x4_0(xx), sel8x4_0(yy) ) 1275 ); 1276 } 1277 1278 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy ) 1279 { 1280 return mk8x4( 1281 qadd8S( sel8x4_3(xx), sel8x4_3(yy) ), 1282 qadd8S( sel8x4_2(xx), sel8x4_2(yy) ), 1283 qadd8S( sel8x4_1(xx), sel8x4_1(yy) ), 1284 qadd8S( sel8x4_0(xx), sel8x4_0(yy) ) 1285 ); 1286 } 1287 1288 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy ) 1289 { 1290 return mk8x4( 1291 qsub8U( sel8x4_3(xx), sel8x4_3(yy) ), 1292 qsub8U( sel8x4_2(xx), sel8x4_2(yy) ), 1293 qsub8U( sel8x4_1(xx), sel8x4_1(yy) ), 1294 qsub8U( sel8x4_0(xx), sel8x4_0(yy) ) 1295 ); 1296 } 1297 1298 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy ) 1299 { 1300 return mk8x4( 1301 qsub8S( sel8x4_3(xx), sel8x4_3(yy) ), 1302 qsub8S( sel8x4_2(xx), sel8x4_2(yy) ), 1303 qsub8S( sel8x4_1(xx), sel8x4_1(yy) ), 1304 qsub8S( sel8x4_0(xx), sel8x4_0(yy) ) 1305 ); 1306 } 1307 1308 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx ) 1309 { 1310 return mk16x2( 1311 cmpnez16( sel16x2_1(xx) ), 1312 cmpnez16( sel16x2_0(xx) ) 1313 ); 1314 } 1315 1316 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx ) 1317 { 1318 return mk8x4( 1319 cmpnez8( sel8x4_3(xx) ), 1320 cmpnez8( sel8x4_2(xx) ), 1321 cmpnez8( sel8x4_1(xx) ), 1322 cmpnez8( sel8x4_0(xx) ) 1323 ); 1324 } 1325 1326 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy ) 1327 { 1328 return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) ) 1329 + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) ) 1330 + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) ) 1331 + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) ); 1332 } 1333 1334 1335 /*---------------------------------------------------------------*/ 1336 /*--- end host_generic_simd64.c ---*/ 1337 /*---------------------------------------------------------------*/ 1338