Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                             host_generic_simd64.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2012 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
     37    where the instruction selectors cannot generate code in-line.
     38    These are purely back-end entities and cannot be seen/referenced
     39    from IR. */
     40 
     41 #include "libvex_basictypes.h"
     42 #include "host_generic_simd64.h"
     43 
     44 
     45 
     46 /* Tuple/select functions for 32x2 vectors. */
     47 
     48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
     49    return (((ULong)w1) << 32) | ((ULong)w0);
     50 }
     51 
     52 static inline UInt sel32x2_1 ( ULong w64 ) {
     53    return 0xFFFFFFFF & toUInt(w64 >> 32);
     54 }
     55 static inline UInt sel32x2_0 ( ULong w64 ) {
     56    return 0xFFFFFFFF & toUInt(w64);
     57 }
     58 
     59 
     60 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
     61    with 64-bit shifts so we give it a hand. */
     62 
     63 static inline ULong mk16x4 ( UShort w3, UShort w2,
     64                              UShort w1, UShort w0 ) {
     65    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
     66    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
     67    return mk32x2(hi32, lo32);
     68 }
     69 
     70 static inline UShort sel16x4_3 ( ULong w64 ) {
     71    UInt hi32 = toUInt(w64 >> 32);
     72    return toUShort(0xFFFF & (hi32 >> 16));
     73 }
     74 static inline UShort sel16x4_2 ( ULong w64 ) {
     75    UInt hi32 = toUInt(w64 >> 32);
     76    return toUShort(0xFFFF & hi32);
     77 }
     78 static inline UShort sel16x4_1 ( ULong w64 ) {
     79    UInt lo32 = (UInt)w64;
     80    return toUShort(0xFFFF & (lo32 >> 16));
     81 }
     82 static inline UShort sel16x4_0 ( ULong w64 ) {
     83    UInt lo32 = (UInt)w64;
     84    return toUShort(0xFFFF & lo32);
     85 }
     86 
     87 
     88 /* Tuple/select functions for 8x8 vectors. */
     89 
     90 static inline ULong mk8x8 ( UChar w7, UChar w6,
     91                             UChar w5, UChar w4,
     92                             UChar w3, UChar w2,
     93                             UChar w1, UChar w0 ) {
     94    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
     95                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
     96    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
     97                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
     98    return mk32x2(hi32, lo32);
     99 }
    100 
    101 static inline UChar sel8x8_7 ( ULong w64 ) {
    102    UInt hi32 = toUInt(w64 >> 32);
    103    return toUChar(0xFF & (hi32 >> 24));
    104 }
    105 static inline UChar sel8x8_6 ( ULong w64 ) {
    106    UInt hi32 = toUInt(w64 >> 32);
    107    return toUChar(0xFF & (hi32 >> 16));
    108 }
    109 static inline UChar sel8x8_5 ( ULong w64 ) {
    110    UInt hi32 = toUInt(w64 >> 32);
    111    return toUChar(0xFF & (hi32 >> 8));
    112 }
    113 static inline UChar sel8x8_4 ( ULong w64 ) {
    114    UInt hi32 = toUInt(w64 >> 32);
    115    return toUChar(0xFF & (hi32 >> 0));
    116 }
    117 static inline UChar sel8x8_3 ( ULong w64 ) {
    118    UInt lo32 = (UInt)w64;
    119    return toUChar(0xFF & (lo32 >> 24));
    120 }
    121 static inline UChar sel8x8_2 ( ULong w64 ) {
    122    UInt lo32 = (UInt)w64;
    123    return toUChar(0xFF & (lo32 >> 16));
    124 }
    125 static inline UChar sel8x8_1 ( ULong w64 ) {
    126    UInt lo32 = (UInt)w64;
    127    return toUChar(0xFF & (lo32 >> 8));
    128 }
    129 static inline UChar sel8x8_0 ( ULong w64 ) {
    130    UInt lo32 = (UInt)w64;
    131    return toUChar(0xFF & (lo32 >> 0));
    132 }
    133 
    134 static inline UChar index8x8 ( ULong w64, UChar ix ) {
    135    ix &= 7;
    136    return toUChar((w64 >> (8*ix)) & 0xFF);
    137 }
    138 
    139 
    140 /* Scalar helpers. */
    141 
    142 static inline Int qadd32S ( Int xx, Int yy )
    143 {
    144    Long t = ((Long)xx) + ((Long)yy);
    145    const Long loLim = -0x80000000LL;
    146    const Long hiLim =  0x7FFFFFFFLL;
    147    if (t < loLim) t = loLim;
    148    if (t > hiLim) t = hiLim;
    149    return (Int)t;
    150 }
    151 
    152 static inline Short qadd16S ( Short xx, Short yy )
    153 {
    154    Int t = ((Int)xx) + ((Int)yy);
    155    if (t < -32768) t = -32768;
    156    if (t > 32767)  t = 32767;
    157    return (Short)t;
    158 }
    159 
    160 static inline Char qadd8S ( Char xx, Char yy )
    161 {
    162    Int t = ((Int)xx) + ((Int)yy);
    163    if (t < -128) t = -128;
    164    if (t > 127)  t = 127;
    165    return (Char)t;
    166 }
    167 
    168 static inline UShort qadd16U ( UShort xx, UShort yy )
    169 {
    170    UInt t = ((UInt)xx) + ((UInt)yy);
    171    if (t > 0xFFFF) t = 0xFFFF;
    172    return (UShort)t;
    173 }
    174 
    175 static inline UChar qadd8U ( UChar xx, UChar yy )
    176 {
    177    UInt t = ((UInt)xx) + ((UInt)yy);
    178    if (t > 0xFF) t = 0xFF;
    179    return (UChar)t;
    180 }
    181 
    182 static inline Int qsub32S ( Int xx, Int yy )
    183 {
    184    Long t = ((Long)xx) - ((Long)yy);
    185    const Long loLim = -0x80000000LL;
    186    const Long hiLim =  0x7FFFFFFFLL;
    187    if (t < loLim) t = loLim;
    188    if (t > hiLim) t = hiLim;
    189    return (Int)t;
    190 }
    191 
    192 static inline Short qsub16S ( Short xx, Short yy )
    193 {
    194    Int t = ((Int)xx) - ((Int)yy);
    195    if (t < -32768) t = -32768;
    196    if (t > 32767)  t = 32767;
    197    return (Short)t;
    198 }
    199 
    200 static inline Char qsub8S ( Char xx, Char yy )
    201 {
    202    Int t = ((Int)xx) - ((Int)yy);
    203    if (t < -128) t = -128;
    204    if (t > 127)  t = 127;
    205    return (Char)t;
    206 }
    207 
    208 static inline UShort qsub16U ( UShort xx, UShort yy )
    209 {
    210    Int t = ((Int)xx) - ((Int)yy);
    211    if (t < 0)      t = 0;
    212    if (t > 0xFFFF) t = 0xFFFF;
    213    return (UShort)t;
    214 }
    215 
    216 static inline UChar qsub8U ( UChar xx, UChar yy )
    217 {
    218    Int t = ((Int)xx) - ((Int)yy);
    219    if (t < 0)    t = 0;
    220    if (t > 0xFF) t = 0xFF;
    221    return (UChar)t;
    222 }
    223 
    224 static inline Short mul16 ( Short xx, Short yy )
    225 {
    226    Int t = ((Int)xx) * ((Int)yy);
    227    return (Short)t;
    228 }
    229 
    230 static inline Int mul32 ( Int xx, Int yy )
    231 {
    232    Int t = ((Int)xx) * ((Int)yy);
    233    return (Int)t;
    234 }
    235 
    236 static inline Short mulhi16S ( Short xx, Short yy )
    237 {
    238    Int t = ((Int)xx) * ((Int)yy);
    239    t >>=/*s*/ 16;
    240    return (Short)t;
    241 }
    242 
    243 static inline UShort mulhi16U ( UShort xx, UShort yy )
    244 {
    245    UInt t = ((UInt)xx) * ((UInt)yy);
    246    t >>=/*u*/ 16;
    247    return (UShort)t;
    248 }
    249 
    250 static inline UInt cmpeq32 ( UInt xx, UInt yy )
    251 {
    252    return xx==yy ? 0xFFFFFFFF : 0;
    253 }
    254 
    255 static inline UShort cmpeq16 ( UShort xx, UShort yy )
    256 {
    257    return toUShort(xx==yy ? 0xFFFF : 0);
    258 }
    259 
    260 static inline UChar cmpeq8 ( UChar xx, UChar yy )
    261 {
    262    return toUChar(xx==yy ? 0xFF : 0);
    263 }
    264 
    265 static inline UInt cmpgt32S ( Int xx, Int yy )
    266 {
    267    return xx>yy ? 0xFFFFFFFF : 0;
    268 }
    269 
    270 static inline UShort cmpgt16S ( Short xx, Short yy )
    271 {
    272    return toUShort(xx>yy ? 0xFFFF : 0);
    273 }
    274 
    275 static inline UChar cmpgt8S ( Char xx, Char yy )
    276 {
    277    return toUChar(xx>yy ? 0xFF : 0);
    278 }
    279 
    280 static inline UInt cmpnez32 ( UInt xx )
    281 {
    282    return xx==0 ? 0 : 0xFFFFFFFF;
    283 }
    284 
    285 static inline UShort cmpnez16 ( UShort xx )
    286 {
    287    return toUShort(xx==0 ? 0 : 0xFFFF);
    288 }
    289 
    290 static inline UChar cmpnez8 ( UChar xx )
    291 {
    292    return toUChar(xx==0 ? 0 : 0xFF);
    293 }
    294 
    295 static inline Short qnarrow32Sto16S ( UInt xx0 )
    296 {
    297    Int xx = (Int)xx0;
    298    if (xx < -32768) xx = -32768;
    299    if (xx > 32767)  xx = 32767;
    300    return (Short)xx;
    301 }
    302 
    303 static inline Char qnarrow16Sto8S ( UShort xx0 )
    304 {
    305    Short xx = (Short)xx0;
    306    if (xx < -128) xx = -128;
    307    if (xx > 127)  xx = 127;
    308    return (Char)xx;
    309 }
    310 
    311 static inline UChar qnarrow16Sto8U ( UShort xx0 )
    312 {
    313    Short xx = (Short)xx0;
    314    if (xx < 0)   xx = 0;
    315    if (xx > 255) xx = 255;
    316    return (UChar)xx;
    317 }
    318 
    319 static inline UShort narrow32to16 ( UInt xx )
    320 {
    321    return (UShort)xx;
    322 }
    323 
    324 static inline UChar narrow16to8 ( UShort xx )
    325 {
    326    return (UChar)xx;
    327 }
    328 
    329 /* shifts: we don't care about out-of-range ones, since
    330    that is dealt with at a higher level. */
    331 
    332 static inline UChar shl8 ( UChar v, UInt n )
    333 {
    334    return toUChar(v << n);
    335 }
    336 
    337 static inline UChar sar8 ( UChar v, UInt n )
    338 {
    339    return toUChar(((Char)v) >> n);
    340 }
    341 
    342 static inline UShort shl16 ( UShort v, UInt n )
    343 {
    344    return toUShort(v << n);
    345 }
    346 
    347 static inline UShort shr16 ( UShort v, UInt n )
    348 {
    349    return toUShort((((UShort)v) >> n));
    350 }
    351 
    352 static inline UShort sar16 ( UShort v, UInt n )
    353 {
    354    return toUShort(((Short)v) >> n);
    355 }
    356 
    357 static inline UInt shl32 ( UInt v, UInt n )
    358 {
    359    return v << n;
    360 }
    361 
    362 static inline UInt shr32 ( UInt v, UInt n )
    363 {
    364    return (((UInt)v) >> n);
    365 }
    366 
    367 static inline UInt sar32 ( UInt v, UInt n )
    368 {
    369    return ((Int)v) >> n;
    370 }
    371 
    372 static inline UChar avg8U ( UChar xx, UChar yy )
    373 {
    374    UInt xxi = (UInt)xx;
    375    UInt yyi = (UInt)yy;
    376    UInt r   = (xxi + yyi + 1) >> 1;
    377    return (UChar)r;
    378 }
    379 
    380 static inline UShort avg16U ( UShort xx, UShort yy )
    381 {
    382    UInt xxi = (UInt)xx;
    383    UInt yyi = (UInt)yy;
    384    UInt r   = (xxi + yyi + 1) >> 1;
    385    return (UShort)r;
    386 }
    387 
    388 static inline Short max16S ( Short xx, Short yy )
    389 {
    390    return toUShort((xx > yy) ? xx : yy);
    391 }
    392 
    393 static inline UChar max8U ( UChar xx, UChar yy )
    394 {
    395    return toUChar((xx > yy) ? xx : yy);
    396 }
    397 
    398 static inline Short min16S ( Short xx, Short yy )
    399 {
    400    return toUShort((xx < yy) ? xx : yy);
    401 }
    402 
    403 static inline UChar min8U ( UChar xx, UChar yy )
    404 {
    405    return toUChar((xx < yy) ? xx : yy);
    406 }
    407 
    408 static inline UShort hadd16U ( UShort xx, UShort yy )
    409 {
    410    UInt xxi = (UInt)xx;
    411    UInt yyi = (UInt)yy;
    412    UInt r   = (xxi + yyi) >> 1;
    413    return (UShort)r;
    414 }
    415 
    416 static inline Short hadd16S ( Short xx, Short yy )
    417 {
    418    Int xxi = (Int)xx;
    419    Int yyi = (Int)yy;
    420    Int r   = (xxi + yyi) >> 1;
    421    return (Short)r;
    422 }
    423 
    424 static inline UShort hsub16U ( UShort xx, UShort yy )
    425 {
    426    UInt xxi = (UInt)xx;
    427    UInt yyi = (UInt)yy;
    428    UInt r   = (xxi - yyi) >> 1;
    429    return (UShort)r;
    430 }
    431 
    432 static inline Short hsub16S ( Short xx, Short yy )
    433 {
    434    Int xxi = (Int)xx;
    435    Int yyi = (Int)yy;
    436    Int r   = (xxi - yyi) >> 1;
    437    return (Short)r;
    438 }
    439 
    440 static inline UChar hadd8U ( UChar xx, UChar yy )
    441 {
    442    UInt xxi = (UInt)xx;
    443    UInt yyi = (UInt)yy;
    444    UInt r   = (xxi + yyi) >> 1;
    445    return (UChar)r;
    446 }
    447 
    448 static inline Char hadd8S ( Char xx, Char yy )
    449 {
    450    Int xxi = (Int)xx;
    451    Int yyi = (Int)yy;
    452    Int r   = (xxi + yyi) >> 1;
    453    return (Char)r;
    454 }
    455 
    456 static inline UChar hsub8U ( UChar xx, UChar yy )
    457 {
    458    UInt xxi = (UInt)xx;
    459    UInt yyi = (UInt)yy;
    460    UInt r   = (xxi - yyi) >> 1;
    461    return (UChar)r;
    462 }
    463 
    464 static inline Char hsub8S ( Char xx, Char yy )
    465 {
    466    Int xxi = (Int)xx;
    467    Int yyi = (Int)yy;
    468    Int r   = (xxi - yyi) >> 1;
    469    return (Char)r;
    470 }
    471 
    472 static inline UInt absdiff8U ( UChar xx, UChar yy )
    473 {
    474    UInt xxu = (UChar)xx;
    475    UInt yyu = (UChar)yy;
    476    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
    477 }
    478 
    479 /* ----------------------------------------------------- */
    480 /* Start of the externally visible functions.  These simply
    481    implement the corresponding IR primops. */
    482 /* ----------------------------------------------------- */
    483 
    484 /* ------------ Normal addition ------------ */
    485 
    486 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
    487 {
    488    return mk32x2(
    489              sel32x2_1(xx) + sel32x2_1(yy),
    490              sel32x2_0(xx) + sel32x2_0(yy)
    491           );
    492 }
    493 
    494 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
    495 {
    496    return mk16x4(
    497              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
    498              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
    499              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
    500              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
    501           );
    502 }
    503 
    504 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
    505 {
    506    return mk8x8(
    507              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
    508              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
    509              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
    510              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
    511              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
    512              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
    513              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
    514              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
    515           );
    516 }
    517 
    518 /* ------------ Saturating addition ------------ */
    519 
    520 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
    521 {
    522    return mk16x4(
    523              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
    524              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
    525              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
    526              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
    527           );
    528 }
    529 
    530 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
    531 {
    532    return mk8x8(
    533              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
    534              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
    535              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
    536              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
    537              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
    538              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
    539              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
    540              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
    541           );
    542 }
    543 
    544 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
    545 {
    546    return mk16x4(
    547              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
    548              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
    549              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
    550              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
    551           );
    552 }
    553 
    554 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
    555 {
    556    return mk8x8(
    557              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
    558              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
    559              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
    560              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
    561              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
    562              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
    563              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
    564              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
    565           );
    566 }
    567 
    568 /* ------------ Normal subtraction ------------ */
    569 
    570 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
    571 {
    572    return mk32x2(
    573              sel32x2_1(xx) - sel32x2_1(yy),
    574              sel32x2_0(xx) - sel32x2_0(yy)
    575           );
    576 }
    577 
    578 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
    579 {
    580    return mk16x4(
    581              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
    582              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
    583              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
    584              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
    585           );
    586 }
    587 
    588 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
    589 {
    590    return mk8x8(
    591              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
    592              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
    593              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
    594              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
    595              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
    596              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
    597              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
    598              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
    599           );
    600 }
    601 
    602 /* ------------ Saturating subtraction ------------ */
    603 
    604 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
    605 {
    606    return mk16x4(
    607              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
    608              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
    609              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
    610              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
    611           );
    612 }
    613 
    614 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
    615 {
    616    return mk8x8(
    617              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
    618              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
    619              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
    620              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
    621              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
    622              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
    623              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
    624              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
    625           );
    626 }
    627 
    628 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
    629 {
    630    return mk16x4(
    631              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
    632              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
    633              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
    634              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
    635           );
    636 }
    637 
    638 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
    639 {
    640    return mk8x8(
    641              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
    642              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
    643              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
    644              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
    645              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
    646              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
    647              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
    648              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
    649           );
    650 }
    651 
    652 /* ------------ Multiplication ------------ */
    653 
    654 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
    655 {
    656    return mk16x4(
    657              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
    658              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
    659              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
    660              mul16( sel16x4_0(xx), sel16x4_0(yy) )
    661           );
    662 }
    663 
    664 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
    665 {
    666    return mk32x2(
    667              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
    668              mul32( sel32x2_0(xx), sel32x2_0(yy) )
    669           );
    670 }
    671 
    672 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
    673 {
    674    return mk16x4(
    675              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
    676              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
    677              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
    678              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
    679           );
    680 }
    681 
    682 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
    683 {
    684    return mk16x4(
    685              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
    686              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
    687              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
    688              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
    689           );
    690 }
    691 
    692 /* ------------ Comparison ------------ */
    693 
    694 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
    695 {
    696    return mk32x2(
    697              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
    698              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
    699           );
    700 }
    701 
    702 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
    703 {
    704    return mk16x4(
    705              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
    706              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
    707              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
    708              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
    709           );
    710 }
    711 
    712 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
    713 {
    714    return mk8x8(
    715              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
    716              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
    717              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
    718              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
    719              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
    720              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
    721              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
    722              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
    723           );
    724 }
    725 
    726 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
    727 {
    728    return mk32x2(
    729              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
    730              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
    731           );
    732 }
    733 
    734 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
    735 {
    736    return mk16x4(
    737              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
    738              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
    739              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
    740              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
    741           );
    742 }
    743 
    744 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
    745 {
    746    return mk8x8(
    747              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
    748              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
    749              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
    750              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
    751              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
    752              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
    753              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
    754              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
    755           );
    756 }
    757 
    758 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
    759 {
    760    return mk32x2(
    761              cmpnez32( sel32x2_1(xx) ),
    762              cmpnez32( sel32x2_0(xx) )
    763           );
    764 }
    765 
    766 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
    767 {
    768    return mk16x4(
    769              cmpnez16( sel16x4_3(xx) ),
    770              cmpnez16( sel16x4_2(xx) ),
    771              cmpnez16( sel16x4_1(xx) ),
    772              cmpnez16( sel16x4_0(xx) )
    773           );
    774 }
    775 
    776 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
    777 {
    778    return mk8x8(
    779              cmpnez8( sel8x8_7(xx) ),
    780              cmpnez8( sel8x8_6(xx) ),
    781              cmpnez8( sel8x8_5(xx) ),
    782              cmpnez8( sel8x8_4(xx) ),
    783              cmpnez8( sel8x8_3(xx) ),
    784              cmpnez8( sel8x8_2(xx) ),
    785              cmpnez8( sel8x8_1(xx) ),
    786              cmpnez8( sel8x8_0(xx) )
    787           );
    788 }
    789 
    790 /* ------------ Saturating narrowing ------------ */
    791 
    792 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
    793 {
    794    UInt d = sel32x2_1(aa);
    795    UInt c = sel32x2_0(aa);
    796    UInt b = sel32x2_1(bb);
    797    UInt a = sel32x2_0(bb);
    798    return mk16x4(
    799              qnarrow32Sto16S(d),
    800              qnarrow32Sto16S(c),
    801              qnarrow32Sto16S(b),
    802              qnarrow32Sto16S(a)
    803           );
    804 }
    805 
    806 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
    807 {
    808    UShort h = sel16x4_3(aa);
    809    UShort g = sel16x4_2(aa);
    810    UShort f = sel16x4_1(aa);
    811    UShort e = sel16x4_0(aa);
    812    UShort d = sel16x4_3(bb);
    813    UShort c = sel16x4_2(bb);
    814    UShort b = sel16x4_1(bb);
    815    UShort a = sel16x4_0(bb);
    816    return mk8x8(
    817              qnarrow16Sto8S(h),
    818              qnarrow16Sto8S(g),
    819              qnarrow16Sto8S(f),
    820              qnarrow16Sto8S(e),
    821              qnarrow16Sto8S(d),
    822              qnarrow16Sto8S(c),
    823              qnarrow16Sto8S(b),
    824              qnarrow16Sto8S(a)
    825           );
    826 }
    827 
    828 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
    829 {
    830    UShort h = sel16x4_3(aa);
    831    UShort g = sel16x4_2(aa);
    832    UShort f = sel16x4_1(aa);
    833    UShort e = sel16x4_0(aa);
    834    UShort d = sel16x4_3(bb);
    835    UShort c = sel16x4_2(bb);
    836    UShort b = sel16x4_1(bb);
    837    UShort a = sel16x4_0(bb);
    838    return mk8x8(
    839              qnarrow16Sto8U(h),
    840              qnarrow16Sto8U(g),
    841              qnarrow16Sto8U(f),
    842              qnarrow16Sto8U(e),
    843              qnarrow16Sto8U(d),
    844              qnarrow16Sto8U(c),
    845              qnarrow16Sto8U(b),
    846              qnarrow16Sto8U(a)
    847           );
    848 }
    849 
    850 /* ------------ Truncating narrowing ------------ */
    851 
    852 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
    853 {
    854    UInt d = sel32x2_1(aa);
    855    UInt c = sel32x2_0(aa);
    856    UInt b = sel32x2_1(bb);
    857    UInt a = sel32x2_0(bb);
    858    return mk16x4(
    859              narrow32to16(d),
    860              narrow32to16(c),
    861              narrow32to16(b),
    862              narrow32to16(a)
    863           );
    864 }
    865 
    866 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
    867 {
    868    UShort h = sel16x4_3(aa);
    869    UShort g = sel16x4_2(aa);
    870    UShort f = sel16x4_1(aa);
    871    UShort e = sel16x4_0(aa);
    872    UShort d = sel16x4_3(bb);
    873    UShort c = sel16x4_2(bb);
    874    UShort b = sel16x4_1(bb);
    875    UShort a = sel16x4_0(bb);
    876    return mk8x8(
    877              narrow16to8(h),
    878              narrow16to8(g),
    879              narrow16to8(f),
    880              narrow16to8(e),
    881              narrow16to8(d),
    882              narrow16to8(c),
    883              narrow16to8(b),
    884              narrow16to8(a)
    885           );
    886 }
    887 
    888 /* ------------ Interleaving ------------ */
    889 
    890 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
    891 {
    892    return mk8x8(
    893              sel8x8_7(aa),
    894              sel8x8_7(bb),
    895              sel8x8_6(aa),
    896              sel8x8_6(bb),
    897              sel8x8_5(aa),
    898              sel8x8_5(bb),
    899              sel8x8_4(aa),
    900              sel8x8_4(bb)
    901           );
    902 }
    903 
    904 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
    905 {
    906    return mk8x8(
    907              sel8x8_3(aa),
    908              sel8x8_3(bb),
    909              sel8x8_2(aa),
    910              sel8x8_2(bb),
    911              sel8x8_1(aa),
    912              sel8x8_1(bb),
    913              sel8x8_0(aa),
    914              sel8x8_0(bb)
    915           );
    916 }
    917 
    918 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
    919 {
    920    return mk16x4(
    921              sel16x4_3(aa),
    922              sel16x4_3(bb),
    923              sel16x4_2(aa),
    924              sel16x4_2(bb)
    925           );
    926 }
    927 
    928 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
    929 {
    930    return mk16x4(
    931              sel16x4_1(aa),
    932              sel16x4_1(bb),
    933              sel16x4_0(aa),
    934              sel16x4_0(bb)
    935           );
    936 }
    937 
    938 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
    939 {
    940    return mk32x2(
    941              sel32x2_1(aa),
    942              sel32x2_1(bb)
    943           );
    944 }
    945 
    946 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
    947 {
    948    return mk32x2(
    949              sel32x2_0(aa),
    950              sel32x2_0(bb)
    951           );
    952 }
    953 
    954 /* ------------ Concatenation ------------ */
    955 
    956 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
    957 {
    958    return mk16x4(
    959              sel16x4_3(aa),
    960              sel16x4_1(aa),
    961              sel16x4_3(bb),
    962              sel16x4_1(bb)
    963           );
    964 }
    965 
    966 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
    967 {
    968    return mk16x4(
    969              sel16x4_2(aa),
    970              sel16x4_0(aa),
    971              sel16x4_2(bb),
    972              sel16x4_0(bb)
    973           );
    974 }
    975 
    976 /* misc hack looking for a proper home */
    977 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
    978 {
    979    return mk8x8(
    980              index8x8(aa, sel8x8_7(bb)),
    981              index8x8(aa, sel8x8_6(bb)),
    982              index8x8(aa, sel8x8_5(bb)),
    983              index8x8(aa, sel8x8_4(bb)),
    984              index8x8(aa, sel8x8_3(bb)),
    985              index8x8(aa, sel8x8_2(bb)),
    986              index8x8(aa, sel8x8_1(bb)),
    987              index8x8(aa, sel8x8_0(bb))
    988           );
    989 }
    990 
    991 /* ------------ Shifting ------------ */
    992 /* Note that because these primops are undefined if the shift amount
    993    equals or exceeds the lane width, the shift amount is masked so
    994    that the scalar shifts are always in range.  In fact, given the
    995    semantics of these primops (ShlN16x4, etc) it is an error if in
    996    fact we are ever given an out-of-range shift amount.
    997 */
    998 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
    999 {
   1000    /* vassert(nn < 32); */
   1001    nn &= 31;
   1002    return mk32x2(
   1003              shl32( sel32x2_1(xx), nn ),
   1004              shl32( sel32x2_0(xx), nn )
   1005           );
   1006 }
   1007 
   1008 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
   1009 {
   1010    /* vassert(nn < 16); */
   1011    nn &= 15;
   1012    return mk16x4(
   1013              shl16( sel16x4_3(xx), nn ),
   1014              shl16( sel16x4_2(xx), nn ),
   1015              shl16( sel16x4_1(xx), nn ),
   1016              shl16( sel16x4_0(xx), nn )
   1017           );
   1018 }
   1019 
   1020 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
   1021 {
   1022    /* vassert(nn < 8); */
   1023    nn &= 7;
   1024    return mk8x8(
   1025              shl8( sel8x8_7(xx), nn ),
   1026              shl8( sel8x8_6(xx), nn ),
   1027              shl8( sel8x8_5(xx), nn ),
   1028              shl8( sel8x8_4(xx), nn ),
   1029              shl8( sel8x8_3(xx), nn ),
   1030              shl8( sel8x8_2(xx), nn ),
   1031              shl8( sel8x8_1(xx), nn ),
   1032              shl8( sel8x8_0(xx), nn )
   1033           );
   1034 }
   1035 
   1036 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
   1037 {
   1038    /* vassert(nn < 32); */
   1039    nn &= 31;
   1040    return mk32x2(
   1041              shr32( sel32x2_1(xx), nn ),
   1042              shr32( sel32x2_0(xx), nn )
   1043           );
   1044 }
   1045 
   1046 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
   1047 {
   1048    /* vassert(nn < 16); */
   1049    nn &= 15;
   1050    return mk16x4(
   1051              shr16( sel16x4_3(xx), nn ),
   1052              shr16( sel16x4_2(xx), nn ),
   1053              shr16( sel16x4_1(xx), nn ),
   1054              shr16( sel16x4_0(xx), nn )
   1055           );
   1056 }
   1057 
   1058 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
   1059 {
   1060    /* vassert(nn < 32); */
   1061    nn &= 31;
   1062    return mk32x2(
   1063              sar32( sel32x2_1(xx), nn ),
   1064              sar32( sel32x2_0(xx), nn )
   1065           );
   1066 }
   1067 
   1068 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
   1069 {
   1070    /* vassert(nn < 16); */
   1071    nn &= 15;
   1072    return mk16x4(
   1073              sar16( sel16x4_3(xx), nn ),
   1074              sar16( sel16x4_2(xx), nn ),
   1075              sar16( sel16x4_1(xx), nn ),
   1076              sar16( sel16x4_0(xx), nn )
   1077           );
   1078 }
   1079 
   1080 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
   1081 {
   1082    /* vassert(nn < 8); */
   1083    nn &= 7;
   1084    return mk8x8(
   1085              sar8( sel8x8_7(xx), nn ),
   1086              sar8( sel8x8_6(xx), nn ),
   1087              sar8( sel8x8_5(xx), nn ),
   1088              sar8( sel8x8_4(xx), nn ),
   1089              sar8( sel8x8_3(xx), nn ),
   1090              sar8( sel8x8_2(xx), nn ),
   1091              sar8( sel8x8_1(xx), nn ),
   1092              sar8( sel8x8_0(xx), nn )
   1093           );
   1094 }
   1095 
   1096 /* ------------ Averaging ------------ */
   1097 
   1098 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
   1099 {
   1100    return mk8x8(
   1101              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1102              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1103              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1104              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1105              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1106              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1107              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1108              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
   1109           );
   1110 }
   1111 
   1112 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
   1113 {
   1114    return mk16x4(
   1115              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
   1116              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
   1117              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
   1118              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
   1119           );
   1120 }
   1121 
   1122 /* ------------ max/min ------------ */
   1123 
   1124 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
   1125 {
   1126    return mk16x4(
   1127              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
   1128              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
   1129              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
   1130              max16S( sel16x4_0(xx), sel16x4_0(yy) )
   1131           );
   1132 }
   1133 
   1134 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
   1135 {
   1136    return mk8x8(
   1137              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1138              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1139              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1140              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1141              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1142              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1143              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1144              max8U( sel8x8_0(xx), sel8x8_0(yy) )
   1145           );
   1146 }
   1147 
   1148 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
   1149 {
   1150    return mk16x4(
   1151              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
   1152              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
   1153              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
   1154              min16S( sel16x4_0(xx), sel16x4_0(yy) )
   1155           );
   1156 }
   1157 
   1158 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
   1159 {
   1160    return mk8x8(
   1161              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1162              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1163              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1164              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1165              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1166              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1167              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1168              min8U( sel8x8_0(xx), sel8x8_0(yy) )
   1169           );
   1170 }
   1171 
   1172 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
   1173 
   1174 /* Tuple/select functions for 16x2 vectors. */
   1175 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
   1176    return (((UInt)w1) << 16) | ((UInt)w2);
   1177 }
   1178 
   1179 static inline UShort sel16x2_1 ( UInt w32 ) {
   1180    return 0xFFFF & (UShort)(w32 >> 16);
   1181 }
   1182 static inline UShort sel16x2_0 ( UInt w32 ) {
   1183    return 0xFFFF & (UShort)(w32);
   1184 }
   1185 
   1186 static inline UInt mk8x4 ( UChar w3, UChar w2,
   1187                            UChar w1, UChar w0 ) {
   1188    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
   1189               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
   1190    return w32;
   1191 }
   1192 
   1193 static inline UChar sel8x4_3 ( UInt w32 ) {
   1194    return toUChar(0xFF & (w32 >> 24));
   1195 }
   1196 static inline UChar sel8x4_2 ( UInt w32 ) {
   1197    return toUChar(0xFF & (w32 >> 16));
   1198 }
   1199 static inline UChar sel8x4_1 ( UInt w32 ) {
   1200    return toUChar(0xFF & (w32 >> 8));
   1201 }
   1202 static inline UChar sel8x4_0 ( UInt w32 ) {
   1203    return toUChar(0xFF & (w32 >> 0));
   1204 }
   1205 
   1206 
   1207 /* ----------------------------------------------------- */
   1208 /* More externally visible functions.  These simply
   1209    implement the corresponding IR primops. */
   1210 /* ----------------------------------------------------- */
   1211 
   1212 /* ------ 16x2 ------ */
   1213 
   1214 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
   1215 {
   1216    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
   1217                   sel16x2_0(xx) + sel16x2_0(yy) );
   1218 }
   1219 
   1220 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
   1221 {
   1222    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
   1223                   sel16x2_0(xx) - sel16x2_0(yy) );
   1224 }
   1225 
   1226 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
   1227 {
   1228    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1229                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1230 }
   1231 
   1232 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
   1233 {
   1234    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1235                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1236 }
   1237 
   1238 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
   1239 {
   1240    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1241                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1242 }
   1243 
   1244 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
   1245 {
   1246    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1247                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1248 }
   1249 
   1250 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
   1251 {
   1252    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1253                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1254 }
   1255 
   1256 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
   1257 {
   1258    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1259                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1260 }
   1261 
   1262 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
   1263 {
   1264    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1265                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1266 }
   1267 
   1268 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
   1269 {
   1270    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1271                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1272 }
   1273 
   1274 /* ------ 8x4 ------ */
   1275 
   1276 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
   1277 {
   1278    return mk8x4(
   1279              sel8x4_3(xx) + sel8x4_3(yy),
   1280              sel8x4_2(xx) + sel8x4_2(yy),
   1281              sel8x4_1(xx) + sel8x4_1(yy),
   1282              sel8x4_0(xx) + sel8x4_0(yy)
   1283           );
   1284 }
   1285 
   1286 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
   1287 {
   1288    return mk8x4(
   1289              sel8x4_3(xx) - sel8x4_3(yy),
   1290              sel8x4_2(xx) - sel8x4_2(yy),
   1291              sel8x4_1(xx) - sel8x4_1(yy),
   1292              sel8x4_0(xx) - sel8x4_0(yy)
   1293           );
   1294 }
   1295 
   1296 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
   1297 {
   1298    return mk8x4(
   1299              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1300              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1301              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1302              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
   1303           );
   1304 }
   1305 
   1306 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
   1307 {
   1308    return mk8x4(
   1309              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1310              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1311              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1312              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
   1313           );
   1314 }
   1315 
   1316 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
   1317 {
   1318    return mk8x4(
   1319              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1320              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1321              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1322              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
   1323           );
   1324 }
   1325 
   1326 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
   1327 {
   1328    return mk8x4(
   1329              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1330              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1331              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1332              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
   1333           );
   1334 }
   1335 
   1336 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
   1337 {
   1338    return mk8x4(
   1339              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1340              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1341              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1342              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
   1343           );
   1344 }
   1345 
   1346 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
   1347 {
   1348    return mk8x4(
   1349              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1350              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1351              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1352              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
   1353           );
   1354 }
   1355 
   1356 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
   1357 {
   1358    return mk8x4(
   1359              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1360              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1361              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1362              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
   1363           );
   1364 }
   1365 
   1366 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
   1367 {
   1368    return mk8x4(
   1369              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1370              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1371              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1372              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
   1373           );
   1374 }
   1375 
   1376 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
   1377 {
   1378    return mk16x2(
   1379              cmpnez16( sel16x2_1(xx) ),
   1380              cmpnez16( sel16x2_0(xx) )
   1381           );
   1382 }
   1383 
   1384 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
   1385 {
   1386    return mk8x4(
   1387              cmpnez8( sel8x4_3(xx) ),
   1388              cmpnez8( sel8x4_2(xx) ),
   1389              cmpnez8( sel8x4_1(xx) ),
   1390              cmpnez8( sel8x4_0(xx) )
   1391           );
   1392 }
   1393 
   1394 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
   1395 {
   1396    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
   1397           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
   1398           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
   1399           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
   1400 }
   1401 
   1402 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
   1403 {
   1404    return qadd32S( xx, yy );
   1405 }
   1406 
   1407 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
   1408 {
   1409    return qsub32S( xx, yy );
   1410 }
   1411 
   1412 
   1413 /*------------------------------------------------------------------*/
   1414 /* Decimal Floating Point (DFP) externally visible helper functions */
   1415 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD                     */
   1416 /*------------------------------------------------------------------*/
   1417 
   1418 #define NOT( x )    ( ( ( x ) == 0) ? 1 : 0)
   1419 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
   1420 #define PUT( x, y ) ( ( x )<< ( y ) )
   1421 
   1422 ULong dpb_to_bcd( ULong chunk )
   1423 {
   1424    Short a, b, c, d, e, f, g, h, i, j, k, m;
   1425    Short p, q, r, s, t, u, v, w, x, y;
   1426    ULong value;
   1427 
   1428    /* convert 10 bit densely packed BCD to BCD */
   1429    p = GET( chunk, 9 );
   1430    q = GET( chunk, 8 );
   1431    r = GET( chunk, 7 );
   1432    s = GET( chunk, 6 );
   1433    t = GET( chunk, 5 );
   1434    u = GET( chunk, 4 );
   1435    v = GET( chunk, 3 );
   1436    w = GET( chunk, 2 );
   1437    x = GET( chunk, 1 );
   1438    y = GET( chunk, 0 );
   1439 
   1440    /* The BCD bit values are given by the following boolean equations.*/
   1441    a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
   1442    b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
   1443    c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
   1444    d = r;
   1445    e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
   1446    f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
   1447    g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
   1448    h = u;
   1449    i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
   1450    j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
   1451             | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
   1452    k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
   1453             | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
   1454    m = y;
   1455 
   1456    value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
   1457             | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
   1458             | PUT(k, 1) | PUT(m, 0);
   1459    return value;
   1460 }
   1461 
   1462 ULong bcd_to_dpb( ULong chunk )
   1463 {
   1464    Short a, b, c, d, e, f, g, h, i, j, k, m;
   1465    Short p, q, r, s, t, u, v, w, x, y;
   1466    ULong value;
   1467    /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
   1468     The boolean equations to calculate the value of each of the DPD bit
   1469     is given in Appendix B  of Book 1: Power ISA User Instruction set.  The
   1470     bits for the DPD number are [abcdefghijkm].  The bits for the BCD value
   1471     are [pqrstuvwxy].  The boolean logic equations in psuedo C code are:
   1472     */
   1473    a = GET( chunk, 11 );
   1474    b = GET( chunk, 10 );
   1475    c = GET( chunk, 9 );
   1476    d = GET( chunk, 8 );
   1477    e = GET( chunk, 7 );
   1478    f = GET( chunk, 6 );
   1479    g = GET( chunk, 5 );
   1480    h = GET( chunk, 4 );
   1481    i = GET( chunk, 3 );
   1482    j = GET( chunk, 2 );
   1483    k = GET( chunk, 1 );
   1484    m = GET( chunk, 0 );
   1485 
   1486    p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
   1487    q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
   1488    r = d;
   1489    s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
   1490             | ( f & NOT(a) & NOT(e) ) | ( e & i );
   1491    t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
   1492             | ( g & NOT(a) & NOT(e) ) | ( a & i );
   1493    u = h;
   1494    v = a | e | i;
   1495    w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
   1496    x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
   1497    y = m;
   1498 
   1499    value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
   1500             | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
   1501 
   1502    return value;
   1503 }
   1504 
   1505 ULong h_DPBtoBCD( ULong dpb )
   1506 {
   1507    ULong result, chunk;
   1508    Int i;
   1509 
   1510    result = 0;
   1511 
   1512    for (i = 0; i < 5; i++) {
   1513       chunk = dpb >> ( 4 - i ) * 10;
   1514       result = result << 12;
   1515       result |= dpb_to_bcd( chunk & 0x3FF );
   1516    }
   1517    return result;
   1518 }
   1519 
   1520 ULong h_BCDtoDPB( ULong bcd )
   1521 {
   1522    ULong result, chunk;
   1523    Int i;
   1524 
   1525    result = 0;
   1526 
   1527    for (i = 0; i < 5; i++) {
   1528       chunk = bcd >> ( 4 - i ) * 12;
   1529       result = result << 10;
   1530       result |= bcd_to_dpb( chunk & 0xFFF );
   1531    }
   1532    return result;
   1533 }
   1534 #undef NOT
   1535 #undef GET
   1536 #undef PUT
   1537 
   1538 /*---------------------------------------------------------------*/
   1539 /*--- end                               host_generic_simd64.c ---*/
   1540 /*---------------------------------------------------------------*/
   1541 
   1542