Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                             host_generic_simd64.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2015 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
     37    where the instruction selectors cannot generate code in-line.
     38    These are purely back-end entities and cannot be seen/referenced
     39    from IR.  There are also helpers for 32-bit arithmetic in here. */
     40 
     41 #include "libvex_basictypes.h"
     42 #include "main_util.h"              // LIKELY, UNLIKELY
     43 #include "host_generic_simd64.h"
     44 
     45 
     46 
     47 /* Tuple/select functions for 32x2 vectors. */
     48 
     49 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
     50    return (((ULong)w1) << 32) | ((ULong)w0);
     51 }
     52 
     53 static inline UInt sel32x2_1 ( ULong w64 ) {
     54    return 0xFFFFFFFF & toUInt(w64 >> 32);
     55 }
     56 static inline UInt sel32x2_0 ( ULong w64 ) {
     57    return 0xFFFFFFFF & toUInt(w64);
     58 }
     59 
     60 
     61 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
     62    with 64-bit shifts so we give it a hand. */
     63 
     64 static inline ULong mk16x4 ( UShort w3, UShort w2,
     65                              UShort w1, UShort w0 ) {
     66    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
     67    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
     68    return mk32x2(hi32, lo32);
     69 }
     70 
     71 static inline UShort sel16x4_3 ( ULong w64 ) {
     72    UInt hi32 = toUInt(w64 >> 32);
     73    return toUShort(0xFFFF & (hi32 >> 16));
     74 }
     75 static inline UShort sel16x4_2 ( ULong w64 ) {
     76    UInt hi32 = toUInt(w64 >> 32);
     77    return toUShort(0xFFFF & hi32);
     78 }
     79 static inline UShort sel16x4_1 ( ULong w64 ) {
     80    UInt lo32 = (UInt)w64;
     81    return toUShort(0xFFFF & (lo32 >> 16));
     82 }
     83 static inline UShort sel16x4_0 ( ULong w64 ) {
     84    UInt lo32 = (UInt)w64;
     85    return toUShort(0xFFFF & lo32);
     86 }
     87 
     88 
     89 /* Tuple/select functions for 8x8 vectors. */
     90 
     91 static inline ULong mk8x8 ( UChar w7, UChar w6,
     92                             UChar w5, UChar w4,
     93                             UChar w3, UChar w2,
     94                             UChar w1, UChar w0 ) {
     95    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
     96                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
     97    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
     98                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
     99    return mk32x2(hi32, lo32);
    100 }
    101 
    102 static inline UChar sel8x8_7 ( ULong w64 ) {
    103    UInt hi32 = toUInt(w64 >> 32);
    104    return toUChar(0xFF & (hi32 >> 24));
    105 }
    106 static inline UChar sel8x8_6 ( ULong w64 ) {
    107    UInt hi32 = toUInt(w64 >> 32);
    108    return toUChar(0xFF & (hi32 >> 16));
    109 }
    110 static inline UChar sel8x8_5 ( ULong w64 ) {
    111    UInt hi32 = toUInt(w64 >> 32);
    112    return toUChar(0xFF & (hi32 >> 8));
    113 }
    114 static inline UChar sel8x8_4 ( ULong w64 ) {
    115    UInt hi32 = toUInt(w64 >> 32);
    116    return toUChar(0xFF & (hi32 >> 0));
    117 }
    118 static inline UChar sel8x8_3 ( ULong w64 ) {
    119    UInt lo32 = (UInt)w64;
    120    return toUChar(0xFF & (lo32 >> 24));
    121 }
    122 static inline UChar sel8x8_2 ( ULong w64 ) {
    123    UInt lo32 = (UInt)w64;
    124    return toUChar(0xFF & (lo32 >> 16));
    125 }
    126 static inline UChar sel8x8_1 ( ULong w64 ) {
    127    UInt lo32 = (UInt)w64;
    128    return toUChar(0xFF & (lo32 >> 8));
    129 }
    130 static inline UChar sel8x8_0 ( ULong w64 ) {
    131    UInt lo32 = (UInt)w64;
    132    return toUChar(0xFF & (lo32 >> 0));
    133 }
    134 
    135 static inline UChar index8x8 ( ULong w64, UChar ix ) {
    136    ix &= 7;
    137    return toUChar((w64 >> (8*ix)) & 0xFF);
    138 }
    139 
    140 
    141 /* Scalar helpers. */
    142 
    143 static inline Int qadd32S ( Int xx, Int yy )
    144 {
    145    Long t = ((Long)xx) + ((Long)yy);
    146    const Long loLim = -0x80000000LL;
    147    const Long hiLim =  0x7FFFFFFFLL;
    148    if (t < loLim) t = loLim;
    149    if (t > hiLim) t = hiLim;
    150    return (Int)t;
    151 }
    152 
    153 static inline Short qadd16S ( Short xx, Short yy )
    154 {
    155    Int t = ((Int)xx) + ((Int)yy);
    156    if (t < -32768) t = -32768;
    157    if (t > 32767)  t = 32767;
    158    return (Short)t;
    159 }
    160 
    161 static inline Char qadd8S ( Char xx, Char yy )
    162 {
    163    Int t = ((Int)xx) + ((Int)yy);
    164    if (t < -128) t = -128;
    165    if (t > 127)  t = 127;
    166    return (Char)t;
    167 }
    168 
    169 static inline UShort qadd16U ( UShort xx, UShort yy )
    170 {
    171    UInt t = ((UInt)xx) + ((UInt)yy);
    172    if (t > 0xFFFF) t = 0xFFFF;
    173    return (UShort)t;
    174 }
    175 
    176 static inline UChar qadd8U ( UChar xx, UChar yy )
    177 {
    178    UInt t = ((UInt)xx) + ((UInt)yy);
    179    if (t > 0xFF) t = 0xFF;
    180    return (UChar)t;
    181 }
    182 
    183 static inline Int qsub32S ( Int xx, Int yy )
    184 {
    185    Long t = ((Long)xx) - ((Long)yy);
    186    const Long loLim = -0x80000000LL;
    187    const Long hiLim =  0x7FFFFFFFLL;
    188    if (t < loLim) t = loLim;
    189    if (t > hiLim) t = hiLim;
    190    return (Int)t;
    191 }
    192 
    193 static inline Short qsub16S ( Short xx, Short yy )
    194 {
    195    Int t = ((Int)xx) - ((Int)yy);
    196    if (t < -32768) t = -32768;
    197    if (t > 32767)  t = 32767;
    198    return (Short)t;
    199 }
    200 
    201 static inline Char qsub8S ( Char xx, Char yy )
    202 {
    203    Int t = ((Int)xx) - ((Int)yy);
    204    if (t < -128) t = -128;
    205    if (t > 127)  t = 127;
    206    return (Char)t;
    207 }
    208 
    209 static inline UShort qsub16U ( UShort xx, UShort yy )
    210 {
    211    Int t = ((Int)xx) - ((Int)yy);
    212    if (t < 0)      t = 0;
    213    if (t > 0xFFFF) t = 0xFFFF;
    214    return (UShort)t;
    215 }
    216 
    217 static inline UChar qsub8U ( UChar xx, UChar yy )
    218 {
    219    Int t = ((Int)xx) - ((Int)yy);
    220    if (t < 0)    t = 0;
    221    if (t > 0xFF) t = 0xFF;
    222    return (UChar)t;
    223 }
    224 
    225 static inline Short mul16 ( Short xx, Short yy )
    226 {
    227    Int t = ((Int)xx) * ((Int)yy);
    228    return (Short)t;
    229 }
    230 
    231 static inline Int mul32 ( Int xx, Int yy )
    232 {
    233    Int t = ((Int)xx) * ((Int)yy);
    234    return (Int)t;
    235 }
    236 
    237 static inline Short mulhi16S ( Short xx, Short yy )
    238 {
    239    Int t = ((Int)xx) * ((Int)yy);
    240    t >>=/*s*/ 16;
    241    return (Short)t;
    242 }
    243 
    244 static inline UShort mulhi16U ( UShort xx, UShort yy )
    245 {
    246    UInt t = ((UInt)xx) * ((UInt)yy);
    247    t >>=/*u*/ 16;
    248    return (UShort)t;
    249 }
    250 
    251 static inline UInt cmpeq32 ( UInt xx, UInt yy )
    252 {
    253    return xx==yy ? 0xFFFFFFFF : 0;
    254 }
    255 
    256 static inline UShort cmpeq16 ( UShort xx, UShort yy )
    257 {
    258    return toUShort(xx==yy ? 0xFFFF : 0);
    259 }
    260 
    261 static inline UChar cmpeq8 ( UChar xx, UChar yy )
    262 {
    263    return toUChar(xx==yy ? 0xFF : 0);
    264 }
    265 
    266 static inline UInt cmpgt32S ( Int xx, Int yy )
    267 {
    268    return xx>yy ? 0xFFFFFFFF : 0;
    269 }
    270 
    271 static inline UShort cmpgt16S ( Short xx, Short yy )
    272 {
    273    return toUShort(xx>yy ? 0xFFFF : 0);
    274 }
    275 
    276 static inline UChar cmpgt8S ( Char xx, Char yy )
    277 {
    278    return toUChar(xx>yy ? 0xFF : 0);
    279 }
    280 
    281 static inline UInt cmpnez32 ( UInt xx )
    282 {
    283    return xx==0 ? 0 : 0xFFFFFFFF;
    284 }
    285 
    286 static inline UShort cmpnez16 ( UShort xx )
    287 {
    288    return toUShort(xx==0 ? 0 : 0xFFFF);
    289 }
    290 
    291 static inline UChar cmpnez8 ( UChar xx )
    292 {
    293    return toUChar(xx==0 ? 0 : 0xFF);
    294 }
    295 
    296 static inline Short qnarrow32Sto16S ( UInt xx0 )
    297 {
    298    Int xx = (Int)xx0;
    299    if (xx < -32768) xx = -32768;
    300    if (xx > 32767)  xx = 32767;
    301    return (Short)xx;
    302 }
    303 
    304 static inline Char qnarrow16Sto8S ( UShort xx0 )
    305 {
    306    Short xx = (Short)xx0;
    307    if (xx < -128) xx = -128;
    308    if (xx > 127)  xx = 127;
    309    return (Char)xx;
    310 }
    311 
    312 static inline UChar qnarrow16Sto8U ( UShort xx0 )
    313 {
    314    Short xx = (Short)xx0;
    315    if (xx < 0)   xx = 0;
    316    if (xx > 255) xx = 255;
    317    return (UChar)xx;
    318 }
    319 
    320 static inline UShort narrow32to16 ( UInt xx )
    321 {
    322    return (UShort)xx;
    323 }
    324 
    325 static inline UChar narrow16to8 ( UShort xx )
    326 {
    327    return (UChar)xx;
    328 }
    329 
    330 /* shifts: we don't care about out-of-range ones, since
    331    that is dealt with at a higher level. */
    332 
    333 static inline UChar shl8 ( UChar v, UInt n )
    334 {
    335    return toUChar(v << n);
    336 }
    337 
    338 static inline UChar sar8 ( UChar v, UInt n )
    339 {
    340    return toUChar(((Char)v) >> n);
    341 }
    342 
    343 static inline UShort shl16 ( UShort v, UInt n )
    344 {
    345    return toUShort(v << n);
    346 }
    347 
    348 static inline UShort shr16 ( UShort v, UInt n )
    349 {
    350    return toUShort((((UShort)v) >> n));
    351 }
    352 
    353 static inline UShort sar16 ( UShort v, UInt n )
    354 {
    355    return toUShort(((Short)v) >> n);
    356 }
    357 
    358 static inline UInt shl32 ( UInt v, UInt n )
    359 {
    360    return v << n;
    361 }
    362 
    363 static inline UInt shr32 ( UInt v, UInt n )
    364 {
    365    return (((UInt)v) >> n);
    366 }
    367 
    368 static inline UInt sar32 ( UInt v, UInt n )
    369 {
    370    return ((Int)v) >> n;
    371 }
    372 
    373 static inline UChar avg8U ( UChar xx, UChar yy )
    374 {
    375    UInt xxi = (UInt)xx;
    376    UInt yyi = (UInt)yy;
    377    UInt r   = (xxi + yyi + 1) >> 1;
    378    return (UChar)r;
    379 }
    380 
    381 static inline UShort avg16U ( UShort xx, UShort yy )
    382 {
    383    UInt xxi = (UInt)xx;
    384    UInt yyi = (UInt)yy;
    385    UInt r   = (xxi + yyi + 1) >> 1;
    386    return (UShort)r;
    387 }
    388 
    389 static inline Short max16S ( Short xx, Short yy )
    390 {
    391    return toUShort((xx > yy) ? xx : yy);
    392 }
    393 
    394 static inline UChar max8U ( UChar xx, UChar yy )
    395 {
    396    return toUChar((xx > yy) ? xx : yy);
    397 }
    398 
    399 static inline Short min16S ( Short xx, Short yy )
    400 {
    401    return toUShort((xx < yy) ? xx : yy);
    402 }
    403 
    404 static inline UChar min8U ( UChar xx, UChar yy )
    405 {
    406    return toUChar((xx < yy) ? xx : yy);
    407 }
    408 
    409 static inline UShort hadd16U ( UShort xx, UShort yy )
    410 {
    411    UInt xxi = (UInt)xx;
    412    UInt yyi = (UInt)yy;
    413    UInt r   = (xxi + yyi) >> 1;
    414    return (UShort)r;
    415 }
    416 
    417 static inline Short hadd16S ( Short xx, Short yy )
    418 {
    419    Int xxi = (Int)xx;
    420    Int yyi = (Int)yy;
    421    Int r   = (xxi + yyi) >> 1;
    422    return (Short)r;
    423 }
    424 
    425 static inline UShort hsub16U ( UShort xx, UShort yy )
    426 {
    427    UInt xxi = (UInt)xx;
    428    UInt yyi = (UInt)yy;
    429    UInt r   = (xxi - yyi) >> 1;
    430    return (UShort)r;
    431 }
    432 
    433 static inline Short hsub16S ( Short xx, Short yy )
    434 {
    435    Int xxi = (Int)xx;
    436    Int yyi = (Int)yy;
    437    Int r   = (xxi - yyi) >> 1;
    438    return (Short)r;
    439 }
    440 
    441 static inline UChar hadd8U ( UChar xx, UChar yy )
    442 {
    443    UInt xxi = (UInt)xx;
    444    UInt yyi = (UInt)yy;
    445    UInt r   = (xxi + yyi) >> 1;
    446    return (UChar)r;
    447 }
    448 
    449 static inline Char hadd8S ( Char xx, Char yy )
    450 {
    451    Int xxi = (Int)xx;
    452    Int yyi = (Int)yy;
    453    Int r   = (xxi + yyi) >> 1;
    454    return (Char)r;
    455 }
    456 
    457 static inline UChar hsub8U ( UChar xx, UChar yy )
    458 {
    459    UInt xxi = (UInt)xx;
    460    UInt yyi = (UInt)yy;
    461    UInt r   = (xxi - yyi) >> 1;
    462    return (UChar)r;
    463 }
    464 
    465 static inline Char hsub8S ( Char xx, Char yy )
    466 {
    467    Int xxi = (Int)xx;
    468    Int yyi = (Int)yy;
    469    Int r   = (xxi - yyi) >> 1;
    470    return (Char)r;
    471 }
    472 
    473 static inline UInt absdiff8U ( UChar xx, UChar yy )
    474 {
    475    UInt xxu = (UChar)xx;
    476    UInt yyu = (UChar)yy;
    477    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
    478 }
    479 
    480 /* ----------------------------------------------------- */
    481 /* Start of the externally visible functions.  These simply
    482    implement the corresponding IR primops. */
    483 /* ----------------------------------------------------- */
    484 
    485 /* ------------ Normal addition ------------ */
    486 
    487 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
    488 {
    489    return mk32x2(
    490              sel32x2_1(xx) + sel32x2_1(yy),
    491              sel32x2_0(xx) + sel32x2_0(yy)
    492           );
    493 }
    494 
    495 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
    496 {
    497    return mk16x4(
    498              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
    499              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
    500              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
    501              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
    502           );
    503 }
    504 
    505 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
    506 {
    507    return mk8x8(
    508              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
    509              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
    510              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
    511              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
    512              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
    513              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
    514              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
    515              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
    516           );
    517 }
    518 
    519 /* ------------ Saturating addition ------------ */
    520 
    521 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
    522 {
    523    return mk16x4(
    524              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
    525              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
    526              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
    527              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
    528           );
    529 }
    530 
    531 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
    532 {
    533    return mk8x8(
    534              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
    535              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
    536              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
    537              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
    538              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
    539              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
    540              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
    541              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
    542           );
    543 }
    544 
    545 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
    546 {
    547    return mk16x4(
    548              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
    549              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
    550              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
    551              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
    552           );
    553 }
    554 
    555 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
    556 {
    557    return mk8x8(
    558              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
    559              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
    560              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
    561              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
    562              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
    563              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
    564              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
    565              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
    566           );
    567 }
    568 
    569 /* ------------ Normal subtraction ------------ */
    570 
    571 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
    572 {
    573    return mk32x2(
    574              sel32x2_1(xx) - sel32x2_1(yy),
    575              sel32x2_0(xx) - sel32x2_0(yy)
    576           );
    577 }
    578 
    579 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
    580 {
    581    return mk16x4(
    582              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
    583              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
    584              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
    585              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
    586           );
    587 }
    588 
    589 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
    590 {
    591    return mk8x8(
    592              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
    593              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
    594              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
    595              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
    596              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
    597              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
    598              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
    599              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
    600           );
    601 }
    602 
    603 /* ------------ Saturating subtraction ------------ */
    604 
    605 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
    606 {
    607    return mk16x4(
    608              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
    609              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
    610              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
    611              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
    612           );
    613 }
    614 
    615 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
    616 {
    617    return mk8x8(
    618              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
    619              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
    620              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
    621              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
    622              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
    623              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
    624              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
    625              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
    626           );
    627 }
    628 
    629 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
    630 {
    631    return mk16x4(
    632              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
    633              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
    634              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
    635              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
    636           );
    637 }
    638 
    639 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
    640 {
    641    return mk8x8(
    642              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
    643              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
    644              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
    645              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
    646              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
    647              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
    648              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
    649              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
    650           );
    651 }
    652 
    653 /* ------------ Multiplication ------------ */
    654 
    655 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
    656 {
    657    return mk16x4(
    658              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
    659              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
    660              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
    661              mul16( sel16x4_0(xx), sel16x4_0(yy) )
    662           );
    663 }
    664 
    665 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
    666 {
    667    return mk32x2(
    668              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
    669              mul32( sel32x2_0(xx), sel32x2_0(yy) )
    670           );
    671 }
    672 
    673 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
    674 {
    675    return mk16x4(
    676              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
    677              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
    678              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
    679              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
    680           );
    681 }
    682 
    683 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
    684 {
    685    return mk16x4(
    686              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
    687              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
    688              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
    689              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
    690           );
    691 }
    692 
    693 /* ------------ Comparison ------------ */
    694 
    695 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
    696 {
    697    return mk32x2(
    698              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
    699              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
    700           );
    701 }
    702 
    703 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
    704 {
    705    return mk16x4(
    706              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
    707              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
    708              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
    709              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
    710           );
    711 }
    712 
    713 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
    714 {
    715    return mk8x8(
    716              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
    717              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
    718              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
    719              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
    720              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
    721              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
    722              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
    723              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
    724           );
    725 }
    726 
    727 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
    728 {
    729    return mk32x2(
    730              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
    731              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
    732           );
    733 }
    734 
    735 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
    736 {
    737    return mk16x4(
    738              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
    739              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
    740              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
    741              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
    742           );
    743 }
    744 
    745 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
    746 {
    747    return mk8x8(
    748              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
    749              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
    750              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
    751              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
    752              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
    753              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
    754              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
    755              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
    756           );
    757 }
    758 
    759 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
    760 {
    761    return mk32x2(
    762              cmpnez32( sel32x2_1(xx) ),
    763              cmpnez32( sel32x2_0(xx) )
    764           );
    765 }
    766 
    767 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
    768 {
    769    return mk16x4(
    770              cmpnez16( sel16x4_3(xx) ),
    771              cmpnez16( sel16x4_2(xx) ),
    772              cmpnez16( sel16x4_1(xx) ),
    773              cmpnez16( sel16x4_0(xx) )
    774           );
    775 }
    776 
    777 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
    778 {
    779    return mk8x8(
    780              cmpnez8( sel8x8_7(xx) ),
    781              cmpnez8( sel8x8_6(xx) ),
    782              cmpnez8( sel8x8_5(xx) ),
    783              cmpnez8( sel8x8_4(xx) ),
    784              cmpnez8( sel8x8_3(xx) ),
    785              cmpnez8( sel8x8_2(xx) ),
    786              cmpnez8( sel8x8_1(xx) ),
    787              cmpnez8( sel8x8_0(xx) )
    788           );
    789 }
    790 
    791 /* ------------ Saturating narrowing ------------ */
    792 
    793 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
    794 {
    795    UInt d = sel32x2_1(aa);
    796    UInt c = sel32x2_0(aa);
    797    UInt b = sel32x2_1(bb);
    798    UInt a = sel32x2_0(bb);
    799    return mk16x4(
    800              qnarrow32Sto16S(d),
    801              qnarrow32Sto16S(c),
    802              qnarrow32Sto16S(b),
    803              qnarrow32Sto16S(a)
    804           );
    805 }
    806 
    807 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
    808 {
    809    UShort h = sel16x4_3(aa);
    810    UShort g = sel16x4_2(aa);
    811    UShort f = sel16x4_1(aa);
    812    UShort e = sel16x4_0(aa);
    813    UShort d = sel16x4_3(bb);
    814    UShort c = sel16x4_2(bb);
    815    UShort b = sel16x4_1(bb);
    816    UShort a = sel16x4_0(bb);
    817    return mk8x8(
    818              qnarrow16Sto8S(h),
    819              qnarrow16Sto8S(g),
    820              qnarrow16Sto8S(f),
    821              qnarrow16Sto8S(e),
    822              qnarrow16Sto8S(d),
    823              qnarrow16Sto8S(c),
    824              qnarrow16Sto8S(b),
    825              qnarrow16Sto8S(a)
    826           );
    827 }
    828 
    829 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
    830 {
    831    UShort h = sel16x4_3(aa);
    832    UShort g = sel16x4_2(aa);
    833    UShort f = sel16x4_1(aa);
    834    UShort e = sel16x4_0(aa);
    835    UShort d = sel16x4_3(bb);
    836    UShort c = sel16x4_2(bb);
    837    UShort b = sel16x4_1(bb);
    838    UShort a = sel16x4_0(bb);
    839    return mk8x8(
    840              qnarrow16Sto8U(h),
    841              qnarrow16Sto8U(g),
    842              qnarrow16Sto8U(f),
    843              qnarrow16Sto8U(e),
    844              qnarrow16Sto8U(d),
    845              qnarrow16Sto8U(c),
    846              qnarrow16Sto8U(b),
    847              qnarrow16Sto8U(a)
    848           );
    849 }
    850 
    851 /* ------------ Truncating narrowing ------------ */
    852 
    853 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
    854 {
    855    UInt d = sel32x2_1(aa);
    856    UInt c = sel32x2_0(aa);
    857    UInt b = sel32x2_1(bb);
    858    UInt a = sel32x2_0(bb);
    859    return mk16x4(
    860              narrow32to16(d),
    861              narrow32to16(c),
    862              narrow32to16(b),
    863              narrow32to16(a)
    864           );
    865 }
    866 
    867 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
    868 {
    869    UShort h = sel16x4_3(aa);
    870    UShort g = sel16x4_2(aa);
    871    UShort f = sel16x4_1(aa);
    872    UShort e = sel16x4_0(aa);
    873    UShort d = sel16x4_3(bb);
    874    UShort c = sel16x4_2(bb);
    875    UShort b = sel16x4_1(bb);
    876    UShort a = sel16x4_0(bb);
    877    return mk8x8(
    878              narrow16to8(h),
    879              narrow16to8(g),
    880              narrow16to8(f),
    881              narrow16to8(e),
    882              narrow16to8(d),
    883              narrow16to8(c),
    884              narrow16to8(b),
    885              narrow16to8(a)
    886           );
    887 }
    888 
    889 /* ------------ Interleaving ------------ */
    890 
    891 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
    892 {
    893    return mk8x8(
    894              sel8x8_7(aa),
    895              sel8x8_7(bb),
    896              sel8x8_6(aa),
    897              sel8x8_6(bb),
    898              sel8x8_5(aa),
    899              sel8x8_5(bb),
    900              sel8x8_4(aa),
    901              sel8x8_4(bb)
    902           );
    903 }
    904 
    905 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
    906 {
    907    return mk8x8(
    908              sel8x8_3(aa),
    909              sel8x8_3(bb),
    910              sel8x8_2(aa),
    911              sel8x8_2(bb),
    912              sel8x8_1(aa),
    913              sel8x8_1(bb),
    914              sel8x8_0(aa),
    915              sel8x8_0(bb)
    916           );
    917 }
    918 
    919 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
    920 {
    921    return mk16x4(
    922              sel16x4_3(aa),
    923              sel16x4_3(bb),
    924              sel16x4_2(aa),
    925              sel16x4_2(bb)
    926           );
    927 }
    928 
    929 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
    930 {
    931    return mk16x4(
    932              sel16x4_1(aa),
    933              sel16x4_1(bb),
    934              sel16x4_0(aa),
    935              sel16x4_0(bb)
    936           );
    937 }
    938 
    939 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
    940 {
    941    return mk32x2(
    942              sel32x2_1(aa),
    943              sel32x2_1(bb)
    944           );
    945 }
    946 
    947 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
    948 {
    949    return mk32x2(
    950              sel32x2_0(aa),
    951              sel32x2_0(bb)
    952           );
    953 }
    954 
    955 /* ------------ Concatenation ------------ */
    956 
    957 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
    958 {
    959    return mk16x4(
    960              sel16x4_3(aa),
    961              sel16x4_1(aa),
    962              sel16x4_3(bb),
    963              sel16x4_1(bb)
    964           );
    965 }
    966 
    967 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
    968 {
    969    return mk16x4(
    970              sel16x4_2(aa),
    971              sel16x4_0(aa),
    972              sel16x4_2(bb),
    973              sel16x4_0(bb)
    974           );
    975 }
    976 
    977 /* misc hack looking for a proper home */
    978 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
    979 {
    980    return mk8x8(
    981              index8x8(aa, sel8x8_7(bb)),
    982              index8x8(aa, sel8x8_6(bb)),
    983              index8x8(aa, sel8x8_5(bb)),
    984              index8x8(aa, sel8x8_4(bb)),
    985              index8x8(aa, sel8x8_3(bb)),
    986              index8x8(aa, sel8x8_2(bb)),
    987              index8x8(aa, sel8x8_1(bb)),
    988              index8x8(aa, sel8x8_0(bb))
    989           );
    990 }
    991 
    992 /* ------------ Shifting ------------ */
    993 /* Note that because these primops are undefined if the shift amount
    994    equals or exceeds the lane width, the shift amount is masked so
    995    that the scalar shifts are always in range.  In fact, given the
    996    semantics of these primops (ShlN16x4, etc) it is an error if in
    997    fact we are ever given an out-of-range shift amount.
    998 */
    999 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
   1000 {
   1001    /* vassert(nn < 32); */
   1002    nn &= 31;
   1003    return mk32x2(
   1004              shl32( sel32x2_1(xx), nn ),
   1005              shl32( sel32x2_0(xx), nn )
   1006           );
   1007 }
   1008 
   1009 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
   1010 {
   1011    /* vassert(nn < 16); */
   1012    nn &= 15;
   1013    return mk16x4(
   1014              shl16( sel16x4_3(xx), nn ),
   1015              shl16( sel16x4_2(xx), nn ),
   1016              shl16( sel16x4_1(xx), nn ),
   1017              shl16( sel16x4_0(xx), nn )
   1018           );
   1019 }
   1020 
   1021 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
   1022 {
   1023    /* vassert(nn < 8); */
   1024    nn &= 7;
   1025    return mk8x8(
   1026              shl8( sel8x8_7(xx), nn ),
   1027              shl8( sel8x8_6(xx), nn ),
   1028              shl8( sel8x8_5(xx), nn ),
   1029              shl8( sel8x8_4(xx), nn ),
   1030              shl8( sel8x8_3(xx), nn ),
   1031              shl8( sel8x8_2(xx), nn ),
   1032              shl8( sel8x8_1(xx), nn ),
   1033              shl8( sel8x8_0(xx), nn )
   1034           );
   1035 }
   1036 
   1037 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
   1038 {
   1039    /* vassert(nn < 32); */
   1040    nn &= 31;
   1041    return mk32x2(
   1042              shr32( sel32x2_1(xx), nn ),
   1043              shr32( sel32x2_0(xx), nn )
   1044           );
   1045 }
   1046 
   1047 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
   1048 {
   1049    /* vassert(nn < 16); */
   1050    nn &= 15;
   1051    return mk16x4(
   1052              shr16( sel16x4_3(xx), nn ),
   1053              shr16( sel16x4_2(xx), nn ),
   1054              shr16( sel16x4_1(xx), nn ),
   1055              shr16( sel16x4_0(xx), nn )
   1056           );
   1057 }
   1058 
   1059 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
   1060 {
   1061    /* vassert(nn < 32); */
   1062    nn &= 31;
   1063    return mk32x2(
   1064              sar32( sel32x2_1(xx), nn ),
   1065              sar32( sel32x2_0(xx), nn )
   1066           );
   1067 }
   1068 
   1069 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
   1070 {
   1071    /* vassert(nn < 16); */
   1072    nn &= 15;
   1073    return mk16x4(
   1074              sar16( sel16x4_3(xx), nn ),
   1075              sar16( sel16x4_2(xx), nn ),
   1076              sar16( sel16x4_1(xx), nn ),
   1077              sar16( sel16x4_0(xx), nn )
   1078           );
   1079 }
   1080 
   1081 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
   1082 {
   1083    /* vassert(nn < 8); */
   1084    nn &= 7;
   1085    return mk8x8(
   1086              sar8( sel8x8_7(xx), nn ),
   1087              sar8( sel8x8_6(xx), nn ),
   1088              sar8( sel8x8_5(xx), nn ),
   1089              sar8( sel8x8_4(xx), nn ),
   1090              sar8( sel8x8_3(xx), nn ),
   1091              sar8( sel8x8_2(xx), nn ),
   1092              sar8( sel8x8_1(xx), nn ),
   1093              sar8( sel8x8_0(xx), nn )
   1094           );
   1095 }
   1096 
   1097 /* ------------ Averaging ------------ */
   1098 
   1099 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
   1100 {
   1101    return mk8x8(
   1102              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1103              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1104              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1105              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1106              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1107              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1108              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1109              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
   1110           );
   1111 }
   1112 
   1113 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
   1114 {
   1115    return mk16x4(
   1116              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
   1117              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
   1118              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
   1119              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
   1120           );
   1121 }
   1122 
   1123 /* ------------ max/min ------------ */
   1124 
   1125 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
   1126 {
   1127    return mk16x4(
   1128              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
   1129              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
   1130              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
   1131              max16S( sel16x4_0(xx), sel16x4_0(yy) )
   1132           );
   1133 }
   1134 
   1135 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
   1136 {
   1137    return mk8x8(
   1138              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1139              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1140              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1141              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1142              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1143              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1144              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1145              max8U( sel8x8_0(xx), sel8x8_0(yy) )
   1146           );
   1147 }
   1148 
   1149 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
   1150 {
   1151    return mk16x4(
   1152              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
   1153              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
   1154              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
   1155              min16S( sel16x4_0(xx), sel16x4_0(yy) )
   1156           );
   1157 }
   1158 
   1159 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
   1160 {
   1161    return mk8x8(
   1162              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1163              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1164              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1165              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1166              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1167              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1168              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1169              min8U( sel8x8_0(xx), sel8x8_0(yy) )
   1170           );
   1171 }
   1172 
   1173 UInt h_generic_calc_GetMSBs8x8 ( ULong xx )
   1174 {
   1175    UInt r = 0;
   1176    if (xx & (1ULL << (64-1))) r |= (1<<7);
   1177    if (xx & (1ULL << (56-1))) r |= (1<<6);
   1178    if (xx & (1ULL << (48-1))) r |= (1<<5);
   1179    if (xx & (1ULL << (40-1))) r |= (1<<4);
   1180    if (xx & (1ULL << (32-1))) r |= (1<<3);
   1181    if (xx & (1ULL << (24-1))) r |= (1<<2);
   1182    if (xx & (1ULL << (16-1))) r |= (1<<1);
   1183    if (xx & (1ULL << ( 8-1))) r |= (1<<0);
   1184    return r;
   1185 }
   1186 
   1187 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
   1188 
   1189 /* Tuple/select functions for 16x2 vectors. */
   1190 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
   1191    return (((UInt)w1) << 16) | ((UInt)w2);
   1192 }
   1193 
   1194 static inline UShort sel16x2_1 ( UInt w32 ) {
   1195    return 0xFFFF & (UShort)(w32 >> 16);
   1196 }
   1197 static inline UShort sel16x2_0 ( UInt w32 ) {
   1198    return 0xFFFF & (UShort)(w32);
   1199 }
   1200 
   1201 static inline UInt mk8x4 ( UChar w3, UChar w2,
   1202                            UChar w1, UChar w0 ) {
   1203    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
   1204               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
   1205    return w32;
   1206 }
   1207 
   1208 static inline UChar sel8x4_3 ( UInt w32 ) {
   1209    return toUChar(0xFF & (w32 >> 24));
   1210 }
   1211 static inline UChar sel8x4_2 ( UInt w32 ) {
   1212    return toUChar(0xFF & (w32 >> 16));
   1213 }
   1214 static inline UChar sel8x4_1 ( UInt w32 ) {
   1215    return toUChar(0xFF & (w32 >> 8));
   1216 }
   1217 static inline UChar sel8x4_0 ( UInt w32 ) {
   1218    return toUChar(0xFF & (w32 >> 0));
   1219 }
   1220 
   1221 
   1222 /* ----------------------------------------------------- */
   1223 /* More externally visible functions.  These simply
   1224    implement the corresponding IR primops. */
   1225 /* ----------------------------------------------------- */
   1226 
   1227 /* ------ 16x2 ------ */
   1228 
   1229 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
   1230 {
   1231    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
   1232                   sel16x2_0(xx) + sel16x2_0(yy) );
   1233 }
   1234 
   1235 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
   1236 {
   1237    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
   1238                   sel16x2_0(xx) - sel16x2_0(yy) );
   1239 }
   1240 
   1241 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
   1242 {
   1243    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1244                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1245 }
   1246 
   1247 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
   1248 {
   1249    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1250                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1251 }
   1252 
   1253 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
   1254 {
   1255    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1256                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1257 }
   1258 
   1259 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
   1260 {
   1261    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1262                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1263 }
   1264 
   1265 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
   1266 {
   1267    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1268                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1269 }
   1270 
   1271 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
   1272 {
   1273    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1274                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1275 }
   1276 
   1277 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
   1278 {
   1279    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1280                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1281 }
   1282 
   1283 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
   1284 {
   1285    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1286                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1287 }
   1288 
   1289 /* ------ 8x4 ------ */
   1290 
   1291 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
   1292 {
   1293    return mk8x4(
   1294              sel8x4_3(xx) + sel8x4_3(yy),
   1295              sel8x4_2(xx) + sel8x4_2(yy),
   1296              sel8x4_1(xx) + sel8x4_1(yy),
   1297              sel8x4_0(xx) + sel8x4_0(yy)
   1298           );
   1299 }
   1300 
   1301 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
   1302 {
   1303    return mk8x4(
   1304              sel8x4_3(xx) - sel8x4_3(yy),
   1305              sel8x4_2(xx) - sel8x4_2(yy),
   1306              sel8x4_1(xx) - sel8x4_1(yy),
   1307              sel8x4_0(xx) - sel8x4_0(yy)
   1308           );
   1309 }
   1310 
   1311 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
   1312 {
   1313    return mk8x4(
   1314              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1315              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1316              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1317              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
   1318           );
   1319 }
   1320 
   1321 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
   1322 {
   1323    return mk8x4(
   1324              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1325              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1326              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1327              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
   1328           );
   1329 }
   1330 
   1331 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
   1332 {
   1333    return mk8x4(
   1334              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1335              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1336              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1337              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
   1338           );
   1339 }
   1340 
   1341 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
   1342 {
   1343    return mk8x4(
   1344              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1345              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1346              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1347              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
   1348           );
   1349 }
   1350 
   1351 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
   1352 {
   1353    return mk8x4(
   1354              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1355              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1356              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1357              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
   1358           );
   1359 }
   1360 
   1361 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
   1362 {
   1363    return mk8x4(
   1364              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1365              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1366              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1367              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
   1368           );
   1369 }
   1370 
   1371 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
   1372 {
   1373    return mk8x4(
   1374              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1375              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1376              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1377              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
   1378           );
   1379 }
   1380 
   1381 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
   1382 {
   1383    return mk8x4(
   1384              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1385              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1386              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1387              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
   1388           );
   1389 }
   1390 
   1391 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
   1392 {
   1393    return mk16x2(
   1394              cmpnez16( sel16x2_1(xx) ),
   1395              cmpnez16( sel16x2_0(xx) )
   1396           );
   1397 }
   1398 
   1399 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
   1400 {
   1401    return mk8x4(
   1402              cmpnez8( sel8x4_3(xx) ),
   1403              cmpnez8( sel8x4_2(xx) ),
   1404              cmpnez8( sel8x4_1(xx) ),
   1405              cmpnez8( sel8x4_0(xx) )
   1406           );
   1407 }
   1408 
   1409 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
   1410 {
   1411    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
   1412           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
   1413           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
   1414           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
   1415 }
   1416 
   1417 UInt h_generic_calc_QAdd32S ( UInt xx, UInt yy )
   1418 {
   1419    return qadd32S( xx, yy );
   1420 }
   1421 
   1422 UInt h_generic_calc_QSub32S ( UInt xx, UInt yy )
   1423 {
   1424    return qsub32S( xx, yy );
   1425 }
   1426 
   1427 
   1428 /*------------------------------------------------------------------*/
   1429 /* Decimal Floating Point (DFP) externally visible helper functions */
   1430 /* that implement Iop_BCDtoDPB and Iop_DPBtoBCD                     */
   1431 /*------------------------------------------------------------------*/
   1432 
   1433 #define NOT( x )    ( ( ( x ) == 0) ? 1 : 0)
   1434 #define GET( x, y ) ( ( ( x ) & ( 0x1UL << ( y ) ) ) >> ( y ) )
   1435 #define PUT( x, y ) ( ( x )<< ( y ) )
   1436 
   1437 static ULong dpb_to_bcd( ULong chunk )
   1438 {
   1439    Short a, b, c, d, e, f, g, h, i, j, k, m;
   1440    Short p, q, r, s, t, u, v, w, x, y;
   1441    ULong value;
   1442 
   1443    /* convert 10 bit densely packed BCD to BCD */
   1444    p = GET( chunk, 9 );
   1445    q = GET( chunk, 8 );
   1446    r = GET( chunk, 7 );
   1447    s = GET( chunk, 6 );
   1448    t = GET( chunk, 5 );
   1449    u = GET( chunk, 4 );
   1450    v = GET( chunk, 3 );
   1451    w = GET( chunk, 2 );
   1452    x = GET( chunk, 1 );
   1453    y = GET( chunk, 0 );
   1454 
   1455    /* The BCD bit values are given by the following boolean equations.*/
   1456    a = ( NOT(s) & v & w ) | ( t & v & w & s ) | ( v & w & NOT(x) );
   1457    b = ( p & s & x & NOT(t) ) | ( p & NOT(w) ) | ( p & NOT(v) );
   1458    c = ( q & s & x & NOT(t) ) | ( q & NOT(w) ) | ( q & NOT(v) );
   1459    d = r;
   1460    e = ( v & NOT(w) & x ) | ( s & v & w & x ) | ( NOT(t) & v & x & w );
   1461    f = ( p & t & v & w & x & NOT(s) ) | ( s & NOT(x) & v ) | ( s & NOT(v) );
   1462    g = ( q & t & w & v & x & NOT(s) ) | ( t & NOT(x) & v ) | ( t & NOT(v) );
   1463    h = u;
   1464    i = ( t & v & w & x ) | ( s & v & w & x ) | ( v & NOT(w) & NOT(x) );
   1465    j = ( p & NOT(s) & NOT(t) & w & v ) | ( s & v & NOT(w) & x )
   1466             | ( p & w & NOT(x) & v ) | ( w & NOT(v) );
   1467    k = ( q & NOT(s) & NOT(t) & v & w ) | ( t & v & NOT(w) & x )
   1468             | ( q & v & w & NOT(x) ) | ( x & NOT(v) );
   1469    m = y;
   1470 
   1471    value = PUT(a, 11) | PUT(b, 10) | PUT(c, 9) | PUT(d, 8) | PUT(e, 7)
   1472             | PUT(f, 6) | PUT(g, 5) | PUT(h, 4) | PUT(i, 3) | PUT(j, 2)
   1473             | PUT(k, 1) | PUT(m, 0);
   1474    return value;
   1475 }
   1476 
   1477 static ULong bcd_to_dpb( ULong chunk )
   1478 {
   1479    Short a, b, c, d, e, f, g, h, i, j, k, m;
   1480    Short p, q, r, s, t, u, v, w, x, y;
   1481    ULong value;
   1482    /* Convert a 3 digit BCD value to a 10 bit Densely Packed Binary (DPD) value
   1483     The boolean equations to calculate the value of each of the DPD bit
   1484     is given in Appendix B  of Book 1: Power ISA User Instruction set.  The
   1485     bits for the DPD number are [abcdefghijkm].  The bits for the BCD value
   1486     are [pqrstuvwxy].  The boolean logic equations in psuedo C code are:
   1487     */
   1488    a = GET( chunk, 11 );
   1489    b = GET( chunk, 10 );
   1490    c = GET( chunk, 9 );
   1491    d = GET( chunk, 8 );
   1492    e = GET( chunk, 7 );
   1493    f = GET( chunk, 6 );
   1494    g = GET( chunk, 5 );
   1495    h = GET( chunk, 4 );
   1496    i = GET( chunk, 3 );
   1497    j = GET( chunk, 2 );
   1498    k = GET( chunk, 1 );
   1499    m = GET( chunk, 0 );
   1500 
   1501    p = ( f & a & i & NOT(e) ) | ( j & a & NOT(i) ) | ( b & NOT(a) );
   1502    q = ( g & a & i & NOT(e) ) | ( k & a & NOT(i) ) | ( c & NOT(a) );
   1503    r = d;
   1504    s = ( j & NOT(a) & e & NOT(i) ) | ( f & NOT(i) & NOT(e) )
   1505             | ( f & NOT(a) & NOT(e) ) | ( e & i );
   1506    t = ( k & NOT(a) & e & NOT(i) ) | ( g & NOT(i) & NOT(e) )
   1507             | ( g & NOT(a) & NOT(e) ) | ( a & i );
   1508    u = h;
   1509    v = a | e | i;
   1510    w = ( NOT(e) & j & NOT(i) ) | ( e & i ) | a;
   1511    x = ( NOT(a) & k & NOT(i) ) | ( a & i ) | e;
   1512    y = m;
   1513 
   1514    value = PUT(p, 9) | PUT(q, 8) | PUT(r, 7) | PUT(s, 6) | PUT(t, 5)
   1515             | PUT(u, 4) | PUT(v, 3) | PUT(w, 2) | PUT(x, 1) | y;
   1516 
   1517    return value;
   1518 }
   1519 
   1520 ULong h_calc_DPBtoBCD( ULong dpb )
   1521 {
   1522    ULong result, chunk;
   1523    Int i;
   1524 
   1525    result = 0;
   1526 
   1527    for (i = 0; i < 5; i++) {
   1528       chunk = dpb >> ( 4 - i ) * 10;
   1529       result = result << 12;
   1530       result |= dpb_to_bcd( chunk & 0x3FF );
   1531    }
   1532    return result;
   1533 }
   1534 
   1535 ULong h_calc_BCDtoDPB( ULong bcd )
   1536 {
   1537    ULong result, chunk;
   1538    Int i;
   1539 
   1540    result = 0;
   1541 
   1542    for (i = 0; i < 5; i++) {
   1543       chunk = bcd >> ( 4 - i ) * 12;
   1544       result = result << 10;
   1545       result |= bcd_to_dpb( chunk & 0xFFF );
   1546    }
   1547    return result;
   1548 }
   1549 #undef NOT
   1550 #undef GET
   1551 #undef PUT
   1552 
   1553 
   1554 /* ----------------------------------------------------- */
   1555 /* Signed and unsigned integer division, that behave like
   1556    the ARMv7 UDIV ansd SDIV instructions.
   1557 
   1558    sdiv32 also behaves like 64-bit v8 SDIV on w-regs.
   1559    udiv32 also behaves like 64-bit v8 UDIV on w-regs.
   1560 */
   1561 /* ----------------------------------------------------- */
   1562 
   1563 UInt h_calc_udiv32_w_arm_semantics ( UInt x, UInt y )
   1564 {
   1565    // Division by zero --> zero
   1566    if (UNLIKELY(y == 0)) return 0;
   1567    // C requires rounding towards zero, which is also what we need.
   1568    return x / y;
   1569 }
   1570 
   1571 ULong h_calc_udiv64_w_arm_semantics ( ULong x, ULong y )
   1572 {
   1573    // Division by zero --> zero
   1574    if (UNLIKELY(y == 0)) return 0;
   1575    // C requires rounding towards zero, which is also what we need.
   1576    return x / y;
   1577 }
   1578 
   1579 Int h_calc_sdiv32_w_arm_semantics ( Int x, Int y )
   1580 {
   1581    // Division by zero --> zero
   1582    if (UNLIKELY(y == 0)) return 0;
   1583    // The single case that produces an unrepresentable result
   1584    if (UNLIKELY( ((UInt)x) == ((UInt)0x80000000)
   1585                  && ((UInt)y) == ((UInt)0xFFFFFFFF) ))
   1586       return (Int)(UInt)0x80000000;
   1587    // Else return the result rounded towards zero.  C89 says
   1588    // this is implementation defined (in the signed case), but gcc
   1589    // promises to round towards zero.  Nevertheless, at startup,
   1590    // in main_main.c, do a check for that.
   1591    return x / y;
   1592 }
   1593 
   1594 Long h_calc_sdiv64_w_arm_semantics ( Long x, Long y )
   1595 {
   1596    // Division by zero --> zero
   1597    if (UNLIKELY(y == 0)) return 0;
   1598    // The single case that produces an unrepresentable result
   1599    if (UNLIKELY( ((ULong)x) == ((ULong)0x8000000000000000ULL )
   1600                  && ((ULong)y) == ((ULong)0xFFFFFFFFFFFFFFFFULL ) ))
   1601       return (Long)(ULong)0x8000000000000000ULL;
   1602    // Else return the result rounded towards zero.  C89 says
   1603    // this is implementation defined (in the signed case), but gcc
   1604    // promises to round towards zero.  Nevertheless, at startup,
   1605    // in main_main.c, do a check for that.
   1606    return x / y;
   1607 }
   1608 
   1609 
   1610 /*---------------------------------------------------------------*/
   1611 /*--- end                               host_generic_simd64.c ---*/
   1612 /*---------------------------------------------------------------*/
   1613