Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                             host_generic_simd64.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2010 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
     37    where the instruction selectors cannot generate code in-line.
     38    These are purely back-end entities and cannot be seen/referenced
     39    from IR. */
     40 
     41 #include "libvex_basictypes.h"
     42 #include "host_generic_simd64.h"
     43 
     44 
     45 
     46 /* Tuple/select functions for 32x2 vectors. */
     47 
     48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
     49    return (((ULong)w1) << 32) | ((ULong)w0);
     50 }
     51 
     52 static inline UInt sel32x2_1 ( ULong w64 ) {
     53    return 0xFFFFFFFF & toUInt(w64 >> 32);
     54 }
     55 static inline UInt sel32x2_0 ( ULong w64 ) {
     56    return 0xFFFFFFFF & toUInt(w64);
     57 }
     58 
     59 
     60 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
     61    with 64-bit shifts so we give it a hand. */
     62 
     63 static inline ULong mk16x4 ( UShort w3, UShort w2,
     64                              UShort w1, UShort w0 ) {
     65    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
     66    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
     67    return mk32x2(hi32, lo32);
     68 }
     69 
     70 static inline UShort sel16x4_3 ( ULong w64 ) {
     71    UInt hi32 = toUInt(w64 >> 32);
     72    return toUShort(0xFFFF & (hi32 >> 16));
     73 }
     74 static inline UShort sel16x4_2 ( ULong w64 ) {
     75    UInt hi32 = toUInt(w64 >> 32);
     76    return toUShort(0xFFFF & hi32);
     77 }
     78 static inline UShort sel16x4_1 ( ULong w64 ) {
     79    UInt lo32 = (UInt)w64;
     80    return toUShort(0xFFFF & (lo32 >> 16));
     81 }
     82 static inline UShort sel16x4_0 ( ULong w64 ) {
     83    UInt lo32 = (UInt)w64;
     84    return toUShort(0xFFFF & lo32);
     85 }
     86 
     87 
     88 /* Tuple/select functions for 8x8 vectors. */
     89 
     90 static inline ULong mk8x8 ( UChar w7, UChar w6,
     91                             UChar w5, UChar w4,
     92                             UChar w3, UChar w2,
     93                             UChar w1, UChar w0 ) {
     94    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
     95                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
     96    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
     97                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
     98    return mk32x2(hi32, lo32);
     99 }
    100 
    101 static inline UChar sel8x8_7 ( ULong w64 ) {
    102    UInt hi32 = toUInt(w64 >> 32);
    103    return toUChar(0xFF & (hi32 >> 24));
    104 }
    105 static inline UChar sel8x8_6 ( ULong w64 ) {
    106    UInt hi32 = toUInt(w64 >> 32);
    107    return toUChar(0xFF & (hi32 >> 16));
    108 }
    109 static inline UChar sel8x8_5 ( ULong w64 ) {
    110    UInt hi32 = toUInt(w64 >> 32);
    111    return toUChar(0xFF & (hi32 >> 8));
    112 }
    113 static inline UChar sel8x8_4 ( ULong w64 ) {
    114    UInt hi32 = toUInt(w64 >> 32);
    115    return toUChar(0xFF & (hi32 >> 0));
    116 }
    117 static inline UChar sel8x8_3 ( ULong w64 ) {
    118    UInt lo32 = (UInt)w64;
    119    return toUChar(0xFF & (lo32 >> 24));
    120 }
    121 static inline UChar sel8x8_2 ( ULong w64 ) {
    122    UInt lo32 = (UInt)w64;
    123    return toUChar(0xFF & (lo32 >> 16));
    124 }
    125 static inline UChar sel8x8_1 ( ULong w64 ) {
    126    UInt lo32 = (UInt)w64;
    127    return toUChar(0xFF & (lo32 >> 8));
    128 }
    129 static inline UChar sel8x8_0 ( ULong w64 ) {
    130    UInt lo32 = (UInt)w64;
    131    return toUChar(0xFF & (lo32 >> 0));
    132 }
    133 
    134 static inline UChar index8x8 ( ULong w64, UChar ix ) {
    135    ix &= 7;
    136    return toUChar((w64 >> (8*ix)) & 0xFF);
    137 }
    138 
    139 
    140 /* Scalar helpers. */
    141 
    142 static inline Short qadd16S ( Short xx, Short yy )
    143 {
    144    Int t = ((Int)xx) + ((Int)yy);
    145    if (t < -32768) t = -32768;
    146    if (t > 32767)  t = 32767;
    147    return (Short)t;
    148 }
    149 
    150 static inline Char qadd8S ( Char xx, Char yy )
    151 {
    152    Int t = ((Int)xx) + ((Int)yy);
    153    if (t < -128) t = -128;
    154    if (t > 127)  t = 127;
    155    return (Char)t;
    156 }
    157 
    158 static inline UShort qadd16U ( UShort xx, UShort yy )
    159 {
    160    UInt t = ((UInt)xx) + ((UInt)yy);
    161    if (t > 0xFFFF) t = 0xFFFF;
    162    return (UShort)t;
    163 }
    164 
    165 static inline UChar qadd8U ( UChar xx, UChar yy )
    166 {
    167    UInt t = ((UInt)xx) + ((UInt)yy);
    168    if (t > 0xFF) t = 0xFF;
    169    return (UChar)t;
    170 }
    171 
    172 static inline Short qsub16S ( Short xx, Short yy )
    173 {
    174    Int t = ((Int)xx) - ((Int)yy);
    175    if (t < -32768) t = -32768;
    176    if (t > 32767)  t = 32767;
    177    return (Short)t;
    178 }
    179 
    180 static inline Char qsub8S ( Char xx, Char yy )
    181 {
    182    Int t = ((Int)xx) - ((Int)yy);
    183    if (t < -128) t = -128;
    184    if (t > 127)  t = 127;
    185    return (Char)t;
    186 }
    187 
    188 static inline UShort qsub16U ( UShort xx, UShort yy )
    189 {
    190    Int t = ((Int)xx) - ((Int)yy);
    191    if (t < 0)      t = 0;
    192    if (t > 0xFFFF) t = 0xFFFF;
    193    return (UShort)t;
    194 }
    195 
    196 static inline UChar qsub8U ( UChar xx, UChar yy )
    197 {
    198    Int t = ((Int)xx) - ((Int)yy);
    199    if (t < 0)    t = 0;
    200    if (t > 0xFF) t = 0xFF;
    201    return (UChar)t;
    202 }
    203 
    204 static inline Short mul16 ( Short xx, Short yy )
    205 {
    206    Int t = ((Int)xx) * ((Int)yy);
    207    return (Short)t;
    208 }
    209 
    210 static inline Int mul32 ( Int xx, Int yy )
    211 {
    212    Int t = ((Int)xx) * ((Int)yy);
    213    return (Int)t;
    214 }
    215 
    216 static inline Short mulhi16S ( Short xx, Short yy )
    217 {
    218    Int t = ((Int)xx) * ((Int)yy);
    219    t >>=/*s*/ 16;
    220    return (Short)t;
    221 }
    222 
    223 static inline UShort mulhi16U ( UShort xx, UShort yy )
    224 {
    225    UInt t = ((UInt)xx) * ((UInt)yy);
    226    t >>=/*u*/ 16;
    227    return (UShort)t;
    228 }
    229 
    230 static inline UInt cmpeq32 ( UInt xx, UInt yy )
    231 {
    232    return xx==yy ? 0xFFFFFFFF : 0;
    233 }
    234 
    235 static inline UShort cmpeq16 ( UShort xx, UShort yy )
    236 {
    237    return toUShort(xx==yy ? 0xFFFF : 0);
    238 }
    239 
    240 static inline UChar cmpeq8 ( UChar xx, UChar yy )
    241 {
    242    return toUChar(xx==yy ? 0xFF : 0);
    243 }
    244 
    245 static inline UInt cmpgt32S ( Int xx, Int yy )
    246 {
    247    return xx>yy ? 0xFFFFFFFF : 0;
    248 }
    249 
    250 static inline UShort cmpgt16S ( Short xx, Short yy )
    251 {
    252    return toUShort(xx>yy ? 0xFFFF : 0);
    253 }
    254 
    255 static inline UChar cmpgt8S ( Char xx, Char yy )
    256 {
    257    return toUChar(xx>yy ? 0xFF : 0);
    258 }
    259 
    260 static inline UInt cmpnez32 ( UInt xx )
    261 {
    262    return xx==0 ? 0 : 0xFFFFFFFF;
    263 }
    264 
    265 static inline UShort cmpnez16 ( UShort xx )
    266 {
    267    return toUShort(xx==0 ? 0 : 0xFFFF);
    268 }
    269 
    270 static inline UChar cmpnez8 ( UChar xx )
    271 {
    272    return toUChar(xx==0 ? 0 : 0xFF);
    273 }
    274 
    275 static inline Short qnarrow32Sto16 ( UInt xx0 )
    276 {
    277    Int xx = (Int)xx0;
    278    if (xx < -32768) xx = -32768;
    279    if (xx > 32767)  xx = 32767;
    280    return (Short)xx;
    281 }
    282 
    283 static inline Char qnarrow16Sto8 ( UShort xx0 )
    284 {
    285    Short xx = (Short)xx0;
    286    if (xx < -128) xx = -128;
    287    if (xx > 127)  xx = 127;
    288    return (Char)xx;
    289 }
    290 
    291 static inline UChar qnarrow16Uto8 ( UShort xx0 )
    292 {
    293    Short xx = (Short)xx0;
    294    if (xx < 0)   xx = 0;
    295    if (xx > 255) xx = 255;
    296    return (UChar)xx;
    297 }
    298 
    299 /* shifts: we don't care about out-of-range ones, since
    300    that is dealt with at a higher level. */
    301 
    302 static inline UChar shl8 ( UChar v, UInt n )
    303 {
    304    return toUChar(v << n);
    305 }
    306 
    307 static inline UChar sar8 ( UChar v, UInt n )
    308 {
    309    return toUChar(((Char)v) >> n);
    310 }
    311 
    312 static inline UShort shl16 ( UShort v, UInt n )
    313 {
    314    return toUShort(v << n);
    315 }
    316 
    317 static inline UShort shr16 ( UShort v, UInt n )
    318 {
    319    return toUShort((((UShort)v) >> n));
    320 }
    321 
    322 static inline UShort sar16 ( UShort v, UInt n )
    323 {
    324    return toUShort(((Short)v) >> n);
    325 }
    326 
    327 static inline UInt shl32 ( UInt v, UInt n )
    328 {
    329    return v << n;
    330 }
    331 
    332 static inline UInt shr32 ( UInt v, UInt n )
    333 {
    334    return (((UInt)v) >> n);
    335 }
    336 
    337 static inline UInt sar32 ( UInt v, UInt n )
    338 {
    339    return ((Int)v) >> n;
    340 }
    341 
    342 static inline UChar avg8U ( UChar xx, UChar yy )
    343 {
    344    UInt xxi = (UInt)xx;
    345    UInt yyi = (UInt)yy;
    346    UInt r   = (xxi + yyi + 1) >> 1;
    347    return (UChar)r;
    348 }
    349 
    350 static inline UShort avg16U ( UShort xx, UShort yy )
    351 {
    352    UInt xxi = (UInt)xx;
    353    UInt yyi = (UInt)yy;
    354    UInt r   = (xxi + yyi + 1) >> 1;
    355    return (UShort)r;
    356 }
    357 
    358 static inline Short max16S ( Short xx, Short yy )
    359 {
    360    return toUShort((xx > yy) ? xx : yy);
    361 }
    362 
    363 static inline UChar max8U ( UChar xx, UChar yy )
    364 {
    365    return toUChar((xx > yy) ? xx : yy);
    366 }
    367 
    368 static inline Short min16S ( Short xx, Short yy )
    369 {
    370    return toUShort((xx < yy) ? xx : yy);
    371 }
    372 
    373 static inline UChar min8U ( UChar xx, UChar yy )
    374 {
    375    return toUChar((xx < yy) ? xx : yy);
    376 }
    377 
    378 static inline UShort hadd16U ( UShort xx, UShort yy )
    379 {
    380    UInt xxi = (UInt)xx;
    381    UInt yyi = (UInt)yy;
    382    UInt r   = (xxi + yyi) >> 1;
    383    return (UShort)r;
    384 }
    385 
    386 static inline Short hadd16S ( Short xx, Short yy )
    387 {
    388    Int xxi = (Int)xx;
    389    Int yyi = (Int)yy;
    390    Int r   = (xxi + yyi) >> 1;
    391    return (Short)r;
    392 }
    393 
    394 static inline UShort hsub16U ( UShort xx, UShort yy )
    395 {
    396    UInt xxi = (UInt)xx;
    397    UInt yyi = (UInt)yy;
    398    UInt r   = (xxi - yyi) >> 1;
    399    return (UShort)r;
    400 }
    401 
    402 static inline Short hsub16S ( Short xx, Short yy )
    403 {
    404    Int xxi = (Int)xx;
    405    Int yyi = (Int)yy;
    406    Int r   = (xxi - yyi) >> 1;
    407    return (Short)r;
    408 }
    409 
    410 static inline UChar hadd8U ( UChar xx, UChar yy )
    411 {
    412    UInt xxi = (UInt)xx;
    413    UInt yyi = (UInt)yy;
    414    UInt r   = (xxi + yyi) >> 1;
    415    return (UChar)r;
    416 }
    417 
    418 static inline Char hadd8S ( Char xx, Char yy )
    419 {
    420    Int xxi = (Int)xx;
    421    Int yyi = (Int)yy;
    422    Int r   = (xxi + yyi) >> 1;
    423    return (Char)r;
    424 }
    425 
    426 static inline UChar hsub8U ( UChar xx, UChar yy )
    427 {
    428    UInt xxi = (UInt)xx;
    429    UInt yyi = (UInt)yy;
    430    UInt r   = (xxi - yyi) >> 1;
    431    return (UChar)r;
    432 }
    433 
    434 static inline Char hsub8S ( Char xx, Char yy )
    435 {
    436    Int xxi = (Int)xx;
    437    Int yyi = (Int)yy;
    438    Int r   = (xxi - yyi) >> 1;
    439    return (Char)r;
    440 }
    441 
    442 static inline UInt absdiff8U ( UChar xx, UChar yy )
    443 {
    444    UInt xxu = (UChar)xx;
    445    UInt yyu = (UChar)yy;
    446    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
    447 }
    448 
    449 /* ----------------------------------------------------- */
    450 /* Start of the externally visible functions.  These simply
    451    implement the corresponding IR primops. */
    452 /* ----------------------------------------------------- */
    453 
    454 /* ------------ Normal addition ------------ */
    455 
    456 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
    457 {
    458    return mk32x2(
    459              sel32x2_1(xx) + sel32x2_1(yy),
    460              sel32x2_0(xx) + sel32x2_0(yy)
    461           );
    462 }
    463 
    464 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
    465 {
    466    return mk16x4(
    467              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
    468              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
    469              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
    470              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
    471           );
    472 }
    473 
    474 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
    475 {
    476    return mk8x8(
    477              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
    478              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
    479              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
    480              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
    481              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
    482              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
    483              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
    484              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
    485           );
    486 }
    487 
    488 /* ------------ Saturating addition ------------ */
    489 
    490 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
    491 {
    492    return mk16x4(
    493              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
    494              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
    495              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
    496              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
    497           );
    498 }
    499 
    500 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
    501 {
    502    return mk8x8(
    503              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
    504              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
    505              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
    506              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
    507              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
    508              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
    509              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
    510              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
    511           );
    512 }
    513 
    514 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
    515 {
    516    return mk16x4(
    517              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
    518              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
    519              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
    520              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
    521           );
    522 }
    523 
    524 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
    525 {
    526    return mk8x8(
    527              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
    528              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
    529              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
    530              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
    531              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
    532              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
    533              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
    534              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
    535           );
    536 }
    537 
    538 /* ------------ Normal subtraction ------------ */
    539 
    540 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
    541 {
    542    return mk32x2(
    543              sel32x2_1(xx) - sel32x2_1(yy),
    544              sel32x2_0(xx) - sel32x2_0(yy)
    545           );
    546 }
    547 
    548 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
    549 {
    550    return mk16x4(
    551              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
    552              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
    553              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
    554              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
    555           );
    556 }
    557 
    558 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
    559 {
    560    return mk8x8(
    561              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
    562              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
    563              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
    564              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
    565              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
    566              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
    567              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
    568              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
    569           );
    570 }
    571 
    572 /* ------------ Saturating subtraction ------------ */
    573 
    574 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
    575 {
    576    return mk16x4(
    577              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
    578              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
    579              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
    580              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
    581           );
    582 }
    583 
    584 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
    585 {
    586    return mk8x8(
    587              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
    588              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
    589              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
    590              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
    591              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
    592              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
    593              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
    594              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
    595           );
    596 }
    597 
    598 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
    599 {
    600    return mk16x4(
    601              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
    602              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
    603              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
    604              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
    605           );
    606 }
    607 
    608 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
    609 {
    610    return mk8x8(
    611              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
    612              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
    613              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
    614              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
    615              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
    616              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
    617              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
    618              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
    619           );
    620 }
    621 
    622 /* ------------ Multiplication ------------ */
    623 
    624 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
    625 {
    626    return mk16x4(
    627              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
    628              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
    629              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
    630              mul16( sel16x4_0(xx), sel16x4_0(yy) )
    631           );
    632 }
    633 
    634 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
    635 {
    636    return mk32x2(
    637              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
    638              mul32( sel32x2_0(xx), sel32x2_0(yy) )
    639           );
    640 }
    641 
    642 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
    643 {
    644    return mk16x4(
    645              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
    646              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
    647              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
    648              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
    649           );
    650 }
    651 
    652 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
    653 {
    654    return mk16x4(
    655              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
    656              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
    657              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
    658              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
    659           );
    660 }
    661 
    662 /* ------------ Comparison ------------ */
    663 
    664 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
    665 {
    666    return mk32x2(
    667              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
    668              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
    669           );
    670 }
    671 
    672 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
    673 {
    674    return mk16x4(
    675              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
    676              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
    677              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
    678              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
    679           );
    680 }
    681 
    682 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
    683 {
    684    return mk8x8(
    685              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
    686              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
    687              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
    688              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
    689              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
    690              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
    691              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
    692              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
    693           );
    694 }
    695 
    696 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
    697 {
    698    return mk32x2(
    699              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
    700              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
    701           );
    702 }
    703 
    704 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
    705 {
    706    return mk16x4(
    707              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
    708              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
    709              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
    710              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
    711           );
    712 }
    713 
    714 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
    715 {
    716    return mk8x8(
    717              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
    718              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
    719              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
    720              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
    721              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
    722              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
    723              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
    724              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
    725           );
    726 }
    727 
    728 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
    729 {
    730    return mk32x2(
    731              cmpnez32( sel32x2_1(xx) ),
    732              cmpnez32( sel32x2_0(xx) )
    733           );
    734 }
    735 
    736 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
    737 {
    738    return mk16x4(
    739              cmpnez16( sel16x4_3(xx) ),
    740              cmpnez16( sel16x4_2(xx) ),
    741              cmpnez16( sel16x4_1(xx) ),
    742              cmpnez16( sel16x4_0(xx) )
    743           );
    744 }
    745 
    746 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
    747 {
    748    return mk8x8(
    749              cmpnez8( sel8x8_7(xx) ),
    750              cmpnez8( sel8x8_6(xx) ),
    751              cmpnez8( sel8x8_5(xx) ),
    752              cmpnez8( sel8x8_4(xx) ),
    753              cmpnez8( sel8x8_3(xx) ),
    754              cmpnez8( sel8x8_2(xx) ),
    755              cmpnez8( sel8x8_1(xx) ),
    756              cmpnez8( sel8x8_0(xx) )
    757           );
    758 }
    759 
    760 /* ------------ Saturating narrowing ------------ */
    761 
    762 ULong h_generic_calc_QNarrow32Sx2 ( ULong aa, ULong bb )
    763 {
    764    UInt d = sel32x2_1(aa);
    765    UInt c = sel32x2_0(aa);
    766    UInt b = sel32x2_1(bb);
    767    UInt a = sel32x2_0(bb);
    768    return mk16x4(
    769              qnarrow32Sto16(d),
    770              qnarrow32Sto16(c),
    771              qnarrow32Sto16(b),
    772              qnarrow32Sto16(a)
    773           );
    774 }
    775 
    776 ULong h_generic_calc_QNarrow16Sx4 ( ULong aa, ULong bb )
    777 {
    778    UShort h = sel16x4_3(aa);
    779    UShort g = sel16x4_2(aa);
    780    UShort f = sel16x4_1(aa);
    781    UShort e = sel16x4_0(aa);
    782    UShort d = sel16x4_3(bb);
    783    UShort c = sel16x4_2(bb);
    784    UShort b = sel16x4_1(bb);
    785    UShort a = sel16x4_0(bb);
    786    return mk8x8(
    787              qnarrow16Sto8(h),
    788              qnarrow16Sto8(g),
    789              qnarrow16Sto8(f),
    790              qnarrow16Sto8(e),
    791              qnarrow16Sto8(d),
    792              qnarrow16Sto8(c),
    793              qnarrow16Sto8(b),
    794              qnarrow16Sto8(a)
    795           );
    796 }
    797 
    798 ULong h_generic_calc_QNarrow16Ux4 ( ULong aa, ULong bb )
    799 {
    800    UShort h = sel16x4_3(aa);
    801    UShort g = sel16x4_2(aa);
    802    UShort f = sel16x4_1(aa);
    803    UShort e = sel16x4_0(aa);
    804    UShort d = sel16x4_3(bb);
    805    UShort c = sel16x4_2(bb);
    806    UShort b = sel16x4_1(bb);
    807    UShort a = sel16x4_0(bb);
    808    return mk8x8(
    809              qnarrow16Uto8(h),
    810              qnarrow16Uto8(g),
    811              qnarrow16Uto8(f),
    812              qnarrow16Uto8(e),
    813              qnarrow16Uto8(d),
    814              qnarrow16Uto8(c),
    815              qnarrow16Uto8(b),
    816              qnarrow16Uto8(a)
    817           );
    818 }
    819 
    820 /* ------------ Interleaving ------------ */
    821 
    822 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
    823 {
    824    return mk8x8(
    825              sel8x8_7(aa),
    826              sel8x8_7(bb),
    827              sel8x8_6(aa),
    828              sel8x8_6(bb),
    829              sel8x8_5(aa),
    830              sel8x8_5(bb),
    831              sel8x8_4(aa),
    832              sel8x8_4(bb)
    833           );
    834 }
    835 
    836 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
    837 {
    838    return mk8x8(
    839              sel8x8_3(aa),
    840              sel8x8_3(bb),
    841              sel8x8_2(aa),
    842              sel8x8_2(bb),
    843              sel8x8_1(aa),
    844              sel8x8_1(bb),
    845              sel8x8_0(aa),
    846              sel8x8_0(bb)
    847           );
    848 }
    849 
    850 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
    851 {
    852    return mk16x4(
    853              sel16x4_3(aa),
    854              sel16x4_3(bb),
    855              sel16x4_2(aa),
    856              sel16x4_2(bb)
    857           );
    858 }
    859 
    860 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
    861 {
    862    return mk16x4(
    863              sel16x4_1(aa),
    864              sel16x4_1(bb),
    865              sel16x4_0(aa),
    866              sel16x4_0(bb)
    867           );
    868 }
    869 
    870 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
    871 {
    872    return mk32x2(
    873              sel32x2_1(aa),
    874              sel32x2_1(bb)
    875           );
    876 }
    877 
    878 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
    879 {
    880    return mk32x2(
    881              sel32x2_0(aa),
    882              sel32x2_0(bb)
    883           );
    884 }
    885 
    886 /* ------------ Concatenation ------------ */
    887 
    888 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
    889 {
    890    return mk16x4(
    891              sel16x4_3(aa),
    892              sel16x4_1(aa),
    893              sel16x4_3(bb),
    894              sel16x4_1(bb)
    895           );
    896 }
    897 
    898 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
    899 {
    900    return mk16x4(
    901              sel16x4_2(aa),
    902              sel16x4_0(aa),
    903              sel16x4_2(bb),
    904              sel16x4_0(bb)
    905           );
    906 }
    907 
    908 /* misc hack looking for a proper home */
    909 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
    910 {
    911    return mk8x8(
    912              index8x8(aa, sel8x8_7(bb)),
    913              index8x8(aa, sel8x8_6(bb)),
    914              index8x8(aa, sel8x8_5(bb)),
    915              index8x8(aa, sel8x8_4(bb)),
    916              index8x8(aa, sel8x8_3(bb)),
    917              index8x8(aa, sel8x8_2(bb)),
    918              index8x8(aa, sel8x8_1(bb)),
    919              index8x8(aa, sel8x8_0(bb))
    920           );
    921 }
    922 
    923 /* ------------ Shifting ------------ */
    924 /* Note that because these primops are undefined if the shift amount
    925    equals or exceeds the lane width, the shift amount is masked so
    926    that the scalar shifts are always in range.  In fact, given the
    927    semantics of these primops (ShlN16x4, etc) it is an error if in
    928    fact we are ever given an out-of-range shift amount.
    929 */
    930 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
    931 {
    932    /* vassert(nn < 32); */
    933    nn &= 31;
    934    return mk32x2(
    935              shl32( sel32x2_1(xx), nn ),
    936              shl32( sel32x2_0(xx), nn )
    937           );
    938 }
    939 
    940 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
    941 {
    942    /* vassert(nn < 16); */
    943    nn &= 15;
    944    return mk16x4(
    945              shl16( sel16x4_3(xx), nn ),
    946              shl16( sel16x4_2(xx), nn ),
    947              shl16( sel16x4_1(xx), nn ),
    948              shl16( sel16x4_0(xx), nn )
    949           );
    950 }
    951 
    952 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
    953 {
    954    /* vassert(nn < 8); */
    955    nn &= 7;
    956    return mk8x8(
    957              shl8( sel8x8_7(xx), nn ),
    958              shl8( sel8x8_6(xx), nn ),
    959              shl8( sel8x8_5(xx), nn ),
    960              shl8( sel8x8_4(xx), nn ),
    961              shl8( sel8x8_3(xx), nn ),
    962              shl8( sel8x8_2(xx), nn ),
    963              shl8( sel8x8_1(xx), nn ),
    964              shl8( sel8x8_0(xx), nn )
    965           );
    966 }
    967 
    968 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
    969 {
    970    /* vassert(nn < 32); */
    971    nn &= 31;
    972    return mk32x2(
    973              shr32( sel32x2_1(xx), nn ),
    974              shr32( sel32x2_0(xx), nn )
    975           );
    976 }
    977 
    978 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
    979 {
    980    /* vassert(nn < 16); */
    981    nn &= 15;
    982    return mk16x4(
    983              shr16( sel16x4_3(xx), nn ),
    984              shr16( sel16x4_2(xx), nn ),
    985              shr16( sel16x4_1(xx), nn ),
    986              shr16( sel16x4_0(xx), nn )
    987           );
    988 }
    989 
    990 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
    991 {
    992    /* vassert(nn < 32); */
    993    nn &= 31;
    994    return mk32x2(
    995              sar32( sel32x2_1(xx), nn ),
    996              sar32( sel32x2_0(xx), nn )
    997           );
    998 }
    999 
   1000 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
   1001 {
   1002    /* vassert(nn < 16); */
   1003    nn &= 15;
   1004    return mk16x4(
   1005              sar16( sel16x4_3(xx), nn ),
   1006              sar16( sel16x4_2(xx), nn ),
   1007              sar16( sel16x4_1(xx), nn ),
   1008              sar16( sel16x4_0(xx), nn )
   1009           );
   1010 }
   1011 
   1012 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
   1013 {
   1014    /* vassert(nn < 8); */
   1015    nn &= 7;
   1016    return mk8x8(
   1017              sar8( sel8x8_7(xx), nn ),
   1018              sar8( sel8x8_6(xx), nn ),
   1019              sar8( sel8x8_5(xx), nn ),
   1020              sar8( sel8x8_4(xx), nn ),
   1021              sar8( sel8x8_3(xx), nn ),
   1022              sar8( sel8x8_2(xx), nn ),
   1023              sar8( sel8x8_1(xx), nn ),
   1024              sar8( sel8x8_0(xx), nn )
   1025           );
   1026 }
   1027 
   1028 /* ------------ Averaging ------------ */
   1029 
   1030 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
   1031 {
   1032    return mk8x8(
   1033              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1034              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1035              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1036              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1037              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1038              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1039              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1040              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
   1041           );
   1042 }
   1043 
   1044 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
   1045 {
   1046    return mk16x4(
   1047              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
   1048              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
   1049              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
   1050              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
   1051           );
   1052 }
   1053 
   1054 /* ------------ max/min ------------ */
   1055 
   1056 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
   1057 {
   1058    return mk16x4(
   1059              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
   1060              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
   1061              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
   1062              max16S( sel16x4_0(xx), sel16x4_0(yy) )
   1063           );
   1064 }
   1065 
   1066 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
   1067 {
   1068    return mk8x8(
   1069              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1070              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1071              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1072              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1073              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1074              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1075              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1076              max8U( sel8x8_0(xx), sel8x8_0(yy) )
   1077           );
   1078 }
   1079 
   1080 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
   1081 {
   1082    return mk16x4(
   1083              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
   1084              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
   1085              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
   1086              min16S( sel16x4_0(xx), sel16x4_0(yy) )
   1087           );
   1088 }
   1089 
   1090 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
   1091 {
   1092    return mk8x8(
   1093              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1094              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1095              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1096              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1097              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1098              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1099              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1100              min8U( sel8x8_0(xx), sel8x8_0(yy) )
   1101           );
   1102 }
   1103 
   1104 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
   1105 
   1106 /* Tuple/select functions for 16x2 vectors. */
   1107 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
   1108    return (((UInt)w1) << 16) | ((UInt)w2);
   1109 }
   1110 
   1111 static inline UShort sel16x2_1 ( UInt w32 ) {
   1112    return 0xFFFF & (UShort)(w32 >> 16);
   1113 }
   1114 static inline UShort sel16x2_0 ( UInt w32 ) {
   1115    return 0xFFFF & (UShort)(w32);
   1116 }
   1117 
   1118 static inline UInt mk8x4 ( UChar w3, UChar w2,
   1119                            UChar w1, UChar w0 ) {
   1120    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
   1121               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
   1122    return w32;
   1123 }
   1124 
   1125 static inline UChar sel8x4_3 ( UInt w32 ) {
   1126    return toUChar(0xFF & (w32 >> 24));
   1127 }
   1128 static inline UChar sel8x4_2 ( UInt w32 ) {
   1129    return toUChar(0xFF & (w32 >> 16));
   1130 }
   1131 static inline UChar sel8x4_1 ( UInt w32 ) {
   1132    return toUChar(0xFF & (w32 >> 8));
   1133 }
   1134 static inline UChar sel8x4_0 ( UInt w32 ) {
   1135    return toUChar(0xFF & (w32 >> 0));
   1136 }
   1137 
   1138 
   1139 /* ----------------------------------------------------- */
   1140 /* More externally visible functions.  These simply
   1141    implement the corresponding IR primops. */
   1142 /* ----------------------------------------------------- */
   1143 
   1144 /* ------ 16x2 ------ */
   1145 
   1146 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
   1147 {
   1148    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
   1149                   sel16x2_0(xx) + sel16x2_0(yy) );
   1150 }
   1151 
   1152 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
   1153 {
   1154    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
   1155                   sel16x2_0(xx) - sel16x2_0(yy) );
   1156 }
   1157 
   1158 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
   1159 {
   1160    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1161                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1162 }
   1163 
   1164 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
   1165 {
   1166    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1167                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1168 }
   1169 
   1170 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
   1171 {
   1172    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1173                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1174 }
   1175 
   1176 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
   1177 {
   1178    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1179                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1180 }
   1181 
   1182 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
   1183 {
   1184    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1185                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1186 }
   1187 
   1188 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
   1189 {
   1190    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1191                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1192 }
   1193 
   1194 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
   1195 {
   1196    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1197                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1198 }
   1199 
   1200 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
   1201 {
   1202    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1203                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1204 }
   1205 
   1206 /* ------ 8x4 ------ */
   1207 
   1208 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
   1209 {
   1210    return mk8x4(
   1211              sel8x4_3(xx) + sel8x4_3(yy),
   1212              sel8x4_2(xx) + sel8x4_2(yy),
   1213              sel8x4_1(xx) + sel8x4_1(yy),
   1214              sel8x4_0(xx) + sel8x4_0(yy)
   1215           );
   1216 }
   1217 
   1218 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
   1219 {
   1220    return mk8x4(
   1221              sel8x4_3(xx) - sel8x4_3(yy),
   1222              sel8x4_2(xx) - sel8x4_2(yy),
   1223              sel8x4_1(xx) - sel8x4_1(yy),
   1224              sel8x4_0(xx) - sel8x4_0(yy)
   1225           );
   1226 }
   1227 
   1228 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
   1229 {
   1230    return mk8x4(
   1231              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1232              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1233              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1234              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
   1235           );
   1236 }
   1237 
   1238 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
   1239 {
   1240    return mk8x4(
   1241              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1242              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1243              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1244              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
   1245           );
   1246 }
   1247 
   1248 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
   1249 {
   1250    return mk8x4(
   1251              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1252              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1253              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1254              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
   1255           );
   1256 }
   1257 
   1258 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
   1259 {
   1260    return mk8x4(
   1261              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1262              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1263              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1264              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
   1265           );
   1266 }
   1267 
   1268 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
   1269 {
   1270    return mk8x4(
   1271              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1272              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1273              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1274              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
   1275           );
   1276 }
   1277 
   1278 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
   1279 {
   1280    return mk8x4(
   1281              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1282              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1283              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1284              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
   1285           );
   1286 }
   1287 
   1288 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
   1289 {
   1290    return mk8x4(
   1291              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1292              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1293              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1294              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
   1295           );
   1296 }
   1297 
   1298 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
   1299 {
   1300    return mk8x4(
   1301              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1302              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1303              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1304              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
   1305           );
   1306 }
   1307 
   1308 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
   1309 {
   1310    return mk16x2(
   1311              cmpnez16( sel16x2_1(xx) ),
   1312              cmpnez16( sel16x2_0(xx) )
   1313           );
   1314 }
   1315 
   1316 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
   1317 {
   1318    return mk8x4(
   1319              cmpnez8( sel8x4_3(xx) ),
   1320              cmpnez8( sel8x4_2(xx) ),
   1321              cmpnez8( sel8x4_1(xx) ),
   1322              cmpnez8( sel8x4_0(xx) )
   1323           );
   1324 }
   1325 
   1326 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
   1327 {
   1328    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
   1329           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
   1330           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
   1331           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
   1332 }
   1333 
   1334 
   1335 /*---------------------------------------------------------------*/
   1336 /*--- end                               host_generic_simd64.c ---*/
   1337 /*---------------------------------------------------------------*/
   1338