Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                             host_generic_simd64.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2011 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* Generic helper functions for doing 64-bit SIMD arithmetic in cases
     37    where the instruction selectors cannot generate code in-line.
     38    These are purely back-end entities and cannot be seen/referenced
     39    from IR. */
     40 
     41 #include "libvex_basictypes.h"
     42 #include "host_generic_simd64.h"
     43 
     44 
     45 
     46 /* Tuple/select functions for 32x2 vectors. */
     47 
     48 static inline ULong mk32x2 ( UInt w1, UInt w0 ) {
     49    return (((ULong)w1) << 32) | ((ULong)w0);
     50 }
     51 
     52 static inline UInt sel32x2_1 ( ULong w64 ) {
     53    return 0xFFFFFFFF & toUInt(w64 >> 32);
     54 }
     55 static inline UInt sel32x2_0 ( ULong w64 ) {
     56    return 0xFFFFFFFF & toUInt(w64);
     57 }
     58 
     59 
     60 /* Tuple/select functions for 16x4 vectors.  gcc is pretty hopeless
     61    with 64-bit shifts so we give it a hand. */
     62 
     63 static inline ULong mk16x4 ( UShort w3, UShort w2,
     64                              UShort w1, UShort w0 ) {
     65    UInt hi32 = (((UInt)w3) << 16) | ((UInt)w2);
     66    UInt lo32 = (((UInt)w1) << 16) | ((UInt)w0);
     67    return mk32x2(hi32, lo32);
     68 }
     69 
     70 static inline UShort sel16x4_3 ( ULong w64 ) {
     71    UInt hi32 = toUInt(w64 >> 32);
     72    return toUShort(0xFFFF & (hi32 >> 16));
     73 }
     74 static inline UShort sel16x4_2 ( ULong w64 ) {
     75    UInt hi32 = toUInt(w64 >> 32);
     76    return toUShort(0xFFFF & hi32);
     77 }
     78 static inline UShort sel16x4_1 ( ULong w64 ) {
     79    UInt lo32 = (UInt)w64;
     80    return toUShort(0xFFFF & (lo32 >> 16));
     81 }
     82 static inline UShort sel16x4_0 ( ULong w64 ) {
     83    UInt lo32 = (UInt)w64;
     84    return toUShort(0xFFFF & lo32);
     85 }
     86 
     87 
     88 /* Tuple/select functions for 8x8 vectors. */
     89 
     90 static inline ULong mk8x8 ( UChar w7, UChar w6,
     91                             UChar w5, UChar w4,
     92                             UChar w3, UChar w2,
     93                             UChar w1, UChar w0 ) {
     94    UInt hi32 =   (((UInt)w7) << 24) | (((UInt)w6) << 16)
     95                | (((UInt)w5) << 8)  | (((UInt)w4) << 0);
     96    UInt lo32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
     97                | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
     98    return mk32x2(hi32, lo32);
     99 }
    100 
    101 static inline UChar sel8x8_7 ( ULong w64 ) {
    102    UInt hi32 = toUInt(w64 >> 32);
    103    return toUChar(0xFF & (hi32 >> 24));
    104 }
    105 static inline UChar sel8x8_6 ( ULong w64 ) {
    106    UInt hi32 = toUInt(w64 >> 32);
    107    return toUChar(0xFF & (hi32 >> 16));
    108 }
    109 static inline UChar sel8x8_5 ( ULong w64 ) {
    110    UInt hi32 = toUInt(w64 >> 32);
    111    return toUChar(0xFF & (hi32 >> 8));
    112 }
    113 static inline UChar sel8x8_4 ( ULong w64 ) {
    114    UInt hi32 = toUInt(w64 >> 32);
    115    return toUChar(0xFF & (hi32 >> 0));
    116 }
    117 static inline UChar sel8x8_3 ( ULong w64 ) {
    118    UInt lo32 = (UInt)w64;
    119    return toUChar(0xFF & (lo32 >> 24));
    120 }
    121 static inline UChar sel8x8_2 ( ULong w64 ) {
    122    UInt lo32 = (UInt)w64;
    123    return toUChar(0xFF & (lo32 >> 16));
    124 }
    125 static inline UChar sel8x8_1 ( ULong w64 ) {
    126    UInt lo32 = (UInt)w64;
    127    return toUChar(0xFF & (lo32 >> 8));
    128 }
    129 static inline UChar sel8x8_0 ( ULong w64 ) {
    130    UInt lo32 = (UInt)w64;
    131    return toUChar(0xFF & (lo32 >> 0));
    132 }
    133 
    134 static inline UChar index8x8 ( ULong w64, UChar ix ) {
    135    ix &= 7;
    136    return toUChar((w64 >> (8*ix)) & 0xFF);
    137 }
    138 
    139 
    140 /* Scalar helpers. */
    141 
    142 static inline Short qadd16S ( Short xx, Short yy )
    143 {
    144    Int t = ((Int)xx) + ((Int)yy);
    145    if (t < -32768) t = -32768;
    146    if (t > 32767)  t = 32767;
    147    return (Short)t;
    148 }
    149 
    150 static inline Char qadd8S ( Char xx, Char yy )
    151 {
    152    Int t = ((Int)xx) + ((Int)yy);
    153    if (t < -128) t = -128;
    154    if (t > 127)  t = 127;
    155    return (Char)t;
    156 }
    157 
    158 static inline UShort qadd16U ( UShort xx, UShort yy )
    159 {
    160    UInt t = ((UInt)xx) + ((UInt)yy);
    161    if (t > 0xFFFF) t = 0xFFFF;
    162    return (UShort)t;
    163 }
    164 
    165 static inline UChar qadd8U ( UChar xx, UChar yy )
    166 {
    167    UInt t = ((UInt)xx) + ((UInt)yy);
    168    if (t > 0xFF) t = 0xFF;
    169    return (UChar)t;
    170 }
    171 
    172 static inline Short qsub16S ( Short xx, Short yy )
    173 {
    174    Int t = ((Int)xx) - ((Int)yy);
    175    if (t < -32768) t = -32768;
    176    if (t > 32767)  t = 32767;
    177    return (Short)t;
    178 }
    179 
    180 static inline Char qsub8S ( Char xx, Char yy )
    181 {
    182    Int t = ((Int)xx) - ((Int)yy);
    183    if (t < -128) t = -128;
    184    if (t > 127)  t = 127;
    185    return (Char)t;
    186 }
    187 
    188 static inline UShort qsub16U ( UShort xx, UShort yy )
    189 {
    190    Int t = ((Int)xx) - ((Int)yy);
    191    if (t < 0)      t = 0;
    192    if (t > 0xFFFF) t = 0xFFFF;
    193    return (UShort)t;
    194 }
    195 
    196 static inline UChar qsub8U ( UChar xx, UChar yy )
    197 {
    198    Int t = ((Int)xx) - ((Int)yy);
    199    if (t < 0)    t = 0;
    200    if (t > 0xFF) t = 0xFF;
    201    return (UChar)t;
    202 }
    203 
    204 static inline Short mul16 ( Short xx, Short yy )
    205 {
    206    Int t = ((Int)xx) * ((Int)yy);
    207    return (Short)t;
    208 }
    209 
    210 static inline Int mul32 ( Int xx, Int yy )
    211 {
    212    Int t = ((Int)xx) * ((Int)yy);
    213    return (Int)t;
    214 }
    215 
    216 static inline Short mulhi16S ( Short xx, Short yy )
    217 {
    218    Int t = ((Int)xx) * ((Int)yy);
    219    t >>=/*s*/ 16;
    220    return (Short)t;
    221 }
    222 
    223 static inline UShort mulhi16U ( UShort xx, UShort yy )
    224 {
    225    UInt t = ((UInt)xx) * ((UInt)yy);
    226    t >>=/*u*/ 16;
    227    return (UShort)t;
    228 }
    229 
    230 static inline UInt cmpeq32 ( UInt xx, UInt yy )
    231 {
    232    return xx==yy ? 0xFFFFFFFF : 0;
    233 }
    234 
    235 static inline UShort cmpeq16 ( UShort xx, UShort yy )
    236 {
    237    return toUShort(xx==yy ? 0xFFFF : 0);
    238 }
    239 
    240 static inline UChar cmpeq8 ( UChar xx, UChar yy )
    241 {
    242    return toUChar(xx==yy ? 0xFF : 0);
    243 }
    244 
    245 static inline UInt cmpgt32S ( Int xx, Int yy )
    246 {
    247    return xx>yy ? 0xFFFFFFFF : 0;
    248 }
    249 
    250 static inline UShort cmpgt16S ( Short xx, Short yy )
    251 {
    252    return toUShort(xx>yy ? 0xFFFF : 0);
    253 }
    254 
    255 static inline UChar cmpgt8S ( Char xx, Char yy )
    256 {
    257    return toUChar(xx>yy ? 0xFF : 0);
    258 }
    259 
    260 static inline UInt cmpnez32 ( UInt xx )
    261 {
    262    return xx==0 ? 0 : 0xFFFFFFFF;
    263 }
    264 
    265 static inline UShort cmpnez16 ( UShort xx )
    266 {
    267    return toUShort(xx==0 ? 0 : 0xFFFF);
    268 }
    269 
    270 static inline UChar cmpnez8 ( UChar xx )
    271 {
    272    return toUChar(xx==0 ? 0 : 0xFF);
    273 }
    274 
    275 static inline Short qnarrow32Sto16S ( UInt xx0 )
    276 {
    277    Int xx = (Int)xx0;
    278    if (xx < -32768) xx = -32768;
    279    if (xx > 32767)  xx = 32767;
    280    return (Short)xx;
    281 }
    282 
    283 static inline Char qnarrow16Sto8S ( UShort xx0 )
    284 {
    285    Short xx = (Short)xx0;
    286    if (xx < -128) xx = -128;
    287    if (xx > 127)  xx = 127;
    288    return (Char)xx;
    289 }
    290 
    291 static inline UChar qnarrow16Sto8U ( UShort xx0 )
    292 {
    293    Short xx = (Short)xx0;
    294    if (xx < 0)   xx = 0;
    295    if (xx > 255) xx = 255;
    296    return (UChar)xx;
    297 }
    298 
    299 static inline UShort narrow32to16 ( UInt xx )
    300 {
    301    return (UShort)xx;
    302 }
    303 
    304 static inline UChar narrow16to8 ( UShort xx )
    305 {
    306    return (UChar)xx;
    307 }
    308 
    309 /* shifts: we don't care about out-of-range ones, since
    310    that is dealt with at a higher level. */
    311 
    312 static inline UChar shl8 ( UChar v, UInt n )
    313 {
    314    return toUChar(v << n);
    315 }
    316 
    317 static inline UChar sar8 ( UChar v, UInt n )
    318 {
    319    return toUChar(((Char)v) >> n);
    320 }
    321 
    322 static inline UShort shl16 ( UShort v, UInt n )
    323 {
    324    return toUShort(v << n);
    325 }
    326 
    327 static inline UShort shr16 ( UShort v, UInt n )
    328 {
    329    return toUShort((((UShort)v) >> n));
    330 }
    331 
    332 static inline UShort sar16 ( UShort v, UInt n )
    333 {
    334    return toUShort(((Short)v) >> n);
    335 }
    336 
    337 static inline UInt shl32 ( UInt v, UInt n )
    338 {
    339    return v << n;
    340 }
    341 
    342 static inline UInt shr32 ( UInt v, UInt n )
    343 {
    344    return (((UInt)v) >> n);
    345 }
    346 
    347 static inline UInt sar32 ( UInt v, UInt n )
    348 {
    349    return ((Int)v) >> n;
    350 }
    351 
    352 static inline UChar avg8U ( UChar xx, UChar yy )
    353 {
    354    UInt xxi = (UInt)xx;
    355    UInt yyi = (UInt)yy;
    356    UInt r   = (xxi + yyi + 1) >> 1;
    357    return (UChar)r;
    358 }
    359 
    360 static inline UShort avg16U ( UShort xx, UShort yy )
    361 {
    362    UInt xxi = (UInt)xx;
    363    UInt yyi = (UInt)yy;
    364    UInt r   = (xxi + yyi + 1) >> 1;
    365    return (UShort)r;
    366 }
    367 
    368 static inline Short max16S ( Short xx, Short yy )
    369 {
    370    return toUShort((xx > yy) ? xx : yy);
    371 }
    372 
    373 static inline UChar max8U ( UChar xx, UChar yy )
    374 {
    375    return toUChar((xx > yy) ? xx : yy);
    376 }
    377 
    378 static inline Short min16S ( Short xx, Short yy )
    379 {
    380    return toUShort((xx < yy) ? xx : yy);
    381 }
    382 
    383 static inline UChar min8U ( UChar xx, UChar yy )
    384 {
    385    return toUChar((xx < yy) ? xx : yy);
    386 }
    387 
    388 static inline UShort hadd16U ( UShort xx, UShort yy )
    389 {
    390    UInt xxi = (UInt)xx;
    391    UInt yyi = (UInt)yy;
    392    UInt r   = (xxi + yyi) >> 1;
    393    return (UShort)r;
    394 }
    395 
    396 static inline Short hadd16S ( Short xx, Short yy )
    397 {
    398    Int xxi = (Int)xx;
    399    Int yyi = (Int)yy;
    400    Int r   = (xxi + yyi) >> 1;
    401    return (Short)r;
    402 }
    403 
    404 static inline UShort hsub16U ( UShort xx, UShort yy )
    405 {
    406    UInt xxi = (UInt)xx;
    407    UInt yyi = (UInt)yy;
    408    UInt r   = (xxi - yyi) >> 1;
    409    return (UShort)r;
    410 }
    411 
    412 static inline Short hsub16S ( Short xx, Short yy )
    413 {
    414    Int xxi = (Int)xx;
    415    Int yyi = (Int)yy;
    416    Int r   = (xxi - yyi) >> 1;
    417    return (Short)r;
    418 }
    419 
    420 static inline UChar hadd8U ( UChar xx, UChar yy )
    421 {
    422    UInt xxi = (UInt)xx;
    423    UInt yyi = (UInt)yy;
    424    UInt r   = (xxi + yyi) >> 1;
    425    return (UChar)r;
    426 }
    427 
    428 static inline Char hadd8S ( Char xx, Char yy )
    429 {
    430    Int xxi = (Int)xx;
    431    Int yyi = (Int)yy;
    432    Int r   = (xxi + yyi) >> 1;
    433    return (Char)r;
    434 }
    435 
    436 static inline UChar hsub8U ( UChar xx, UChar yy )
    437 {
    438    UInt xxi = (UInt)xx;
    439    UInt yyi = (UInt)yy;
    440    UInt r   = (xxi - yyi) >> 1;
    441    return (UChar)r;
    442 }
    443 
    444 static inline Char hsub8S ( Char xx, Char yy )
    445 {
    446    Int xxi = (Int)xx;
    447    Int yyi = (Int)yy;
    448    Int r   = (xxi - yyi) >> 1;
    449    return (Char)r;
    450 }
    451 
    452 static inline UInt absdiff8U ( UChar xx, UChar yy )
    453 {
    454    UInt xxu = (UChar)xx;
    455    UInt yyu = (UChar)yy;
    456    return xxu >= yyu  ? xxu - yyu  : yyu - xxu;
    457 }
    458 
    459 /* ----------------------------------------------------- */
    460 /* Start of the externally visible functions.  These simply
    461    implement the corresponding IR primops. */
    462 /* ----------------------------------------------------- */
    463 
    464 /* ------------ Normal addition ------------ */
    465 
    466 ULong h_generic_calc_Add32x2 ( ULong xx, ULong yy )
    467 {
    468    return mk32x2(
    469              sel32x2_1(xx) + sel32x2_1(yy),
    470              sel32x2_0(xx) + sel32x2_0(yy)
    471           );
    472 }
    473 
    474 ULong h_generic_calc_Add16x4 ( ULong xx, ULong yy )
    475 {
    476    return mk16x4(
    477              toUShort( sel16x4_3(xx) + sel16x4_3(yy) ),
    478              toUShort( sel16x4_2(xx) + sel16x4_2(yy) ),
    479              toUShort( sel16x4_1(xx) + sel16x4_1(yy) ),
    480              toUShort( sel16x4_0(xx) + sel16x4_0(yy) )
    481           );
    482 }
    483 
    484 ULong h_generic_calc_Add8x8 ( ULong xx, ULong yy )
    485 {
    486    return mk8x8(
    487              toUChar( sel8x8_7(xx) + sel8x8_7(yy) ),
    488              toUChar( sel8x8_6(xx) + sel8x8_6(yy) ),
    489              toUChar( sel8x8_5(xx) + sel8x8_5(yy) ),
    490              toUChar( sel8x8_4(xx) + sel8x8_4(yy) ),
    491              toUChar( sel8x8_3(xx) + sel8x8_3(yy) ),
    492              toUChar( sel8x8_2(xx) + sel8x8_2(yy) ),
    493              toUChar( sel8x8_1(xx) + sel8x8_1(yy) ),
    494              toUChar( sel8x8_0(xx) + sel8x8_0(yy) )
    495           );
    496 }
    497 
    498 /* ------------ Saturating addition ------------ */
    499 
    500 ULong h_generic_calc_QAdd16Sx4 ( ULong xx, ULong yy )
    501 {
    502    return mk16x4(
    503              qadd16S( sel16x4_3(xx), sel16x4_3(yy) ),
    504              qadd16S( sel16x4_2(xx), sel16x4_2(yy) ),
    505              qadd16S( sel16x4_1(xx), sel16x4_1(yy) ),
    506              qadd16S( sel16x4_0(xx), sel16x4_0(yy) )
    507           );
    508 }
    509 
    510 ULong h_generic_calc_QAdd8Sx8 ( ULong xx, ULong yy )
    511 {
    512    return mk8x8(
    513              qadd8S( sel8x8_7(xx), sel8x8_7(yy) ),
    514              qadd8S( sel8x8_6(xx), sel8x8_6(yy) ),
    515              qadd8S( sel8x8_5(xx), sel8x8_5(yy) ),
    516              qadd8S( sel8x8_4(xx), sel8x8_4(yy) ),
    517              qadd8S( sel8x8_3(xx), sel8x8_3(yy) ),
    518              qadd8S( sel8x8_2(xx), sel8x8_2(yy) ),
    519              qadd8S( sel8x8_1(xx), sel8x8_1(yy) ),
    520              qadd8S( sel8x8_0(xx), sel8x8_0(yy) )
    521           );
    522 }
    523 
    524 ULong h_generic_calc_QAdd16Ux4 ( ULong xx, ULong yy )
    525 {
    526    return mk16x4(
    527              qadd16U( sel16x4_3(xx), sel16x4_3(yy) ),
    528              qadd16U( sel16x4_2(xx), sel16x4_2(yy) ),
    529              qadd16U( sel16x4_1(xx), sel16x4_1(yy) ),
    530              qadd16U( sel16x4_0(xx), sel16x4_0(yy) )
    531           );
    532 }
    533 
    534 ULong h_generic_calc_QAdd8Ux8 ( ULong xx, ULong yy )
    535 {
    536    return mk8x8(
    537              qadd8U( sel8x8_7(xx), sel8x8_7(yy) ),
    538              qadd8U( sel8x8_6(xx), sel8x8_6(yy) ),
    539              qadd8U( sel8x8_5(xx), sel8x8_5(yy) ),
    540              qadd8U( sel8x8_4(xx), sel8x8_4(yy) ),
    541              qadd8U( sel8x8_3(xx), sel8x8_3(yy) ),
    542              qadd8U( sel8x8_2(xx), sel8x8_2(yy) ),
    543              qadd8U( sel8x8_1(xx), sel8x8_1(yy) ),
    544              qadd8U( sel8x8_0(xx), sel8x8_0(yy) )
    545           );
    546 }
    547 
    548 /* ------------ Normal subtraction ------------ */
    549 
    550 ULong h_generic_calc_Sub32x2 ( ULong xx, ULong yy )
    551 {
    552    return mk32x2(
    553              sel32x2_1(xx) - sel32x2_1(yy),
    554              sel32x2_0(xx) - sel32x2_0(yy)
    555           );
    556 }
    557 
    558 ULong h_generic_calc_Sub16x4 ( ULong xx, ULong yy )
    559 {
    560    return mk16x4(
    561              toUShort( sel16x4_3(xx) - sel16x4_3(yy) ),
    562              toUShort( sel16x4_2(xx) - sel16x4_2(yy) ),
    563              toUShort( sel16x4_1(xx) - sel16x4_1(yy) ),
    564              toUShort( sel16x4_0(xx) - sel16x4_0(yy) )
    565           );
    566 }
    567 
    568 ULong h_generic_calc_Sub8x8 ( ULong xx, ULong yy )
    569 {
    570    return mk8x8(
    571              toUChar( sel8x8_7(xx) - sel8x8_7(yy) ),
    572              toUChar( sel8x8_6(xx) - sel8x8_6(yy) ),
    573              toUChar( sel8x8_5(xx) - sel8x8_5(yy) ),
    574              toUChar( sel8x8_4(xx) - sel8x8_4(yy) ),
    575              toUChar( sel8x8_3(xx) - sel8x8_3(yy) ),
    576              toUChar( sel8x8_2(xx) - sel8x8_2(yy) ),
    577              toUChar( sel8x8_1(xx) - sel8x8_1(yy) ),
    578              toUChar( sel8x8_0(xx) - sel8x8_0(yy) )
    579           );
    580 }
    581 
    582 /* ------------ Saturating subtraction ------------ */
    583 
    584 ULong h_generic_calc_QSub16Sx4 ( ULong xx, ULong yy )
    585 {
    586    return mk16x4(
    587              qsub16S( sel16x4_3(xx), sel16x4_3(yy) ),
    588              qsub16S( sel16x4_2(xx), sel16x4_2(yy) ),
    589              qsub16S( sel16x4_1(xx), sel16x4_1(yy) ),
    590              qsub16S( sel16x4_0(xx), sel16x4_0(yy) )
    591           );
    592 }
    593 
    594 ULong h_generic_calc_QSub8Sx8 ( ULong xx, ULong yy )
    595 {
    596    return mk8x8(
    597              qsub8S( sel8x8_7(xx), sel8x8_7(yy) ),
    598              qsub8S( sel8x8_6(xx), sel8x8_6(yy) ),
    599              qsub8S( sel8x8_5(xx), sel8x8_5(yy) ),
    600              qsub8S( sel8x8_4(xx), sel8x8_4(yy) ),
    601              qsub8S( sel8x8_3(xx), sel8x8_3(yy) ),
    602              qsub8S( sel8x8_2(xx), sel8x8_2(yy) ),
    603              qsub8S( sel8x8_1(xx), sel8x8_1(yy) ),
    604              qsub8S( sel8x8_0(xx), sel8x8_0(yy) )
    605           );
    606 }
    607 
    608 ULong h_generic_calc_QSub16Ux4 ( ULong xx, ULong yy )
    609 {
    610    return mk16x4(
    611              qsub16U( sel16x4_3(xx), sel16x4_3(yy) ),
    612              qsub16U( sel16x4_2(xx), sel16x4_2(yy) ),
    613              qsub16U( sel16x4_1(xx), sel16x4_1(yy) ),
    614              qsub16U( sel16x4_0(xx), sel16x4_0(yy) )
    615           );
    616 }
    617 
    618 ULong h_generic_calc_QSub8Ux8 ( ULong xx, ULong yy )
    619 {
    620    return mk8x8(
    621              qsub8U( sel8x8_7(xx), sel8x8_7(yy) ),
    622              qsub8U( sel8x8_6(xx), sel8x8_6(yy) ),
    623              qsub8U( sel8x8_5(xx), sel8x8_5(yy) ),
    624              qsub8U( sel8x8_4(xx), sel8x8_4(yy) ),
    625              qsub8U( sel8x8_3(xx), sel8x8_3(yy) ),
    626              qsub8U( sel8x8_2(xx), sel8x8_2(yy) ),
    627              qsub8U( sel8x8_1(xx), sel8x8_1(yy) ),
    628              qsub8U( sel8x8_0(xx), sel8x8_0(yy) )
    629           );
    630 }
    631 
    632 /* ------------ Multiplication ------------ */
    633 
    634 ULong h_generic_calc_Mul16x4 ( ULong xx, ULong yy )
    635 {
    636    return mk16x4(
    637              mul16( sel16x4_3(xx), sel16x4_3(yy) ),
    638              mul16( sel16x4_2(xx), sel16x4_2(yy) ),
    639              mul16( sel16x4_1(xx), sel16x4_1(yy) ),
    640              mul16( sel16x4_0(xx), sel16x4_0(yy) )
    641           );
    642 }
    643 
    644 ULong h_generic_calc_Mul32x2 ( ULong xx, ULong yy )
    645 {
    646    return mk32x2(
    647              mul32( sel32x2_1(xx), sel32x2_1(yy) ),
    648              mul32( sel32x2_0(xx), sel32x2_0(yy) )
    649           );
    650 }
    651 
    652 ULong h_generic_calc_MulHi16Sx4 ( ULong xx, ULong yy )
    653 {
    654    return mk16x4(
    655              mulhi16S( sel16x4_3(xx), sel16x4_3(yy) ),
    656              mulhi16S( sel16x4_2(xx), sel16x4_2(yy) ),
    657              mulhi16S( sel16x4_1(xx), sel16x4_1(yy) ),
    658              mulhi16S( sel16x4_0(xx), sel16x4_0(yy) )
    659           );
    660 }
    661 
    662 ULong h_generic_calc_MulHi16Ux4 ( ULong xx, ULong yy )
    663 {
    664    return mk16x4(
    665              mulhi16U( sel16x4_3(xx), sel16x4_3(yy) ),
    666              mulhi16U( sel16x4_2(xx), sel16x4_2(yy) ),
    667              mulhi16U( sel16x4_1(xx), sel16x4_1(yy) ),
    668              mulhi16U( sel16x4_0(xx), sel16x4_0(yy) )
    669           );
    670 }
    671 
    672 /* ------------ Comparison ------------ */
    673 
    674 ULong h_generic_calc_CmpEQ32x2 ( ULong xx, ULong yy )
    675 {
    676    return mk32x2(
    677              cmpeq32( sel32x2_1(xx), sel32x2_1(yy) ),
    678              cmpeq32( sel32x2_0(xx), sel32x2_0(yy) )
    679           );
    680 }
    681 
    682 ULong h_generic_calc_CmpEQ16x4 ( ULong xx, ULong yy )
    683 {
    684    return mk16x4(
    685              cmpeq16( sel16x4_3(xx), sel16x4_3(yy) ),
    686              cmpeq16( sel16x4_2(xx), sel16x4_2(yy) ),
    687              cmpeq16( sel16x4_1(xx), sel16x4_1(yy) ),
    688              cmpeq16( sel16x4_0(xx), sel16x4_0(yy) )
    689           );
    690 }
    691 
    692 ULong h_generic_calc_CmpEQ8x8 ( ULong xx, ULong yy )
    693 {
    694    return mk8x8(
    695              cmpeq8( sel8x8_7(xx), sel8x8_7(yy) ),
    696              cmpeq8( sel8x8_6(xx), sel8x8_6(yy) ),
    697              cmpeq8( sel8x8_5(xx), sel8x8_5(yy) ),
    698              cmpeq8( sel8x8_4(xx), sel8x8_4(yy) ),
    699              cmpeq8( sel8x8_3(xx), sel8x8_3(yy) ),
    700              cmpeq8( sel8x8_2(xx), sel8x8_2(yy) ),
    701              cmpeq8( sel8x8_1(xx), sel8x8_1(yy) ),
    702              cmpeq8( sel8x8_0(xx), sel8x8_0(yy) )
    703           );
    704 }
    705 
    706 ULong h_generic_calc_CmpGT32Sx2 ( ULong xx, ULong yy )
    707 {
    708    return mk32x2(
    709              cmpgt32S( sel32x2_1(xx), sel32x2_1(yy) ),
    710              cmpgt32S( sel32x2_0(xx), sel32x2_0(yy) )
    711           );
    712 }
    713 
    714 ULong h_generic_calc_CmpGT16Sx4 ( ULong xx, ULong yy )
    715 {
    716    return mk16x4(
    717              cmpgt16S( sel16x4_3(xx), sel16x4_3(yy) ),
    718              cmpgt16S( sel16x4_2(xx), sel16x4_2(yy) ),
    719              cmpgt16S( sel16x4_1(xx), sel16x4_1(yy) ),
    720              cmpgt16S( sel16x4_0(xx), sel16x4_0(yy) )
    721           );
    722 }
    723 
    724 ULong h_generic_calc_CmpGT8Sx8 ( ULong xx, ULong yy )
    725 {
    726    return mk8x8(
    727              cmpgt8S( sel8x8_7(xx), sel8x8_7(yy) ),
    728              cmpgt8S( sel8x8_6(xx), sel8x8_6(yy) ),
    729              cmpgt8S( sel8x8_5(xx), sel8x8_5(yy) ),
    730              cmpgt8S( sel8x8_4(xx), sel8x8_4(yy) ),
    731              cmpgt8S( sel8x8_3(xx), sel8x8_3(yy) ),
    732              cmpgt8S( sel8x8_2(xx), sel8x8_2(yy) ),
    733              cmpgt8S( sel8x8_1(xx), sel8x8_1(yy) ),
    734              cmpgt8S( sel8x8_0(xx), sel8x8_0(yy) )
    735           );
    736 }
    737 
    738 ULong h_generic_calc_CmpNEZ32x2 ( ULong xx )
    739 {
    740    return mk32x2(
    741              cmpnez32( sel32x2_1(xx) ),
    742              cmpnez32( sel32x2_0(xx) )
    743           );
    744 }
    745 
    746 ULong h_generic_calc_CmpNEZ16x4 ( ULong xx )
    747 {
    748    return mk16x4(
    749              cmpnez16( sel16x4_3(xx) ),
    750              cmpnez16( sel16x4_2(xx) ),
    751              cmpnez16( sel16x4_1(xx) ),
    752              cmpnez16( sel16x4_0(xx) )
    753           );
    754 }
    755 
    756 ULong h_generic_calc_CmpNEZ8x8 ( ULong xx )
    757 {
    758    return mk8x8(
    759              cmpnez8( sel8x8_7(xx) ),
    760              cmpnez8( sel8x8_6(xx) ),
    761              cmpnez8( sel8x8_5(xx) ),
    762              cmpnez8( sel8x8_4(xx) ),
    763              cmpnez8( sel8x8_3(xx) ),
    764              cmpnez8( sel8x8_2(xx) ),
    765              cmpnez8( sel8x8_1(xx) ),
    766              cmpnez8( sel8x8_0(xx) )
    767           );
    768 }
    769 
    770 /* ------------ Saturating narrowing ------------ */
    771 
    772 ULong h_generic_calc_QNarrowBin32Sto16Sx4 ( ULong aa, ULong bb )
    773 {
    774    UInt d = sel32x2_1(aa);
    775    UInt c = sel32x2_0(aa);
    776    UInt b = sel32x2_1(bb);
    777    UInt a = sel32x2_0(bb);
    778    return mk16x4(
    779              qnarrow32Sto16S(d),
    780              qnarrow32Sto16S(c),
    781              qnarrow32Sto16S(b),
    782              qnarrow32Sto16S(a)
    783           );
    784 }
    785 
    786 ULong h_generic_calc_QNarrowBin16Sto8Sx8 ( ULong aa, ULong bb )
    787 {
    788    UShort h = sel16x4_3(aa);
    789    UShort g = sel16x4_2(aa);
    790    UShort f = sel16x4_1(aa);
    791    UShort e = sel16x4_0(aa);
    792    UShort d = sel16x4_3(bb);
    793    UShort c = sel16x4_2(bb);
    794    UShort b = sel16x4_1(bb);
    795    UShort a = sel16x4_0(bb);
    796    return mk8x8(
    797              qnarrow16Sto8S(h),
    798              qnarrow16Sto8S(g),
    799              qnarrow16Sto8S(f),
    800              qnarrow16Sto8S(e),
    801              qnarrow16Sto8S(d),
    802              qnarrow16Sto8S(c),
    803              qnarrow16Sto8S(b),
    804              qnarrow16Sto8S(a)
    805           );
    806 }
    807 
    808 ULong h_generic_calc_QNarrowBin16Sto8Ux8 ( ULong aa, ULong bb )
    809 {
    810    UShort h = sel16x4_3(aa);
    811    UShort g = sel16x4_2(aa);
    812    UShort f = sel16x4_1(aa);
    813    UShort e = sel16x4_0(aa);
    814    UShort d = sel16x4_3(bb);
    815    UShort c = sel16x4_2(bb);
    816    UShort b = sel16x4_1(bb);
    817    UShort a = sel16x4_0(bb);
    818    return mk8x8(
    819              qnarrow16Sto8U(h),
    820              qnarrow16Sto8U(g),
    821              qnarrow16Sto8U(f),
    822              qnarrow16Sto8U(e),
    823              qnarrow16Sto8U(d),
    824              qnarrow16Sto8U(c),
    825              qnarrow16Sto8U(b),
    826              qnarrow16Sto8U(a)
    827           );
    828 }
    829 
    830 /* ------------ Truncating narrowing ------------ */
    831 
    832 ULong h_generic_calc_NarrowBin32to16x4 ( ULong aa, ULong bb )
    833 {
    834    UInt d = sel32x2_1(aa);
    835    UInt c = sel32x2_0(aa);
    836    UInt b = sel32x2_1(bb);
    837    UInt a = sel32x2_0(bb);
    838    return mk16x4(
    839              narrow32to16(d),
    840              narrow32to16(c),
    841              narrow32to16(b),
    842              narrow32to16(a)
    843           );
    844 }
    845 
    846 ULong h_generic_calc_NarrowBin16to8x8 ( ULong aa, ULong bb )
    847 {
    848    UShort h = sel16x4_3(aa);
    849    UShort g = sel16x4_2(aa);
    850    UShort f = sel16x4_1(aa);
    851    UShort e = sel16x4_0(aa);
    852    UShort d = sel16x4_3(bb);
    853    UShort c = sel16x4_2(bb);
    854    UShort b = sel16x4_1(bb);
    855    UShort a = sel16x4_0(bb);
    856    return mk8x8(
    857              narrow16to8(h),
    858              narrow16to8(g),
    859              narrow16to8(f),
    860              narrow16to8(e),
    861              narrow16to8(d),
    862              narrow16to8(c),
    863              narrow16to8(b),
    864              narrow16to8(a)
    865           );
    866 }
    867 
    868 /* ------------ Interleaving ------------ */
    869 
    870 ULong h_generic_calc_InterleaveHI8x8 ( ULong aa, ULong bb )
    871 {
    872    return mk8x8(
    873              sel8x8_7(aa),
    874              sel8x8_7(bb),
    875              sel8x8_6(aa),
    876              sel8x8_6(bb),
    877              sel8x8_5(aa),
    878              sel8x8_5(bb),
    879              sel8x8_4(aa),
    880              sel8x8_4(bb)
    881           );
    882 }
    883 
    884 ULong h_generic_calc_InterleaveLO8x8 ( ULong aa, ULong bb )
    885 {
    886    return mk8x8(
    887              sel8x8_3(aa),
    888              sel8x8_3(bb),
    889              sel8x8_2(aa),
    890              sel8x8_2(bb),
    891              sel8x8_1(aa),
    892              sel8x8_1(bb),
    893              sel8x8_0(aa),
    894              sel8x8_0(bb)
    895           );
    896 }
    897 
    898 ULong h_generic_calc_InterleaveHI16x4 ( ULong aa, ULong bb )
    899 {
    900    return mk16x4(
    901              sel16x4_3(aa),
    902              sel16x4_3(bb),
    903              sel16x4_2(aa),
    904              sel16x4_2(bb)
    905           );
    906 }
    907 
    908 ULong h_generic_calc_InterleaveLO16x4 ( ULong aa, ULong bb )
    909 {
    910    return mk16x4(
    911              sel16x4_1(aa),
    912              sel16x4_1(bb),
    913              sel16x4_0(aa),
    914              sel16x4_0(bb)
    915           );
    916 }
    917 
    918 ULong h_generic_calc_InterleaveHI32x2 ( ULong aa, ULong bb )
    919 {
    920    return mk32x2(
    921              sel32x2_1(aa),
    922              sel32x2_1(bb)
    923           );
    924 }
    925 
    926 ULong h_generic_calc_InterleaveLO32x2 ( ULong aa, ULong bb )
    927 {
    928    return mk32x2(
    929              sel32x2_0(aa),
    930              sel32x2_0(bb)
    931           );
    932 }
    933 
    934 /* ------------ Concatenation ------------ */
    935 
    936 ULong h_generic_calc_CatOddLanes16x4 ( ULong aa, ULong bb )
    937 {
    938    return mk16x4(
    939              sel16x4_3(aa),
    940              sel16x4_1(aa),
    941              sel16x4_3(bb),
    942              sel16x4_1(bb)
    943           );
    944 }
    945 
    946 ULong h_generic_calc_CatEvenLanes16x4 ( ULong aa, ULong bb )
    947 {
    948    return mk16x4(
    949              sel16x4_2(aa),
    950              sel16x4_0(aa),
    951              sel16x4_2(bb),
    952              sel16x4_0(bb)
    953           );
    954 }
    955 
    956 /* misc hack looking for a proper home */
    957 ULong h_generic_calc_Perm8x8 ( ULong aa, ULong bb )
    958 {
    959    return mk8x8(
    960              index8x8(aa, sel8x8_7(bb)),
    961              index8x8(aa, sel8x8_6(bb)),
    962              index8x8(aa, sel8x8_5(bb)),
    963              index8x8(aa, sel8x8_4(bb)),
    964              index8x8(aa, sel8x8_3(bb)),
    965              index8x8(aa, sel8x8_2(bb)),
    966              index8x8(aa, sel8x8_1(bb)),
    967              index8x8(aa, sel8x8_0(bb))
    968           );
    969 }
    970 
    971 /* ------------ Shifting ------------ */
    972 /* Note that because these primops are undefined if the shift amount
    973    equals or exceeds the lane width, the shift amount is masked so
    974    that the scalar shifts are always in range.  In fact, given the
    975    semantics of these primops (ShlN16x4, etc) it is an error if in
    976    fact we are ever given an out-of-range shift amount.
    977 */
    978 ULong h_generic_calc_ShlN32x2 ( ULong xx, UInt nn )
    979 {
    980    /* vassert(nn < 32); */
    981    nn &= 31;
    982    return mk32x2(
    983              shl32( sel32x2_1(xx), nn ),
    984              shl32( sel32x2_0(xx), nn )
    985           );
    986 }
    987 
    988 ULong h_generic_calc_ShlN16x4 ( ULong xx, UInt nn )
    989 {
    990    /* vassert(nn < 16); */
    991    nn &= 15;
    992    return mk16x4(
    993              shl16( sel16x4_3(xx), nn ),
    994              shl16( sel16x4_2(xx), nn ),
    995              shl16( sel16x4_1(xx), nn ),
    996              shl16( sel16x4_0(xx), nn )
    997           );
    998 }
    999 
   1000 ULong h_generic_calc_ShlN8x8  ( ULong xx, UInt nn )
   1001 {
   1002    /* vassert(nn < 8); */
   1003    nn &= 7;
   1004    return mk8x8(
   1005              shl8( sel8x8_7(xx), nn ),
   1006              shl8( sel8x8_6(xx), nn ),
   1007              shl8( sel8x8_5(xx), nn ),
   1008              shl8( sel8x8_4(xx), nn ),
   1009              shl8( sel8x8_3(xx), nn ),
   1010              shl8( sel8x8_2(xx), nn ),
   1011              shl8( sel8x8_1(xx), nn ),
   1012              shl8( sel8x8_0(xx), nn )
   1013           );
   1014 }
   1015 
   1016 ULong h_generic_calc_ShrN32x2 ( ULong xx, UInt nn )
   1017 {
   1018    /* vassert(nn < 32); */
   1019    nn &= 31;
   1020    return mk32x2(
   1021              shr32( sel32x2_1(xx), nn ),
   1022              shr32( sel32x2_0(xx), nn )
   1023           );
   1024 }
   1025 
   1026 ULong h_generic_calc_ShrN16x4 ( ULong xx, UInt nn )
   1027 {
   1028    /* vassert(nn < 16); */
   1029    nn &= 15;
   1030    return mk16x4(
   1031              shr16( sel16x4_3(xx), nn ),
   1032              shr16( sel16x4_2(xx), nn ),
   1033              shr16( sel16x4_1(xx), nn ),
   1034              shr16( sel16x4_0(xx), nn )
   1035           );
   1036 }
   1037 
   1038 ULong h_generic_calc_SarN32x2 ( ULong xx, UInt nn )
   1039 {
   1040    /* vassert(nn < 32); */
   1041    nn &= 31;
   1042    return mk32x2(
   1043              sar32( sel32x2_1(xx), nn ),
   1044              sar32( sel32x2_0(xx), nn )
   1045           );
   1046 }
   1047 
   1048 ULong h_generic_calc_SarN16x4 ( ULong xx, UInt nn )
   1049 {
   1050    /* vassert(nn < 16); */
   1051    nn &= 15;
   1052    return mk16x4(
   1053              sar16( sel16x4_3(xx), nn ),
   1054              sar16( sel16x4_2(xx), nn ),
   1055              sar16( sel16x4_1(xx), nn ),
   1056              sar16( sel16x4_0(xx), nn )
   1057           );
   1058 }
   1059 
   1060 ULong h_generic_calc_SarN8x8 ( ULong xx, UInt nn )
   1061 {
   1062    /* vassert(nn < 8); */
   1063    nn &= 7;
   1064    return mk8x8(
   1065              sar8( sel8x8_7(xx), nn ),
   1066              sar8( sel8x8_6(xx), nn ),
   1067              sar8( sel8x8_5(xx), nn ),
   1068              sar8( sel8x8_4(xx), nn ),
   1069              sar8( sel8x8_3(xx), nn ),
   1070              sar8( sel8x8_2(xx), nn ),
   1071              sar8( sel8x8_1(xx), nn ),
   1072              sar8( sel8x8_0(xx), nn )
   1073           );
   1074 }
   1075 
   1076 /* ------------ Averaging ------------ */
   1077 
   1078 ULong h_generic_calc_Avg8Ux8 ( ULong xx, ULong yy )
   1079 {
   1080    return mk8x8(
   1081              avg8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1082              avg8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1083              avg8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1084              avg8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1085              avg8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1086              avg8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1087              avg8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1088              avg8U( sel8x8_0(xx), sel8x8_0(yy) )
   1089           );
   1090 }
   1091 
   1092 ULong h_generic_calc_Avg16Ux4 ( ULong xx, ULong yy )
   1093 {
   1094    return mk16x4(
   1095              avg16U( sel16x4_3(xx), sel16x4_3(yy) ),
   1096              avg16U( sel16x4_2(xx), sel16x4_2(yy) ),
   1097              avg16U( sel16x4_1(xx), sel16x4_1(yy) ),
   1098              avg16U( sel16x4_0(xx), sel16x4_0(yy) )
   1099           );
   1100 }
   1101 
   1102 /* ------------ max/min ------------ */
   1103 
   1104 ULong h_generic_calc_Max16Sx4 ( ULong xx, ULong yy )
   1105 {
   1106    return mk16x4(
   1107              max16S( sel16x4_3(xx), sel16x4_3(yy) ),
   1108              max16S( sel16x4_2(xx), sel16x4_2(yy) ),
   1109              max16S( sel16x4_1(xx), sel16x4_1(yy) ),
   1110              max16S( sel16x4_0(xx), sel16x4_0(yy) )
   1111           );
   1112 }
   1113 
   1114 ULong h_generic_calc_Max8Ux8 ( ULong xx, ULong yy )
   1115 {
   1116    return mk8x8(
   1117              max8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1118              max8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1119              max8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1120              max8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1121              max8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1122              max8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1123              max8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1124              max8U( sel8x8_0(xx), sel8x8_0(yy) )
   1125           );
   1126 }
   1127 
   1128 ULong h_generic_calc_Min16Sx4 ( ULong xx, ULong yy )
   1129 {
   1130    return mk16x4(
   1131              min16S( sel16x4_3(xx), sel16x4_3(yy) ),
   1132              min16S( sel16x4_2(xx), sel16x4_2(yy) ),
   1133              min16S( sel16x4_1(xx), sel16x4_1(yy) ),
   1134              min16S( sel16x4_0(xx), sel16x4_0(yy) )
   1135           );
   1136 }
   1137 
   1138 ULong h_generic_calc_Min8Ux8 ( ULong xx, ULong yy )
   1139 {
   1140    return mk8x8(
   1141              min8U( sel8x8_7(xx), sel8x8_7(yy) ),
   1142              min8U( sel8x8_6(xx), sel8x8_6(yy) ),
   1143              min8U( sel8x8_5(xx), sel8x8_5(yy) ),
   1144              min8U( sel8x8_4(xx), sel8x8_4(yy) ),
   1145              min8U( sel8x8_3(xx), sel8x8_3(yy) ),
   1146              min8U( sel8x8_2(xx), sel8x8_2(yy) ),
   1147              min8U( sel8x8_1(xx), sel8x8_1(yy) ),
   1148              min8U( sel8x8_0(xx), sel8x8_0(yy) )
   1149           );
   1150 }
   1151 
   1152 /* ------------ SOME 32-bit SIMD HELPERS TOO ------------ */
   1153 
   1154 /* Tuple/select functions for 16x2 vectors. */
   1155 static inline UInt mk16x2 ( UShort w1, UShort w2 ) {
   1156    return (((UInt)w1) << 16) | ((UInt)w2);
   1157 }
   1158 
   1159 static inline UShort sel16x2_1 ( UInt w32 ) {
   1160    return 0xFFFF & (UShort)(w32 >> 16);
   1161 }
   1162 static inline UShort sel16x2_0 ( UInt w32 ) {
   1163    return 0xFFFF & (UShort)(w32);
   1164 }
   1165 
   1166 static inline UInt mk8x4 ( UChar w3, UChar w2,
   1167                            UChar w1, UChar w0 ) {
   1168    UInt w32 =   (((UInt)w3) << 24) | (((UInt)w2) << 16)
   1169               | (((UInt)w1) << 8)  | (((UInt)w0) << 0);
   1170    return w32;
   1171 }
   1172 
   1173 static inline UChar sel8x4_3 ( UInt w32 ) {
   1174    return toUChar(0xFF & (w32 >> 24));
   1175 }
   1176 static inline UChar sel8x4_2 ( UInt w32 ) {
   1177    return toUChar(0xFF & (w32 >> 16));
   1178 }
   1179 static inline UChar sel8x4_1 ( UInt w32 ) {
   1180    return toUChar(0xFF & (w32 >> 8));
   1181 }
   1182 static inline UChar sel8x4_0 ( UInt w32 ) {
   1183    return toUChar(0xFF & (w32 >> 0));
   1184 }
   1185 
   1186 
   1187 /* ----------------------------------------------------- */
   1188 /* More externally visible functions.  These simply
   1189    implement the corresponding IR primops. */
   1190 /* ----------------------------------------------------- */
   1191 
   1192 /* ------ 16x2 ------ */
   1193 
   1194 UInt h_generic_calc_Add16x2 ( UInt xx, UInt yy )
   1195 {
   1196    return mk16x2( sel16x2_1(xx) + sel16x2_1(yy),
   1197                   sel16x2_0(xx) + sel16x2_0(yy) );
   1198 }
   1199 
   1200 UInt h_generic_calc_Sub16x2 ( UInt xx, UInt yy )
   1201 {
   1202    return mk16x2( sel16x2_1(xx) - sel16x2_1(yy),
   1203                   sel16x2_0(xx) - sel16x2_0(yy) );
   1204 }
   1205 
   1206 UInt h_generic_calc_HAdd16Ux2 ( UInt xx, UInt yy )
   1207 {
   1208    return mk16x2( hadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1209                   hadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1210 }
   1211 
   1212 UInt h_generic_calc_HAdd16Sx2 ( UInt xx, UInt yy )
   1213 {
   1214    return mk16x2( hadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1215                   hadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1216 }
   1217 
   1218 UInt h_generic_calc_HSub16Ux2 ( UInt xx, UInt yy )
   1219 {
   1220    return mk16x2( hsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1221                   hsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1222 }
   1223 
   1224 UInt h_generic_calc_HSub16Sx2 ( UInt xx, UInt yy )
   1225 {
   1226    return mk16x2( hsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1227                   hsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1228 }
   1229 
   1230 UInt h_generic_calc_QAdd16Ux2 ( UInt xx, UInt yy )
   1231 {
   1232    return mk16x2( qadd16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1233                   qadd16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1234 }
   1235 
   1236 UInt h_generic_calc_QAdd16Sx2 ( UInt xx, UInt yy )
   1237 {
   1238    return mk16x2( qadd16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1239                   qadd16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1240 }
   1241 
   1242 UInt h_generic_calc_QSub16Ux2 ( UInt xx, UInt yy )
   1243 {
   1244    return mk16x2( qsub16U( sel16x2_1(xx), sel16x2_1(yy) ),
   1245                   qsub16U( sel16x2_0(xx), sel16x2_0(yy) ) );
   1246 }
   1247 
   1248 UInt h_generic_calc_QSub16Sx2 ( UInt xx, UInt yy )
   1249 {
   1250    return mk16x2( qsub16S( sel16x2_1(xx), sel16x2_1(yy) ),
   1251                   qsub16S( sel16x2_0(xx), sel16x2_0(yy) ) );
   1252 }
   1253 
   1254 /* ------ 8x4 ------ */
   1255 
   1256 UInt h_generic_calc_Add8x4 ( UInt xx, UInt yy )
   1257 {
   1258    return mk8x4(
   1259              sel8x4_3(xx) + sel8x4_3(yy),
   1260              sel8x4_2(xx) + sel8x4_2(yy),
   1261              sel8x4_1(xx) + sel8x4_1(yy),
   1262              sel8x4_0(xx) + sel8x4_0(yy)
   1263           );
   1264 }
   1265 
   1266 UInt h_generic_calc_Sub8x4 ( UInt xx, UInt yy )
   1267 {
   1268    return mk8x4(
   1269              sel8x4_3(xx) - sel8x4_3(yy),
   1270              sel8x4_2(xx) - sel8x4_2(yy),
   1271              sel8x4_1(xx) - sel8x4_1(yy),
   1272              sel8x4_0(xx) - sel8x4_0(yy)
   1273           );
   1274 }
   1275 
   1276 UInt h_generic_calc_HAdd8Ux4 ( UInt xx, UInt yy )
   1277 {
   1278    return mk8x4(
   1279              hadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1280              hadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1281              hadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1282              hadd8U( sel8x4_0(xx), sel8x4_0(yy) )
   1283           );
   1284 }
   1285 
   1286 UInt h_generic_calc_HAdd8Sx4 ( UInt xx, UInt yy )
   1287 {
   1288    return mk8x4(
   1289              hadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1290              hadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1291              hadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1292              hadd8S( sel8x4_0(xx), sel8x4_0(yy) )
   1293           );
   1294 }
   1295 
   1296 UInt h_generic_calc_HSub8Ux4 ( UInt xx, UInt yy )
   1297 {
   1298    return mk8x4(
   1299              hsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1300              hsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1301              hsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1302              hsub8U( sel8x4_0(xx), sel8x4_0(yy) )
   1303           );
   1304 }
   1305 
   1306 UInt h_generic_calc_HSub8Sx4 ( UInt xx, UInt yy )
   1307 {
   1308    return mk8x4(
   1309              hsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1310              hsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1311              hsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1312              hsub8S( sel8x4_0(xx), sel8x4_0(yy) )
   1313           );
   1314 }
   1315 
   1316 UInt h_generic_calc_QAdd8Ux4 ( UInt xx, UInt yy )
   1317 {
   1318    return mk8x4(
   1319              qadd8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1320              qadd8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1321              qadd8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1322              qadd8U( sel8x4_0(xx), sel8x4_0(yy) )
   1323           );
   1324 }
   1325 
   1326 UInt h_generic_calc_QAdd8Sx4 ( UInt xx, UInt yy )
   1327 {
   1328    return mk8x4(
   1329              qadd8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1330              qadd8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1331              qadd8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1332              qadd8S( sel8x4_0(xx), sel8x4_0(yy) )
   1333           );
   1334 }
   1335 
   1336 UInt h_generic_calc_QSub8Ux4 ( UInt xx, UInt yy )
   1337 {
   1338    return mk8x4(
   1339              qsub8U( sel8x4_3(xx), sel8x4_3(yy) ),
   1340              qsub8U( sel8x4_2(xx), sel8x4_2(yy) ),
   1341              qsub8U( sel8x4_1(xx), sel8x4_1(yy) ),
   1342              qsub8U( sel8x4_0(xx), sel8x4_0(yy) )
   1343           );
   1344 }
   1345 
   1346 UInt h_generic_calc_QSub8Sx4 ( UInt xx, UInt yy )
   1347 {
   1348    return mk8x4(
   1349              qsub8S( sel8x4_3(xx), sel8x4_3(yy) ),
   1350              qsub8S( sel8x4_2(xx), sel8x4_2(yy) ),
   1351              qsub8S( sel8x4_1(xx), sel8x4_1(yy) ),
   1352              qsub8S( sel8x4_0(xx), sel8x4_0(yy) )
   1353           );
   1354 }
   1355 
   1356 UInt h_generic_calc_CmpNEZ16x2 ( UInt xx )
   1357 {
   1358    return mk16x2(
   1359              cmpnez16( sel16x2_1(xx) ),
   1360              cmpnez16( sel16x2_0(xx) )
   1361           );
   1362 }
   1363 
   1364 UInt h_generic_calc_CmpNEZ8x4 ( UInt xx )
   1365 {
   1366    return mk8x4(
   1367              cmpnez8( sel8x4_3(xx) ),
   1368              cmpnez8( sel8x4_2(xx) ),
   1369              cmpnez8( sel8x4_1(xx) ),
   1370              cmpnez8( sel8x4_0(xx) )
   1371           );
   1372 }
   1373 
   1374 UInt h_generic_calc_Sad8Ux4 ( UInt xx, UInt yy )
   1375 {
   1376    return absdiff8U( sel8x4_3(xx), sel8x4_3(yy) )
   1377           + absdiff8U( sel8x4_2(xx), sel8x4_2(yy) )
   1378           + absdiff8U( sel8x4_1(xx), sel8x4_1(yy) )
   1379           + absdiff8U( sel8x4_0(xx), sel8x4_0(yy) );
   1380 }
   1381 
   1382 
   1383 /*---------------------------------------------------------------*/
   1384 /*--- end                               host_generic_simd64.c ---*/
   1385 /*---------------------------------------------------------------*/
   1386