Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                            host_generic_simd128.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2010-2013 OpenWorks GbR
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 */
     30 
     31 /* Generic helper functions for doing 128-bit SIMD arithmetic in cases
     32    where the instruction selectors cannot generate code in-line.
     33    These are purely back-end entities and cannot be seen/referenced
     34    from IR. */
     35 
     36 #include "libvex_basictypes.h"
     37 #include "host_generic_simd128.h"
     38 
     39 
     40 /* Primitive helpers always take args of the real type (signed vs
     41    unsigned) but return an unsigned result, so there's no conversion
     42    weirdness when stuffing results back in the V128 union fields,
     43    which are all unsigned. */
     44 
     45 static inline UInt mul32 ( Int xx, Int yy )
     46 {
     47    Int t = ((Int)xx) * ((Int)yy);
     48    return toUInt(t);
     49 }
     50 
     51 static inline UInt max32S ( Int xx, Int yy )
     52 {
     53    return toUInt((xx > yy) ? xx : yy);
     54 }
     55 
     56 static inline UInt min32S ( Int xx, Int yy )
     57 {
     58    return toUInt((xx < yy) ? xx : yy);
     59 }
     60 
     61 static inline UInt max32U ( UInt xx, UInt yy )
     62 {
     63    return toUInt((xx > yy) ? xx : yy);
     64 }
     65 
     66 static inline UInt min32U ( UInt xx, UInt yy )
     67 {
     68    return toUInt((xx < yy) ? xx : yy);
     69 }
     70 
     71 static inline UShort max16U ( UShort xx, UShort yy )
     72 {
     73    return toUShort((xx > yy) ? xx : yy);
     74 }
     75 
     76 static inline UShort min16U ( UShort xx, UShort yy )
     77 {
     78    return toUShort((xx < yy) ? xx : yy);
     79 }
     80 
     81 static inline UChar max8S ( Char xx, Char yy )
     82 {
     83    return toUChar((xx > yy) ? xx : yy);
     84 }
     85 
     86 static inline UChar min8S ( Char xx, Char yy )
     87 {
     88    return toUChar((xx < yy) ? xx : yy);
     89 }
     90 
     91 static inline ULong cmpEQ64 ( Long xx, Long yy )
     92 {
     93    return (((Long)xx) == ((Long)yy))
     94              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
     95 }
     96 
     97 static inline ULong cmpGT64S ( Long xx, Long yy )
     98 {
     99    return (((Long)xx) > ((Long)yy))
    100              ? 0xFFFFFFFFFFFFFFFFULL : 0ULL;
    101 }
    102 
    103 static inline ULong sar64 ( ULong v, UInt n )
    104 {
    105    return ((Long)v) >> n;
    106 }
    107 
    108 static inline UChar sar8 ( UChar v, UInt n )
    109 {
    110    return toUChar(((Char)v) >> n);
    111 }
    112 
    113 static inline UShort qnarrow32Sto16U ( UInt xx0 )
    114 {
    115    Int xx = (Int)xx0;
    116    if (xx < 0)     xx = 0;
    117    if (xx > 65535) xx = 65535;
    118    return (UShort)xx;
    119 }
    120 
    121 static inline UShort narrow32to16 ( UInt xx )
    122 {
    123    return (UShort)xx;
    124 }
    125 
    126 static inline UChar narrow16to8 ( UShort xx )
    127 {
    128    return (UChar)xx;
    129 }
    130 
    131 
    132 void VEX_REGPARM(3)
    133      h_generic_calc_Mul32x4 ( /*OUT*/V128* res,
    134                               V128* argL, V128* argR )
    135 {
    136    res->w32[0] = mul32(argL->w32[0], argR->w32[0]);
    137    res->w32[1] = mul32(argL->w32[1], argR->w32[1]);
    138    res->w32[2] = mul32(argL->w32[2], argR->w32[2]);
    139    res->w32[3] = mul32(argL->w32[3], argR->w32[3]);
    140 }
    141 
    142 void VEX_REGPARM(3)
    143      h_generic_calc_Max32Sx4 ( /*OUT*/V128* res,
    144                                V128* argL, V128* argR )
    145 {
    146    res->w32[0] = max32S(argL->w32[0], argR->w32[0]);
    147    res->w32[1] = max32S(argL->w32[1], argR->w32[1]);
    148    res->w32[2] = max32S(argL->w32[2], argR->w32[2]);
    149    res->w32[3] = max32S(argL->w32[3], argR->w32[3]);
    150 }
    151 
    152 void VEX_REGPARM(3)
    153      h_generic_calc_Min32Sx4 ( /*OUT*/V128* res,
    154                                V128* argL, V128* argR )
    155 {
    156    res->w32[0] = min32S(argL->w32[0], argR->w32[0]);
    157    res->w32[1] = min32S(argL->w32[1], argR->w32[1]);
    158    res->w32[2] = min32S(argL->w32[2], argR->w32[2]);
    159    res->w32[3] = min32S(argL->w32[3], argR->w32[3]);
    160 }
    161 
    162 void VEX_REGPARM(3)
    163      h_generic_calc_Max32Ux4 ( /*OUT*/V128* res,
    164                                V128* argL, V128* argR )
    165 {
    166    res->w32[0] = max32U(argL->w32[0], argR->w32[0]);
    167    res->w32[1] = max32U(argL->w32[1], argR->w32[1]);
    168    res->w32[2] = max32U(argL->w32[2], argR->w32[2]);
    169    res->w32[3] = max32U(argL->w32[3], argR->w32[3]);
    170 }
    171 
    172 void VEX_REGPARM(3)
    173      h_generic_calc_Min32Ux4 ( /*OUT*/V128* res,
    174                                V128* argL, V128* argR )
    175 {
    176    res->w32[0] = min32U(argL->w32[0], argR->w32[0]);
    177    res->w32[1] = min32U(argL->w32[1], argR->w32[1]);
    178    res->w32[2] = min32U(argL->w32[2], argR->w32[2]);
    179    res->w32[3] = min32U(argL->w32[3], argR->w32[3]);
    180 }
    181 
    182 void VEX_REGPARM(3)
    183      h_generic_calc_Max16Ux8 ( /*OUT*/V128* res,
    184                                V128* argL, V128* argR )
    185 {
    186    res->w16[0] = max16U(argL->w16[0], argR->w16[0]);
    187    res->w16[1] = max16U(argL->w16[1], argR->w16[1]);
    188    res->w16[2] = max16U(argL->w16[2], argR->w16[2]);
    189    res->w16[3] = max16U(argL->w16[3], argR->w16[3]);
    190    res->w16[4] = max16U(argL->w16[4], argR->w16[4]);
    191    res->w16[5] = max16U(argL->w16[5], argR->w16[5]);
    192    res->w16[6] = max16U(argL->w16[6], argR->w16[6]);
    193    res->w16[7] = max16U(argL->w16[7], argR->w16[7]);
    194 }
    195 
    196 void VEX_REGPARM(3)
    197      h_generic_calc_Min16Ux8 ( /*OUT*/V128* res,
    198                                V128* argL, V128* argR )
    199 {
    200    res->w16[0] = min16U(argL->w16[0], argR->w16[0]);
    201    res->w16[1] = min16U(argL->w16[1], argR->w16[1]);
    202    res->w16[2] = min16U(argL->w16[2], argR->w16[2]);
    203    res->w16[3] = min16U(argL->w16[3], argR->w16[3]);
    204    res->w16[4] = min16U(argL->w16[4], argR->w16[4]);
    205    res->w16[5] = min16U(argL->w16[5], argR->w16[5]);
    206    res->w16[6] = min16U(argL->w16[6], argR->w16[6]);
    207    res->w16[7] = min16U(argL->w16[7], argR->w16[7]);
    208 }
    209 
    210 void VEX_REGPARM(3)
    211      h_generic_calc_Max8Sx16 ( /*OUT*/V128* res,
    212                                V128* argL, V128* argR )
    213 {
    214    res->w8[ 0] = max8S(argL->w8[ 0], argR->w8[ 0]);
    215    res->w8[ 1] = max8S(argL->w8[ 1], argR->w8[ 1]);
    216    res->w8[ 2] = max8S(argL->w8[ 2], argR->w8[ 2]);
    217    res->w8[ 3] = max8S(argL->w8[ 3], argR->w8[ 3]);
    218    res->w8[ 4] = max8S(argL->w8[ 4], argR->w8[ 4]);
    219    res->w8[ 5] = max8S(argL->w8[ 5], argR->w8[ 5]);
    220    res->w8[ 6] = max8S(argL->w8[ 6], argR->w8[ 6]);
    221    res->w8[ 7] = max8S(argL->w8[ 7], argR->w8[ 7]);
    222    res->w8[ 8] = max8S(argL->w8[ 8], argR->w8[ 8]);
    223    res->w8[ 9] = max8S(argL->w8[ 9], argR->w8[ 9]);
    224    res->w8[10] = max8S(argL->w8[10], argR->w8[10]);
    225    res->w8[11] = max8S(argL->w8[11], argR->w8[11]);
    226    res->w8[12] = max8S(argL->w8[12], argR->w8[12]);
    227    res->w8[13] = max8S(argL->w8[13], argR->w8[13]);
    228    res->w8[14] = max8S(argL->w8[14], argR->w8[14]);
    229    res->w8[15] = max8S(argL->w8[15], argR->w8[15]);
    230 }
    231 
    232 void VEX_REGPARM(3)
    233      h_generic_calc_Min8Sx16 ( /*OUT*/V128* res,
    234                                V128* argL, V128* argR )
    235 {
    236    res->w8[ 0] = min8S(argL->w8[ 0], argR->w8[ 0]);
    237    res->w8[ 1] = min8S(argL->w8[ 1], argR->w8[ 1]);
    238    res->w8[ 2] = min8S(argL->w8[ 2], argR->w8[ 2]);
    239    res->w8[ 3] = min8S(argL->w8[ 3], argR->w8[ 3]);
    240    res->w8[ 4] = min8S(argL->w8[ 4], argR->w8[ 4]);
    241    res->w8[ 5] = min8S(argL->w8[ 5], argR->w8[ 5]);
    242    res->w8[ 6] = min8S(argL->w8[ 6], argR->w8[ 6]);
    243    res->w8[ 7] = min8S(argL->w8[ 7], argR->w8[ 7]);
    244    res->w8[ 8] = min8S(argL->w8[ 8], argR->w8[ 8]);
    245    res->w8[ 9] = min8S(argL->w8[ 9], argR->w8[ 9]);
    246    res->w8[10] = min8S(argL->w8[10], argR->w8[10]);
    247    res->w8[11] = min8S(argL->w8[11], argR->w8[11]);
    248    res->w8[12] = min8S(argL->w8[12], argR->w8[12]);
    249    res->w8[13] = min8S(argL->w8[13], argR->w8[13]);
    250    res->w8[14] = min8S(argL->w8[14], argR->w8[14]);
    251    res->w8[15] = min8S(argL->w8[15], argR->w8[15]);
    252 }
    253 
    254 void VEX_REGPARM(3)
    255      h_generic_calc_CmpEQ64x2 ( /*OUT*/V128* res,
    256                                 V128* argL, V128* argR )
    257 {
    258    res->w64[0] = cmpEQ64(argL->w64[0], argR->w64[0]);
    259    res->w64[1] = cmpEQ64(argL->w64[1], argR->w64[1]);
    260 }
    261 
    262 void VEX_REGPARM(3)
    263      h_generic_calc_CmpGT64Sx2 ( /*OUT*/V128* res,
    264                                  V128* argL, V128* argR )
    265 {
    266    res->w64[0] = cmpGT64S(argL->w64[0], argR->w64[0]);
    267    res->w64[1] = cmpGT64S(argL->w64[1], argR->w64[1]);
    268 }
    269 
    270 /* ------------ Shifting ------------ */
    271 /* Note that because these primops are undefined if the shift amount
    272    equals or exceeds the lane width, the shift amount is masked so
    273    that the scalar shifts are always in range.  In fact, given the
    274    semantics of these primops (Sar64x2, etc) it is an error if in
    275    fact we are ever given an out-of-range shift amount.
    276 */
    277 void /*not-regparm*/
    278      h_generic_calc_SarN64x2 ( /*OUT*/V128* res,
    279                                V128* argL, UInt nn)
    280 {
    281    /* vassert(nn < 64); */
    282    nn &= 63;
    283    res->w64[0] = sar64(argL->w64[0], nn);
    284    res->w64[1] = sar64(argL->w64[1], nn);
    285 }
    286 
    287 void /*not-regparm*/
    288      h_generic_calc_SarN8x16 ( /*OUT*/V128* res,
    289                               V128* argL, UInt nn)
    290 {
    291    /* vassert(nn < 8); */
    292    nn &= 7;
    293    res->w8[ 0] = sar8(argL->w8[ 0], nn);
    294    res->w8[ 1] = sar8(argL->w8[ 1], nn);
    295    res->w8[ 2] = sar8(argL->w8[ 2], nn);
    296    res->w8[ 3] = sar8(argL->w8[ 3], nn);
    297    res->w8[ 4] = sar8(argL->w8[ 4], nn);
    298    res->w8[ 5] = sar8(argL->w8[ 5], nn);
    299    res->w8[ 6] = sar8(argL->w8[ 6], nn);
    300    res->w8[ 7] = sar8(argL->w8[ 7], nn);
    301    res->w8[ 8] = sar8(argL->w8[ 8], nn);
    302    res->w8[ 9] = sar8(argL->w8[ 9], nn);
    303    res->w8[10] = sar8(argL->w8[10], nn);
    304    res->w8[11] = sar8(argL->w8[11], nn);
    305    res->w8[12] = sar8(argL->w8[12], nn);
    306    res->w8[13] = sar8(argL->w8[13], nn);
    307    res->w8[14] = sar8(argL->w8[14], nn);
    308    res->w8[15] = sar8(argL->w8[15], nn);
    309 }
    310 
    311 void VEX_REGPARM(3)
    312      h_generic_calc_QNarrowBin32Sto16Ux8 ( /*OUT*/V128* res,
    313                                            V128* argL, V128* argR )
    314 {
    315    res->w16[0] = qnarrow32Sto16U(argR->w32[0]);
    316    res->w16[1] = qnarrow32Sto16U(argR->w32[1]);
    317    res->w16[2] = qnarrow32Sto16U(argR->w32[2]);
    318    res->w16[3] = qnarrow32Sto16U(argR->w32[3]);
    319    res->w16[4] = qnarrow32Sto16U(argL->w32[0]);
    320    res->w16[5] = qnarrow32Sto16U(argL->w32[1]);
    321    res->w16[6] = qnarrow32Sto16U(argL->w32[2]);
    322    res->w16[7] = qnarrow32Sto16U(argL->w32[3]);
    323 }
    324 
    325 void VEX_REGPARM(3)
    326      h_generic_calc_NarrowBin16to8x16 ( /*OUT*/V128* res,
    327                                         V128* argL, V128* argR )
    328 {
    329    res->w8[ 0] = narrow16to8(argR->w16[0]);
    330    res->w8[ 1] = narrow16to8(argR->w16[1]);
    331    res->w8[ 2] = narrow16to8(argR->w16[2]);
    332    res->w8[ 3] = narrow16to8(argR->w16[3]);
    333    res->w8[ 4] = narrow16to8(argR->w16[4]);
    334    res->w8[ 5] = narrow16to8(argR->w16[5]);
    335    res->w8[ 6] = narrow16to8(argR->w16[6]);
    336    res->w8[ 7] = narrow16to8(argR->w16[7]);
    337    res->w8[ 8] = narrow16to8(argL->w16[0]);
    338    res->w8[ 9] = narrow16to8(argL->w16[1]);
    339    res->w8[10] = narrow16to8(argL->w16[2]);
    340    res->w8[11] = narrow16to8(argL->w16[3]);
    341    res->w8[12] = narrow16to8(argL->w16[4]);
    342    res->w8[13] = narrow16to8(argL->w16[5]);
    343    res->w8[14] = narrow16to8(argL->w16[6]);
    344    res->w8[15] = narrow16to8(argL->w16[7]);
    345 }
    346 
    347 void VEX_REGPARM(3)
    348      h_generic_calc_NarrowBin32to16x8 ( /*OUT*/V128* res,
    349                                         V128* argL, V128* argR )
    350 {
    351    res->w16[0] = narrow32to16(argR->w32[0]);
    352    res->w16[1] = narrow32to16(argR->w32[1]);
    353    res->w16[2] = narrow32to16(argR->w32[2]);
    354    res->w16[3] = narrow32to16(argR->w32[3]);
    355    res->w16[4] = narrow32to16(argL->w32[0]);
    356    res->w16[5] = narrow32to16(argL->w32[1]);
    357    res->w16[6] = narrow32to16(argL->w32[2]);
    358    res->w16[7] = narrow32to16(argL->w32[3]);
    359 }
    360 
    361 void VEX_REGPARM(3)
    362      h_generic_calc_Perm32x4 ( /*OUT*/V128* res,
    363                                V128* argL, V128* argR )
    364 {
    365    res->w32[0] = argL->w32[ argR->w32[0] & 3 ];
    366    res->w32[1] = argL->w32[ argR->w32[1] & 3 ];
    367    res->w32[2] = argL->w32[ argR->w32[2] & 3 ];
    368    res->w32[3] = argL->w32[ argR->w32[3] & 3 ];
    369 }
    370 
    371 UInt /*not-regparm*/
    372      h_generic_calc_GetMSBs8x16 ( ULong w64hi, ULong w64lo )
    373 {
    374    UInt r = 0;
    375    if (w64hi & (1ULL << (64-1))) r |= (1<<15);
    376    if (w64hi & (1ULL << (56-1))) r |= (1<<14);
    377    if (w64hi & (1ULL << (48-1))) r |= (1<<13);
    378    if (w64hi & (1ULL << (40-1))) r |= (1<<12);
    379    if (w64hi & (1ULL << (32-1))) r |= (1<<11);
    380    if (w64hi & (1ULL << (24-1))) r |= (1<<10);
    381    if (w64hi & (1ULL << (16-1))) r |= (1<<9);
    382    if (w64hi & (1ULL << ( 8-1))) r |= (1<<8);
    383    if (w64lo & (1ULL << (64-1))) r |= (1<<7);
    384    if (w64lo & (1ULL << (56-1))) r |= (1<<6);
    385    if (w64lo & (1ULL << (48-1))) r |= (1<<5);
    386    if (w64lo & (1ULL << (40-1))) r |= (1<<4);
    387    if (w64lo & (1ULL << (32-1))) r |= (1<<3);
    388    if (w64lo & (1ULL << (24-1))) r |= (1<<2);
    389    if (w64lo & (1ULL << (16-1))) r |= (1<<1);
    390    if (w64lo & (1ULL << ( 8-1))) r |= (1<<0);
    391    return r;
    392 }
    393 
    394 /*---------------------------------------------------------------*/
    395 /*--- end                              host_generic_simd128.c ---*/
    396 /*---------------------------------------------------------------*/
    397