Home | History | Annotate | Download | only in priv
      1 
      2 /*---------------------------------------------------------------*/
      3 /*--- begin                               guest_generic_x87.c ---*/
      4 /*---------------------------------------------------------------*/
      5 
      6 /*
      7    This file is part of Valgrind, a dynamic binary instrumentation
      8    framework.
      9 
     10    Copyright (C) 2004-2017 OpenWorks LLP
     11       info (at) open-works.net
     12 
     13    This program is free software; you can redistribute it and/or
     14    modify it under the terms of the GNU General Public License as
     15    published by the Free Software Foundation; either version 2 of the
     16    License, or (at your option) any later version.
     17 
     18    This program is distributed in the hope that it will be useful, but
     19    WITHOUT ANY WARRANTY; without even the implied warranty of
     20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     21    General Public License for more details.
     22 
     23    You should have received a copy of the GNU General Public License
     24    along with this program; if not, write to the Free Software
     25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
     26    02110-1301, USA.
     27 
     28    The GNU General Public License is contained in the file COPYING.
     29 
     30    Neither the names of the U.S. Department of Energy nor the
     31    University of California nor the names of its contributors may be
     32    used to endorse or promote products derived from this software
     33    without prior written permission.
     34 */
     35 
     36 /* This file contains functions for doing some x87-specific
     37    operations.  Both the amd64 and x86 front ends (guests) indirectly
     38    call these functions via guest helper calls.  By putting them here,
     39    code duplication is avoided.  Some of these functions are tricky
     40    and hard to verify, so there is much to be said for only having one
     41    copy thereof.
     42 */
     43 
     44 #include "libvex_basictypes.h"
     45 
     46 #include "main_util.h"
     47 #include "guest_generic_x87.h"
     48 
     49 
     50 /* 80 and 64-bit floating point formats:
     51 
     52    80-bit:
     53 
     54     S  0       0-------0      zero
     55     S  0       0X------X      denormals
     56     S  1-7FFE  1X------X      normals (all normals have leading 1)
     57     S  7FFF    10------0      infinity
     58     S  7FFF    10X-----X      snan
     59     S  7FFF    11X-----X      qnan
     60 
     61    S is the sign bit.  For runs X----X, at least one of the Xs must be
     62    nonzero.  Exponent is 15 bits, fractional part is 63 bits, and
     63    there is an explicitly represented leading 1, and a sign bit,
     64    giving 80 in total.
     65 
     66    64-bit avoids the confusion of an explicitly represented leading 1
     67    and so is simpler:
     68 
     69     S  0      0------0   zero
     70     S  0      X------X   denormals
     71     S  1-7FE  any        normals
     72     S  7FF    0------0   infinity
     73     S  7FF    0X-----X   snan
     74     S  7FF    1X-----X   qnan
     75 
     76    Exponent is 11 bits, fractional part is 52 bits, and there is a
     77    sign bit, giving 64 in total.
     78 */
     79 
     80 
     81 static inline UInt read_bit_array ( UChar* arr, UInt n )
     82 {
     83    UChar c = arr[n >> 3];
     84    c >>= (n&7);
     85    return c & 1;
     86 }
     87 
     88 static inline void write_bit_array ( UChar* arr, UInt n, UInt b )
     89 {
     90    UChar c = arr[n >> 3];
     91    c = toUChar( c & ~(1 << (n&7)) );
     92    c = toUChar( c | ((b&1) << (n&7)) );
     93    arr[n >> 3] = c;
     94 }
     95 
     96 /* Convert an IEEE754 double (64-bit) into an x87 extended double
     97    (80-bit), mimicing the hardware fairly closely.  Both numbers are
     98    stored little-endian.  Limitations, all of which could be fixed,
     99    given some level of hassle:
    100 
    101    * Identity of NaNs is not preserved.
    102 
    103    See comments in the code for more details.
    104 */
    105 void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 )
    106 {
    107    Bool  mantissaIsZero;
    108    Int   bexp, i, j, shift;
    109    UChar sign;
    110 
    111    sign = toUChar( (f64[7] >> 7) & 1 );
    112    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
    113    bexp &= 0x7FF;
    114 
    115    mantissaIsZero = False;
    116    if (bexp == 0 || bexp == 0x7FF) {
    117       /* We'll need to know whether or not the mantissa (bits 51:0) is
    118          all zeroes in order to handle these cases.  So figure it
    119          out. */
    120       mantissaIsZero
    121          = toBool(
    122               (f64[6] & 0x0F) == 0
    123               && f64[5] == 0 && f64[4] == 0 && f64[3] == 0
    124               && f64[2] == 0 && f64[1] == 0 && f64[0] == 0
    125            );
    126    }
    127 
    128    /* If the exponent is zero, either we have a zero or a denormal.
    129       Produce a zero.  This is a hack in that it forces denormals to
    130       zero.  Could do better. */
    131    if (bexp == 0) {
    132       f80[9] = toUChar( sign << 7 );
    133       f80[8] = f80[7] = f80[6] = f80[5] = f80[4]
    134              = f80[3] = f80[2] = f80[1] = f80[0] = 0;
    135 
    136       if (mantissaIsZero)
    137          /* It really is zero, so that's all we can do. */
    138          return;
    139 
    140       /* There is at least one 1-bit in the mantissa.  So it's a
    141          potentially denormalised double -- but we can produce a
    142          normalised long double.  Count the leading zeroes in the
    143          mantissa so as to decide how much to bump the exponent down
    144          by.  Note, this is SLOW. */
    145       shift = 0;
    146       for (i = 51; i >= 0; i--) {
    147         if (read_bit_array(f64, i))
    148            break;
    149         shift++;
    150       }
    151 
    152       /* and copy into place as many bits as we can get our hands on. */
    153       j = 63;
    154       for (i = 51 - shift; i >= 0; i--) {
    155          write_bit_array( f80, j,
    156      	 read_bit_array( f64, i ) );
    157          j--;
    158       }
    159 
    160       /* Set the exponent appropriately, and we're done. */
    161       bexp -= shift;
    162       bexp += (16383 - 1023);
    163       f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
    164       f80[8] = toUChar( bexp & 0xFF );
    165       return;
    166    }
    167 
    168    /* If the exponent is 7FF, this is either an Infinity, a SNaN or
    169       QNaN, as determined by examining bits 51:0, thus:
    170           0  ... 0    Inf
    171           0X ... X    SNaN
    172           1X ... X    QNaN
    173       where at least one of the Xs is not zero.
    174    */
    175    if (bexp == 0x7FF) {
    176       if (mantissaIsZero) {
    177          /* Produce an appropriately signed infinity:
    178             S 1--1 (15)  1  0--0 (63)
    179          */
    180          f80[9] = toUChar( (sign << 7) | 0x7F );
    181          f80[8] = 0xFF;
    182          f80[7] = 0x80;
    183          f80[6] = f80[5] = f80[4] = f80[3]
    184                 = f80[2] = f80[1] = f80[0] = 0;
    185          return;
    186       }
    187       /* So it's either a QNaN or SNaN.  Distinguish by considering
    188          bit 51.  Note, this destroys all the trailing bits
    189          (identity?) of the NaN.  IEEE754 doesn't require preserving
    190          these (it only requires that there be one QNaN value and one
    191          SNaN value), but x87 does seem to have some ability to
    192          preserve them.  Anyway, here, the NaN's identity is
    193          destroyed.  Could be improved. */
    194       if (f64[6] & 8) {
    195          /* QNaN.  Make a canonical QNaN:
    196             S 1--1 (15)  1 1  0--0 (62)
    197          */
    198          f80[9] = toUChar( (sign << 7) | 0x7F );
    199          f80[8] = 0xFF;
    200          f80[7] = 0xC0;
    201          f80[6] = f80[5] = f80[4] = f80[3]
    202                 = f80[2] = f80[1] = f80[0] = 0x00;
    203       } else {
    204          /* SNaN.  Make a SNaN:
    205             S 1--1 (15)  1 0  1--1 (62)
    206          */
    207          f80[9] = toUChar( (sign << 7) | 0x7F );
    208          f80[8] = 0xFF;
    209          f80[7] = 0xBF;
    210          f80[6] = f80[5] = f80[4] = f80[3]
    211                 = f80[2] = f80[1] = f80[0] = 0xFF;
    212       }
    213       return;
    214    }
    215 
    216    /* It's not a zero, denormal, infinity or nan.  So it must be a
    217       normalised number.  Rebias the exponent and build the new
    218       number.  */
    219    bexp += (16383 - 1023);
    220 
    221    f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
    222    f80[8] = toUChar( bexp & 0xFF );
    223    f80[7] = toUChar( (1 << 7) | ((f64[6] << 3) & 0x78)
    224                               | ((f64[5] >> 5) & 7) );
    225    f80[6] = toUChar( ((f64[5] << 3) & 0xF8) | ((f64[4] >> 5) & 7) );
    226    f80[5] = toUChar( ((f64[4] << 3) & 0xF8) | ((f64[3] >> 5) & 7) );
    227    f80[4] = toUChar( ((f64[3] << 3) & 0xF8) | ((f64[2] >> 5) & 7) );
    228    f80[3] = toUChar( ((f64[2] << 3) & 0xF8) | ((f64[1] >> 5) & 7) );
    229    f80[2] = toUChar( ((f64[1] << 3) & 0xF8) | ((f64[0] >> 5) & 7) );
    230    f80[1] = toUChar( ((f64[0] << 3) & 0xF8) );
    231    f80[0] = toUChar( 0 );
    232 }
    233 
    234 
    235 /* Convert an x87 extended double (80-bit) into an IEEE 754 double
    236    (64-bit), mimicking the hardware fairly closely.  Both numbers are
    237    stored little-endian.  Limitations, both of which could be fixed,
    238    given some level of hassle:
    239 
    240    * Rounding following truncation could be a bit better.
    241 
    242    * Identity of NaNs is not preserved.
    243 
    244    See comments in the code for more details.
    245 */
    246 void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 )
    247 {
    248    Bool  isInf;
    249    Int   bexp, i, j;
    250    UChar sign;
    251 
    252    sign = toUChar((f80[9] >> 7) & 1);
    253    bexp = (((UInt)f80[9]) << 8) | (UInt)f80[8];
    254    bexp &= 0x7FFF;
    255 
    256    /* If the exponent is zero, either we have a zero or a denormal.
    257       But an extended precision denormal becomes a double precision
    258       zero, so in either case, just produce the appropriately signed
    259       zero. */
    260    if (bexp == 0) {
    261       f64[7] = toUChar(sign << 7);
    262       f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
    263       return;
    264    }
    265 
    266    /* If the exponent is 7FFF, this is either an Infinity, a SNaN or
    267       QNaN, as determined by examining bits 62:0, thus:
    268           10  ... 0    Inf
    269           10X ... X    SNaN
    270           11X ... X    QNaN
    271       where at least one of the Xs is not zero.
    272    */
    273    if (bexp == 0x7FFF) {
    274       isInf = toBool(
    275                  (f80[7] & 0x7F) == 0
    276                  && f80[6] == 0 && f80[5] == 0 && f80[4] == 0
    277                  && f80[3] == 0 && f80[2] == 0 && f80[1] == 0
    278                  && f80[0] == 0
    279               );
    280       if (isInf) {
    281          if (0 == (f80[7] & 0x80))
    282             goto wierd_NaN;
    283          /* Produce an appropriately signed infinity:
    284             S 1--1 (11)  0--0 (52)
    285          */
    286          f64[7] = toUChar((sign << 7) | 0x7F);
    287          f64[6] = 0xF0;
    288          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
    289          return;
    290       }
    291       /* So it's either a QNaN or SNaN.  Distinguish by considering
    292          bit 61.  Note, this destroys all the trailing bits
    293          (identity?) of the NaN.  IEEE754 doesn't require preserving
    294          these (it only requires that there be one QNaN value and one
    295          SNaN value), but x87 does seem to have some ability to
    296          preserve them.  Anyway, here, the NaN's identity is
    297          destroyed.  Could be improved. */
    298       if (f80[7] & 0x40) {
    299          /* QNaN.  Make a canonical QNaN:
    300             S 1--1 (11)  1  0--0 (51)
    301          */
    302          f64[7] = toUChar((sign << 7) | 0x7F);
    303          f64[6] = 0xF8;
    304          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0x00;
    305       } else {
    306          /* SNaN.  Make a SNaN:
    307             S 1--1 (11)  0  1--1 (51)
    308          */
    309          f64[7] = toUChar((sign << 7) | 0x7F);
    310          f64[6] = 0xF7;
    311          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
    312       }
    313       return;
    314    }
    315 
    316    /* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is
    317       zero, the x87 FPU appears to consider the number denormalised
    318       and converts it to a QNaN. */
    319    if (0 == (f80[7] & 0x80)) {
    320       wierd_NaN:
    321       /* Strange hardware QNaN:
    322          S 1--1 (11)  1  0--0 (51)
    323       */
    324       /* On a PIII, these QNaNs always appear with sign==1.  I have
    325          no idea why. */
    326       f64[7] = (1 /*sign*/ << 7) | 0x7F;
    327       f64[6] = 0xF8;
    328       f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
    329       return;
    330    }
    331 
    332    /* It's not a zero, denormal, infinity or nan.  So it must be a
    333       normalised number.  Rebias the exponent and consider. */
    334    bexp -= (16383 - 1023);
    335    if (bexp >= 0x7FF) {
    336       /* It's too big for a double.  Construct an infinity. */
    337       f64[7] = toUChar((sign << 7) | 0x7F);
    338       f64[6] = 0xF0;
    339       f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
    340       return;
    341    }
    342 
    343    if (bexp <= 0) {
    344       /* It's too small for a normalised double.  First construct a
    345          zero and then see if it can be improved into a denormal.  */
    346       f64[7] = toUChar(sign << 7);
    347       f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
    348 
    349       if (bexp < -52)
    350          /* Too small even for a denormal. */
    351          return;
    352 
    353       /* Ok, let's make a denormal.  Note, this is SLOW. */
    354       /* Copy bits 63, 62, 61, etc of the src mantissa into the dst,
    355          indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */
    356       /* bexp is in range -52 .. 0 inclusive */
    357       for (i = 63; i >= 0; i--) {
    358          j = i - 12 + bexp;
    359          if (j < 0) break;
    360          /* We shouldn't really call vassert from generated code. */
    361          vassert(j >= 0 && j < 52);
    362          write_bit_array ( f64,
    363                            j,
    364                            read_bit_array ( f80, i ) );
    365       }
    366       /* and now we might have to round ... */
    367       if (read_bit_array(f80, 10+1 - bexp) == 1)
    368          goto do_rounding;
    369 
    370       return;
    371    }
    372 
    373    /* Ok, it's a normalised number which is representable as a double.
    374       Copy the exponent and mantissa into place. */
    375    /*
    376    for (i = 0; i < 52; i++)
    377       write_bit_array ( f64,
    378                         i,
    379                         read_bit_array ( f80, i+11 ) );
    380    */
    381    f64[0] = toUChar( (f80[1] >> 3) | (f80[2] << 5) );
    382    f64[1] = toUChar( (f80[2] >> 3) | (f80[3] << 5) );
    383    f64[2] = toUChar( (f80[3] >> 3) | (f80[4] << 5) );
    384    f64[3] = toUChar( (f80[4] >> 3) | (f80[5] << 5) );
    385    f64[4] = toUChar( (f80[5] >> 3) | (f80[6] << 5) );
    386    f64[5] = toUChar( (f80[6] >> 3) | (f80[7] << 5) );
    387 
    388    f64[6] = toUChar( ((bexp << 4) & 0xF0) | ((f80[7] >> 3) & 0x0F) );
    389 
    390    f64[7] = toUChar( (sign << 7) | ((bexp >> 4) & 0x7F) );
    391 
    392    /* Now consider any rounding that needs to happen as a result of
    393       truncating the mantissa. */
    394    if (f80[1] & 4) /* read_bit_array(f80, 10) == 1) */ {
    395 
    396       /* If the bottom bits of f80 are "100 0000 0000", then the
    397          infinitely precise value is deemed to be mid-way between the
    398          two closest representable values.  Since we're doing
    399          round-to-nearest (the default mode), in that case it is the
    400          bit immediately above which indicates whether we should round
    401          upwards or not -- if 0, we don't.  All that is encapsulated
    402          in the following simple test. */
    403       if ((f80[1] & 0xF) == 4/*0100b*/ && f80[0] == 0)
    404          return;
    405 
    406       do_rounding:
    407       /* Round upwards.  This is a kludge.  Once in every 2^24
    408          roundings (statistically) the bottom three bytes are all 0xFF
    409          and so we don't round at all.  Could be improved. */
    410       if (f64[0] != 0xFF) {
    411          f64[0]++;
    412       }
    413       else
    414       if (f64[0] == 0xFF && f64[1] != 0xFF) {
    415          f64[0] = 0;
    416          f64[1]++;
    417       }
    418       else
    419       if (f64[0] == 0xFF && f64[1] == 0xFF && f64[2] != 0xFF) {
    420          f64[0] = 0;
    421          f64[1] = 0;
    422          f64[2]++;
    423       }
    424       /* else we don't round, but we should. */
    425    }
    426 }
    427 
    428 
    429 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
    430 /* Extract the signed significand or exponent component as per
    431    fxtract.  Arg and result are doubles travelling under the guise of
    432    ULongs.  Returns significand when getExp is zero and exponent
    433    otherwise. */
    434 ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp )
    435 {
    436    ULong  uSig, uExp;
    437    /* Long   sSig; */
    438    Int    sExp, i;
    439    UInt   sign, expExp;
    440 
    441    /*
    442     S  7FF    0------0   infinity
    443     S  7FF    0X-----X   snan
    444     S  7FF    1X-----X   qnan
    445    */
    446    const ULong posInf  = 0x7FF0000000000000ULL;
    447    const ULong negInf  = 0xFFF0000000000000ULL;
    448    const ULong nanMask = 0x7FF0000000000000ULL;
    449    const ULong qNan    = 0x7FF8000000000000ULL;
    450    const ULong posZero = 0x0000000000000000ULL;
    451    const ULong negZero = 0x8000000000000000ULL;
    452    const ULong bit51   = 1ULL << 51;
    453    const ULong bit52   = 1ULL << 52;
    454    const ULong sigMask = bit52 - 1;
    455 
    456    /* Mimic Core i5 behaviour for special cases. */
    457    if (arg == posInf)
    458       return getExp ? posInf : posInf;
    459    if (arg == negInf)
    460       return getExp ? posInf : negInf;
    461    if ((arg & nanMask) == nanMask)
    462       return qNan | (arg & (1ULL << 63));
    463    if (arg == posZero)
    464       return getExp ? negInf : posZero;
    465    if (arg == negZero)
    466       return getExp ? negInf : negZero;
    467 
    468    /* Split into sign, exponent and significand. */
    469    sign = ((UInt)(arg >> 63)) & 1;
    470 
    471    /* Mask off exponent & sign. uSig is in range 0 .. 2^52-1. */
    472    uSig = arg & sigMask;
    473 
    474    /* Get the exponent. */
    475    sExp = ((Int)(arg >> 52)) & 0x7FF;
    476 
    477    /* Deal with denormals: if the exponent is zero, then the
    478       significand cannot possibly be zero (negZero/posZero are handled
    479       above).  Shift the significand left until bit 51 of it becomes
    480       1, and decrease the exponent accordingly.
    481    */
    482    if (sExp == 0) {
    483       for (i = 0; i < 52; i++) {
    484          if (uSig & bit51)
    485             break;
    486          uSig <<= 1;
    487          sExp--;
    488       }
    489       uSig <<= 1;
    490    } else {
    491       /* Add the implied leading-1 in the significand. */
    492       uSig |= bit52;
    493    }
    494 
    495    /* Roll in the sign. */
    496    /* sSig = uSig; */
    497    /* if (sign) sSig =- sSig; */
    498 
    499    /* Convert sig into a double.  This should be an exact conversion.
    500       Then divide by 2^52, which should give a value in the range 1.0
    501       to 2.0-epsilon, at least for normalised args. */
    502    /* dSig = (Double)sSig; */
    503    /* dSig /= 67108864.0;  */ /* 2^26 */
    504    /* dSig /= 67108864.0;  */ /* 2^26 */
    505    uSig &= sigMask;
    506    uSig |= 0x3FF0000000000000ULL;
    507    if (sign)
    508       uSig ^= negZero;
    509 
    510    /* Convert exp into a double.  Also an exact conversion. */
    511    /* dExp = (Double)(sExp - 1023); */
    512    sExp -= 1023;
    513    if (sExp == 0) {
    514       uExp = 0;
    515    } else {
    516       uExp   = sExp < 0 ? -sExp : sExp;
    517       expExp = 0x3FF +52;
    518       /* 1 <= uExp <= 1074 */
    519       /* Skip first 42 iterations of normalisation loop as we know they
    520          will always happen */
    521       uExp <<= 42;
    522       expExp -= 42;
    523       for (i = 0; i < 52-42; i++) {
    524          if (uExp & bit52)
    525             break;
    526          uExp <<= 1;
    527          expExp--;
    528       }
    529       uExp &= sigMask;
    530       uExp |= ((ULong)expExp) << 52;
    531       if (sExp < 0) uExp ^= negZero;
    532    }
    533 
    534    return getExp ? uExp : uSig;
    535 }
    536 
    537 
    538 
    539 /*---------------------------------------------------------*/
    540 /*--- SSE4.2 PCMP{E,I}STR{I,M} helpers                  ---*/
    541 /*---------------------------------------------------------*/
    542 
    543 /* We need the definitions for OSZACP eflags/rflags offsets.
    544    #including guest_{amd64,x86}_defs.h causes chaos, so just copy the
    545    required values directly.  They are not going to change in the
    546    foreseeable future :-)
    547 */
    548 
    549 #define SHIFT_O   11
    550 #define SHIFT_S   7
    551 #define SHIFT_Z   6
    552 #define SHIFT_A   4
    553 #define SHIFT_C   0
    554 #define SHIFT_P   2
    555 
    556 #define MASK_O    (1 << SHIFT_O)
    557 #define MASK_S    (1 << SHIFT_S)
    558 #define MASK_Z    (1 << SHIFT_Z)
    559 #define MASK_A    (1 << SHIFT_A)
    560 #define MASK_C    (1 << SHIFT_C)
    561 #define MASK_P    (1 << SHIFT_P)
    562 
    563 
    564 /* Count leading zeroes, w/ 0-produces-32 semantics, a la Hacker's
    565    Delight. */
    566 static UInt clz32 ( UInt x )
    567 {
    568    Int y, m, n;
    569    y = -(x >> 16);
    570    m = (y >> 16) & 16;
    571    n = 16 - m;
    572    x = x >> m;
    573    y = x - 0x100;
    574    m = (y >> 16) & 8;
    575    n = n + m;
    576    x = x << m;
    577    y = x - 0x1000;
    578    m = (y >> 16) & 4;
    579    n = n + m;
    580    x = x << m;
    581    y = x - 0x4000;
    582    m = (y >> 16) & 2;
    583    n = n + m;
    584    x = x << m;
    585    y = x >> 14;
    586    m = y & ~(y >> 1);
    587    return n + 2 - m;
    588 }
    589 
    590 static UInt ctz32 ( UInt x )
    591 {
    592    return 32 - clz32((~x) & (x-1));
    593 }
    594 
    595 /* Convert a 4-bit value to a 32-bit value by cloning each bit 8
    596    times.  There's surely a better way to do this, but I don't know
    597    what it is. */
    598 static UInt bits4_to_bytes4 ( UInt bits4 )
    599 {
    600    UInt r = 0;
    601    r |= (bits4 & 1) ? 0x000000FF : 0;
    602    r |= (bits4 & 2) ? 0x0000FF00 : 0;
    603    r |= (bits4 & 4) ? 0x00FF0000 : 0;
    604    r |= (bits4 & 8) ? 0xFF000000 : 0;
    605    return r;
    606 }
    607 
    608 
    609 /* Convert a 2-bit value to a 32-bit value by cloning each bit 16
    610    times.  There's surely a better way to do this, but I don't know
    611    what it is. */
    612 static UInt bits2_to_bytes4 ( UInt bits2 )
    613 {
    614    UInt r = 0;
    615    r |= (bits2 & 1) ? 0x0000FFFF : 0;
    616    r |= (bits2 & 2) ? 0xFFFF0000 : 0;
    617    return r;
    618 }
    619 
    620 
    621 /* Given partial results from a pcmpXstrX operation (intRes1,
    622    basically), generate an I- or M-format output value, also the new
    623    OSZACP flags.  */
    624 static
    625 void compute_PCMPxSTRx_gen_output (/*OUT*/V128* resV,
    626                                    /*OUT*/UInt* resOSZACP,
    627                                    UInt intRes1,
    628                                    UInt zmaskL, UInt zmaskR,
    629                                    UInt validL,
    630                                    UInt pol, UInt idx,
    631                                    Bool isxSTRM )
    632 {
    633    vassert((pol >> 2) == 0);
    634    vassert((idx >> 1) == 0);
    635 
    636    UInt intRes2 = 0;
    637    switch (pol) {
    638       case 0: intRes2 = intRes1;          break; // pol +
    639       case 1: intRes2 = ~intRes1;         break; // pol -
    640       case 2: intRes2 = intRes1;          break; // pol m+
    641       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
    642    }
    643    intRes2 &= 0xFFFF;
    644 
    645    if (isxSTRM) {
    646 
    647       // generate M-format output (a bit or byte mask in XMM0)
    648       if (idx) {
    649          resV->w32[0] = bits4_to_bytes4( (intRes2 >>  0) & 0xF );
    650          resV->w32[1] = bits4_to_bytes4( (intRes2 >>  4) & 0xF );
    651          resV->w32[2] = bits4_to_bytes4( (intRes2 >>  8) & 0xF );
    652          resV->w32[3] = bits4_to_bytes4( (intRes2 >> 12) & 0xF );
    653       } else {
    654          resV->w32[0] = intRes2 & 0xFFFF;
    655          resV->w32[1] = 0;
    656          resV->w32[2] = 0;
    657          resV->w32[3] = 0;
    658       }
    659 
    660    } else {
    661 
    662       // generate I-format output (an index in ECX)
    663       // generate ecx value
    664       UInt newECX = 0;
    665       if (idx) {
    666          // index of ms-1-bit
    667          newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
    668       } else {
    669          // index of ls-1-bit
    670          newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
    671       }
    672 
    673       resV->w32[0] = newECX;
    674       resV->w32[1] = 0;
    675       resV->w32[2] = 0;
    676       resV->w32[3] = 0;
    677 
    678    }
    679 
    680    // generate new flags, common to all ISTRI and ISTRM cases
    681    *resOSZACP    // A, P are zero
    682      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
    683      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
    684      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
    685      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
    686 }
    687 
    688 
    689 /* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
    690    basically), generate an I- or M-format output value, also the new
    691    OSZACP flags.  */
    692 static
    693 void compute_PCMPxSTRx_gen_output_wide (/*OUT*/V128* resV,
    694                                         /*OUT*/UInt* resOSZACP,
    695                                         UInt intRes1,
    696                                         UInt zmaskL, UInt zmaskR,
    697                                         UInt validL,
    698                                         UInt pol, UInt idx,
    699                                         Bool isxSTRM )
    700 {
    701    vassert((pol >> 2) == 0);
    702    vassert((idx >> 1) == 0);
    703 
    704    UInt intRes2 = 0;
    705    switch (pol) {
    706       case 0: intRes2 = intRes1;          break; // pol +
    707       case 1: intRes2 = ~intRes1;         break; // pol -
    708       case 2: intRes2 = intRes1;          break; // pol m+
    709       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
    710    }
    711    intRes2 &= 0xFF;
    712 
    713    if (isxSTRM) {
    714 
    715       // generate M-format output (a bit or byte mask in XMM0)
    716       if (idx) {
    717          resV->w32[0] = bits2_to_bytes4( (intRes2 >> 0) & 0x3 );
    718          resV->w32[1] = bits2_to_bytes4( (intRes2 >> 2) & 0x3 );
    719          resV->w32[2] = bits2_to_bytes4( (intRes2 >> 4) & 0x3 );
    720          resV->w32[3] = bits2_to_bytes4( (intRes2 >> 6) & 0x3 );
    721       } else {
    722          resV->w32[0] = intRes2 & 0xFF;
    723          resV->w32[1] = 0;
    724          resV->w32[2] = 0;
    725          resV->w32[3] = 0;
    726       }
    727 
    728    } else {
    729 
    730       // generate I-format output (an index in ECX)
    731       // generate ecx value
    732       UInt newECX = 0;
    733       if (idx) {
    734          // index of ms-1-bit
    735          newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2));
    736       } else {
    737          // index of ls-1-bit
    738          newECX = intRes2 == 0 ? 8 : ctz32(intRes2);
    739       }
    740 
    741       resV->w32[0] = newECX;
    742       resV->w32[1] = 0;
    743       resV->w32[2] = 0;
    744       resV->w32[3] = 0;
    745 
    746    }
    747 
    748    // generate new flags, common to all ISTRI and ISTRM cases
    749    *resOSZACP    // A, P are zero
    750      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
    751      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
    752      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
    753      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
    754 }
    755 
    756 
    757 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
    758    variants on 8-bit data.
    759 
    760    For xSTRI variants, the new ECX value is placed in the 32 bits
    761    pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
    762    variants, the result is a 128 bit value and is placed at *resV in
    763    the obvious way.
    764 
    765    For all variants, the new OSZACP value is placed at *resOSZACP.
    766 
    767    argLV and argRV are the vector args.  The caller must prepare a
    768    16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
    769    must be 1 for each zero byte of of the respective arg.  For ESTRx
    770    variants this is derived from the explicit length indication, and
    771    must be 0 in all places except at the bit index corresponding to
    772    the valid length (0 .. 16).  If the valid length is 16 then the
    773    mask must be all zeroes.  In all cases, bits 31:16 must be zero.
    774 
    775    imm8 is the original immediate from the instruction.  isSTRM
    776    indicates whether this is a xSTRM or xSTRI variant, which controls
    777    how much of *res is written.
    778 
    779    If the given imm8 case can be handled, the return value is True.
    780    If not, False is returned, and neither *res not *resOSZACP are
    781    altered.
    782 */
    783 
    784 Bool compute_PCMPxSTRx ( /*OUT*/V128* resV,
    785                          /*OUT*/UInt* resOSZACP,
    786                          V128* argLV,  V128* argRV,
    787                          UInt zmaskL, UInt zmaskR,
    788                          UInt imm8,   Bool isxSTRM )
    789 {
    790    vassert(imm8 < 0x80);
    791    vassert((zmaskL >> 16) == 0);
    792    vassert((zmaskR >> 16) == 0);
    793 
    794    /* Explicitly reject any imm8 values that haven't been validated,
    795       even if they would probably work.  Life is too short to have
    796       unvalidated cases in the code base. */
    797    switch (imm8) {
    798       case 0x00: case 0x02:
    799       case 0x08: case 0x0A: case 0x0C: case 0x0E:
    800       case 0x10: case 0x12: case 0x14:
    801       case 0x18: case 0x1A:
    802       case 0x30:            case 0x34:
    803       case 0x38: case 0x3A:
    804       case 0x40: case 0x42: case 0x44: case 0x46:
    805                  case 0x4A:
    806                  case 0x62:
    807       case 0x70: case 0x72:
    808          break;
    809       default:
    810          return False;
    811    }
    812 
    813    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
    814    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
    815    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
    816    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
    817 
    818    /*----------------------------------------*/
    819    /*-- strcmp on byte data                --*/
    820    /*----------------------------------------*/
    821 
    822    if (agg == 2/*equal each, aka strcmp*/
    823        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
    824       Int    i;
    825       UChar* argL = (UChar*)argLV;
    826       UChar* argR = (UChar*)argRV;
    827       UInt boolResII = 0;
    828       for (i = 15; i >= 0; i--) {
    829          UChar cL  = argL[i];
    830          UChar cR  = argR[i];
    831          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
    832       }
    833       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    834       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    835 
    836       // do invalidation, common to all equal-each cases
    837       UInt intRes1
    838          = (boolResII & validL & validR)  // if both valid, use cmpres
    839            | (~ (validL | validR));       // if both invalid, force 1
    840                                           // else force 0
    841       intRes1 &= 0xFFFF;
    842 
    843       // generate I-format output
    844       compute_PCMPxSTRx_gen_output(
    845          resV, resOSZACP,
    846          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
    847       );
    848 
    849       return True;
    850    }
    851 
    852    /*----------------------------------------*/
    853    /*-- set membership on byte data        --*/
    854    /*----------------------------------------*/
    855 
    856    if (agg == 0/*equal any, aka find chars in a set*/
    857        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
    858       /* argL: the string,  argR: charset */
    859       UInt   si, ci;
    860       UChar* argL    = (UChar*)argLV;
    861       UChar* argR    = (UChar*)argRV;
    862       UInt   boolRes = 0;
    863       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    864       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    865 
    866       for (si = 0; si < 16; si++) {
    867          if ((validL & (1 << si)) == 0)
    868             // run off the end of the string.
    869             break;
    870          UInt m = 0;
    871          for (ci = 0; ci < 16; ci++) {
    872             if ((validR & (1 << ci)) == 0) break;
    873             if (argR[ci] == argL[si]) { m = 1; break; }
    874          }
    875          boolRes |= (m << si);
    876       }
    877 
    878       // boolRes is "pre-invalidated"
    879       UInt intRes1 = boolRes & 0xFFFF;
    880 
    881       // generate I-format output
    882       compute_PCMPxSTRx_gen_output(
    883          resV, resOSZACP,
    884          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
    885       );
    886 
    887       return True;
    888    }
    889 
    890    /*----------------------------------------*/
    891    /*-- substring search on byte data      --*/
    892    /*----------------------------------------*/
    893 
    894    if (agg == 3/*equal ordered, aka substring search*/
    895        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
    896 
    897       /* argL: haystack,  argR: needle */
    898       UInt   ni, hi;
    899       UChar* argL    = (UChar*)argLV;
    900       UChar* argR    = (UChar*)argRV;
    901       UInt   boolRes = 0;
    902       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    903       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    904       for (hi = 0; hi < 16; hi++) {
    905          UInt m = 1;
    906          for (ni = 0; ni < 16; ni++) {
    907             if ((validR & (1 << ni)) == 0) break;
    908             UInt i = ni + hi;
    909             if (i >= 16) break;
    910             if (argL[i] != argR[ni]) { m = 0; break; }
    911          }
    912          boolRes |= (m << hi);
    913          if ((validL & (1 << hi)) == 0)
    914             // run off the end of the haystack
    915             break;
    916       }
    917 
    918       // boolRes is "pre-invalidated"
    919       UInt intRes1 = boolRes & 0xFFFF;
    920 
    921       // generate I-format output
    922       compute_PCMPxSTRx_gen_output(
    923          resV, resOSZACP,
    924          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
    925       );
    926 
    927       return True;
    928    }
    929 
    930    /*----------------------------------------*/
    931    /*-- ranges, unsigned byte data         --*/
    932    /*----------------------------------------*/
    933 
    934    if (agg == 1/*ranges*/
    935        && fmt == 0/*ub*/) {
    936 
    937       /* argL: string,  argR: range-pairs */
    938       UInt   ri, si;
    939       UChar* argL    = (UChar*)argLV;
    940       UChar* argR    = (UChar*)argRV;
    941       UInt   boolRes = 0;
    942       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    943       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    944       for (si = 0; si < 16; si++) {
    945          if ((validL & (1 << si)) == 0)
    946             // run off the end of the string
    947             break;
    948          UInt m = 0;
    949          for (ri = 0; ri < 16; ri += 2) {
    950             if ((validR & (3 << ri)) != (3 << ri)) break;
    951             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    952                m = 1; break;
    953             }
    954          }
    955          boolRes |= (m << si);
    956       }
    957 
    958       // boolRes is "pre-invalidated"
    959       UInt intRes1 = boolRes & 0xFFFF;
    960 
    961       // generate I-format output
    962       compute_PCMPxSTRx_gen_output(
    963          resV, resOSZACP,
    964          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
    965       );
    966 
    967       return True;
    968    }
    969 
    970    /*----------------------------------------*/
    971    /*-- ranges, signed byte data           --*/
    972    /*----------------------------------------*/
    973 
    974    if (agg == 1/*ranges*/
    975        && fmt == 2/*sb*/) {
    976 
    977       /* argL: string,  argR: range-pairs */
    978       UInt   ri, si;
    979       Char*  argL    = (Char*)argLV;
    980       Char*  argR    = (Char*)argRV;
    981       UInt   boolRes = 0;
    982       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    983       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    984       for (si = 0; si < 16; si++) {
    985          if ((validL & (1 << si)) == 0)
    986             // run off the end of the string
    987             break;
    988          UInt m = 0;
    989          for (ri = 0; ri < 16; ri += 2) {
    990             if ((validR & (3 << ri)) != (3 << ri)) break;
    991             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    992                m = 1; break;
    993             }
    994          }
    995          boolRes |= (m << si);
    996       }
    997 
    998       // boolRes is "pre-invalidated"
    999       UInt intRes1 = boolRes & 0xFFFF;
   1000 
   1001       // generate I-format output
   1002       compute_PCMPxSTRx_gen_output(
   1003          resV, resOSZACP,
   1004          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
   1005       );
   1006 
   1007       return True;
   1008    }
   1009 
   1010    return False;
   1011 }
   1012 
   1013 
   1014 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
   1015    variants on 16-bit characters.
   1016 
   1017    For xSTRI variants, the new ECX value is placed in the 32 bits
   1018    pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
   1019    variants, the result is a 128 bit value and is placed at *resV in
   1020    the obvious way.
   1021 
   1022    For all variants, the new OSZACP value is placed at *resOSZACP.
   1023 
   1024    argLV and argRV are the vector args.  The caller must prepare a
   1025    8-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
   1026    must be 1 for each zero byte of of the respective arg.  For ESTRx
   1027    variants this is derived from the explicit length indication, and
   1028    must be 0 in all places except at the bit index corresponding to
   1029    the valid length (0 .. 8).  If the valid length is 8 then the
   1030    mask must be all zeroes.  In all cases, bits 31:8 must be zero.
   1031 
   1032    imm8 is the original immediate from the instruction.  isSTRM
   1033    indicates whether this is a xSTRM or xSTRI variant, which controls
   1034    how much of *res is written.
   1035 
   1036    If the given imm8 case can be handled, the return value is True.
   1037    If not, False is returned, and neither *res not *resOSZACP are
   1038    altered.
   1039 */
   1040 
   1041 Bool compute_PCMPxSTRx_wide ( /*OUT*/V128* resV,
   1042                               /*OUT*/UInt* resOSZACP,
   1043                               V128* argLV,  V128* argRV,
   1044                               UInt zmaskL, UInt zmaskR,
   1045                               UInt imm8,   Bool isxSTRM )
   1046 {
   1047    vassert(imm8 < 0x80);
   1048    vassert((zmaskL >> 8) == 0);
   1049    vassert((zmaskR >> 8) == 0);
   1050 
   1051    /* Explicitly reject any imm8 values that haven't been validated,
   1052       even if they would probably work.  Life is too short to have
   1053       unvalidated cases in the code base. */
   1054    switch (imm8) {
   1055       case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
   1056                  case 0x13: case 0x19: case 0x1B:
   1057                             case 0x39: case 0x3B:
   1058                  case 0x45:            case 0x4B:
   1059          break;
   1060       default:
   1061          return False;
   1062    }
   1063 
   1064    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
   1065    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
   1066    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
   1067    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
   1068 
   1069    /*----------------------------------------*/
   1070    /*-- strcmp on wide data                --*/
   1071    /*----------------------------------------*/
   1072 
   1073    if (agg == 2/*equal each, aka strcmp*/
   1074        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
   1075       Int     i;
   1076       UShort* argL = (UShort*)argLV;
   1077       UShort* argR = (UShort*)argRV;
   1078       UInt boolResII = 0;
   1079       for (i = 7; i >= 0; i--) {
   1080          UShort cL  = argL[i];
   1081          UShort cR  = argR[i];
   1082          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
   1083       }
   1084       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
   1085       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
   1086 
   1087       // do invalidation, common to all equal-each cases
   1088       UInt intRes1
   1089          = (boolResII & validL & validR)  // if both valid, use cmpres
   1090            | (~ (validL | validR));       // if both invalid, force 1
   1091                                           // else force 0
   1092       intRes1 &= 0xFF;
   1093 
   1094       // generate I-format output
   1095       compute_PCMPxSTRx_gen_output_wide(
   1096          resV, resOSZACP,
   1097          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
   1098       );
   1099 
   1100       return True;
   1101    }
   1102 
   1103    /*----------------------------------------*/
   1104    /*-- set membership on wide data        --*/
   1105    /*----------------------------------------*/
   1106 
   1107    if (agg == 0/*equal any, aka find chars in a set*/
   1108        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
   1109       /* argL: the string,  argR: charset */
   1110       UInt    si, ci;
   1111       UShort* argL    = (UShort*)argLV;
   1112       UShort* argR    = (UShort*)argRV;
   1113       UInt    boolRes = 0;
   1114       UInt    validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
   1115       UInt    validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
   1116 
   1117       for (si = 0; si < 8; si++) {
   1118          if ((validL & (1 << si)) == 0)
   1119             // run off the end of the string.
   1120             break;
   1121          UInt m = 0;
   1122          for (ci = 0; ci < 8; ci++) {
   1123             if ((validR & (1 << ci)) == 0) break;
   1124             if (argR[ci] == argL[si]) { m = 1; break; }
   1125          }
   1126          boolRes |= (m << si);
   1127       }
   1128 
   1129       // boolRes is "pre-invalidated"
   1130       UInt intRes1 = boolRes & 0xFF;
   1131 
   1132       // generate I-format output
   1133       compute_PCMPxSTRx_gen_output_wide(
   1134          resV, resOSZACP,
   1135          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
   1136       );
   1137 
   1138       return True;
   1139    }
   1140 
   1141    /*----------------------------------------*/
   1142    /*-- substring search on wide data      --*/
   1143    /*----------------------------------------*/
   1144 
   1145    if (agg == 3/*equal ordered, aka substring search*/
   1146        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
   1147 
   1148       /* argL: haystack,  argR: needle */
   1149       UInt    ni, hi;
   1150       UShort* argL    = (UShort*)argLV;
   1151       UShort* argR    = (UShort*)argRV;
   1152       UInt    boolRes = 0;
   1153       UInt    validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
   1154       UInt    validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
   1155       for (hi = 0; hi < 8; hi++) {
   1156          UInt m = 1;
   1157          for (ni = 0; ni < 8; ni++) {
   1158             if ((validR & (1 << ni)) == 0) break;
   1159             UInt i = ni + hi;
   1160             if (i >= 8) break;
   1161             if (argL[i] != argR[ni]) { m = 0; break; }
   1162          }
   1163          boolRes |= (m << hi);
   1164          if ((validL & (1 << hi)) == 0)
   1165             // run off the end of the haystack
   1166             break;
   1167       }
   1168 
   1169       // boolRes is "pre-invalidated"
   1170       UInt intRes1 = boolRes & 0xFF;
   1171 
   1172       // generate I-format output
   1173       compute_PCMPxSTRx_gen_output_wide(
   1174          resV, resOSZACP,
   1175          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
   1176       );
   1177 
   1178       return True;
   1179    }
   1180 
   1181    /*----------------------------------------*/
   1182    /*-- ranges, unsigned wide data         --*/
   1183    /*----------------------------------------*/
   1184 
   1185    if (agg == 1/*ranges*/
   1186        && fmt == 1/*uw*/) {
   1187 
   1188       /* argL: string,  argR: range-pairs */
   1189       UInt    ri, si;
   1190       UShort* argL    = (UShort*)argLV;
   1191       UShort* argR    = (UShort*)argRV;
   1192       UInt    boolRes = 0;
   1193       UInt    validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
   1194       UInt    validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
   1195       for (si = 0; si < 8; si++) {
   1196          if ((validL & (1 << si)) == 0)
   1197             // run off the end of the string
   1198             break;
   1199          UInt m = 0;
   1200          for (ri = 0; ri < 8; ri += 2) {
   1201             if ((validR & (3 << ri)) != (3 << ri)) break;
   1202             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
   1203                m = 1; break;
   1204             }
   1205          }
   1206          boolRes |= (m << si);
   1207       }
   1208 
   1209       // boolRes is "pre-invalidated"
   1210       UInt intRes1 = boolRes & 0xFF;
   1211 
   1212       // generate I-format output
   1213       compute_PCMPxSTRx_gen_output_wide(
   1214          resV, resOSZACP,
   1215          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
   1216       );
   1217 
   1218       return True;
   1219    }
   1220 
   1221    return False;
   1222 }
   1223 
   1224 
   1225 /*---------------------------------------------------------------*/
   1226 /*--- end                                 guest_generic_x87.c ---*/
   1227 /*---------------------------------------------------------------*/
   1228