Home | History | Annotate | Download | only in amd64
      1 
      2 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
      3    pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
      4    aspect. */
      5 
      6 #include <string.h>
      7 #include <stdio.h>
      8 #include <assert.h>
      9 
     10 typedef  unsigned int   UInt;
     11 typedef  signed int     Int;
     12 typedef  unsigned char  UChar;
     13 typedef  unsigned long long int ULong;
     14 typedef  UChar          Bool;
     15 #define False ((Bool)0)
     16 #define True  ((Bool)1)
     17 
     18 //typedef  unsigned char  V128[16];
     19 typedef
     20    union {
     21       UChar uChar[16];
     22       UInt  uInt[4];
     23    }
     24    V128;
     25 
     26 #define SHIFT_O   11
     27 #define SHIFT_S   7
     28 #define SHIFT_Z   6
     29 #define SHIFT_A   4
     30 #define SHIFT_C   0
     31 #define SHIFT_P   2
     32 
     33 #define MASK_O    (1ULL << SHIFT_O)
     34 #define MASK_S    (1ULL << SHIFT_S)
     35 #define MASK_Z    (1ULL << SHIFT_Z)
     36 #define MASK_A    (1ULL << SHIFT_A)
     37 #define MASK_C    (1ULL << SHIFT_C)
     38 #define MASK_P    (1ULL << SHIFT_P)
     39 
     40 
     41 UInt clz32 ( UInt x )
     42 {
     43    Int y, m, n;
     44    y = -(x >> 16);
     45    m = (y >> 16) & 16;
     46    n = 16 - m;
     47    x = x >> m;
     48    y = x - 0x100;
     49    m = (y >> 16) & 8;
     50    n = n + m;
     51    x = x << m;
     52    y = x - 0x1000;
     53    m = (y >> 16) & 4;
     54    n = n + m;
     55    x = x << m;
     56    y = x - 0x4000;
     57    m = (y >> 16) & 2;
     58    n = n + m;
     59    x = x << m;
     60    y = x >> 14;
     61    m = y & ~(y >> 1);
     62    return n + 2 - m;
     63 }
     64 
     65 UInt ctz32 ( UInt x )
     66 {
     67    return 32 - clz32((~x) & (x-1));
     68 }
     69 
     70 void expand ( V128* dst, char* summary )
     71 {
     72    Int i;
     73    assert( strlen(summary) == 16 );
     74    for (i = 0; i < 16; i++) {
     75       UChar xx = 0;
     76       UChar x = summary[15-i];
     77       if      (x >= '0' && x <= '9') { xx = x - '0'; }
     78       else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
     79       else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
     80       else assert(0);
     81 
     82       assert(xx < 16);
     83       xx = (xx << 4) | xx;
     84       assert(xx < 256);
     85       dst->uChar[i] = xx;
     86    }
     87 }
     88 
     89 void try_istri ( char* which,
     90                  UInt(*h_fn)(V128*,V128*),
     91                  UInt(*s_fn)(V128*,V128*),
     92                  char* summL, char* summR )
     93 {
     94    assert(strlen(which) == 2);
     95    V128 argL, argR;
     96    expand(&argL, summL);
     97    expand(&argR, summR);
     98    UInt h_res = h_fn(&argL, &argR);
     99    UInt s_res = s_fn(&argL, &argR);
    100    printf("istri %s  %s %s -> %08x %08x %s\n",
    101           which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
    102 }
    103 
    104 UInt zmask_from_V128 ( V128* arg )
    105 {
    106    UInt i, res = 0;
    107    for (i = 0; i < 16; i++) {
    108       res |=  ((arg->uChar[i] == 0) ? 1 : 0) << i;
    109    }
    110    return res;
    111 }
    112 
    113 //////////////////////////////////////////////////////////
    114 //                                                      //
    115 //                       GENERAL                        //
    116 //                                                      //
    117 //////////////////////////////////////////////////////////
    118 
    119 
    120 /* Given partial results from a pcmpXstrX operation (intRes1,
    121    basically), generate an I format (index value for ECX) output, and
    122    also the new OSZACP flags.
    123 */
    124 static
    125 void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
    126                                     /*OUT*/UInt* resOSZACP,
    127                                     UInt intRes1,
    128                                     UInt zmaskL, UInt zmaskR,
    129                                     UInt validL,
    130                                     UInt pol, UInt idx )
    131 {
    132    assert((pol >> 2) == 0);
    133    assert((idx >> 1) == 0);
    134 
    135    UInt intRes2 = 0;
    136    switch (pol) {
    137       case 0: intRes2 = intRes1;          break; // pol +
    138       case 1: intRes2 = ~intRes1;         break; // pol -
    139       case 2: intRes2 = intRes1;          break; // pol m+
    140       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
    141    }
    142    intRes2 &= 0xFFFF;
    143 
    144    // generate ecx value
    145    UInt newECX = 0;
    146    if (idx) {
    147      // index of ms-1-bit
    148      newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
    149    } else {
    150      // index of ls-1-bit
    151      newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
    152    }
    153 
    154    *(UInt*)(&resV[0]) = newECX;
    155 
    156    // generate new flags, common to all ISTRI and ISTRM cases
    157    *resOSZACP    // A, P are zero
    158      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
    159      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
    160      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
    161      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
    162 }
    163 
    164 
    165 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
    166    variants.
    167 
    168    For xSTRI variants, the new ECX value is placed in the 32 bits
    169    pointed to by *resV.  For xSTRM variants, the result is a 128 bit
    170    value and is placed at *resV in the obvious way.
    171 
    172    For all variants, the new OSZACP value is placed at *resOSZACP.
    173 
    174    argLV and argRV are the vector args.  The caller must prepare a
    175    16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
    176    must be 1 for each zero byte of of the respective arg.  For ESTRx
    177    variants this is derived from the explicit length indication, and
    178    must be 0 in all places except at the bit index corresponding to
    179    the valid length (0 .. 16).  If the valid length is 16 then the
    180    mask must be all zeroes.  In all cases, bits 31:16 must be zero.
    181 
    182    imm8 is the original immediate from the instruction.  isSTRM
    183    indicates whether this is a xSTRM or xSTRI variant, which controls
    184    how much of *res is written.
    185 
    186    If the given imm8 case can be handled, the return value is True.
    187    If not, False is returned, and neither *res not *resOSZACP are
    188    altered.
    189 */
    190 
    191 Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
    192                      /*OUT*/UInt* resOSZACP,
    193                      V128* argLV,  V128* argRV,
    194                      UInt zmaskL, UInt zmaskR,
    195                      UInt imm8,   Bool isSTRM )
    196 {
    197    assert(imm8 < 0x80);
    198    assert((zmaskL >> 16) == 0);
    199    assert((zmaskR >> 16) == 0);
    200 
    201    /* Explicitly reject any imm8 values that haven't been validated,
    202       even if they would probably work.  Life is too short to have
    203       unvalidated cases in the code base. */
    204    switch (imm8) {
    205       case 0x00:
    206       case 0x02: case 0x08: case 0x0C: case 0x12: case 0x1A:
    207       case 0x38: case 0x3A: case 0x44: case 0x4A:
    208          break;
    209       default:
    210          return False;
    211    }
    212 
    213    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
    214    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
    215    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
    216    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
    217 
    218    /*----------------------------------------*/
    219    /*-- strcmp on byte data                --*/
    220    /*----------------------------------------*/
    221 
    222    if (agg == 2/*equal each, aka strcmp*/
    223        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    224        && !isSTRM) {
    225       Int    i;
    226       UChar* argL = (UChar*)argLV;
    227       UChar* argR = (UChar*)argRV;
    228       UInt boolResII = 0;
    229       for (i = 15; i >= 0; i--) {
    230          UChar cL  = argL[i];
    231          UChar cR  = argR[i];
    232          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
    233       }
    234       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    235       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    236 
    237       // do invalidation, common to all equal-each cases
    238       UInt intRes1
    239          = (boolResII & validL & validR)  // if both valid, use cmpres
    240            | (~ (validL | validR));       // if both invalid, force 1
    241                                           // else force 0
    242       intRes1 &= 0xFFFF;
    243 
    244       // generate I-format output
    245       pcmpXstrX_WRK_gen_output_fmt_I(
    246          resV, resOSZACP,
    247          intRes1, zmaskL, zmaskR, validL, pol, idx
    248       );
    249 
    250       return True;
    251    }
    252 
    253    /*----------------------------------------*/
    254    /*-- set membership on byte data        --*/
    255    /*----------------------------------------*/
    256 
    257    if (agg == 0/*equal any, aka find chars in a set*/
    258        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    259        && !isSTRM) {
    260       /* argL: the string,  argR: charset */
    261       UInt   si, ci;
    262       UChar* argL    = (UChar*)argLV;
    263       UChar* argR    = (UChar*)argRV;
    264       UInt   boolRes = 0;
    265       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    266       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    267 
    268       for (si = 0; si < 16; si++) {
    269          if ((validL & (1 << si)) == 0)
    270             // run off the end of the string.
    271             break;
    272          UInt m = 0;
    273          for (ci = 0; ci < 16; ci++) {
    274             if ((validR & (1 << ci)) == 0) break;
    275             if (argR[ci] == argL[si]) { m = 1; break; }
    276          }
    277          boolRes |= (m << si);
    278       }
    279 
    280       // boolRes is "pre-invalidated"
    281       UInt intRes1 = boolRes & 0xFFFF;
    282 
    283       // generate I-format output
    284       pcmpXstrX_WRK_gen_output_fmt_I(
    285          resV, resOSZACP,
    286          intRes1, zmaskL, zmaskR, validL, pol, idx
    287       );
    288 
    289       return True;
    290    }
    291 
    292    /*----------------------------------------*/
    293    /*-- substring search on byte data      --*/
    294    /*----------------------------------------*/
    295 
    296    if (agg == 3/*equal ordered, aka substring search*/
    297        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    298        && !isSTRM) {
    299 
    300       /* argL: haystack,  argR: needle */
    301       UInt   ni, hi;
    302       UChar* argL    = (UChar*)argLV;
    303       UChar* argR    = (UChar*)argRV;
    304       UInt   boolRes = 0;
    305       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    306       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    307       for (hi = 0; hi < 16; hi++) {
    308          if ((validL & (1 << hi)) == 0)
    309             // run off the end of the haystack
    310             break;
    311          UInt m = 1;
    312          for (ni = 0; ni < 16; ni++) {
    313             if ((validR & (1 << ni)) == 0) break;
    314             UInt i = ni + hi;
    315             if (i >= 16) break;
    316             if (argL[i] != argR[ni]) { m = 0; break; }
    317          }
    318          boolRes |= (m << hi);
    319       }
    320 
    321       // boolRes is "pre-invalidated"
    322       UInt intRes1 = boolRes & 0xFFFF;
    323 
    324       // generate I-format output
    325       pcmpXstrX_WRK_gen_output_fmt_I(
    326          resV, resOSZACP,
    327          intRes1, zmaskL, zmaskR, validL, pol, idx
    328       );
    329 
    330       return True;
    331    }
    332 
    333    /*----------------------------------------*/
    334    /*-- ranges, unsigned byte data         --*/
    335    /*----------------------------------------*/
    336 
    337    if (agg == 1/*ranges*/
    338        && fmt == 0/*ub*/
    339        && !isSTRM) {
    340 
    341       /* argL: string,  argR: range-pairs */
    342       UInt   ri, si;
    343       UChar* argL    = (UChar*)argLV;
    344       UChar* argR    = (UChar*)argRV;
    345       UInt   boolRes = 0;
    346       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    347       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    348       for (si = 0; si < 16; si++) {
    349          if ((validL & (1 << si)) == 0)
    350             // run off the end of the string
    351             break;
    352          UInt m = 0;
    353          for (ri = 0; ri < 16; ri += 2) {
    354             if ((validR & (3 << ri)) != (3 << ri)) break;
    355             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    356                m = 1; break;
    357             }
    358          }
    359          boolRes |= (m << si);
    360       }
    361 
    362       // boolRes is "pre-invalidated"
    363       UInt intRes1 = boolRes & 0xFFFF;
    364 
    365       // generate I-format output
    366       pcmpXstrX_WRK_gen_output_fmt_I(
    367          resV, resOSZACP,
    368          intRes1, zmaskL, zmaskR, validL, pol, idx
    369       );
    370 
    371       return True;
    372    }
    373 
    374    return False;
    375 }
    376 
    377 
    378 //////////////////////////////////////////////////////////
    379 //                                                      //
    380 //                       ISTRI_4A                       //
    381 //                                                      //
    382 //////////////////////////////////////////////////////////
    383 
    384 UInt h_pcmpistri_4A ( V128* argL, V128* argR )
    385 {
    386    V128 block[2];
    387    memcpy(&block[0], argL, sizeof(V128));
    388    memcpy(&block[1], argR, sizeof(V128));
    389    ULong res, flags;
    390    __asm__ __volatile__(
    391       "subq      $1024,  %%rsp"             "\n\t"
    392       "movdqu    0(%2),  %%xmm2"            "\n\t"
    393       "movdqu    16(%2), %%xmm11"           "\n\t"
    394       "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
    395       "pushfq"                              "\n\t"
    396       "popq      %%rdx"                     "\n\t"
    397       "movq      %%rcx,  %0"                "\n\t"
    398       "movq      %%rdx,  %1"                "\n\t"
    399       "addq      $1024,  %%rsp"             "\n\t"
    400       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    401       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    402    );
    403    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    404 }
    405 
    406 UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
    407 {
    408    V128 resV;
    409    UInt resOSZACP, resECX;
    410    Bool ok
    411       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    412                        zmask_from_V128(argLU),
    413                        zmask_from_V128(argRU),
    414                        0x4A, False/*!isSTRM*/
    415         );
    416    assert(ok);
    417    resECX = resV.uInt[0];
    418    return (resOSZACP << 16) | resECX;
    419 }
    420 
    421 void istri_4A ( void )
    422 {
    423    char* wot = "4A";
    424    UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
    425    UInt(*s)(V128*,V128*) = s_pcmpistri_4A;
    426 
    427    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    428 
    429    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    430    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    431    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    432    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    433 
    434    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    435    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    436    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    437 
    438    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    439    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    440    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    441    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    442 
    443    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    444    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    445    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    446 
    447    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    448 
    449    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    450    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    451    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    452 
    453    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    454    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    455    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    456 
    457    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    458    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    459    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    460 
    461    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    462    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    463    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    464 
    465    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    466    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    467 }
    468 
    469 //////////////////////////////////////////////////////////
    470 //                                                      //
    471 //                       ISTRI_3A                       //
    472 //                                                      //
    473 //////////////////////////////////////////////////////////
    474 
    475 UInt h_pcmpistri_3A ( V128* argL, V128* argR )
    476 {
    477    V128 block[2];
    478    memcpy(&block[0], argL, sizeof(V128));
    479    memcpy(&block[1], argR, sizeof(V128));
    480    ULong res, flags;
    481    __asm__ __volatile__(
    482       "subq      $1024,  %%rsp"             "\n\t"
    483       "movdqu    0(%2),  %%xmm2"            "\n\t"
    484       "movdqu    16(%2), %%xmm11"           "\n\t"
    485       "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
    486       "pushfq"                              "\n\t"
    487       "popq      %%rdx"                     "\n\t"
    488       "movq      %%rcx,  %0"                "\n\t"
    489       "movq      %%rdx,  %1"                "\n\t"
    490       "addq      $1024,  %%rsp"             "\n\t"
    491       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    492       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    493    );
    494    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    495 }
    496 
    497 UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
    498 {
    499    V128 resV;
    500    UInt resOSZACP, resECX;
    501    Bool ok
    502       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    503                        zmask_from_V128(argLU),
    504                        zmask_from_V128(argRU),
    505                        0x3A, False/*!isSTRM*/
    506         );
    507    assert(ok);
    508    resECX = resV.uInt[0];
    509    return (resOSZACP << 16) | resECX;
    510 }
    511 
    512 void istri_3A ( void )
    513 {
    514    char* wot = "3A";
    515    UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
    516    UInt(*s)(V128*,V128*) = s_pcmpistri_3A;
    517 
    518    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    519 
    520    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    521    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    522    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    523    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    524 
    525    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    526    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    527    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    528 
    529    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    530    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    531    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    532    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    533 
    534    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    535    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    536    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    537 
    538    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    539 
    540    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    541    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    542    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    543 
    544    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    545    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    546    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    547 
    548    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    549    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    550    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    551 
    552    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    553    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    554    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    555 
    556    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    557    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    558 }
    559 
    560 
    561 
    562 //////////////////////////////////////////////////////////
    563 //                                                      //
    564 //                       ISTRI_0C                       //
    565 //                                                      //
    566 //////////////////////////////////////////////////////////
    567 
    568 __attribute__((noinline))
    569 UInt h_pcmpistri_0C ( V128* argL, V128* argR )
    570 {
    571    V128 block[2];
    572    memcpy(&block[0], argL, sizeof(V128));
    573    memcpy(&block[1], argR, sizeof(V128));
    574    ULong res = 0, flags = 0;
    575    __asm__ __volatile__(
    576       "movdqa    0(%2),  %%xmm2"            "\n\t"
    577       "movdqa    16(%2), %%xmm11"           "\n\t"
    578       "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
    579       //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
    580       //"movd %%xmm0, %%ecx" "\n\t"
    581       "pushfq"                              "\n\t"
    582       "popq      %%rdx"                     "\n\t"
    583       "movq      %%rcx,  %0"                "\n\t"
    584       "movq      %%rdx,  %1"                "\n\t"
    585       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    586       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    587    );
    588    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    589 }
    590 
    591 UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
    592 {
    593    V128 resV;
    594    UInt resOSZACP, resECX;
    595    Bool ok
    596       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    597                        zmask_from_V128(argLU),
    598                        zmask_from_V128(argRU),
    599                        0x0C, False/*!isSTRM*/
    600         );
    601    assert(ok);
    602    resECX = resV.uInt[0];
    603    return (resOSZACP << 16) | resECX;
    604 }
    605 
    606 void istri_0C ( void )
    607 {
    608    char* wot = "0C";
    609    UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
    610    UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
    611 
    612    try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
    613 
    614    try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
    615 
    616    try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
    617    try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
    618    try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
    619 
    620    try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
    621 
    622    try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
    623    try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
    624    try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
    625    try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
    626    try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
    627 
    628    try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
    629    try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
    630    try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
    631 
    632    try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
    633    try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
    634 
    635    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
    636    try_istri(wot,h,s, "1111111111111234", "0000000000000001");
    637    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
    638 
    639    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
    640    try_istri(wot,h,s, "a111111111111111", "000000000000000a");
    641    try_istri(wot,h,s, "b111111111111111", "000000000000000a");
    642 }
    643 
    644 
    645 //////////////////////////////////////////////////////////
    646 //                                                      //
    647 //                       ISTRI_08                       //
    648 //                                                      //
    649 //////////////////////////////////////////////////////////
    650 
    651 UInt h_pcmpistri_08 ( V128* argL, V128* argR )
    652 {
    653    V128 block[2];
    654    memcpy(&block[0], argL, sizeof(V128));
    655    memcpy(&block[1], argR, sizeof(V128));
    656    ULong res, flags;
    657    __asm__ __volatile__(
    658       "subq      $1024,  %%rsp"             "\n\t"
    659       "movdqu    0(%2),  %%xmm2"            "\n\t"
    660       "movdqu    16(%2), %%xmm11"           "\n\t"
    661       "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
    662       "pushfq"                              "\n\t"
    663       "popq      %%rdx"                     "\n\t"
    664       "movq      %%rcx,  %0"                "\n\t"
    665       "movq      %%rdx,  %1"                "\n\t"
    666       "addq      $1024,  %%rsp"             "\n\t"
    667       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    668       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    669    );
    670    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    671 }
    672 
    673 UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
    674 {
    675    V128 resV;
    676    UInt resOSZACP, resECX;
    677    Bool ok
    678       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    679                        zmask_from_V128(argLU),
    680                        zmask_from_V128(argRU),
    681                        0x08, False/*!isSTRM*/
    682         );
    683    assert(ok);
    684    resECX = resV.uInt[0];
    685    return (resOSZACP << 16) | resECX;
    686 }
    687 
    688 void istri_08 ( void )
    689 {
    690    char* wot = "08";
    691    UInt(*h)(V128*,V128*) = h_pcmpistri_08;
    692    UInt(*s)(V128*,V128*) = s_pcmpistri_08;
    693 
    694    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    695 
    696    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    697    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    698    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    699    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    700 
    701    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    702    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    703    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    704 
    705    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    706    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    707    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    708    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    709 
    710    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    711    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    712    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    713 
    714    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    715 
    716    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    717    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    718    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    719 
    720    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    721    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    722    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    723 
    724    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    725    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    726    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    727 
    728    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    729    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    730    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    731 
    732    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    733    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    734 }
    735 
    736 
    737 
    738 //////////////////////////////////////////////////////////
    739 //                                                      //
    740 //                       ISTRI_1A                       //
    741 //                                                      //
    742 //////////////////////////////////////////////////////////
    743 
    744 UInt h_pcmpistri_1A ( V128* argL, V128* argR )
    745 {
    746    V128 block[2];
    747    memcpy(&block[0], argL, sizeof(V128));
    748    memcpy(&block[1], argR, sizeof(V128));
    749    ULong res, flags;
    750    __asm__ __volatile__(
    751       "subq      $1024,  %%rsp"             "\n\t"
    752       "movdqu    0(%2),  %%xmm2"            "\n\t"
    753       "movdqu    16(%2), %%xmm11"           "\n\t"
    754       "pcmpistri $0x1A,  %%xmm2, %%xmm11"   "\n\t"
    755       "pushfq"                              "\n\t"
    756       "popq      %%rdx"                     "\n\t"
    757       "movq      %%rcx,  %0"                "\n\t"
    758       "movq      %%rdx,  %1"                "\n\t"
    759       "addq      $1024,  %%rsp"             "\n\t"
    760       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    761       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    762    );
    763    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    764 }
    765 
    766 UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
    767 {
    768    V128 resV;
    769    UInt resOSZACP, resECX;
    770    Bool ok
    771       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    772                        zmask_from_V128(argLU),
    773                        zmask_from_V128(argRU),
    774                        0x1A, False/*!isSTRM*/
    775         );
    776    assert(ok);
    777    resECX = resV.uInt[0];
    778    return (resOSZACP << 16) | resECX;
    779 }
    780 
    781 void istri_1A ( void )
    782 {
    783    char* wot = "1A";
    784    UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
    785    UInt(*s)(V128*,V128*) = s_pcmpistri_1A;
    786 
    787    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    788 
    789    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    790    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    791    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    792    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    793 
    794    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    795    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    796    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    797 
    798    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    799    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    800    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    801    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    802 
    803    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    804    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    805    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    806 
    807    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    808 
    809    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    810    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    811    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    812 
    813    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    814    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    815    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    816 
    817    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    818    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    819    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    820 
    821    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    822    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    823    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    824 
    825    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    826    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    827 }
    828 
    829 
    830 
    831 //////////////////////////////////////////////////////////
    832 //                                                      //
    833 //                       ISTRI_02                       //
    834 //                                                      //
    835 //////////////////////////////////////////////////////////
    836 
    837 UInt h_pcmpistri_02 ( V128* argL, V128* argR )
    838 {
    839    V128 block[2];
    840    memcpy(&block[0], argL, sizeof(V128));
    841    memcpy(&block[1], argR, sizeof(V128));
    842    ULong res, flags;
    843    __asm__ __volatile__(
    844       "subq      $1024,  %%rsp"             "\n\t"
    845       "movdqu    0(%2),  %%xmm2"            "\n\t"
    846       "movdqu    16(%2), %%xmm11"           "\n\t"
    847       "pcmpistri $0x02,  %%xmm2, %%xmm11"   "\n\t"
    848 //"pcmpistrm $0x02, %%xmm2, %%xmm11"   "\n\t"
    849 //"movd %%xmm0, %%ecx" "\n\t"
    850       "pushfq"                              "\n\t"
    851       "popq      %%rdx"                     "\n\t"
    852       "movq      %%rcx,  %0"                "\n\t"
    853       "movq      %%rdx,  %1"                "\n\t"
    854       "addq      $1024,  %%rsp"             "\n\t"
    855       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    856       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    857    );
    858    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    859 }
    860 
    861 UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
    862 {
    863    V128 resV;
    864    UInt resOSZACP, resECX;
    865    Bool ok
    866       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    867                        zmask_from_V128(argLU),
    868                        zmask_from_V128(argRU),
    869                        0x02, False/*!isSTRM*/
    870         );
    871    assert(ok);
    872    resECX = resV.uInt[0];
    873    return (resOSZACP << 16) | resECX;
    874 }
    875 
    876 void istri_02 ( void )
    877 {
    878    char* wot = "02";
    879    UInt(*h)(V128*,V128*) = h_pcmpistri_02;
    880    UInt(*s)(V128*,V128*) = s_pcmpistri_02;
    881 
    882    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
    883    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
    884    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
    885    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
    886 
    887    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    888    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
    889    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
    890    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
    891    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
    892 
    893    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    894    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
    895    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
    896    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
    897 
    898    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    899    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    900 
    901    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
    902    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
    903    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
    904    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
    905 
    906    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
    907 
    908    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
    909    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
    910 }
    911 
    912 
    913 //////////////////////////////////////////////////////////
    914 //                                                      //
    915 //                       ISTRI_12                       //
    916 //                                                      //
    917 //////////////////////////////////////////////////////////
    918 
    919 UInt h_pcmpistri_12 ( V128* argL, V128* argR )
    920 {
    921    V128 block[2];
    922    memcpy(&block[0], argL, sizeof(V128));
    923    memcpy(&block[1], argR, sizeof(V128));
    924    ULong res, flags;
    925    __asm__ __volatile__(
    926       "subq      $1024,  %%rsp"             "\n\t"
    927       "movdqu    0(%2),  %%xmm2"            "\n\t"
    928       "movdqu    16(%2), %%xmm11"           "\n\t"
    929       "pcmpistri $0x12,  %%xmm2, %%xmm11"   "\n\t"
    930 //"pcmpistrm $0x12, %%xmm2, %%xmm11"   "\n\t"
    931 //"movd %%xmm0, %%ecx" "\n\t"
    932       "pushfq"                              "\n\t"
    933       "popq      %%rdx"                     "\n\t"
    934       "movq      %%rcx,  %0"                "\n\t"
    935       "movq      %%rdx,  %1"                "\n\t"
    936       "addq      $1024,  %%rsp"             "\n\t"
    937       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    938       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    939    );
    940    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    941 }
    942 
    943 UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
    944 {
    945    V128 resV;
    946    UInt resOSZACP, resECX;
    947    Bool ok
    948       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    949                        zmask_from_V128(argLU),
    950                        zmask_from_V128(argRU),
    951                        0x12, False/*!isSTRM*/
    952         );
    953    assert(ok);
    954    resECX = resV.uInt[0];
    955    return (resOSZACP << 16) | resECX;
    956 }
    957 
    958 void istri_12 ( void )
    959 {
    960    char* wot = "12";
    961    UInt(*h)(V128*,V128*) = h_pcmpistri_12;
    962    UInt(*s)(V128*,V128*) = s_pcmpistri_12;
    963 
    964    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
    965    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
    966    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
    967    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
    968 
    969    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    970    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
    971    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
    972    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
    973    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
    974 
    975    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    976    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
    977    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
    978    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
    979 
    980    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    981    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    982 
    983    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
    984    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
    985    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
    986    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
    987 
    988    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
    989 
    990    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
    991    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
    992 }
    993 
    994 
    995 
    996 //////////////////////////////////////////////////////////
    997 //                                                      //
    998 //                       ISTRI_44                       //
    999 //                                                      //
   1000 //////////////////////////////////////////////////////////
   1001 
   1002 UInt h_pcmpistri_44 ( V128* argL, V128* argR )
   1003 {
   1004    V128 block[2];
   1005    memcpy(&block[0], argL, sizeof(V128));
   1006    memcpy(&block[1], argR, sizeof(V128));
   1007    ULong res, flags;
   1008    __asm__ __volatile__(
   1009       "subq      $1024,  %%rsp"             "\n\t"
   1010       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1011       "movdqu    16(%2), %%xmm11"           "\n\t"
   1012       "pcmpistri $0x44,  %%xmm2, %%xmm11"   "\n\t"
   1013 //"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
   1014 //"movd %%xmm0, %%ecx" "\n\t"
   1015       "pushfq"                              "\n\t"
   1016       "popq      %%rdx"                     "\n\t"
   1017       "movq      %%rcx,  %0"                "\n\t"
   1018       "movq      %%rdx,  %1"                "\n\t"
   1019       "addq      $1024,  %%rsp"             "\n\t"
   1020       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1021       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1022    );
   1023    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1024 }
   1025 
   1026 UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
   1027 {
   1028    V128 resV;
   1029    UInt resOSZACP, resECX;
   1030    Bool ok
   1031       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1032                        zmask_from_V128(argLU),
   1033                        zmask_from_V128(argRU),
   1034                        0x44, False/*!isSTRM*/
   1035         );
   1036    assert(ok);
   1037    resECX = resV.uInt[0];
   1038    return (resOSZACP << 16) | resECX;
   1039 }
   1040 
   1041 void istri_44 ( void )
   1042 {
   1043    char* wot = "44";
   1044    UInt(*h)(V128*,V128*) = h_pcmpistri_44;
   1045    UInt(*s)(V128*,V128*) = s_pcmpistri_44;
   1046 
   1047    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1048    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1049    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1050    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1051 
   1052    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1053    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1054    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1055    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1056    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1057 
   1058    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1059 
   1060    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1061    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1062    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1063 
   1064    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1065    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1066    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1067 
   1068    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1069    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1070 
   1071    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1072    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1073 }
   1074 
   1075 
   1076 //////////////////////////////////////////////////////////
   1077 //                                                      //
   1078 //                       ISTRI_00                       //
   1079 //                                                      //
   1080 //////////////////////////////////////////////////////////
   1081 
   1082 UInt h_pcmpistri_00 ( V128* argL, V128* argR )
   1083 {
   1084    V128 block[2];
   1085    memcpy(&block[0], argL, sizeof(V128));
   1086    memcpy(&block[1], argR, sizeof(V128));
   1087    ULong res, flags;
   1088    __asm__ __volatile__(
   1089       "subq      $1024,  %%rsp"             "\n\t"
   1090       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1091       "movdqu    16(%2), %%xmm11"           "\n\t"
   1092       "pcmpistri $0x00,  %%xmm2, %%xmm11"   "\n\t"
   1093 //"pcmpistrm $0x00, %%xmm2, %%xmm11"   "\n\t"
   1094 //"movd %%xmm0, %%ecx" "\n\t"
   1095       "pushfq"                              "\n\t"
   1096       "popq      %%rdx"                     "\n\t"
   1097       "movq      %%rcx,  %0"                "\n\t"
   1098       "movq      %%rdx,  %1"                "\n\t"
   1099       "addq      $1024,  %%rsp"             "\n\t"
   1100       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1101       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1102    );
   1103    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1104 }
   1105 
   1106 UInt s_pcmpistri_00 ( V128* argLU, V128* argRU )
   1107 {
   1108    V128 resV;
   1109    UInt resOSZACP, resECX;
   1110    Bool ok
   1111       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1112                        zmask_from_V128(argLU),
   1113                        zmask_from_V128(argRU),
   1114                        0x00, False/*!isSTRM*/
   1115         );
   1116    assert(ok);
   1117    resECX = resV.uInt[0];
   1118    return (resOSZACP << 16) | resECX;
   1119 }
   1120 
   1121 void istri_00 ( void )
   1122 {
   1123    char* wot = "00";
   1124    UInt(*h)(V128*,V128*) = h_pcmpistri_00;
   1125    UInt(*s)(V128*,V128*) = s_pcmpistri_00;
   1126 
   1127    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1128    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1129    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1130    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1131 
   1132    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1133    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1134    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1135    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1136    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1137 
   1138    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1139    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1140    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1141    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1142 
   1143    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1144    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1145 
   1146    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1147    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1148    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1149    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1150 
   1151    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1152 
   1153    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1154    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1155 }
   1156 
   1157 
   1158 //////////////////////////////////////////////////////////
   1159 //                                                      //
   1160 //                       ISTRI_38                       //
   1161 //                                                      //
   1162 //////////////////////////////////////////////////////////
   1163 
   1164 UInt h_pcmpistri_38 ( V128* argL, V128* argR )
   1165 {
   1166    V128 block[2];
   1167    memcpy(&block[0], argL, sizeof(V128));
   1168    memcpy(&block[1], argR, sizeof(V128));
   1169    ULong res, flags;
   1170    __asm__ __volatile__(
   1171       "subq      $1024,  %%rsp"             "\n\t"
   1172       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1173       "movdqu    16(%2), %%xmm11"           "\n\t"
   1174       "pcmpistri $0x38,  %%xmm2, %%xmm11"   "\n\t"
   1175       "pushfq"                              "\n\t"
   1176       "popq      %%rdx"                     "\n\t"
   1177       "movq      %%rcx,  %0"                "\n\t"
   1178       "movq      %%rdx,  %1"                "\n\t"
   1179       "addq      $1024,  %%rsp"             "\n\t"
   1180       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1181       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1182    );
   1183    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1184 }
   1185 
   1186 UInt s_pcmpistri_38 ( V128* argLU, V128* argRU )
   1187 {
   1188    V128 resV;
   1189    UInt resOSZACP, resECX;
   1190    Bool ok
   1191       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1192                        zmask_from_V128(argLU),
   1193                        zmask_from_V128(argRU),
   1194                        0x38, False/*!isSTRM*/
   1195         );
   1196    assert(ok);
   1197    resECX = resV.uInt[0];
   1198    return (resOSZACP << 16) | resECX;
   1199 }
   1200 
   1201 void istri_38 ( void )
   1202 {
   1203    char* wot = "38";
   1204    UInt(*h)(V128*,V128*) = h_pcmpistri_38;
   1205    UInt(*s)(V128*,V128*) = s_pcmpistri_38;
   1206 
   1207    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1208 
   1209    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1210    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1211    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1212    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1213 
   1214    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1215    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1216    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1217 
   1218    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1219    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1220    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1221    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1222 
   1223    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1224    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1225    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1226 
   1227    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1228 
   1229    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1230    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1231    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
   1232 
   1233    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
   1234    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1235    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
   1236 
   1237    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1238    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
   1239    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
   1240 
   1241    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
   1242    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
   1243    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
   1244 
   1245    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   1246    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   1247 }
   1248 
   1249 
   1250 
   1251 //////////////////////////////////////////////////////////
   1252 //                                                      //
   1253 //                         main                         //
   1254 //                                                      //
   1255 //////////////////////////////////////////////////////////
   1256 
   1257 int main ( void )
   1258 {
   1259    istri_4A();
   1260    istri_3A();
   1261    istri_08();
   1262    istri_1A();
   1263    istri_02();
   1264    istri_0C();
   1265    istri_12();
   1266    istri_44();
   1267    return 0;
   1268 }
   1269 
   1270 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
   1271    pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
   1272    aspect. */
   1273 
   1274 #include <string.h>
   1275 #include <stdio.h>
   1276 #include <assert.h>
   1277 
   1278 typedef  unsigned int   UInt;
   1279 typedef  signed int     Int;
   1280 typedef  unsigned char  UChar;
   1281 typedef  unsigned long long int ULong;
   1282 typedef  UChar          Bool;
   1283 #define False ((Bool)0)
   1284 #define True  ((Bool)1)
   1285 
   1286 //typedef  unsigned char  V128[16];
   1287 typedef
   1288    union {
   1289       UChar uChar[16];
   1290       UInt  uInt[4];
   1291    }
   1292    V128;
   1293 
   1294 #define SHIFT_O   11
   1295 #define SHIFT_S   7
   1296 #define SHIFT_Z   6
   1297 #define SHIFT_A   4
   1298 #define SHIFT_C   0
   1299 #define SHIFT_P   2
   1300 
   1301 #define MASK_O    (1ULL << SHIFT_O)
   1302 #define MASK_S    (1ULL << SHIFT_S)
   1303 #define MASK_Z    (1ULL << SHIFT_Z)
   1304 #define MASK_A    (1ULL << SHIFT_A)
   1305 #define MASK_C    (1ULL << SHIFT_C)
   1306 #define MASK_P    (1ULL << SHIFT_P)
   1307 
   1308 
   1309 UInt clz32 ( UInt x )
   1310 {
   1311    Int y, m, n;
   1312    y = -(x >> 16);
   1313    m = (y >> 16) & 16;
   1314    n = 16 - m;
   1315    x = x >> m;
   1316    y = x - 0x100;
   1317    m = (y >> 16) & 8;
   1318    n = n + m;
   1319    x = x << m;
   1320    y = x - 0x1000;
   1321    m = (y >> 16) & 4;
   1322    n = n + m;
   1323    x = x << m;
   1324    y = x - 0x4000;
   1325    m = (y >> 16) & 2;
   1326    n = n + m;
   1327    x = x << m;
   1328    y = x >> 14;
   1329    m = y & ~(y >> 1);
   1330    return n + 2 - m;
   1331 }
   1332 
   1333 UInt ctz32 ( UInt x )
   1334 {
   1335    return 32 - clz32((~x) & (x-1));
   1336 }
   1337 
   1338 void expand ( V128* dst, char* summary )
   1339 {
   1340    Int i;
   1341    assert( strlen(summary) == 16 );
   1342    for (i = 0; i < 16; i++) {
   1343       UChar xx = 0;
   1344       UChar x = summary[15-i];
   1345       if      (x >= '0' && x <= '9') { xx = x - '0'; }
   1346       else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
   1347       else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
   1348       else assert(0);
   1349 
   1350       assert(xx < 16);
   1351       xx = (xx << 4) | xx;
   1352       assert(xx < 256);
   1353       dst->uChar[i] = xx;
   1354    }
   1355 }
   1356 
   1357 void try_istri ( char* which,
   1358                  UInt(*h_fn)(V128*,V128*),
   1359                  UInt(*s_fn)(V128*,V128*),
   1360                  char* summL, char* summR )
   1361 {
   1362    assert(strlen(which) == 2);
   1363    V128 argL, argR;
   1364    expand(&argL, summL);
   1365    expand(&argR, summR);
   1366    UInt h_res = h_fn(&argL, &argR);
   1367    UInt s_res = s_fn(&argL, &argR);
   1368    printf("istri %s  %s %s -> %08x %08x %s\n",
   1369           which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
   1370 }
   1371 
   1372 UInt zmask_from_V128 ( V128* arg )
   1373 {
   1374    UInt i, res = 0;
   1375    for (i = 0; i < 16; i++) {
   1376       res |=  ((arg->uChar[i] == 0) ? 1 : 0) << i;
   1377    }
   1378    return res;
   1379 }
   1380 
   1381 //////////////////////////////////////////////////////////
   1382 //                                                      //
   1383 //                       GENERAL                        //
   1384 //                                                      //
   1385 //////////////////////////////////////////////////////////
   1386 
   1387 
   1388 /* Given partial results from a pcmpXstrX operation (intRes1,
   1389    basically), generate an I format (index value for ECX) output, and
   1390    also the new OSZACP flags.
   1391 */
   1392 static
   1393 void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
   1394                                     /*OUT*/UInt* resOSZACP,
   1395                                     UInt intRes1,
   1396                                     UInt zmaskL, UInt zmaskR,
   1397                                     UInt validL,
   1398                                     UInt pol, UInt idx )
   1399 {
   1400    assert((pol >> 2) == 0);
   1401    assert((idx >> 1) == 0);
   1402 
   1403    UInt intRes2 = 0;
   1404    switch (pol) {
   1405       case 0: intRes2 = intRes1;          break; // pol +
   1406       case 1: intRes2 = ~intRes1;         break; // pol -
   1407       case 2: intRes2 = intRes1;          break; // pol m+
   1408       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
   1409    }
   1410    intRes2 &= 0xFFFF;
   1411 
   1412    // generate ecx value
   1413    UInt newECX = 0;
   1414    if (idx) {
   1415      // index of ms-1-bit
   1416      newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
   1417    } else {
   1418      // index of ls-1-bit
   1419      newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
   1420    }
   1421 
   1422    *(UInt*)(&resV[0]) = newECX;
   1423 
   1424    // generate new flags, common to all ISTRI and ISTRM cases
   1425    *resOSZACP    // A, P are zero
   1426      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
   1427      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
   1428      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
   1429      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
   1430 }
   1431 
   1432 
   1433 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
   1434    variants.
   1435 
   1436    For xSTRI variants, the new ECX value is placed in the 32 bits
   1437    pointed to by *resV.  For xSTRM variants, the result is a 128 bit
   1438    value and is placed at *resV in the obvious way.
   1439 
   1440    For all variants, the new OSZACP value is placed at *resOSZACP.
   1441 
   1442    argLV and argRV are the vector args.  The caller must prepare a
   1443    16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
   1444    must be 1 for each zero byte of of the respective arg.  For ESTRx
   1445    variants this is derived from the explicit length indication, and
   1446    must be 0 in all places except at the bit index corresponding to
   1447    the valid length (0 .. 16).  If the valid length is 16 then the
   1448    mask must be all zeroes.  In all cases, bits 31:16 must be zero.
   1449 
   1450    imm8 is the original immediate from the instruction.  isSTRM
   1451    indicates whether this is a xSTRM or xSTRI variant, which controls
   1452    how much of *res is written.
   1453 
   1454    If the given imm8 case can be handled, the return value is True.
   1455    If not, False is returned, and neither *res not *resOSZACP are
   1456    altered.
   1457 */
   1458 
   1459 Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
   1460                      /*OUT*/UInt* resOSZACP,
   1461                      V128* argLV,  V128* argRV,
   1462                      UInt zmaskL, UInt zmaskR,
   1463                      UInt imm8,   Bool isSTRM )
   1464 {
   1465    assert(imm8 < 0x80);
   1466    assert((zmaskL >> 16) == 0);
   1467    assert((zmaskR >> 16) == 0);
   1468 
   1469    /* Explicitly reject any imm8 values that haven't been validated,
   1470       even if they would probably work.  Life is too short to have
   1471       unvalidated cases in the code base. */
   1472    switch (imm8) {
   1473       case 0x02: case 0x08: case 0x0C: case 0x12: case 0x1A:
   1474       case 0x3A: case 0x44: case 0x4A:
   1475          break;
   1476       default:
   1477          return False;
   1478    }
   1479 
   1480    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
   1481    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
   1482    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
   1483    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
   1484 
   1485    /*----------------------------------------*/
   1486    /*-- strcmp on byte data                --*/
   1487    /*----------------------------------------*/
   1488 
   1489    if (agg == 2/*equal each, aka strcmp*/
   1490        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
   1491        && !isSTRM) {
   1492       Int    i;
   1493       UChar* argL = (UChar*)argLV;
   1494       UChar* argR = (UChar*)argRV;
   1495       UInt boolResII = 0;
   1496       for (i = 15; i >= 0; i--) {
   1497          UChar cL  = argL[i];
   1498          UChar cR  = argR[i];
   1499          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
   1500       }
   1501       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
   1502       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
   1503 
   1504       // do invalidation, common to all equal-each cases
   1505       UInt intRes1
   1506          = (boolResII & validL & validR)  // if both valid, use cmpres
   1507            | (~ (validL | validR));       // if both invalid, force 1
   1508                                           // else force 0
   1509       intRes1 &= 0xFFFF;
   1510 
   1511       // generate I-format output
   1512       pcmpXstrX_WRK_gen_output_fmt_I(
   1513          resV, resOSZACP,
   1514          intRes1, zmaskL, zmaskR, validL, pol, idx
   1515       );
   1516 
   1517       return True;
   1518    }
   1519 
   1520    /*----------------------------------------*/
   1521    /*-- set membership on byte data        --*/
   1522    /*----------------------------------------*/
   1523 
   1524    if (agg == 0/*equal any, aka find chars in a set*/
   1525        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
   1526        && !isSTRM) {
   1527       /* argL: the string,  argR: charset */
   1528       UInt   si, ci;
   1529       UChar* argL    = (UChar*)argLV;
   1530       UChar* argR    = (UChar*)argRV;
   1531       UInt   boolRes = 0;
   1532       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
   1533       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
   1534 
   1535       for (si = 0; si < 16; si++) {
   1536          if ((validL & (1 << si)) == 0)
   1537             // run off the end of the string.
   1538             break;
   1539          UInt m = 0;
   1540          for (ci = 0; ci < 16; ci++) {
   1541             if ((validR & (1 << ci)) == 0) break;
   1542             if (argR[ci] == argL[si]) { m = 1; break; }
   1543          }
   1544          boolRes |= (m << si);
   1545       }
   1546 
   1547       // boolRes is "pre-invalidated"
   1548       UInt intRes1 = boolRes & 0xFFFF;
   1549 
   1550       // generate I-format output
   1551       pcmpXstrX_WRK_gen_output_fmt_I(
   1552          resV, resOSZACP,
   1553          intRes1, zmaskL, zmaskR, validL, pol, idx
   1554       );
   1555 
   1556       return True;
   1557    }
   1558 
   1559    /*----------------------------------------*/
   1560    /*-- substring search on byte data      --*/
   1561    /*----------------------------------------*/
   1562 
   1563    if (agg == 3/*equal ordered, aka substring search*/
   1564        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
   1565        && !isSTRM) {
   1566 
   1567       /* argL: haystack,  argR: needle */
   1568       UInt   ni, hi;
   1569       UChar* argL    = (UChar*)argLV;
   1570       UChar* argR    = (UChar*)argRV;
   1571       UInt   boolRes = 0;
   1572       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
   1573       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
   1574       for (hi = 0; hi < 16; hi++) {
   1575          if ((validL & (1 << hi)) == 0)
   1576             // run off the end of the haystack
   1577             break;
   1578          UInt m = 1;
   1579          for (ni = 0; ni < 16; ni++) {
   1580             if ((validR & (1 << ni)) == 0) break;
   1581             UInt i = ni + hi;
   1582             if (i >= 16) break;
   1583             if (argL[i] != argR[ni]) { m = 0; break; }
   1584          }
   1585          boolRes |= (m << hi);
   1586       }
   1587 
   1588       // boolRes is "pre-invalidated"
   1589       UInt intRes1 = boolRes & 0xFFFF;
   1590 
   1591       // generate I-format output
   1592       pcmpXstrX_WRK_gen_output_fmt_I(
   1593          resV, resOSZACP,
   1594          intRes1, zmaskL, zmaskR, validL, pol, idx
   1595       );
   1596 
   1597       return True;
   1598    }
   1599 
   1600    /*----------------------------------------*/
   1601    /*-- ranges, unsigned byte data         --*/
   1602    /*----------------------------------------*/
   1603 
   1604    if (agg == 1/*ranges*/
   1605        && fmt == 0/*ub*/
   1606        && !isSTRM) {
   1607 
   1608       /* argL: string,  argR: range-pairs */
   1609       UInt   ri, si;
   1610       UChar* argL    = (UChar*)argLV;
   1611       UChar* argR    = (UChar*)argRV;
   1612       UInt   boolRes = 0;
   1613       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
   1614       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
   1615       for (si = 0; si < 16; si++) {
   1616          if ((validL & (1 << si)) == 0)
   1617             // run off the end of the string
   1618             break;
   1619          UInt m = 0;
   1620          for (ri = 0; ri < 16; ri += 2) {
   1621             if ((validR & (3 << ri)) != (3 << ri)) break;
   1622             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
   1623                m = 1; break;
   1624             }
   1625          }
   1626          boolRes |= (m << si);
   1627       }
   1628 
   1629       // boolRes is "pre-invalidated"
   1630       UInt intRes1 = boolRes & 0xFFFF;
   1631 
   1632       // generate I-format output
   1633       pcmpXstrX_WRK_gen_output_fmt_I(
   1634          resV, resOSZACP,
   1635          intRes1, zmaskL, zmaskR, validL, pol, idx
   1636       );
   1637 
   1638       return True;
   1639    }
   1640 
   1641    return False;
   1642 }
   1643 
   1644 
   1645 //////////////////////////////////////////////////////////
   1646 //                                                      //
   1647 //                       ISTRI_4A                       //
   1648 //                                                      //
   1649 //////////////////////////////////////////////////////////
   1650 
   1651 UInt h_pcmpistri_4A ( V128* argL, V128* argR )
   1652 {
   1653    V128 block[2];
   1654    memcpy(&block[0], argL, sizeof(V128));
   1655    memcpy(&block[1], argR, sizeof(V128));
   1656    ULong res, flags;
   1657    __asm__ __volatile__(
   1658       "subq      $1024,  %%rsp"             "\n\t"
   1659       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1660       "movdqu    16(%2), %%xmm11"           "\n\t"
   1661       "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
   1662       "pushfq"                              "\n\t"
   1663       "popq      %%rdx"                     "\n\t"
   1664       "movq      %%rcx,  %0"                "\n\t"
   1665       "movq      %%rdx,  %1"                "\n\t"
   1666       "addq      $1024,  %%rsp"             "\n\t"
   1667       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1668       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1669    );
   1670    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1671 }
   1672 
   1673 UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
   1674 {
   1675    V128 resV;
   1676    UInt resOSZACP, resECX;
   1677    Bool ok
   1678       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1679                        zmask_from_V128(argLU),
   1680                        zmask_from_V128(argRU),
   1681                        0x4A, False/*!isSTRM*/
   1682         );
   1683    assert(ok);
   1684    resECX = resV.uInt[0];
   1685    return (resOSZACP << 16) | resECX;
   1686 }
   1687 
   1688 void istri_4A ( void )
   1689 {
   1690    char* wot = "4A";
   1691    UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
   1692    UInt(*s)(V128*,V128*) = s_pcmpistri_4A;
   1693 
   1694    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1695 
   1696    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1697    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1698    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1699    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1700 
   1701    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1702    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1703    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1704 
   1705    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1706    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1707    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1708    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1709 
   1710    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1711    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1712    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1713 
   1714    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1715 
   1716    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1717    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1718    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
   1719 
   1720    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
   1721    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1722    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
   1723 
   1724    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1725    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
   1726    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
   1727 
   1728    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
   1729    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
   1730    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
   1731 
   1732    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   1733    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   1734 }
   1735 
   1736 //////////////////////////////////////////////////////////
   1737 //                                                      //
   1738 //                       ISTRI_3A                       //
   1739 //                                                      //
   1740 //////////////////////////////////////////////////////////
   1741 
   1742 UInt h_pcmpistri_3A ( V128* argL, V128* argR )
   1743 {
   1744    V128 block[2];
   1745    memcpy(&block[0], argL, sizeof(V128));
   1746    memcpy(&block[1], argR, sizeof(V128));
   1747    ULong res, flags;
   1748    __asm__ __volatile__(
   1749       "subq      $1024,  %%rsp"             "\n\t"
   1750       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1751       "movdqu    16(%2), %%xmm11"           "\n\t"
   1752       "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
   1753       "pushfq"                              "\n\t"
   1754       "popq      %%rdx"                     "\n\t"
   1755       "movq      %%rcx,  %0"                "\n\t"
   1756       "movq      %%rdx,  %1"                "\n\t"
   1757       "addq      $1024,  %%rsp"             "\n\t"
   1758       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1759       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1760    );
   1761    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1762 }
   1763 
   1764 UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
   1765 {
   1766    V128 resV;
   1767    UInt resOSZACP, resECX;
   1768    Bool ok
   1769       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1770                        zmask_from_V128(argLU),
   1771                        zmask_from_V128(argRU),
   1772                        0x3A, False/*!isSTRM*/
   1773         );
   1774    assert(ok);
   1775    resECX = resV.uInt[0];
   1776    return (resOSZACP << 16) | resECX;
   1777 }
   1778 
   1779 void istri_3A ( void )
   1780 {
   1781    char* wot = "3A";
   1782    UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
   1783    UInt(*s)(V128*,V128*) = s_pcmpistri_3A;
   1784 
   1785    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1786 
   1787    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1788    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1789    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1790    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1791 
   1792    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1793    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1794    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1795 
   1796    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1797    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1798    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1799    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1800 
   1801    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1802    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1803    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1804 
   1805    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1806 
   1807    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1808    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1809    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
   1810 
   1811    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
   1812    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1813    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
   1814 
   1815    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1816    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
   1817    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
   1818 
   1819    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
   1820    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
   1821    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
   1822 
   1823    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   1824    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   1825 }
   1826 
   1827 
   1828 
   1829 //////////////////////////////////////////////////////////
   1830 //                                                      //
   1831 //                       ISTRI_0C                       //
   1832 //                                                      //
   1833 //////////////////////////////////////////////////////////
   1834 
   1835 __attribute__((noinline))
   1836 UInt h_pcmpistri_0C ( V128* argL, V128* argR )
   1837 {
   1838    V128 block[2];
   1839    memcpy(&block[0], argL, sizeof(V128));
   1840    memcpy(&block[1], argR, sizeof(V128));
   1841    ULong res = 0, flags = 0;
   1842    __asm__ __volatile__(
   1843       "movdqa    0(%2),  %%xmm2"            "\n\t"
   1844       "movdqa    16(%2), %%xmm11"           "\n\t"
   1845       "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
   1846       //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
   1847       //"movd %%xmm0, %%ecx" "\n\t"
   1848       "pushfq"                              "\n\t"
   1849       "popq      %%rdx"                     "\n\t"
   1850       "movq      %%rcx,  %0"                "\n\t"
   1851       "movq      %%rdx,  %1"                "\n\t"
   1852       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1853       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1854    );
   1855    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1856 }
   1857 
   1858 UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
   1859 {
   1860    V128 resV;
   1861    UInt resOSZACP, resECX;
   1862    Bool ok
   1863       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1864                        zmask_from_V128(argLU),
   1865                        zmask_from_V128(argRU),
   1866                        0x0C, False/*!isSTRM*/
   1867         );
   1868    assert(ok);
   1869    resECX = resV.uInt[0];
   1870    return (resOSZACP << 16) | resECX;
   1871 }
   1872 
   1873 void istri_0C ( void )
   1874 {
   1875    char* wot = "0C";
   1876    UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
   1877    UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
   1878 
   1879    try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
   1880 
   1881    try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
   1882 
   1883    try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
   1884    try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
   1885    try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
   1886 
   1887    try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
   1888 
   1889    try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
   1890    try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
   1891    try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
   1892    try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
   1893    try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
   1894 
   1895    try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
   1896    try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
   1897    try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
   1898 
   1899    try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
   1900    try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
   1901 
   1902    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
   1903    try_istri(wot,h,s, "1111111111111234", "0000000000000001");
   1904    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
   1905 
   1906    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
   1907    try_istri(wot,h,s, "a111111111111111", "000000000000000a");
   1908    try_istri(wot,h,s, "b111111111111111", "000000000000000a");
   1909 }
   1910 
   1911 
   1912 //////////////////////////////////////////////////////////
   1913 //                                                      //
   1914 //                       ISTRI_08                       //
   1915 //                                                      //
   1916 //////////////////////////////////////////////////////////
   1917 
   1918 UInt h_pcmpistri_08 ( V128* argL, V128* argR )
   1919 {
   1920    V128 block[2];
   1921    memcpy(&block[0], argL, sizeof(V128));
   1922    memcpy(&block[1], argR, sizeof(V128));
   1923    ULong res, flags;
   1924    __asm__ __volatile__(
   1925       "subq      $1024,  %%rsp"             "\n\t"
   1926       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1927       "movdqu    16(%2), %%xmm11"           "\n\t"
   1928       "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
   1929       "pushfq"                              "\n\t"
   1930       "popq      %%rdx"                     "\n\t"
   1931       "movq      %%rcx,  %0"                "\n\t"
   1932       "movq      %%rdx,  %1"                "\n\t"
   1933       "addq      $1024,  %%rsp"             "\n\t"
   1934       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1935       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1936    );
   1937    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1938 }
   1939 
   1940 UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
   1941 {
   1942    V128 resV;
   1943    UInt resOSZACP, resECX;
   1944    Bool ok
   1945       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1946                        zmask_from_V128(argLU),
   1947                        zmask_from_V128(argRU),
   1948                        0x08, False/*!isSTRM*/
   1949         );
   1950    assert(ok);
   1951    resECX = resV.uInt[0];
   1952    return (resOSZACP << 16) | resECX;
   1953 }
   1954 
   1955 void istri_08 ( void )
   1956 {
   1957    char* wot = "08";
   1958    UInt(*h)(V128*,V128*) = h_pcmpistri_08;
   1959    UInt(*s)(V128*,V128*) = s_pcmpistri_08;
   1960 
   1961    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1962 
   1963    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1964    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1965    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1966    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1967 
   1968    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1969    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1970    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1971 
   1972    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1973    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1974    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1975    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1976 
   1977    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1978    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1979    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1980 
   1981    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1982 
   1983    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1984    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1985    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
   1986 
   1987    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
   1988    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1989    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
   1990 
   1991    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1992    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
   1993    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
   1994 
   1995    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
   1996    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
   1997    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
   1998 
   1999    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   2000    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   2001 }
   2002 
   2003 
   2004 
   2005 //////////////////////////////////////////////////////////
   2006 //                                                      //
   2007 //                       ISTRI_1A                       //
   2008 //                                                      //
   2009 //////////////////////////////////////////////////////////
   2010 
   2011 UInt h_pcmpistri_1A ( V128* argL, V128* argR )
   2012 {
   2013    V128 block[2];
   2014    memcpy(&block[0], argL, sizeof(V128));
   2015    memcpy(&block[1], argR, sizeof(V128));
   2016    ULong res, flags;
   2017    __asm__ __volatile__(
   2018       "subq      $1024,  %%rsp"             "\n\t"
   2019       "movdqu    0(%2),  %%xmm2"            "\n\t"
   2020       "movdqu    16(%2), %%xmm11"           "\n\t"
   2021       "pcmpistri $0x1A,  %%xmm2, %%xmm11"   "\n\t"
   2022       "pushfq"                              "\n\t"
   2023       "popq      %%rdx"                     "\n\t"
   2024       "movq      %%rcx,  %0"                "\n\t"
   2025       "movq      %%rdx,  %1"                "\n\t"
   2026       "addq      $1024,  %%rsp"             "\n\t"
   2027       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   2028       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   2029    );
   2030    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   2031 }
   2032 
   2033 UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
   2034 {
   2035    V128 resV;
   2036    UInt resOSZACP, resECX;
   2037    Bool ok
   2038       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   2039                        zmask_from_V128(argLU),
   2040                        zmask_from_V128(argRU),
   2041                        0x1A, False/*!isSTRM*/
   2042         );
   2043    assert(ok);
   2044    resECX = resV.uInt[0];
   2045    return (resOSZACP << 16) | resECX;
   2046 }
   2047 
   2048 void istri_1A ( void )
   2049 {
   2050    char* wot = "1A";
   2051    UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
   2052    UInt(*s)(V128*,V128*) = s_pcmpistri_1A;
   2053 
   2054    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   2055 
   2056    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2057    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2058    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   2059    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   2060 
   2061    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   2062    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   2063    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   2064 
   2065    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2066    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2067    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2068    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2069 
   2070    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2071    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   2072    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   2073 
   2074    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2075 
   2076    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   2077    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   2078    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
   2079 
   2080    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
   2081    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   2082    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
   2083 
   2084    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   2085    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
   2086    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
   2087 
   2088    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
   2089    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
   2090    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
   2091 
   2092    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   2093    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   2094 }
   2095 
   2096 
   2097 
   2098 //////////////////////////////////////////////////////////
   2099 //                                                      //
   2100 //                       ISTRI_02                       //
   2101 //                                                      //
   2102 //////////////////////////////////////////////////////////
   2103 
   2104 UInt h_pcmpistri_02 ( V128* argL, V128* argR )
   2105 {
   2106    V128 block[2];
   2107    memcpy(&block[0], argL, sizeof(V128));
   2108    memcpy(&block[1], argR, sizeof(V128));
   2109    ULong res, flags;
   2110    __asm__ __volatile__(
   2111       "subq      $1024,  %%rsp"             "\n\t"
   2112       "movdqu    0(%2),  %%xmm2"            "\n\t"
   2113       "movdqu    16(%2), %%xmm11"           "\n\t"
   2114       "pcmpistri $0x02,  %%xmm2, %%xmm11"   "\n\t"
   2115 //"pcmpistrm $0x02, %%xmm2, %%xmm11"   "\n\t"
   2116 //"movd %%xmm0, %%ecx" "\n\t"
   2117       "pushfq"                              "\n\t"
   2118       "popq      %%rdx"                     "\n\t"
   2119       "movq      %%rcx,  %0"                "\n\t"
   2120       "movq      %%rdx,  %1"                "\n\t"
   2121       "addq      $1024,  %%rsp"             "\n\t"
   2122       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   2123       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   2124    );
   2125    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   2126 }
   2127 
   2128 UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
   2129 {
   2130    V128 resV;
   2131    UInt resOSZACP, resECX;
   2132    Bool ok
   2133       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   2134                        zmask_from_V128(argLU),
   2135                        zmask_from_V128(argRU),
   2136                        0x02, False/*!isSTRM*/
   2137         );
   2138    assert(ok);
   2139    resECX = resV.uInt[0];
   2140    return (resOSZACP << 16) | resECX;
   2141 }
   2142 
   2143 void istri_02 ( void )
   2144 {
   2145    char* wot = "02";
   2146    UInt(*h)(V128*,V128*) = h_pcmpistri_02;
   2147    UInt(*s)(V128*,V128*) = s_pcmpistri_02;
   2148 
   2149    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   2150    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   2151    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   2152    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   2153 
   2154    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   2155    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   2156    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   2157    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   2158    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   2159 
   2160    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   2161    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   2162    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   2163    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   2164 
   2165    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   2166    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2167 
   2168    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   2169    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   2170    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   2171    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   2172 
   2173    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   2174 
   2175    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   2176    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   2177 }
   2178 
   2179 
   2180 //////////////////////////////////////////////////////////
   2181 //                                                      //
   2182 //                       ISTRI_12                       //
   2183 //                                                      //
   2184 //////////////////////////////////////////////////////////
   2185 
   2186 UInt h_pcmpistri_12 ( V128* argL, V128* argR )
   2187 {
   2188    V128 block[2];
   2189    memcpy(&block[0], argL, sizeof(V128));
   2190    memcpy(&block[1], argR, sizeof(V128));
   2191    ULong res, flags;
   2192    __asm__ __volatile__(
   2193       "subq      $1024,  %%rsp"             "\n\t"
   2194       "movdqu    0(%2),  %%xmm2"            "\n\t"
   2195       "movdqu    16(%2), %%xmm11"           "\n\t"
   2196       "pcmpistri $0x12,  %%xmm2, %%xmm11"   "\n\t"
   2197 //"pcmpistrm $0x12, %%xmm2, %%xmm11"   "\n\t"
   2198 //"movd %%xmm0, %%ecx" "\n\t"
   2199       "pushfq"                              "\n\t"
   2200       "popq      %%rdx"                     "\n\t"
   2201       "movq      %%rcx,  %0"                "\n\t"
   2202       "movq      %%rdx,  %1"                "\n\t"
   2203       "addq      $1024,  %%rsp"             "\n\t"
   2204       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   2205       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   2206    );
   2207    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   2208 }
   2209 
   2210 UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
   2211 {
   2212    V128 resV;
   2213    UInt resOSZACP, resECX;
   2214    Bool ok
   2215       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   2216                        zmask_from_V128(argLU),
   2217                        zmask_from_V128(argRU),
   2218                        0x12, False/*!isSTRM*/
   2219         );
   2220    assert(ok);
   2221    resECX = resV.uInt[0];
   2222    return (resOSZACP << 16) | resECX;
   2223 }
   2224 
   2225 void istri_12 ( void )
   2226 {
   2227    char* wot = "12";
   2228    UInt(*h)(V128*,V128*) = h_pcmpistri_12;
   2229    UInt(*s)(V128*,V128*) = s_pcmpistri_12;
   2230 
   2231    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   2232    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   2233    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   2234    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   2235 
   2236    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   2237    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   2238    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   2239    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   2240    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   2241 
   2242    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   2243    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   2244    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   2245    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   2246 
   2247    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   2248    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   2249 
   2250    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   2251    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   2252    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   2253    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   2254 
   2255    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   2256 
   2257    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   2258    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   2259 }
   2260 
   2261 
   2262 
   2263 //////////////////////////////////////////////////////////
   2264 //                                                      //
   2265 //                       ISTRI_44                       //
   2266 //                                                      //
   2267 //////////////////////////////////////////////////////////
   2268 
   2269 UInt h_pcmpistri_44 ( V128* argL, V128* argR )
   2270 {
   2271    V128 block[2];
   2272    memcpy(&block[0], argL, sizeof(V128));
   2273    memcpy(&block[1], argR, sizeof(V128));
   2274    ULong res, flags;
   2275    __asm__ __volatile__(
   2276       "subq      $1024,  %%rsp"             "\n\t"
   2277       "movdqu    0(%2),  %%xmm2"            "\n\t"
   2278       "movdqu    16(%2), %%xmm11"           "\n\t"
   2279       "pcmpistri $0x44,  %%xmm2, %%xmm11"   "\n\t"
   2280 //"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
   2281 //"movd %%xmm0, %%ecx" "\n\t"
   2282       "pushfq"                              "\n\t"
   2283       "popq      %%rdx"                     "\n\t"
   2284       "movq      %%rcx,  %0"                "\n\t"
   2285       "movq      %%rdx,  %1"                "\n\t"
   2286       "addq      $1024,  %%rsp"             "\n\t"
   2287       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   2288       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   2289    );
   2290    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   2291 }
   2292 
   2293 UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
   2294 {
   2295    V128 resV;
   2296    UInt resOSZACP, resECX;
   2297    Bool ok
   2298       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   2299                        zmask_from_V128(argLU),
   2300                        zmask_from_V128(argRU),
   2301                        0x44, False/*!isSTRM*/
   2302         );
   2303    assert(ok);
   2304    resECX = resV.uInt[0];
   2305    return (resOSZACP << 16) | resECX;
   2306 }
   2307 
   2308 void istri_44 ( void )
   2309 {
   2310    char* wot = "44";
   2311    UInt(*h)(V128*,V128*) = h_pcmpistri_44;
   2312    UInt(*s)(V128*,V128*) = s_pcmpistri_44;
   2313 
   2314    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   2315    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   2316    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   2317    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   2318 
   2319    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   2320    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   2321    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   2322    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   2323    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   2324 
   2325    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   2326 
   2327    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   2328    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   2329    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   2330 
   2331    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   2332    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   2333    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   2334 
   2335    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   2336    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   2337 
   2338    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   2339    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   2340 }
   2341 
   2342 
   2343 
   2344 
   2345 
   2346 //////////////////////////////////////////////////////////
   2347 //                                                      //
   2348 //                         main                         //
   2349 //                                                      //
   2350 //////////////////////////////////////////////////////////
   2351 
   2352 int main ( void )
   2353 {
   2354    istri_4A();
   2355    istri_3A();
   2356    istri_08();
   2357    istri_1A();
   2358    istri_02();
   2359    istri_0C();
   2360    istri_12();
   2361    istri_44();
   2362    istri_00();
   2363    istri_38();
   2364    return 0;
   2365 }
   2366