Home | History | Annotate | Download | only in amd64
      1 
      2 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
      3    pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
      4    aspect. */
      5 
      6 #include <string.h>
      7 #include <stdio.h>
      8 #include <assert.h>
      9 
     10 typedef  unsigned int   UInt;
     11 typedef  signed int     Int;
     12 typedef  unsigned char  UChar;
     13 typedef  signed char    Char;
     14 typedef  unsigned long long int ULong;
     15 typedef  UChar          Bool;
     16 #define False ((Bool)0)
     17 #define True  ((Bool)1)
     18 
     19 //typedef  unsigned char  V128[16];
     20 typedef
     21    union {
     22       UChar uChar[16];
     23       UInt  uInt[4];
     24    }
     25    V128;
     26 
     27 #define SHIFT_O   11
     28 #define SHIFT_S   7
     29 #define SHIFT_Z   6
     30 #define SHIFT_A   4
     31 #define SHIFT_C   0
     32 #define SHIFT_P   2
     33 
     34 #define MASK_O    (1ULL << SHIFT_O)
     35 #define MASK_S    (1ULL << SHIFT_S)
     36 #define MASK_Z    (1ULL << SHIFT_Z)
     37 #define MASK_A    (1ULL << SHIFT_A)
     38 #define MASK_C    (1ULL << SHIFT_C)
     39 #define MASK_P    (1ULL << SHIFT_P)
     40 
     41 
     42 UInt clz32 ( UInt x )
     43 {
     44    Int y, m, n;
     45    y = -(x >> 16);
     46    m = (y >> 16) & 16;
     47    n = 16 - m;
     48    x = x >> m;
     49    y = x - 0x100;
     50    m = (y >> 16) & 8;
     51    n = n + m;
     52    x = x << m;
     53    y = x - 0x1000;
     54    m = (y >> 16) & 4;
     55    n = n + m;
     56    x = x << m;
     57    y = x - 0x4000;
     58    m = (y >> 16) & 2;
     59    n = n + m;
     60    x = x << m;
     61    y = x >> 14;
     62    m = y & ~(y >> 1);
     63    return n + 2 - m;
     64 }
     65 
     66 UInt ctz32 ( UInt x )
     67 {
     68    return 32 - clz32((~x) & (x-1));
     69 }
     70 
     71 void expand ( V128* dst, char* summary )
     72 {
     73    Int i;
     74    assert( strlen(summary) == 16 );
     75    for (i = 0; i < 16; i++) {
     76       UChar xx = 0;
     77       UChar x = summary[15-i];
     78       if      (x >= '0' && x <= '9') { xx = x - '0'; }
     79       else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
     80       else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
     81       else assert(0);
     82 
     83       assert(xx < 16);
     84       xx = (xx << 4) | xx;
     85       assert(xx < 256);
     86       dst->uChar[i] = xx;
     87    }
     88 }
     89 
     90 void try_istri ( char* which,
     91                  UInt(*h_fn)(V128*,V128*),
     92                  UInt(*s_fn)(V128*,V128*),
     93                  char* summL, char* summR )
     94 {
     95    assert(strlen(which) == 2);
     96    V128 argL, argR;
     97    expand(&argL, summL);
     98    expand(&argR, summR);
     99    UInt h_res = h_fn(&argL, &argR);
    100    UInt s_res = s_fn(&argL, &argR);
    101    printf("istri %s  %s %s -> %08x %08x %s\n",
    102           which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
    103 }
    104 
    105 UInt zmask_from_V128 ( V128* arg )
    106 {
    107    UInt i, res = 0;
    108    for (i = 0; i < 16; i++) {
    109       res |=  ((arg->uChar[i] == 0) ? 1 : 0) << i;
    110    }
    111    return res;
    112 }
    113 
    114 //////////////////////////////////////////////////////////
    115 //                                                      //
    116 //                       GENERAL                        //
    117 //                                                      //
    118 //////////////////////////////////////////////////////////
    119 
    120 
    121 /* Given partial results from a pcmpXstrX operation (intRes1,
    122    basically), generate an I format (index value for ECX) output, and
    123    also the new OSZACP flags.
    124 */
    125 static
    126 void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
    127                                     /*OUT*/UInt* resOSZACP,
    128                                     UInt intRes1,
    129                                     UInt zmaskL, UInt zmaskR,
    130                                     UInt validL,
    131                                     UInt pol, UInt idx )
    132 {
    133    assert((pol >> 2) == 0);
    134    assert((idx >> 1) == 0);
    135 
    136    UInt intRes2 = 0;
    137    switch (pol) {
    138       case 0: intRes2 = intRes1;          break; // pol +
    139       case 1: intRes2 = ~intRes1;         break; // pol -
    140       case 2: intRes2 = intRes1;          break; // pol m+
    141       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
    142    }
    143    intRes2 &= 0xFFFF;
    144 
    145    // generate ecx value
    146    UInt newECX = 0;
    147    if (idx) {
    148      // index of ms-1-bit
    149      newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
    150    } else {
    151      // index of ls-1-bit
    152      newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
    153    }
    154 
    155    *(UInt*)(&resV[0]) = newECX;
    156 
    157    // generate new flags, common to all ISTRI and ISTRM cases
    158    *resOSZACP    // A, P are zero
    159      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
    160      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
    161      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
    162      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
    163 }
    164 
    165 
    166 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
    167    variants.
    168 
    169    For xSTRI variants, the new ECX value is placed in the 32 bits
    170    pointed to by *resV.  For xSTRM variants, the result is a 128 bit
    171    value and is placed at *resV in the obvious way.
    172 
    173    For all variants, the new OSZACP value is placed at *resOSZACP.
    174 
    175    argLV and argRV are the vector args.  The caller must prepare a
    176    16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
    177    must be 1 for each zero byte of of the respective arg.  For ESTRx
    178    variants this is derived from the explicit length indication, and
    179    must be 0 in all places except at the bit index corresponding to
    180    the valid length (0 .. 16).  If the valid length is 16 then the
    181    mask must be all zeroes.  In all cases, bits 31:16 must be zero.
    182 
    183    imm8 is the original immediate from the instruction.  isSTRM
    184    indicates whether this is a xSTRM or xSTRI variant, which controls
    185    how much of *res is written.
    186 
    187    If the given imm8 case can be handled, the return value is True.
    188    If not, False is returned, and neither *res not *resOSZACP are
    189    altered.
    190 */
    191 
    192 Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
    193                      /*OUT*/UInt* resOSZACP,
    194                      V128* argLV,  V128* argRV,
    195                      UInt zmaskL, UInt zmaskR,
    196                      UInt imm8,   Bool isSTRM )
    197 {
    198    assert(imm8 < 0x80);
    199    assert((zmaskL >> 16) == 0);
    200    assert((zmaskR >> 16) == 0);
    201 
    202    /* Explicitly reject any imm8 values that haven't been validated,
    203       even if they would probably work.  Life is too short to have
    204       unvalidated cases in the code base. */
    205    switch (imm8) {
    206       case 0x00: case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x0E:
    207       case 0x12: case 0x14: case 0x18: case 0x1A:
    208       case 0x30: case 0x34: case 0x38: case 0x3A:
    209       case 0x40: case 0x42: case 0x44: case 0x46: case 0x4A:
    210          break;
    211       default:
    212          return False;
    213    }
    214 
    215    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
    216    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
    217    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
    218    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
    219 
    220    /*----------------------------------------*/
    221    /*-- strcmp on byte data                --*/
    222    /*----------------------------------------*/
    223 
    224    if (agg == 2/*equal each, aka strcmp*/
    225        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    226        && !isSTRM) {
    227       Int    i;
    228       UChar* argL = (UChar*)argLV;
    229       UChar* argR = (UChar*)argRV;
    230       UInt boolResII = 0;
    231       for (i = 15; i >= 0; i--) {
    232          UChar cL  = argL[i];
    233          UChar cR  = argR[i];
    234          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
    235       }
    236       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    237       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    238 
    239       // do invalidation, common to all equal-each cases
    240       UInt intRes1
    241          = (boolResII & validL & validR)  // if both valid, use cmpres
    242            | (~ (validL | validR));       // if both invalid, force 1
    243                                           // else force 0
    244       intRes1 &= 0xFFFF;
    245 
    246       // generate I-format output
    247       pcmpXstrX_WRK_gen_output_fmt_I(
    248          resV, resOSZACP,
    249          intRes1, zmaskL, zmaskR, validL, pol, idx
    250       );
    251 
    252       return True;
    253    }
    254 
    255    /*----------------------------------------*/
    256    /*-- set membership on byte data        --*/
    257    /*----------------------------------------*/
    258 
    259    if (agg == 0/*equal any, aka find chars in a set*/
    260        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    261        && !isSTRM) {
    262       /* argL: the string,  argR: charset */
    263       UInt   si, ci;
    264       UChar* argL    = (UChar*)argLV;
    265       UChar* argR    = (UChar*)argRV;
    266       UInt   boolRes = 0;
    267       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    268       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    269 
    270       for (si = 0; si < 16; si++) {
    271          if ((validL & (1 << si)) == 0)
    272             // run off the end of the string.
    273             break;
    274          UInt m = 0;
    275          for (ci = 0; ci < 16; ci++) {
    276             if ((validR & (1 << ci)) == 0) break;
    277             if (argR[ci] == argL[si]) { m = 1; break; }
    278          }
    279          boolRes |= (m << si);
    280       }
    281 
    282       // boolRes is "pre-invalidated"
    283       UInt intRes1 = boolRes & 0xFFFF;
    284 
    285       // generate I-format output
    286       pcmpXstrX_WRK_gen_output_fmt_I(
    287          resV, resOSZACP,
    288          intRes1, zmaskL, zmaskR, validL, pol, idx
    289       );
    290 
    291       return True;
    292    }
    293 
    294    /*----------------------------------------*/
    295    /*-- substring search on byte data      --*/
    296    /*----------------------------------------*/
    297 
    298    if (agg == 3/*equal ordered, aka substring search*/
    299        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    300        && !isSTRM) {
    301 
    302       /* argL: haystack,  argR: needle */
    303       UInt   ni, hi;
    304       UChar* argL    = (UChar*)argLV;
    305       UChar* argR    = (UChar*)argRV;
    306       UInt   boolRes = 0;
    307       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    308       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    309       for (hi = 0; hi < 16; hi++) {
    310          UInt m = 1;
    311          for (ni = 0; ni < 16; ni++) {
    312             if ((validR & (1 << ni)) == 0) break;
    313             UInt i = ni + hi;
    314             if (i >= 16) break;
    315             if (argL[i] != argR[ni]) { m = 0; break; }
    316          }
    317          boolRes |= (m << hi);
    318          if ((validL & (1 << hi)) == 0)
    319             // run off the end of the haystack
    320             break;
    321       }
    322 
    323       // boolRes is "pre-invalidated"
    324       UInt intRes1 = boolRes & 0xFFFF;
    325 
    326       // generate I-format output
    327       pcmpXstrX_WRK_gen_output_fmt_I(
    328          resV, resOSZACP,
    329          intRes1, zmaskL, zmaskR, validL, pol, idx
    330       );
    331 
    332       return True;
    333    }
    334 
    335    /*----------------------------------------*/
    336    /*-- ranges, unsigned byte data         --*/
    337    /*----------------------------------------*/
    338 
    339    if (agg == 1/*ranges*/
    340        && fmt == 0/*ub*/
    341        && !isSTRM) {
    342 
    343       /* argL: string,  argR: range-pairs */
    344       UInt   ri, si;
    345       UChar* argL    = (UChar*)argLV;
    346       UChar* argR    = (UChar*)argRV;
    347       UInt   boolRes = 0;
    348       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    349       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    350       for (si = 0; si < 16; si++) {
    351          if ((validL & (1 << si)) == 0)
    352             // run off the end of the string
    353             break;
    354          UInt m = 0;
    355          for (ri = 0; ri < 16; ri += 2) {
    356             if ((validR & (3 << ri)) != (3 << ri)) break;
    357             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    358                m = 1; break;
    359             }
    360          }
    361          boolRes |= (m << si);
    362       }
    363 
    364       // boolRes is "pre-invalidated"
    365       UInt intRes1 = boolRes & 0xFFFF;
    366 
    367       // generate I-format output
    368       pcmpXstrX_WRK_gen_output_fmt_I(
    369          resV, resOSZACP,
    370          intRes1, zmaskL, zmaskR, validL, pol, idx
    371       );
    372 
    373       return True;
    374    }
    375 
    376    /*----------------------------------------*/
    377    /*-- ranges, signed byte data           --*/
    378    /*----------------------------------------*/
    379 
    380    if (agg == 1/*ranges*/
    381        && fmt == 2/*sb*/
    382        && !isSTRM) {
    383 
    384       /* argL: string,  argR: range-pairs */
    385       UInt   ri, si;
    386       Char*  argL    = (Char*)argLV;
    387       Char*  argR    = (Char*)argRV;
    388       UInt   boolRes = 0;
    389       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    390       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    391       for (si = 0; si < 16; si++) {
    392          if ((validL & (1 << si)) == 0)
    393             // run off the end of the string
    394             break;
    395          UInt m = 0;
    396          for (ri = 0; ri < 16; ri += 2) {
    397             if ((validR & (3 << ri)) != (3 << ri)) break;
    398             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    399                m = 1; break;
    400             }
    401          }
    402          boolRes |= (m << si);
    403       }
    404 
    405       // boolRes is "pre-invalidated"
    406       UInt intRes1 = boolRes & 0xFFFF;
    407 
    408       // generate I-format output
    409       pcmpXstrX_WRK_gen_output_fmt_I(
    410          resV, resOSZACP,
    411          intRes1, zmaskL, zmaskR, validL, pol, idx
    412       );
    413 
    414       return True;
    415    }
    416 
    417    return False;
    418 }
    419 
    420 
    421 //////////////////////////////////////////////////////////
    422 //                                                      //
    423 //                       ISTRI_4A                       //
    424 //                                                      //
    425 //////////////////////////////////////////////////////////
    426 
    427 UInt h_pcmpistri_4A ( V128* argL, V128* argR )
    428 {
    429    V128 block[2];
    430    memcpy(&block[0], argL, sizeof(V128));
    431    memcpy(&block[1], argR, sizeof(V128));
    432    ULong res, flags;
    433    __asm__ __volatile__(
    434       "subq      $1024,  %%rsp"             "\n\t"
    435       "movdqu    0(%2),  %%xmm2"            "\n\t"
    436       "movdqu    16(%2), %%xmm11"           "\n\t"
    437       "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
    438       "pushfq"                              "\n\t"
    439       "popq      %%rdx"                     "\n\t"
    440       "movq      %%rcx,  %0"                "\n\t"
    441       "movq      %%rdx,  %1"                "\n\t"
    442       "addq      $1024,  %%rsp"             "\n\t"
    443       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    444       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    445    );
    446    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    447 }
    448 
    449 UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
    450 {
    451    V128 resV;
    452    UInt resOSZACP, resECX;
    453    Bool ok
    454       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    455                        zmask_from_V128(argLU),
    456                        zmask_from_V128(argRU),
    457                        0x4A, False/*!isSTRM*/
    458         );
    459    assert(ok);
    460    resECX = resV.uInt[0];
    461    return (resOSZACP << 16) | resECX;
    462 }
    463 
    464 void istri_4A ( void )
    465 {
    466    char* wot = "4A";
    467    UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
    468    UInt(*s)(V128*,V128*) = s_pcmpistri_4A;
    469 
    470    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    471 
    472    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    473    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    474    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    475    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    476 
    477    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    478    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    479    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    480 
    481    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    482    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    483    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    484    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    485 
    486    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    487    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    488    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    489 
    490    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    491 
    492    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    493    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    494    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    495 
    496    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    497    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    498    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    499 
    500    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    501    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    502    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    503 
    504    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    505    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    506    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    507 
    508    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    509    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    510 }
    511 
    512 //////////////////////////////////////////////////////////
    513 //                                                      //
    514 //                       ISTRI_3A                       //
    515 //                                                      //
    516 //////////////////////////////////////////////////////////
    517 
    518 UInt h_pcmpistri_3A ( V128* argL, V128* argR )
    519 {
    520    V128 block[2];
    521    memcpy(&block[0], argL, sizeof(V128));
    522    memcpy(&block[1], argR, sizeof(V128));
    523    ULong res, flags;
    524    __asm__ __volatile__(
    525       "subq      $1024,  %%rsp"             "\n\t"
    526       "movdqu    0(%2),  %%xmm2"            "\n\t"
    527       "movdqu    16(%2), %%xmm11"           "\n\t"
    528       "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
    529       "pushfq"                              "\n\t"
    530       "popq      %%rdx"                     "\n\t"
    531       "movq      %%rcx,  %0"                "\n\t"
    532       "movq      %%rdx,  %1"                "\n\t"
    533       "addq      $1024,  %%rsp"             "\n\t"
    534       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    535       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    536    );
    537    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    538 }
    539 
    540 UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
    541 {
    542    V128 resV;
    543    UInt resOSZACP, resECX;
    544    Bool ok
    545       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    546                        zmask_from_V128(argLU),
    547                        zmask_from_V128(argRU),
    548                        0x3A, False/*!isSTRM*/
    549         );
    550    assert(ok);
    551    resECX = resV.uInt[0];
    552    return (resOSZACP << 16) | resECX;
    553 }
    554 
    555 void istri_3A ( void )
    556 {
    557    char* wot = "3A";
    558    UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
    559    UInt(*s)(V128*,V128*) = s_pcmpistri_3A;
    560 
    561    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    562 
    563    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    564    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    565    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    566    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    567 
    568    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    569    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    570    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    571 
    572    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    573    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    574    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    575    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    576 
    577    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    578    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    579    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    580 
    581    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    582 
    583    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    584    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    585    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    586 
    587    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    588    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    589    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    590 
    591    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    592    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    593    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    594 
    595    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    596    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    597    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    598 
    599    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    600    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    601 }
    602 
    603 
    604 
    605 //////////////////////////////////////////////////////////
    606 //                                                      //
    607 //                       ISTRI_0C                       //
    608 //                                                      //
    609 //////////////////////////////////////////////////////////
    610 
    611 __attribute__((noinline))
    612 UInt h_pcmpistri_0C ( V128* argL, V128* argR )
    613 {
    614    V128 block[2];
    615    memcpy(&block[0], argL, sizeof(V128));
    616    memcpy(&block[1], argR, sizeof(V128));
    617    ULong res = 0, flags = 0;
    618    __asm__ __volatile__(
    619       "movdqu    0(%2),  %%xmm2"            "\n\t"
    620       "movdqu    16(%2), %%xmm11"           "\n\t"
    621       "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
    622       //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
    623       //"movd %%xmm0, %%ecx" "\n\t"
    624       "pushfq"                              "\n\t"
    625       "popq      %%rdx"                     "\n\t"
    626       "movq      %%rcx,  %0"                "\n\t"
    627       "movq      %%rdx,  %1"                "\n\t"
    628       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    629       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    630    );
    631    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    632 }
    633 
    634 UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
    635 {
    636    V128 resV;
    637    UInt resOSZACP, resECX;
    638    Bool ok
    639       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    640                        zmask_from_V128(argLU),
    641                        zmask_from_V128(argRU),
    642                        0x0C, False/*!isSTRM*/
    643         );
    644    assert(ok);
    645    resECX = resV.uInt[0];
    646    return (resOSZACP << 16) | resECX;
    647 }
    648 
    649 void istri_0C ( void )
    650 {
    651    char* wot = "0C";
    652    UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
    653    UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
    654 
    655    try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
    656 
    657    try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
    658 
    659    try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
    660    try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
    661    try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
    662 
    663    try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
    664 
    665    try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
    666    try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
    667    try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
    668    try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
    669    try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
    670 
    671    try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
    672    try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
    673    try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
    674 
    675    try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
    676    try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
    677 
    678    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
    679    try_istri(wot,h,s, "1111111111111234", "0000000000000001");
    680    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
    681 
    682    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
    683    try_istri(wot,h,s, "a111111111111111", "000000000000000a");
    684    try_istri(wot,h,s, "b111111111111111", "000000000000000a");
    685 
    686    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
    687    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    688    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
    689    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
    690 }
    691 
    692 
    693 //////////////////////////////////////////////////////////
    694 //                                                      //
    695 //                       ISTRI_08                       //
    696 //                                                      //
    697 //////////////////////////////////////////////////////////
    698 
    699 UInt h_pcmpistri_08 ( V128* argL, V128* argR )
    700 {
    701    V128 block[2];
    702    memcpy(&block[0], argL, sizeof(V128));
    703    memcpy(&block[1], argR, sizeof(V128));
    704    ULong res, flags;
    705    __asm__ __volatile__(
    706       "subq      $1024,  %%rsp"             "\n\t"
    707       "movdqu    0(%2),  %%xmm2"            "\n\t"
    708       "movdqu    16(%2), %%xmm11"           "\n\t"
    709       "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
    710       "pushfq"                              "\n\t"
    711       "popq      %%rdx"                     "\n\t"
    712       "movq      %%rcx,  %0"                "\n\t"
    713       "movq      %%rdx,  %1"                "\n\t"
    714       "addq      $1024,  %%rsp"             "\n\t"
    715       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    716       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    717    );
    718    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    719 }
    720 
    721 UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
    722 {
    723    V128 resV;
    724    UInt resOSZACP, resECX;
    725    Bool ok
    726       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    727                        zmask_from_V128(argLU),
    728                        zmask_from_V128(argRU),
    729                        0x08, False/*!isSTRM*/
    730         );
    731    assert(ok);
    732    resECX = resV.uInt[0];
    733    return (resOSZACP << 16) | resECX;
    734 }
    735 
    736 void istri_08 ( void )
    737 {
    738    char* wot = "08";
    739    UInt(*h)(V128*,V128*) = h_pcmpistri_08;
    740    UInt(*s)(V128*,V128*) = s_pcmpistri_08;
    741 
    742    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    743 
    744    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    745    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    746    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    747    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    748 
    749    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    750    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    751    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    752 
    753    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    754    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    755    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    756    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    757 
    758    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    759    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    760    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    761 
    762    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    763 
    764    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    765    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    766    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    767 
    768    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    769    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    770    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    771 
    772    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    773    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    774    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    775 
    776    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    777    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    778    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    779 
    780    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    781    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    782 }
    783 
    784 
    785 
    786 //////////////////////////////////////////////////////////
    787 //                                                      //
    788 //                       ISTRI_18                       //
    789 //                                                      //
    790 //////////////////////////////////////////////////////////
    791 
    792 UInt h_pcmpistri_18 ( V128* argL, V128* argR )
    793 {
    794    V128 block[2];
    795    memcpy(&block[0], argL, sizeof(V128));
    796    memcpy(&block[1], argR, sizeof(V128));
    797    ULong res, flags;
    798    __asm__ __volatile__(
    799       "subq      $1024,  %%rsp"             "\n\t"
    800       "movdqu    0(%2),  %%xmm2"            "\n\t"
    801       "movdqu    16(%2), %%xmm11"           "\n\t"
    802       "pcmpistri $0x18,  %%xmm2, %%xmm11"   "\n\t"
    803       "pushfq"                              "\n\t"
    804       "popq      %%rdx"                     "\n\t"
    805       "movq      %%rcx,  %0"                "\n\t"
    806       "movq      %%rdx,  %1"                "\n\t"
    807       "addq      $1024,  %%rsp"             "\n\t"
    808       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    809       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    810    );
    811    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    812 }
    813 
    814 UInt s_pcmpistri_18 ( V128* argLU, V128* argRU )
    815 {
    816    V128 resV;
    817    UInt resOSZACP, resECX;
    818    Bool ok
    819       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    820                        zmask_from_V128(argLU),
    821                        zmask_from_V128(argRU),
    822                        0x18, False/*!isSTRM*/
    823         );
    824    assert(ok);
    825    resECX = resV.uInt[0];
    826    return (resOSZACP << 16) | resECX;
    827 }
    828 
    829 void istri_18 ( void )
    830 {
    831    char* wot = "18";
    832    UInt(*h)(V128*,V128*) = h_pcmpistri_18;
    833    UInt(*s)(V128*,V128*) = s_pcmpistri_18;
    834 
    835    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    836 
    837    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    838    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    839    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    840    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    841 
    842    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    843    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    844    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    845 
    846    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    847    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    848    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    849    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    850 
    851    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    852    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    853    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    854 
    855    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    856 
    857    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    858    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    859    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    860 
    861    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    862    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    863    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    864 
    865    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    866    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    867    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    868 
    869    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    870    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    871    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    872 
    873    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    874    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    875 }
    876 
    877 
    878 
    879 //////////////////////////////////////////////////////////
    880 //                                                      //
    881 //                       ISTRI_1A                       //
    882 //                                                      //
    883 //////////////////////////////////////////////////////////
    884 
    885 UInt h_pcmpistri_1A ( V128* argL, V128* argR )
    886 {
    887    V128 block[2];
    888    memcpy(&block[0], argL, sizeof(V128));
    889    memcpy(&block[1], argR, sizeof(V128));
    890    ULong res, flags;
    891    __asm__ __volatile__(
    892       "subq      $1024,  %%rsp"             "\n\t"
    893       "movdqu    0(%2),  %%xmm2"            "\n\t"
    894       "movdqu    16(%2), %%xmm11"           "\n\t"
    895       "pcmpistri $0x1A,  %%xmm2, %%xmm11"   "\n\t"
    896       "pushfq"                              "\n\t"
    897       "popq      %%rdx"                     "\n\t"
    898       "movq      %%rcx,  %0"                "\n\t"
    899       "movq      %%rdx,  %1"                "\n\t"
    900       "addq      $1024,  %%rsp"             "\n\t"
    901       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    902       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    903    );
    904    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    905 }
    906 
    907 UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
    908 {
    909    V128 resV;
    910    UInt resOSZACP, resECX;
    911    Bool ok
    912       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    913                        zmask_from_V128(argLU),
    914                        zmask_from_V128(argRU),
    915                        0x1A, False/*!isSTRM*/
    916         );
    917    assert(ok);
    918    resECX = resV.uInt[0];
    919    return (resOSZACP << 16) | resECX;
    920 }
    921 
    922 void istri_1A ( void )
    923 {
    924    char* wot = "1A";
    925    UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
    926    UInt(*s)(V128*,V128*) = s_pcmpistri_1A;
    927 
    928    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    929 
    930    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    931    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    932    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    933    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    934 
    935    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    936    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    937    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    938 
    939    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    940    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    941    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    942    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    943 
    944    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    945    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    946    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    947 
    948    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    949 
    950    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    951    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    952    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    953 
    954    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    955    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    956    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    957 
    958    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    959    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    960    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    961 
    962    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    963    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    964    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    965 
    966    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    967    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    968 }
    969 
    970 
    971 
    972 //////////////////////////////////////////////////////////
    973 //                                                      //
    974 //                       ISTRI_02                       //
    975 //                                                      //
    976 //////////////////////////////////////////////////////////
    977 
    978 UInt h_pcmpistri_02 ( V128* argL, V128* argR )
    979 {
    980    V128 block[2];
    981    memcpy(&block[0], argL, sizeof(V128));
    982    memcpy(&block[1], argR, sizeof(V128));
    983    ULong res, flags;
    984    __asm__ __volatile__(
    985       "subq      $1024,  %%rsp"             "\n\t"
    986       "movdqu    0(%2),  %%xmm2"            "\n\t"
    987       "movdqu    16(%2), %%xmm11"           "\n\t"
    988       "pcmpistri $0x02,  %%xmm2, %%xmm11"   "\n\t"
    989 //"pcmpistrm $0x02, %%xmm2, %%xmm11"   "\n\t"
    990 //"movd %%xmm0, %%ecx" "\n\t"
    991       "pushfq"                              "\n\t"
    992       "popq      %%rdx"                     "\n\t"
    993       "movq      %%rcx,  %0"                "\n\t"
    994       "movq      %%rdx,  %1"                "\n\t"
    995       "addq      $1024,  %%rsp"             "\n\t"
    996       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    997       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    998    );
    999    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1000 }
   1001 
   1002 UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
   1003 {
   1004    V128 resV;
   1005    UInt resOSZACP, resECX;
   1006    Bool ok
   1007       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1008                        zmask_from_V128(argLU),
   1009                        zmask_from_V128(argRU),
   1010                        0x02, False/*!isSTRM*/
   1011         );
   1012    assert(ok);
   1013    resECX = resV.uInt[0];
   1014    return (resOSZACP << 16) | resECX;
   1015 }
   1016 
   1017 void istri_02 ( void )
   1018 {
   1019    char* wot = "02";
   1020    UInt(*h)(V128*,V128*) = h_pcmpistri_02;
   1021    UInt(*s)(V128*,V128*) = s_pcmpistri_02;
   1022 
   1023    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1024    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1025    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1026    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1027 
   1028    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1029    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1030    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1031    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1032    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1033 
   1034    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1035    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1036    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1037    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1038 
   1039    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1040    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1041 
   1042    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1043    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1044    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1045    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1046 
   1047    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1048 
   1049    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1050    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1051 }
   1052 
   1053 
   1054 //////////////////////////////////////////////////////////
   1055 //                                                      //
   1056 //                       ISTRI_12                       //
   1057 //                                                      //
   1058 //////////////////////////////////////////////////////////
   1059 
   1060 UInt h_pcmpistri_12 ( V128* argL, V128* argR )
   1061 {
   1062    V128 block[2];
   1063    memcpy(&block[0], argL, sizeof(V128));
   1064    memcpy(&block[1], argR, sizeof(V128));
   1065    ULong res, flags;
   1066    __asm__ __volatile__(
   1067       "subq      $1024,  %%rsp"             "\n\t"
   1068       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1069       "movdqu    16(%2), %%xmm11"           "\n\t"
   1070       "pcmpistri $0x12,  %%xmm2, %%xmm11"   "\n\t"
   1071 //"pcmpistrm $0x12, %%xmm2, %%xmm11"   "\n\t"
   1072 //"movd %%xmm0, %%ecx" "\n\t"
   1073       "pushfq"                              "\n\t"
   1074       "popq      %%rdx"                     "\n\t"
   1075       "movq      %%rcx,  %0"                "\n\t"
   1076       "movq      %%rdx,  %1"                "\n\t"
   1077       "addq      $1024,  %%rsp"             "\n\t"
   1078       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1079       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1080    );
   1081    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1082 }
   1083 
   1084 UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
   1085 {
   1086    V128 resV;
   1087    UInt resOSZACP, resECX;
   1088    Bool ok
   1089       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1090                        zmask_from_V128(argLU),
   1091                        zmask_from_V128(argRU),
   1092                        0x12, False/*!isSTRM*/
   1093         );
   1094    assert(ok);
   1095    resECX = resV.uInt[0];
   1096    return (resOSZACP << 16) | resECX;
   1097 }
   1098 
   1099 void istri_12 ( void )
   1100 {
   1101    char* wot = "12";
   1102    UInt(*h)(V128*,V128*) = h_pcmpistri_12;
   1103    UInt(*s)(V128*,V128*) = s_pcmpistri_12;
   1104 
   1105    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1106    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1107    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1108    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1109 
   1110    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1111    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1112    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1113    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1114    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1115 
   1116    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1117    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1118    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1119    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1120 
   1121    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1122    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1123 
   1124    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1125    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1126    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1127    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1128 
   1129    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1130 
   1131    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1132    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1133 }
   1134 
   1135 
   1136 
   1137 //////////////////////////////////////////////////////////
   1138 //                                                      //
   1139 //                       ISTRI_44                       //
   1140 //                                                      //
   1141 //////////////////////////////////////////////////////////
   1142 
   1143 UInt h_pcmpistri_44 ( V128* argL, V128* argR )
   1144 {
   1145    V128 block[2];
   1146    memcpy(&block[0], argL, sizeof(V128));
   1147    memcpy(&block[1], argR, sizeof(V128));
   1148    ULong res, flags;
   1149    __asm__ __volatile__(
   1150       "subq      $1024,  %%rsp"             "\n\t"
   1151       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1152       "movdqu    16(%2), %%xmm11"           "\n\t"
   1153       "pcmpistri $0x44,  %%xmm2, %%xmm11"   "\n\t"
   1154 //"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
   1155 //"movd %%xmm0, %%ecx" "\n\t"
   1156       "pushfq"                              "\n\t"
   1157       "popq      %%rdx"                     "\n\t"
   1158       "movq      %%rcx,  %0"                "\n\t"
   1159       "movq      %%rdx,  %1"                "\n\t"
   1160       "addq      $1024,  %%rsp"             "\n\t"
   1161       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1162       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1163    );
   1164    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1165 }
   1166 
   1167 UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
   1168 {
   1169    V128 resV;
   1170    UInt resOSZACP, resECX;
   1171    Bool ok
   1172       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1173                        zmask_from_V128(argLU),
   1174                        zmask_from_V128(argRU),
   1175                        0x44, False/*!isSTRM*/
   1176         );
   1177    assert(ok);
   1178    resECX = resV.uInt[0];
   1179    return (resOSZACP << 16) | resECX;
   1180 }
   1181 
   1182 void istri_44 ( void )
   1183 {
   1184    char* wot = "44";
   1185    UInt(*h)(V128*,V128*) = h_pcmpistri_44;
   1186    UInt(*s)(V128*,V128*) = s_pcmpistri_44;
   1187 
   1188    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1189    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1190    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1191    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1192 
   1193    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1194    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1195    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1196    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1197    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1198 
   1199    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1200 
   1201    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1202    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1203    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1204 
   1205    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1206    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1207    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1208 
   1209    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1210    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1211 
   1212    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1213    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1214 
   1215    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
   1216    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
   1217    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
   1218    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
   1219 }
   1220 
   1221 
   1222 //////////////////////////////////////////////////////////
   1223 //                                                      //
   1224 //                       ISTRI_00                       //
   1225 //                                                      //
   1226 //////////////////////////////////////////////////////////
   1227 
   1228 UInt h_pcmpistri_00 ( V128* argL, V128* argR )
   1229 {
   1230    V128 block[2];
   1231    memcpy(&block[0], argL, sizeof(V128));
   1232    memcpy(&block[1], argR, sizeof(V128));
   1233    ULong res, flags;
   1234    __asm__ __volatile__(
   1235       "subq      $1024,  %%rsp"             "\n\t"
   1236       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1237       "movdqu    16(%2), %%xmm11"           "\n\t"
   1238       "pcmpistri $0x00,  %%xmm2, %%xmm11"   "\n\t"
   1239 //"pcmpistrm $0x00, %%xmm2, %%xmm11"   "\n\t"
   1240 //"movd %%xmm0, %%ecx" "\n\t"
   1241       "pushfq"                              "\n\t"
   1242       "popq      %%rdx"                     "\n\t"
   1243       "movq      %%rcx,  %0"                "\n\t"
   1244       "movq      %%rdx,  %1"                "\n\t"
   1245       "addq      $1024,  %%rsp"             "\n\t"
   1246       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1247       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1248    );
   1249    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1250 }
   1251 
   1252 UInt s_pcmpistri_00 ( V128* argLU, V128* argRU )
   1253 {
   1254    V128 resV;
   1255    UInt resOSZACP, resECX;
   1256    Bool ok
   1257       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1258                        zmask_from_V128(argLU),
   1259                        zmask_from_V128(argRU),
   1260                        0x00, False/*!isSTRM*/
   1261         );
   1262    assert(ok);
   1263    resECX = resV.uInt[0];
   1264    return (resOSZACP << 16) | resECX;
   1265 }
   1266 
   1267 void istri_00 ( void )
   1268 {
   1269    char* wot = "00";
   1270    UInt(*h)(V128*,V128*) = h_pcmpistri_00;
   1271    UInt(*s)(V128*,V128*) = s_pcmpistri_00;
   1272 
   1273    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1274    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1275    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1276    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1277 
   1278    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1279    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1280    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1281    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1282    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1283 
   1284    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1285    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1286    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1287    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1288 
   1289    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1290    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1291 
   1292    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1293    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1294    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1295    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1296 
   1297    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1298 
   1299    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1300    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1301 }
   1302 
   1303 
   1304 //////////////////////////////////////////////////////////
   1305 //                                                      //
   1306 //                       ISTRI_38                       //
   1307 //                                                      //
   1308 //////////////////////////////////////////////////////////
   1309 
   1310 UInt h_pcmpistri_38 ( V128* argL, V128* argR )
   1311 {
   1312    V128 block[2];
   1313    memcpy(&block[0], argL, sizeof(V128));
   1314    memcpy(&block[1], argR, sizeof(V128));
   1315    ULong res, flags;
   1316    __asm__ __volatile__(
   1317       "subq      $1024,  %%rsp"             "\n\t"
   1318       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1319       "movdqu    16(%2), %%xmm11"           "\n\t"
   1320       "pcmpistri $0x38,  %%xmm2, %%xmm11"   "\n\t"
   1321       "pushfq"                              "\n\t"
   1322       "popq      %%rdx"                     "\n\t"
   1323       "movq      %%rcx,  %0"                "\n\t"
   1324       "movq      %%rdx,  %1"                "\n\t"
   1325       "addq      $1024,  %%rsp"             "\n\t"
   1326       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1327       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1328    );
   1329    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1330 }
   1331 
   1332 UInt s_pcmpistri_38 ( V128* argLU, V128* argRU )
   1333 {
   1334    V128 resV;
   1335    UInt resOSZACP, resECX;
   1336    Bool ok
   1337       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1338                        zmask_from_V128(argLU),
   1339                        zmask_from_V128(argRU),
   1340                        0x38, False/*!isSTRM*/
   1341         );
   1342    assert(ok);
   1343    resECX = resV.uInt[0];
   1344    return (resOSZACP << 16) | resECX;
   1345 }
   1346 
   1347 void istri_38 ( void )
   1348 {
   1349    char* wot = "38";
   1350    UInt(*h)(V128*,V128*) = h_pcmpistri_38;
   1351    UInt(*s)(V128*,V128*) = s_pcmpistri_38;
   1352 
   1353    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1354 
   1355    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1356    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1357    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1358    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1359 
   1360    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1361    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1362    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1363 
   1364    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1365    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1366    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1367    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1368 
   1369    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1370    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1371    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1372 
   1373    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1374 
   1375    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1376    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1377    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
   1378 
   1379    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
   1380    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1381    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
   1382 
   1383    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1384    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
   1385    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
   1386 
   1387    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
   1388    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
   1389    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
   1390 
   1391    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   1392    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   1393 }
   1394 
   1395 
   1396 
   1397 //////////////////////////////////////////////////////////
   1398 //                                                      //
   1399 //                       ISTRI_46                       //
   1400 //                                                      //
   1401 //////////////////////////////////////////////////////////
   1402 
   1403 UInt h_pcmpistri_46 ( V128* argL, V128* argR )
   1404 {
   1405    V128 block[2];
   1406    memcpy(&block[0], argL, sizeof(V128));
   1407    memcpy(&block[1], argR, sizeof(V128));
   1408    ULong res, flags;
   1409    __asm__ __volatile__(
   1410       "subq      $1024,  %%rsp"             "\n\t"
   1411       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1412       "movdqu    16(%2), %%xmm11"           "\n\t"
   1413       "pcmpistri $0x46,  %%xmm2, %%xmm11"   "\n\t"
   1414       "pushfq"                              "\n\t"
   1415       "popq      %%rdx"                     "\n\t"
   1416       "movq      %%rcx,  %0"                "\n\t"
   1417       "movq      %%rdx,  %1"                "\n\t"
   1418       "addq      $1024,  %%rsp"             "\n\t"
   1419       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1420       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1421    );
   1422    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1423 }
   1424 
   1425 UInt s_pcmpistri_46 ( V128* argLU, V128* argRU )
   1426 {
   1427    V128 resV;
   1428    UInt resOSZACP, resECX;
   1429    Bool ok
   1430       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1431                        zmask_from_V128(argLU),
   1432                        zmask_from_V128(argRU),
   1433                        0x46, False/*!isSTRM*/
   1434         );
   1435    assert(ok);
   1436    resECX = resV.uInt[0];
   1437    return (resOSZACP << 16) | resECX;
   1438 }
   1439 
   1440 void istri_46 ( void )
   1441 {
   1442    char* wot = "46";
   1443    UInt(*h)(V128*,V128*) = h_pcmpistri_46;
   1444    UInt(*s)(V128*,V128*) = s_pcmpistri_46;
   1445 
   1446    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1447    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1448    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1449    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1450 
   1451    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1452    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1453    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1454    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1455    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1456 
   1457    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1458 
   1459    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1460    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1461    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1462 
   1463    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1464    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1465    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1466 
   1467    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1468    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1469 
   1470    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1471    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1472 
   1473    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
   1474    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
   1475    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
   1476    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
   1477 }
   1478 
   1479 
   1480 //////////////////////////////////////////////////////////
   1481 //                                                      //
   1482 //                       ISTRI_30                       //
   1483 //                                                      //
   1484 //////////////////////////////////////////////////////////
   1485 
   1486 UInt h_pcmpistri_30 ( V128* argL, V128* argR )
   1487 {
   1488    V128 block[2];
   1489    memcpy(&block[0], argL, sizeof(V128));
   1490    memcpy(&block[1], argR, sizeof(V128));
   1491    ULong res, flags;
   1492    __asm__ __volatile__(
   1493       "subq      $1024,  %%rsp"             "\n\t"
   1494       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1495       "movdqu    16(%2), %%xmm11"           "\n\t"
   1496       "pcmpistri $0x30,  %%xmm2, %%xmm11"   "\n\t"
   1497       "pushfq"                              "\n\t"
   1498       "popq      %%rdx"                     "\n\t"
   1499       "movq      %%rcx,  %0"                "\n\t"
   1500       "movq      %%rdx,  %1"                "\n\t"
   1501       "addq      $1024,  %%rsp"             "\n\t"
   1502       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1503       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1504    );
   1505    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1506 }
   1507 
   1508 UInt s_pcmpistri_30 ( V128* argLU, V128* argRU )
   1509 {
   1510    V128 resV;
   1511    UInt resOSZACP, resECX;
   1512    Bool ok
   1513       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1514                        zmask_from_V128(argLU),
   1515                        zmask_from_V128(argRU),
   1516                        0x30, False/*!isSTRM*/
   1517         );
   1518    assert(ok);
   1519    resECX = resV.uInt[0];
   1520    return (resOSZACP << 16) | resECX;
   1521 }
   1522 
   1523 void istri_30 ( void )
   1524 {
   1525    char* wot = "30";
   1526    UInt(*h)(V128*,V128*) = h_pcmpistri_30;
   1527    UInt(*s)(V128*,V128*) = s_pcmpistri_30;
   1528 
   1529    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1530    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1531    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1532    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1533 
   1534    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1535    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1536    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1537    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1538    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1539 
   1540    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1541    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1542    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1543    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1544 
   1545    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1546    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1547 
   1548    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1549    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1550    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1551    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1552 
   1553    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1554 
   1555    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1556    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1557 }
   1558 
   1559 
   1560 //////////////////////////////////////////////////////////
   1561 //                                                      //
   1562 //                       ISTRI_40                       //
   1563 //                                                      //
   1564 //////////////////////////////////////////////////////////
   1565 
   1566 UInt h_pcmpistri_40 ( V128* argL, V128* argR )
   1567 {
   1568    V128 block[2];
   1569    memcpy(&block[0], argL, sizeof(V128));
   1570    memcpy(&block[1], argR, sizeof(V128));
   1571    ULong res, flags;
   1572    __asm__ __volatile__(
   1573       "subq      $1024,  %%rsp"             "\n\t"
   1574       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1575       "movdqu    16(%2), %%xmm11"           "\n\t"
   1576       "pcmpistri $0x40,  %%xmm2, %%xmm11"   "\n\t"
   1577       "pushfq"                              "\n\t"
   1578       "popq      %%rdx"                     "\n\t"
   1579       "movq      %%rcx,  %0"                "\n\t"
   1580       "movq      %%rdx,  %1"                "\n\t"
   1581       "addq      $1024,  %%rsp"             "\n\t"
   1582       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1583       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1584    );
   1585    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1586 }
   1587 
   1588 UInt s_pcmpistri_40 ( V128* argLU, V128* argRU )
   1589 {
   1590    V128 resV;
   1591    UInt resOSZACP, resECX;
   1592    Bool ok
   1593       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1594                        zmask_from_V128(argLU),
   1595                        zmask_from_V128(argRU),
   1596                        0x40, False/*!isSTRM*/
   1597         );
   1598    assert(ok);
   1599    resECX = resV.uInt[0];
   1600    return (resOSZACP << 16) | resECX;
   1601 }
   1602 
   1603 void istri_40 ( void )
   1604 {
   1605    char* wot = "40";
   1606    UInt(*h)(V128*,V128*) = h_pcmpistri_40;
   1607    UInt(*s)(V128*,V128*) = s_pcmpistri_40;
   1608 
   1609    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1610    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1611    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1612    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1613 
   1614    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1615    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1616    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1617    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1618    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1619 
   1620    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1621    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1622    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1623    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1624 
   1625    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1626    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1627 
   1628    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1629    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1630    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1631    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1632 
   1633    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1634 
   1635    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1636    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1637 }
   1638 
   1639 
   1640 //////////////////////////////////////////////////////////
   1641 //                                                      //
   1642 //                       ISTRI_42                       //
   1643 //                                                      //
   1644 //////////////////////////////////////////////////////////
   1645 
   1646 UInt h_pcmpistri_42 ( V128* argL, V128* argR )
   1647 {
   1648    V128 block[2];
   1649    memcpy(&block[0], argL, sizeof(V128));
   1650    memcpy(&block[1], argR, sizeof(V128));
   1651    ULong res, flags;
   1652    __asm__ __volatile__(
   1653       "subq      $1024,  %%rsp"             "\n\t"
   1654       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1655       "movdqu    16(%2), %%xmm11"           "\n\t"
   1656       "pcmpistri $0x42,  %%xmm2, %%xmm11"   "\n\t"
   1657       "pushfq"                              "\n\t"
   1658       "popq      %%rdx"                     "\n\t"
   1659       "movq      %%rcx,  %0"                "\n\t"
   1660       "movq      %%rdx,  %1"                "\n\t"
   1661       "addq      $1024,  %%rsp"             "\n\t"
   1662       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1663       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1664    );
   1665    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1666 }
   1667 
   1668 UInt s_pcmpistri_42 ( V128* argLU, V128* argRU )
   1669 {
   1670    V128 resV;
   1671    UInt resOSZACP, resECX;
   1672    Bool ok
   1673       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1674                        zmask_from_V128(argLU),
   1675                        zmask_from_V128(argRU),
   1676                        0x42, False/*!isSTRM*/
   1677         );
   1678    assert(ok);
   1679    resECX = resV.uInt[0];
   1680    return (resOSZACP << 16) | resECX;
   1681 }
   1682 
   1683 void istri_42 ( void )
   1684 {
   1685    char* wot = "42";
   1686    UInt(*h)(V128*,V128*) = h_pcmpistri_42;
   1687    UInt(*s)(V128*,V128*) = s_pcmpistri_42;
   1688 
   1689    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1690    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1691    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1692    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1693 
   1694    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1695    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1696    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1697    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1698    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1699 
   1700    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1701    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1702    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1703    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1704 
   1705    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1706    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1707 
   1708    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1709    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1710    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1711    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1712 
   1713    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1714 
   1715    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1716    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1717 }
   1718 
   1719 
   1720 //////////////////////////////////////////////////////////
   1721 //                                                      //
   1722 //                       ISTRI_0E                       //
   1723 //                                                      //
   1724 //////////////////////////////////////////////////////////
   1725 
   1726 __attribute__((noinline))
   1727 UInt h_pcmpistri_0E ( V128* argL, V128* argR )
   1728 {
   1729    V128 block[2];
   1730    memcpy(&block[0], argL, sizeof(V128));
   1731    memcpy(&block[1], argR, sizeof(V128));
   1732    ULong res = 0, flags = 0;
   1733    __asm__ __volatile__(
   1734       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1735       "movdqu    16(%2), %%xmm11"           "\n\t"
   1736       "pcmpistri $0x0E,  %%xmm2, %%xmm11"   "\n\t"
   1737       "pushfq"                              "\n\t"
   1738       "popq      %%rdx"                     "\n\t"
   1739       "movq      %%rcx,  %0"                "\n\t"
   1740       "movq      %%rdx,  %1"                "\n\t"
   1741       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1742       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1743    );
   1744    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1745 }
   1746 
   1747 UInt s_pcmpistri_0E ( V128* argLU, V128* argRU )
   1748 {
   1749    V128 resV;
   1750    UInt resOSZACP, resECX;
   1751    Bool ok
   1752       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1753                        zmask_from_V128(argLU),
   1754                        zmask_from_V128(argRU),
   1755                        0x0E, False/*!isSTRM*/
   1756         );
   1757    assert(ok);
   1758    resECX = resV.uInt[0];
   1759    return (resOSZACP << 16) | resECX;
   1760 }
   1761 
   1762 void istri_0E ( void )
   1763 {
   1764    char* wot = "0E";
   1765    UInt(*h)(V128*,V128*) = h_pcmpistri_0E;
   1766    UInt(*s)(V128*,V128*) = s_pcmpistri_0E;
   1767 
   1768    try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
   1769 
   1770    try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
   1771 
   1772    try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
   1773    try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
   1774    try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
   1775 
   1776    try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
   1777 
   1778    try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
   1779    try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
   1780    try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
   1781    try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
   1782    try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
   1783 
   1784    try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
   1785    try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
   1786    try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
   1787 
   1788    try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
   1789    try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
   1790 
   1791    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
   1792    try_istri(wot,h,s, "1111111111111234", "0000000000000001");
   1793    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
   1794 
   1795    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
   1796    try_istri(wot,h,s, "a111111111111111", "000000000000000a");
   1797    try_istri(wot,h,s, "b111111111111111", "000000000000000a");
   1798 
   1799    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
   1800    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1801    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
   1802    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
   1803 }
   1804 
   1805 
   1806 //////////////////////////////////////////////////////////
   1807 //                                                      //
   1808 //                       ISTRI_34                       //
   1809 //                                                      //
   1810 //////////////////////////////////////////////////////////
   1811 
   1812 UInt h_pcmpistri_34 ( V128* argL, V128* argR )
   1813 {
   1814    V128 block[2];
   1815    memcpy(&block[0], argL, sizeof(V128));
   1816    memcpy(&block[1], argR, sizeof(V128));
   1817    ULong res, flags;
   1818    __asm__ __volatile__(
   1819       "subq      $1024,  %%rsp"             "\n\t"
   1820       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1821       "movdqu    16(%2), %%xmm11"           "\n\t"
   1822       "pcmpistri $0x34,  %%xmm2, %%xmm11"   "\n\t"
   1823       "pushfq"                              "\n\t"
   1824       "popq      %%rdx"                     "\n\t"
   1825       "movq      %%rcx,  %0"                "\n\t"
   1826       "movq      %%rdx,  %1"                "\n\t"
   1827       "addq      $1024,  %%rsp"             "\n\t"
   1828       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1829       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1830    );
   1831    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1832 }
   1833 
   1834 UInt s_pcmpistri_34 ( V128* argLU, V128* argRU )
   1835 {
   1836    V128 resV;
   1837    UInt resOSZACP, resECX;
   1838    Bool ok
   1839       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1840                        zmask_from_V128(argLU),
   1841                        zmask_from_V128(argRU),
   1842                        0x34, False/*!isSTRM*/
   1843         );
   1844    assert(ok);
   1845    resECX = resV.uInt[0];
   1846    return (resOSZACP << 16) | resECX;
   1847 }
   1848 
   1849 void istri_34 ( void )
   1850 {
   1851    char* wot = "34";
   1852    UInt(*h)(V128*,V128*) = h_pcmpistri_34;
   1853    UInt(*s)(V128*,V128*) = s_pcmpistri_34;
   1854 
   1855    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1856    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1857    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1858    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1859 
   1860    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1861    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1862    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1863    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1864    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1865 
   1866    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1867 
   1868    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1869    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1870    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1871 
   1872    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1873    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1874    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1875 
   1876    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1877    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1878 
   1879    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1880    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1881 
   1882    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
   1883    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
   1884    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
   1885    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
   1886 }
   1887 
   1888 
   1889 //////////////////////////////////////////////////////////
   1890 //                                                      //
   1891 //                       ISTRI_14                       //
   1892 //                                                      //
   1893 //////////////////////////////////////////////////////////
   1894 
   1895 UInt h_pcmpistri_14 ( V128* argL, V128* argR )
   1896 {
   1897    V128 block[2];
   1898    memcpy(&block[0], argL, sizeof(V128));
   1899    memcpy(&block[1], argR, sizeof(V128));
   1900    ULong res, flags;
   1901    __asm__ __volatile__(
   1902       "subq      $1024,  %%rsp"             "\n\t"
   1903       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1904       "movdqu    16(%2), %%xmm11"           "\n\t"
   1905       "pcmpistri $0x14,  %%xmm2, %%xmm11"   "\n\t"
   1906       "pushfq"                              "\n\t"
   1907       "popq      %%rdx"                     "\n\t"
   1908       "movq      %%rcx,  %0"                "\n\t"
   1909       "movq      %%rdx,  %1"                "\n\t"
   1910       "addq      $1024,  %%rsp"             "\n\t"
   1911       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1912       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1913    );
   1914    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1915 }
   1916 
   1917 UInt s_pcmpistri_14 ( V128* argLU, V128* argRU )
   1918 {
   1919    V128 resV;
   1920    UInt resOSZACP, resECX;
   1921    Bool ok
   1922       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1923                        zmask_from_V128(argLU),
   1924                        zmask_from_V128(argRU),
   1925                        0x14, False/*!isSTRM*/
   1926         );
   1927    assert(ok);
   1928    resECX = resV.uInt[0];
   1929    return (resOSZACP << 16) | resECX;
   1930 }
   1931 
   1932 void istri_14 ( void )
   1933 {
   1934    char* wot = "14";
   1935    UInt(*h)(V128*,V128*) = h_pcmpistri_14;
   1936    UInt(*s)(V128*,V128*) = s_pcmpistri_14;
   1937 
   1938    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1939    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1940    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1941    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1942 
   1943    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1944    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1945    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1946    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1947    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1948 
   1949    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1950 
   1951    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1952    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1953    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1954 
   1955    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1956    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1957    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1958 
   1959    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1960    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1961 
   1962    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1963    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1964 
   1965    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
   1966    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
   1967    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
   1968    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
   1969 }
   1970 
   1971 
   1972 //////////////////////////////////////////////////////////
   1973 //                                                      //
   1974 //                         main                         //
   1975 //                                                      //
   1976 //////////////////////////////////////////////////////////
   1977 
   1978 int main ( void )
   1979 {
   1980    istri_4A();
   1981    istri_3A();
   1982    istri_08();
   1983    istri_18();
   1984    istri_1A();
   1985    istri_02();
   1986    istri_0C();
   1987    istri_12();
   1988    istri_44();
   1989    istri_00();
   1990    istri_38();
   1991    istri_46();
   1992    istri_30();
   1993    istri_40();
   1994    istri_42();
   1995    istri_0E();
   1996    istri_14();
   1997    istri_34();
   1998    return 0;
   1999 }
   2000