Home | History | Annotate | Download | only in amd64
      1 
      2 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
      3    pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
      4    aspect. */
      5 
      6 #include <string.h>
      7 #include <stdio.h>
      8 #include <assert.h>
      9 
     10 typedef  unsigned int   UInt;
     11 typedef  signed int     Int;
     12 typedef  unsigned char  UChar;
     13 typedef  signed char    Char;
     14 typedef  unsigned long long int ULong;
     15 typedef  UChar          Bool;
     16 #define False ((Bool)0)
     17 #define True  ((Bool)1)
     18 
     19 //typedef  unsigned char  V128[16];
     20 typedef
     21    union {
     22       UChar uChar[16];
     23       UInt  uInt[4];
     24    }
     25    V128;
     26 
     27 #define SHIFT_O   11
     28 #define SHIFT_S   7
     29 #define SHIFT_Z   6
     30 #define SHIFT_A   4
     31 #define SHIFT_C   0
     32 #define SHIFT_P   2
     33 
     34 #define MASK_O    (1ULL << SHIFT_O)
     35 #define MASK_S    (1ULL << SHIFT_S)
     36 #define MASK_Z    (1ULL << SHIFT_Z)
     37 #define MASK_A    (1ULL << SHIFT_A)
     38 #define MASK_C    (1ULL << SHIFT_C)
     39 #define MASK_P    (1ULL << SHIFT_P)
     40 
     41 
     42 UInt clz32 ( UInt x )
     43 {
     44    Int y, m, n;
     45    y = -(x >> 16);
     46    m = (y >> 16) & 16;
     47    n = 16 - m;
     48    x = x >> m;
     49    y = x - 0x100;
     50    m = (y >> 16) & 8;
     51    n = n + m;
     52    x = x << m;
     53    y = x - 0x1000;
     54    m = (y >> 16) & 4;
     55    n = n + m;
     56    x = x << m;
     57    y = x - 0x4000;
     58    m = (y >> 16) & 2;
     59    n = n + m;
     60    x = x << m;
     61    y = x >> 14;
     62    m = y & ~(y >> 1);
     63    return n + 2 - m;
     64 }
     65 
     66 UInt ctz32 ( UInt x )
     67 {
     68    return 32 - clz32((~x) & (x-1));
     69 }
     70 
     71 void expand ( V128* dst, char* summary )
     72 {
     73    Int i;
     74    assert( strlen(summary) == 16 );
     75    for (i = 0; i < 16; i++) {
     76       UChar xx = 0;
     77       UChar x = summary[15-i];
     78       if      (x >= '0' && x <= '9') { xx = x - '0'; }
     79       else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
     80       else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
     81       else assert(0);
     82 
     83       assert(xx < 16);
     84       xx = (xx << 4) | xx;
     85       assert(xx < 256);
     86       dst->uChar[i] = xx;
     87    }
     88 }
     89 
     90 void try_istri ( char* which,
     91                  UInt(*h_fn)(V128*,V128*),
     92                  UInt(*s_fn)(V128*,V128*),
     93                  char* summL, char* summR )
     94 {
     95    assert(strlen(which) == 2);
     96    V128 argL, argR;
     97    expand(&argL, summL);
     98    expand(&argR, summR);
     99    UInt h_res = h_fn(&argL, &argR);
    100    UInt s_res = s_fn(&argL, &argR);
    101    printf("istri %s  %s %s -> %08x %08x %s\n",
    102           which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
    103 }
    104 
    105 UInt zmask_from_V128 ( V128* arg )
    106 {
    107    UInt i, res = 0;
    108    for (i = 0; i < 16; i++) {
    109       res |=  ((arg->uChar[i] == 0) ? 1 : 0) << i;
    110    }
    111    return res;
    112 }
    113 
    114 //////////////////////////////////////////////////////////
    115 //                                                      //
    116 //                       GENERAL                        //
    117 //                                                      //
    118 //////////////////////////////////////////////////////////
    119 
    120 
    121 /* Given partial results from a pcmpXstrX operation (intRes1,
    122    basically), generate an I format (index value for ECX) output, and
    123    also the new OSZACP flags.
    124 */
    125 static
    126 void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
    127                                     /*OUT*/UInt* resOSZACP,
    128                                     UInt intRes1,
    129                                     UInt zmaskL, UInt zmaskR,
    130                                     UInt validL,
    131                                     UInt pol, UInt idx )
    132 {
    133    assert((pol >> 2) == 0);
    134    assert((idx >> 1) == 0);
    135 
    136    UInt intRes2 = 0;
    137    switch (pol) {
    138       case 0: intRes2 = intRes1;          break; // pol +
    139       case 1: intRes2 = ~intRes1;         break; // pol -
    140       case 2: intRes2 = intRes1;          break; // pol m+
    141       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
    142    }
    143    intRes2 &= 0xFFFF;
    144 
    145    // generate ecx value
    146    UInt newECX = 0;
    147    if (idx) {
    148      // index of ms-1-bit
    149      newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
    150    } else {
    151      // index of ls-1-bit
    152      newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
    153    }
    154 
    155    *(UInt*)(&resV[0]) = newECX;
    156 
    157    // generate new flags, common to all ISTRI and ISTRM cases
    158    *resOSZACP    // A, P are zero
    159      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
    160      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
    161      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
    162      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
    163 }
    164 
    165 
    166 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
    167    variants.
    168 
    169    For xSTRI variants, the new ECX value is placed in the 32 bits
    170    pointed to by *resV.  For xSTRM variants, the result is a 128 bit
    171    value and is placed at *resV in the obvious way.
    172 
    173    For all variants, the new OSZACP value is placed at *resOSZACP.
    174 
    175    argLV and argRV are the vector args.  The caller must prepare a
    176    16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
    177    must be 1 for each zero byte of of the respective arg.  For ESTRx
    178    variants this is derived from the explicit length indication, and
    179    must be 0 in all places except at the bit index corresponding to
    180    the valid length (0 .. 16).  If the valid length is 16 then the
    181    mask must be all zeroes.  In all cases, bits 31:16 must be zero.
    182 
    183    imm8 is the original immediate from the instruction.  isSTRM
    184    indicates whether this is a xSTRM or xSTRI variant, which controls
    185    how much of *res is written.
    186 
    187    If the given imm8 case can be handled, the return value is True.
    188    If not, False is returned, and neither *res not *resOSZACP are
    189    altered.
    190 */
    191 
    192 Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
    193                      /*OUT*/UInt* resOSZACP,
    194                      V128* argLV,  V128* argRV,
    195                      UInt zmaskL, UInt zmaskR,
    196                      UInt imm8,   Bool isSTRM )
    197 {
    198    assert(imm8 < 0x80);
    199    assert((zmaskL >> 16) == 0);
    200    assert((zmaskR >> 16) == 0);
    201 
    202    /* Explicitly reject any imm8 values that haven't been validated,
    203       even if they would probably work.  Life is too short to have
    204       unvalidated cases in the code base. */
    205    switch (imm8) {
    206       case 0x00: case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x0E:
    207       case 0x12: case 0x14: case 0x1A:
    208       case 0x30: case 0x34: case 0x38: case 0x3A:
    209       case 0x40: case 0x44: case 0x46: case 0x4A:
    210          break;
    211       default:
    212          return False;
    213    }
    214 
    215    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
    216    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
    217    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
    218    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
    219 
    220    /*----------------------------------------*/
    221    /*-- strcmp on byte data                --*/
    222    /*----------------------------------------*/
    223 
    224    if (agg == 2/*equal each, aka strcmp*/
    225        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    226        && !isSTRM) {
    227       Int    i;
    228       UChar* argL = (UChar*)argLV;
    229       UChar* argR = (UChar*)argRV;
    230       UInt boolResII = 0;
    231       for (i = 15; i >= 0; i--) {
    232          UChar cL  = argL[i];
    233          UChar cR  = argR[i];
    234          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
    235       }
    236       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    237       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    238 
    239       // do invalidation, common to all equal-each cases
    240       UInt intRes1
    241          = (boolResII & validL & validR)  // if both valid, use cmpres
    242            | (~ (validL | validR));       // if both invalid, force 1
    243                                           // else force 0
    244       intRes1 &= 0xFFFF;
    245 
    246       // generate I-format output
    247       pcmpXstrX_WRK_gen_output_fmt_I(
    248          resV, resOSZACP,
    249          intRes1, zmaskL, zmaskR, validL, pol, idx
    250       );
    251 
    252       return True;
    253    }
    254 
    255    /*----------------------------------------*/
    256    /*-- set membership on byte data        --*/
    257    /*----------------------------------------*/
    258 
    259    if (agg == 0/*equal any, aka find chars in a set*/
    260        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    261        && !isSTRM) {
    262       /* argL: the string,  argR: charset */
    263       UInt   si, ci;
    264       UChar* argL    = (UChar*)argLV;
    265       UChar* argR    = (UChar*)argRV;
    266       UInt   boolRes = 0;
    267       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    268       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    269 
    270       for (si = 0; si < 16; si++) {
    271          if ((validL & (1 << si)) == 0)
    272             // run off the end of the string.
    273             break;
    274          UInt m = 0;
    275          for (ci = 0; ci < 16; ci++) {
    276             if ((validR & (1 << ci)) == 0) break;
    277             if (argR[ci] == argL[si]) { m = 1; break; }
    278          }
    279          boolRes |= (m << si);
    280       }
    281 
    282       // boolRes is "pre-invalidated"
    283       UInt intRes1 = boolRes & 0xFFFF;
    284 
    285       // generate I-format output
    286       pcmpXstrX_WRK_gen_output_fmt_I(
    287          resV, resOSZACP,
    288          intRes1, zmaskL, zmaskR, validL, pol, idx
    289       );
    290 
    291       return True;
    292    }
    293 
    294    /*----------------------------------------*/
    295    /*-- substring search on byte data      --*/
    296    /*----------------------------------------*/
    297 
    298    if (agg == 3/*equal ordered, aka substring search*/
    299        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    300        && !isSTRM) {
    301 
    302       /* argL: haystack,  argR: needle */
    303       UInt   ni, hi;
    304       UChar* argL    = (UChar*)argLV;
    305       UChar* argR    = (UChar*)argRV;
    306       UInt   boolRes = 0;
    307       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    308       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    309       for (hi = 0; hi < 16; hi++) {
    310          UInt m = 1;
    311          for (ni = 0; ni < 16; ni++) {
    312             if ((validR & (1 << ni)) == 0) break;
    313             UInt i = ni + hi;
    314             if (i >= 16) break;
    315             if (argL[i] != argR[ni]) { m = 0; break; }
    316          }
    317          boolRes |= (m << hi);
    318          if ((validL & (1 << hi)) == 0)
    319             // run off the end of the haystack
    320             break;
    321       }
    322 
    323       // boolRes is "pre-invalidated"
    324       UInt intRes1 = boolRes & 0xFFFF;
    325 
    326       // generate I-format output
    327       pcmpXstrX_WRK_gen_output_fmt_I(
    328          resV, resOSZACP,
    329          intRes1, zmaskL, zmaskR, validL, pol, idx
    330       );
    331 
    332       return True;
    333    }
    334 
    335    /*----------------------------------------*/
    336    /*-- ranges, unsigned byte data         --*/
    337    /*----------------------------------------*/
    338 
    339    if (agg == 1/*ranges*/
    340        && fmt == 0/*ub*/
    341        && !isSTRM) {
    342 
    343       /* argL: string,  argR: range-pairs */
    344       UInt   ri, si;
    345       UChar* argL    = (UChar*)argLV;
    346       UChar* argR    = (UChar*)argRV;
    347       UInt   boolRes = 0;
    348       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    349       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    350       for (si = 0; si < 16; si++) {
    351          if ((validL & (1 << si)) == 0)
    352             // run off the end of the string
    353             break;
    354          UInt m = 0;
    355          for (ri = 0; ri < 16; ri += 2) {
    356             if ((validR & (3 << ri)) != (3 << ri)) break;
    357             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    358                m = 1; break;
    359             }
    360          }
    361          boolRes |= (m << si);
    362       }
    363 
    364       // boolRes is "pre-invalidated"
    365       UInt intRes1 = boolRes & 0xFFFF;
    366 
    367       // generate I-format output
    368       pcmpXstrX_WRK_gen_output_fmt_I(
    369          resV, resOSZACP,
    370          intRes1, zmaskL, zmaskR, validL, pol, idx
    371       );
    372 
    373       return True;
    374    }
    375 
    376    /*----------------------------------------*/
    377    /*-- ranges, signed byte data           --*/
    378    /*----------------------------------------*/
    379 
    380    if (agg == 1/*ranges*/
    381        && fmt == 2/*sb*/
    382        && !isSTRM) {
    383 
    384       /* argL: string,  argR: range-pairs */
    385       UInt   ri, si;
    386       Char*  argL    = (Char*)argLV;
    387       Char*  argR    = (Char*)argRV;
    388       UInt   boolRes = 0;
    389       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    390       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    391       for (si = 0; si < 16; si++) {
    392          if ((validL & (1 << si)) == 0)
    393             // run off the end of the string
    394             break;
    395          UInt m = 0;
    396          for (ri = 0; ri < 16; ri += 2) {
    397             if ((validR & (3 << ri)) != (3 << ri)) break;
    398             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    399                m = 1; break;
    400             }
    401          }
    402          boolRes |= (m << si);
    403       }
    404 
    405       // boolRes is "pre-invalidated"
    406       UInt intRes1 = boolRes & 0xFFFF;
    407 
    408       // generate I-format output
    409       pcmpXstrX_WRK_gen_output_fmt_I(
    410          resV, resOSZACP,
    411          intRes1, zmaskL, zmaskR, validL, pol, idx
    412       );
    413 
    414       return True;
    415    }
    416 
    417    return False;
    418 }
    419 
    420 
    421 //////////////////////////////////////////////////////////
    422 //                                                      //
    423 //                       ISTRI_4A                       //
    424 //                                                      //
    425 //////////////////////////////////////////////////////////
    426 
    427 UInt h_pcmpistri_4A ( V128* argL, V128* argR )
    428 {
    429    V128 block[2];
    430    memcpy(&block[0], argL, sizeof(V128));
    431    memcpy(&block[1], argR, sizeof(V128));
    432    ULong res, flags;
    433    __asm__ __volatile__(
    434       "subq      $1024,  %%rsp"             "\n\t"
    435       "movdqu    0(%2),  %%xmm2"            "\n\t"
    436       "movdqu    16(%2), %%xmm11"           "\n\t"
    437       "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
    438       "pushfq"                              "\n\t"
    439       "popq      %%rdx"                     "\n\t"
    440       "movq      %%rcx,  %0"                "\n\t"
    441       "movq      %%rdx,  %1"                "\n\t"
    442       "addq      $1024,  %%rsp"             "\n\t"
    443       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    444       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    445    );
    446    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    447 }
    448 
    449 UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
    450 {
    451    V128 resV;
    452    UInt resOSZACP, resECX;
    453    Bool ok
    454       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    455                        zmask_from_V128(argLU),
    456                        zmask_from_V128(argRU),
    457                        0x4A, False/*!isSTRM*/
    458         );
    459    assert(ok);
    460    resECX = resV.uInt[0];
    461    return (resOSZACP << 16) | resECX;
    462 }
    463 
    464 void istri_4A ( void )
    465 {
    466    char* wot = "4A";
    467    UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
    468    UInt(*s)(V128*,V128*) = s_pcmpistri_4A;
    469 
    470    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    471 
    472    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    473    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    474    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    475    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    476 
    477    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    478    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    479    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    480 
    481    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    482    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    483    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    484    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    485 
    486    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    487    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    488    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    489 
    490    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    491 
    492    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    493    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    494    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    495 
    496    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    497    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    498    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    499 
    500    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    501    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    502    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    503 
    504    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    505    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    506    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    507 
    508    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    509    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    510 }
    511 
    512 //////////////////////////////////////////////////////////
    513 //                                                      //
    514 //                       ISTRI_3A                       //
    515 //                                                      //
    516 //////////////////////////////////////////////////////////
    517 
    518 UInt h_pcmpistri_3A ( V128* argL, V128* argR )
    519 {
    520    V128 block[2];
    521    memcpy(&block[0], argL, sizeof(V128));
    522    memcpy(&block[1], argR, sizeof(V128));
    523    ULong res, flags;
    524    __asm__ __volatile__(
    525       "subq      $1024,  %%rsp"             "\n\t"
    526       "movdqu    0(%2),  %%xmm2"            "\n\t"
    527       "movdqu    16(%2), %%xmm11"           "\n\t"
    528       "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
    529       "pushfq"                              "\n\t"
    530       "popq      %%rdx"                     "\n\t"
    531       "movq      %%rcx,  %0"                "\n\t"
    532       "movq      %%rdx,  %1"                "\n\t"
    533       "addq      $1024,  %%rsp"             "\n\t"
    534       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    535       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    536    );
    537    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    538 }
    539 
    540 UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
    541 {
    542    V128 resV;
    543    UInt resOSZACP, resECX;
    544    Bool ok
    545       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    546                        zmask_from_V128(argLU),
    547                        zmask_from_V128(argRU),
    548                        0x3A, False/*!isSTRM*/
    549         );
    550    assert(ok);
    551    resECX = resV.uInt[0];
    552    return (resOSZACP << 16) | resECX;
    553 }
    554 
    555 void istri_3A ( void )
    556 {
    557    char* wot = "3A";
    558    UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
    559    UInt(*s)(V128*,V128*) = s_pcmpistri_3A;
    560 
    561    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    562 
    563    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    564    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    565    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    566    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    567 
    568    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    569    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    570    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    571 
    572    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    573    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    574    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    575    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    576 
    577    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    578    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    579    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    580 
    581    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    582 
    583    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    584    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    585    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    586 
    587    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    588    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    589    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    590 
    591    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    592    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    593    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    594 
    595    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    596    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    597    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    598 
    599    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    600    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    601 }
    602 
    603 
    604 
    605 //////////////////////////////////////////////////////////
    606 //                                                      //
    607 //                       ISTRI_0C                       //
    608 //                                                      //
    609 //////////////////////////////////////////////////////////
    610 
    611 __attribute__((noinline))
    612 UInt h_pcmpistri_0C ( V128* argL, V128* argR )
    613 {
    614    V128 block[2];
    615    memcpy(&block[0], argL, sizeof(V128));
    616    memcpy(&block[1], argR, sizeof(V128));
    617    ULong res = 0, flags = 0;
    618    __asm__ __volatile__(
    619       "movdqu    0(%2),  %%xmm2"            "\n\t"
    620       "movdqu    16(%2), %%xmm11"           "\n\t"
    621       "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
    622       //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
    623       //"movd %%xmm0, %%ecx" "\n\t"
    624       "pushfq"                              "\n\t"
    625       "popq      %%rdx"                     "\n\t"
    626       "movq      %%rcx,  %0"                "\n\t"
    627       "movq      %%rdx,  %1"                "\n\t"
    628       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    629       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    630    );
    631    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    632 }
    633 
    634 UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
    635 {
    636    V128 resV;
    637    UInt resOSZACP, resECX;
    638    Bool ok
    639       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    640                        zmask_from_V128(argLU),
    641                        zmask_from_V128(argRU),
    642                        0x0C, False/*!isSTRM*/
    643         );
    644    assert(ok);
    645    resECX = resV.uInt[0];
    646    return (resOSZACP << 16) | resECX;
    647 }
    648 
    649 void istri_0C ( void )
    650 {
    651    char* wot = "0C";
    652    UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
    653    UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
    654 
    655    try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
    656 
    657    try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
    658 
    659    try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
    660    try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
    661    try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
    662 
    663    try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
    664 
    665    try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
    666    try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
    667    try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
    668    try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
    669    try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
    670 
    671    try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
    672    try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
    673    try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
    674 
    675    try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
    676    try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
    677 
    678    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
    679    try_istri(wot,h,s, "1111111111111234", "0000000000000001");
    680    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
    681 
    682    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
    683    try_istri(wot,h,s, "a111111111111111", "000000000000000a");
    684    try_istri(wot,h,s, "b111111111111111", "000000000000000a");
    685 
    686    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
    687    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    688    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
    689    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
    690 }
    691 
    692 
    693 //////////////////////////////////////////////////////////
    694 //                                                      //
    695 //                       ISTRI_08                       //
    696 //                                                      //
    697 //////////////////////////////////////////////////////////
    698 
    699 UInt h_pcmpistri_08 ( V128* argL, V128* argR )
    700 {
    701    V128 block[2];
    702    memcpy(&block[0], argL, sizeof(V128));
    703    memcpy(&block[1], argR, sizeof(V128));
    704    ULong res, flags;
    705    __asm__ __volatile__(
    706       "subq      $1024,  %%rsp"             "\n\t"
    707       "movdqu    0(%2),  %%xmm2"            "\n\t"
    708       "movdqu    16(%2), %%xmm11"           "\n\t"
    709       "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
    710       "pushfq"                              "\n\t"
    711       "popq      %%rdx"                     "\n\t"
    712       "movq      %%rcx,  %0"                "\n\t"
    713       "movq      %%rdx,  %1"                "\n\t"
    714       "addq      $1024,  %%rsp"             "\n\t"
    715       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    716       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    717    );
    718    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    719 }
    720 
    721 UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
    722 {
    723    V128 resV;
    724    UInt resOSZACP, resECX;
    725    Bool ok
    726       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    727                        zmask_from_V128(argLU),
    728                        zmask_from_V128(argRU),
    729                        0x08, False/*!isSTRM*/
    730         );
    731    assert(ok);
    732    resECX = resV.uInt[0];
    733    return (resOSZACP << 16) | resECX;
    734 }
    735 
    736 void istri_08 ( void )
    737 {
    738    char* wot = "08";
    739    UInt(*h)(V128*,V128*) = h_pcmpistri_08;
    740    UInt(*s)(V128*,V128*) = s_pcmpistri_08;
    741 
    742    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    743 
    744    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    745    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    746    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    747    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    748 
    749    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    750    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    751    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    752 
    753    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    754    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    755    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    756    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    757 
    758    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    759    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    760    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    761 
    762    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    763 
    764    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    765    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    766    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    767 
    768    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    769    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    770    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    771 
    772    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    773    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    774    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    775 
    776    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    777    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    778    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    779 
    780    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    781    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    782 }
    783 
    784 
    785 
    786 //////////////////////////////////////////////////////////
    787 //                                                      //
    788 //                       ISTRI_1A                       //
    789 //                                                      //
    790 //////////////////////////////////////////////////////////
    791 
    792 UInt h_pcmpistri_1A ( V128* argL, V128* argR )
    793 {
    794    V128 block[2];
    795    memcpy(&block[0], argL, sizeof(V128));
    796    memcpy(&block[1], argR, sizeof(V128));
    797    ULong res, flags;
    798    __asm__ __volatile__(
    799       "subq      $1024,  %%rsp"             "\n\t"
    800       "movdqu    0(%2),  %%xmm2"            "\n\t"
    801       "movdqu    16(%2), %%xmm11"           "\n\t"
    802       "pcmpistri $0x1A,  %%xmm2, %%xmm11"   "\n\t"
    803       "pushfq"                              "\n\t"
    804       "popq      %%rdx"                     "\n\t"
    805       "movq      %%rcx,  %0"                "\n\t"
    806       "movq      %%rdx,  %1"                "\n\t"
    807       "addq      $1024,  %%rsp"             "\n\t"
    808       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    809       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    810    );
    811    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    812 }
    813 
    814 UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
    815 {
    816    V128 resV;
    817    UInt resOSZACP, resECX;
    818    Bool ok
    819       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    820                        zmask_from_V128(argLU),
    821                        zmask_from_V128(argRU),
    822                        0x1A, False/*!isSTRM*/
    823         );
    824    assert(ok);
    825    resECX = resV.uInt[0];
    826    return (resOSZACP << 16) | resECX;
    827 }
    828 
    829 void istri_1A ( void )
    830 {
    831    char* wot = "1A";
    832    UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
    833    UInt(*s)(V128*,V128*) = s_pcmpistri_1A;
    834 
    835    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    836 
    837    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    838    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    839    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    840    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    841 
    842    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    843    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    844    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    845 
    846    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    847    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    848    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    849    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    850 
    851    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    852    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    853    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    854 
    855    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    856 
    857    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    858    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    859    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    860 
    861    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    862    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    863    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    864 
    865    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    866    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    867    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    868 
    869    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    870    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    871    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    872 
    873    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    874    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    875 }
    876 
    877 
    878 
    879 //////////////////////////////////////////////////////////
    880 //                                                      //
    881 //                       ISTRI_02                       //
    882 //                                                      //
    883 //////////////////////////////////////////////////////////
    884 
    885 UInt h_pcmpistri_02 ( V128* argL, V128* argR )
    886 {
    887    V128 block[2];
    888    memcpy(&block[0], argL, sizeof(V128));
    889    memcpy(&block[1], argR, sizeof(V128));
    890    ULong res, flags;
    891    __asm__ __volatile__(
    892       "subq      $1024,  %%rsp"             "\n\t"
    893       "movdqu    0(%2),  %%xmm2"            "\n\t"
    894       "movdqu    16(%2), %%xmm11"           "\n\t"
    895       "pcmpistri $0x02,  %%xmm2, %%xmm11"   "\n\t"
    896 //"pcmpistrm $0x02, %%xmm2, %%xmm11"   "\n\t"
    897 //"movd %%xmm0, %%ecx" "\n\t"
    898       "pushfq"                              "\n\t"
    899       "popq      %%rdx"                     "\n\t"
    900       "movq      %%rcx,  %0"                "\n\t"
    901       "movq      %%rdx,  %1"                "\n\t"
    902       "addq      $1024,  %%rsp"             "\n\t"
    903       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    904       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    905    );
    906    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    907 }
    908 
    909 UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
    910 {
    911    V128 resV;
    912    UInt resOSZACP, resECX;
    913    Bool ok
    914       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    915                        zmask_from_V128(argLU),
    916                        zmask_from_V128(argRU),
    917                        0x02, False/*!isSTRM*/
    918         );
    919    assert(ok);
    920    resECX = resV.uInt[0];
    921    return (resOSZACP << 16) | resECX;
    922 }
    923 
    924 void istri_02 ( void )
    925 {
    926    char* wot = "02";
    927    UInt(*h)(V128*,V128*) = h_pcmpistri_02;
    928    UInt(*s)(V128*,V128*) = s_pcmpistri_02;
    929 
    930    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
    931    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
    932    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
    933    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
    934 
    935    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    936    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
    937    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
    938    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
    939    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
    940 
    941    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    942    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
    943    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
    944    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
    945 
    946    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    947    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    948 
    949    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
    950    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
    951    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
    952    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
    953 
    954    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
    955 
    956    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
    957    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
    958 }
    959 
    960 
    961 //////////////////////////////////////////////////////////
    962 //                                                      //
    963 //                       ISTRI_12                       //
    964 //                                                      //
    965 //////////////////////////////////////////////////////////
    966 
    967 UInt h_pcmpistri_12 ( V128* argL, V128* argR )
    968 {
    969    V128 block[2];
    970    memcpy(&block[0], argL, sizeof(V128));
    971    memcpy(&block[1], argR, sizeof(V128));
    972    ULong res, flags;
    973    __asm__ __volatile__(
    974       "subq      $1024,  %%rsp"             "\n\t"
    975       "movdqu    0(%2),  %%xmm2"            "\n\t"
    976       "movdqu    16(%2), %%xmm11"           "\n\t"
    977       "pcmpistri $0x12,  %%xmm2, %%xmm11"   "\n\t"
    978 //"pcmpistrm $0x12, %%xmm2, %%xmm11"   "\n\t"
    979 //"movd %%xmm0, %%ecx" "\n\t"
    980       "pushfq"                              "\n\t"
    981       "popq      %%rdx"                     "\n\t"
    982       "movq      %%rcx,  %0"                "\n\t"
    983       "movq      %%rdx,  %1"                "\n\t"
    984       "addq      $1024,  %%rsp"             "\n\t"
    985       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    986       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    987    );
    988    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    989 }
    990 
    991 UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
    992 {
    993    V128 resV;
    994    UInt resOSZACP, resECX;
    995    Bool ok
    996       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    997                        zmask_from_V128(argLU),
    998                        zmask_from_V128(argRU),
    999                        0x12, False/*!isSTRM*/
   1000         );
   1001    assert(ok);
   1002    resECX = resV.uInt[0];
   1003    return (resOSZACP << 16) | resECX;
   1004 }
   1005 
   1006 void istri_12 ( void )
   1007 {
   1008    char* wot = "12";
   1009    UInt(*h)(V128*,V128*) = h_pcmpistri_12;
   1010    UInt(*s)(V128*,V128*) = s_pcmpistri_12;
   1011 
   1012    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1013    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1014    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1015    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1016 
   1017    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1018    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1019    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1020    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1021    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1022 
   1023    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1024    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1025    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1026    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1027 
   1028    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1029    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1030 
   1031    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1032    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1033    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1034    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1035 
   1036    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1037 
   1038    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1039    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1040 }
   1041 
   1042 
   1043 
   1044 //////////////////////////////////////////////////////////
   1045 //                                                      //
   1046 //                       ISTRI_44                       //
   1047 //                                                      //
   1048 //////////////////////////////////////////////////////////
   1049 
   1050 UInt h_pcmpistri_44 ( V128* argL, V128* argR )
   1051 {
   1052    V128 block[2];
   1053    memcpy(&block[0], argL, sizeof(V128));
   1054    memcpy(&block[1], argR, sizeof(V128));
   1055    ULong res, flags;
   1056    __asm__ __volatile__(
   1057       "subq      $1024,  %%rsp"             "\n\t"
   1058       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1059       "movdqu    16(%2), %%xmm11"           "\n\t"
   1060       "pcmpistri $0x44,  %%xmm2, %%xmm11"   "\n\t"
   1061 //"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
   1062 //"movd %%xmm0, %%ecx" "\n\t"
   1063       "pushfq"                              "\n\t"
   1064       "popq      %%rdx"                     "\n\t"
   1065       "movq      %%rcx,  %0"                "\n\t"
   1066       "movq      %%rdx,  %1"                "\n\t"
   1067       "addq      $1024,  %%rsp"             "\n\t"
   1068       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1069       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1070    );
   1071    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1072 }
   1073 
   1074 UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
   1075 {
   1076    V128 resV;
   1077    UInt resOSZACP, resECX;
   1078    Bool ok
   1079       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1080                        zmask_from_V128(argLU),
   1081                        zmask_from_V128(argRU),
   1082                        0x44, False/*!isSTRM*/
   1083         );
   1084    assert(ok);
   1085    resECX = resV.uInt[0];
   1086    return (resOSZACP << 16) | resECX;
   1087 }
   1088 
   1089 void istri_44 ( void )
   1090 {
   1091    char* wot = "44";
   1092    UInt(*h)(V128*,V128*) = h_pcmpistri_44;
   1093    UInt(*s)(V128*,V128*) = s_pcmpistri_44;
   1094 
   1095    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1096    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1097    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1098    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1099 
   1100    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1101    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1102    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1103    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1104    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1105 
   1106    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1107 
   1108    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1109    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1110    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1111 
   1112    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1113    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1114    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1115 
   1116    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1117    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1118 
   1119    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1120    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1121 
   1122    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
   1123    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
   1124    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
   1125    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
   1126 }
   1127 
   1128 
   1129 //////////////////////////////////////////////////////////
   1130 //                                                      //
   1131 //                       ISTRI_00                       //
   1132 //                                                      //
   1133 //////////////////////////////////////////////////////////
   1134 
   1135 UInt h_pcmpistri_00 ( V128* argL, V128* argR )
   1136 {
   1137    V128 block[2];
   1138    memcpy(&block[0], argL, sizeof(V128));
   1139    memcpy(&block[1], argR, sizeof(V128));
   1140    ULong res, flags;
   1141    __asm__ __volatile__(
   1142       "subq      $1024,  %%rsp"             "\n\t"
   1143       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1144       "movdqu    16(%2), %%xmm11"           "\n\t"
   1145       "pcmpistri $0x00,  %%xmm2, %%xmm11"   "\n\t"
   1146 //"pcmpistrm $0x00, %%xmm2, %%xmm11"   "\n\t"
   1147 //"movd %%xmm0, %%ecx" "\n\t"
   1148       "pushfq"                              "\n\t"
   1149       "popq      %%rdx"                     "\n\t"
   1150       "movq      %%rcx,  %0"                "\n\t"
   1151       "movq      %%rdx,  %1"                "\n\t"
   1152       "addq      $1024,  %%rsp"             "\n\t"
   1153       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1154       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1155    );
   1156    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1157 }
   1158 
   1159 UInt s_pcmpistri_00 ( V128* argLU, V128* argRU )
   1160 {
   1161    V128 resV;
   1162    UInt resOSZACP, resECX;
   1163    Bool ok
   1164       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1165                        zmask_from_V128(argLU),
   1166                        zmask_from_V128(argRU),
   1167                        0x00, False/*!isSTRM*/
   1168         );
   1169    assert(ok);
   1170    resECX = resV.uInt[0];
   1171    return (resOSZACP << 16) | resECX;
   1172 }
   1173 
   1174 void istri_00 ( void )
   1175 {
   1176    char* wot = "00";
   1177    UInt(*h)(V128*,V128*) = h_pcmpistri_00;
   1178    UInt(*s)(V128*,V128*) = s_pcmpistri_00;
   1179 
   1180    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1181    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1182    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1183    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1184 
   1185    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1186    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1187    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1188    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1189    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1190 
   1191    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1192    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1193    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1194    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1195 
   1196    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1197    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1198 
   1199    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1200    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1201    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1202    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1203 
   1204    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1205 
   1206    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1207    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1208 }
   1209 
   1210 
   1211 //////////////////////////////////////////////////////////
   1212 //                                                      //
   1213 //                       ISTRI_38                       //
   1214 //                                                      //
   1215 //////////////////////////////////////////////////////////
   1216 
   1217 UInt h_pcmpistri_38 ( V128* argL, V128* argR )
   1218 {
   1219    V128 block[2];
   1220    memcpy(&block[0], argL, sizeof(V128));
   1221    memcpy(&block[1], argR, sizeof(V128));
   1222    ULong res, flags;
   1223    __asm__ __volatile__(
   1224       "subq      $1024,  %%rsp"             "\n\t"
   1225       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1226       "movdqu    16(%2), %%xmm11"           "\n\t"
   1227       "pcmpistri $0x38,  %%xmm2, %%xmm11"   "\n\t"
   1228       "pushfq"                              "\n\t"
   1229       "popq      %%rdx"                     "\n\t"
   1230       "movq      %%rcx,  %0"                "\n\t"
   1231       "movq      %%rdx,  %1"                "\n\t"
   1232       "addq      $1024,  %%rsp"             "\n\t"
   1233       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1234       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1235    );
   1236    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1237 }
   1238 
   1239 UInt s_pcmpistri_38 ( V128* argLU, V128* argRU )
   1240 {
   1241    V128 resV;
   1242    UInt resOSZACP, resECX;
   1243    Bool ok
   1244       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1245                        zmask_from_V128(argLU),
   1246                        zmask_from_V128(argRU),
   1247                        0x38, False/*!isSTRM*/
   1248         );
   1249    assert(ok);
   1250    resECX = resV.uInt[0];
   1251    return (resOSZACP << 16) | resECX;
   1252 }
   1253 
   1254 void istri_38 ( void )
   1255 {
   1256    char* wot = "38";
   1257    UInt(*h)(V128*,V128*) = h_pcmpistri_38;
   1258    UInt(*s)(V128*,V128*) = s_pcmpistri_38;
   1259 
   1260    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1261 
   1262    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1263    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1264    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1265    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1266 
   1267    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1268    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1269    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1270 
   1271    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1272    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1273    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1274    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1275 
   1276    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1277    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1278    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1279 
   1280    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1281 
   1282    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1283    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1284    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
   1285 
   1286    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
   1287    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1288    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
   1289 
   1290    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1291    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
   1292    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
   1293 
   1294    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
   1295    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
   1296    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
   1297 
   1298    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   1299    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   1300 }
   1301 
   1302 
   1303 
   1304 //////////////////////////////////////////////////////////
   1305 //                                                      //
   1306 //                       ISTRI_46                       //
   1307 //                                                      //
   1308 //////////////////////////////////////////////////////////
   1309 
   1310 UInt h_pcmpistri_46 ( V128* argL, V128* argR )
   1311 {
   1312    V128 block[2];
   1313    memcpy(&block[0], argL, sizeof(V128));
   1314    memcpy(&block[1], argR, sizeof(V128));
   1315    ULong res, flags;
   1316    __asm__ __volatile__(
   1317       "subq      $1024,  %%rsp"             "\n\t"
   1318       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1319       "movdqu    16(%2), %%xmm11"           "\n\t"
   1320       "pcmpistri $0x46,  %%xmm2, %%xmm11"   "\n\t"
   1321       "pushfq"                              "\n\t"
   1322       "popq      %%rdx"                     "\n\t"
   1323       "movq      %%rcx,  %0"                "\n\t"
   1324       "movq      %%rdx,  %1"                "\n\t"
   1325       "addq      $1024,  %%rsp"             "\n\t"
   1326       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1327       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1328    );
   1329    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1330 }
   1331 
   1332 UInt s_pcmpistri_46 ( V128* argLU, V128* argRU )
   1333 {
   1334    V128 resV;
   1335    UInt resOSZACP, resECX;
   1336    Bool ok
   1337       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1338                        zmask_from_V128(argLU),
   1339                        zmask_from_V128(argRU),
   1340                        0x46, False/*!isSTRM*/
   1341         );
   1342    assert(ok);
   1343    resECX = resV.uInt[0];
   1344    return (resOSZACP << 16) | resECX;
   1345 }
   1346 
   1347 void istri_46 ( void )
   1348 {
   1349    char* wot = "46";
   1350    UInt(*h)(V128*,V128*) = h_pcmpistri_46;
   1351    UInt(*s)(V128*,V128*) = s_pcmpistri_46;
   1352 
   1353    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1354    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1355    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1356    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1357 
   1358    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1359    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1360    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1361    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1362    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1363 
   1364    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1365 
   1366    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1367    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1368    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1369 
   1370    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1371    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1372    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1373 
   1374    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1375    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1376 
   1377    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1378    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1379 
   1380    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
   1381    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
   1382    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
   1383    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
   1384 }
   1385 
   1386 
   1387 //////////////////////////////////////////////////////////
   1388 //                                                      //
   1389 //                       ISTRI_30                       //
   1390 //                                                      //
   1391 //////////////////////////////////////////////////////////
   1392 
   1393 UInt h_pcmpistri_30 ( V128* argL, V128* argR )
   1394 {
   1395    V128 block[2];
   1396    memcpy(&block[0], argL, sizeof(V128));
   1397    memcpy(&block[1], argR, sizeof(V128));
   1398    ULong res, flags;
   1399    __asm__ __volatile__(
   1400       "subq      $1024,  %%rsp"             "\n\t"
   1401       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1402       "movdqu    16(%2), %%xmm11"           "\n\t"
   1403       "pcmpistri $0x30,  %%xmm2, %%xmm11"   "\n\t"
   1404       "pushfq"                              "\n\t"
   1405       "popq      %%rdx"                     "\n\t"
   1406       "movq      %%rcx,  %0"                "\n\t"
   1407       "movq      %%rdx,  %1"                "\n\t"
   1408       "addq      $1024,  %%rsp"             "\n\t"
   1409       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1410       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1411    );
   1412    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1413 }
   1414 
   1415 UInt s_pcmpistri_30 ( V128* argLU, V128* argRU )
   1416 {
   1417    V128 resV;
   1418    UInt resOSZACP, resECX;
   1419    Bool ok
   1420       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1421                        zmask_from_V128(argLU),
   1422                        zmask_from_V128(argRU),
   1423                        0x30, False/*!isSTRM*/
   1424         );
   1425    assert(ok);
   1426    resECX = resV.uInt[0];
   1427    return (resOSZACP << 16) | resECX;
   1428 }
   1429 
   1430 void istri_30 ( void )
   1431 {
   1432    char* wot = "30";
   1433    UInt(*h)(V128*,V128*) = h_pcmpistri_30;
   1434    UInt(*s)(V128*,V128*) = s_pcmpistri_30;
   1435 
   1436    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1437    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1438    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1439    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1440 
   1441    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1442    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1443    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1444    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1445    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1446 
   1447    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1448    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1449    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1450    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1451 
   1452    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1453    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1454 
   1455    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1456    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1457    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1458    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1459 
   1460    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1461 
   1462    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1463    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1464 }
   1465 
   1466 
   1467 //////////////////////////////////////////////////////////
   1468 //                                                      //
   1469 //                       ISTRI_40                       //
   1470 //                                                      //
   1471 //////////////////////////////////////////////////////////
   1472 
   1473 UInt h_pcmpistri_40 ( V128* argL, V128* argR )
   1474 {
   1475    V128 block[2];
   1476    memcpy(&block[0], argL, sizeof(V128));
   1477    memcpy(&block[1], argR, sizeof(V128));
   1478    ULong res, flags;
   1479    __asm__ __volatile__(
   1480       "subq      $1024,  %%rsp"             "\n\t"
   1481       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1482       "movdqu    16(%2), %%xmm11"           "\n\t"
   1483       "pcmpistri $0x40,  %%xmm2, %%xmm11"   "\n\t"
   1484       "pushfq"                              "\n\t"
   1485       "popq      %%rdx"                     "\n\t"
   1486       "movq      %%rcx,  %0"                "\n\t"
   1487       "movq      %%rdx,  %1"                "\n\t"
   1488       "addq      $1024,  %%rsp"             "\n\t"
   1489       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1490       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1491    );
   1492    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1493 }
   1494 
   1495 UInt s_pcmpistri_40 ( V128* argLU, V128* argRU )
   1496 {
   1497    V128 resV;
   1498    UInt resOSZACP, resECX;
   1499    Bool ok
   1500       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1501                        zmask_from_V128(argLU),
   1502                        zmask_from_V128(argRU),
   1503                        0x40, False/*!isSTRM*/
   1504         );
   1505    assert(ok);
   1506    resECX = resV.uInt[0];
   1507    return (resOSZACP << 16) | resECX;
   1508 }
   1509 
   1510 void istri_40 ( void )
   1511 {
   1512    char* wot = "40";
   1513    UInt(*h)(V128*,V128*) = h_pcmpistri_40;
   1514    UInt(*s)(V128*,V128*) = s_pcmpistri_40;
   1515 
   1516    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1517    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1518    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1519    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1520 
   1521    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1522    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1523    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1524    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1525    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1526 
   1527    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1528    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1529    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1530    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1531 
   1532    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1533    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1534 
   1535    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1536    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1537    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1538    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1539 
   1540    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1541 
   1542    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1543    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1544 }
   1545 
   1546 
   1547 //////////////////////////////////////////////////////////
   1548 //                                                      //
   1549 //                       ISTRI_0E                       //
   1550 //                                                      //
   1551 //////////////////////////////////////////////////////////
   1552 
   1553 __attribute__((noinline))
   1554 UInt h_pcmpistri_0E ( V128* argL, V128* argR )
   1555 {
   1556    V128 block[2];
   1557    memcpy(&block[0], argL, sizeof(V128));
   1558    memcpy(&block[1], argR, sizeof(V128));
   1559    ULong res = 0, flags = 0;
   1560    __asm__ __volatile__(
   1561       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1562       "movdqu    16(%2), %%xmm11"           "\n\t"
   1563       "pcmpistri $0x0E,  %%xmm2, %%xmm11"   "\n\t"
   1564       "pushfq"                              "\n\t"
   1565       "popq      %%rdx"                     "\n\t"
   1566       "movq      %%rcx,  %0"                "\n\t"
   1567       "movq      %%rdx,  %1"                "\n\t"
   1568       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1569       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1570    );
   1571    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1572 }
   1573 
   1574 UInt s_pcmpistri_0E ( V128* argLU, V128* argRU )
   1575 {
   1576    V128 resV;
   1577    UInt resOSZACP, resECX;
   1578    Bool ok
   1579       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1580                        zmask_from_V128(argLU),
   1581                        zmask_from_V128(argRU),
   1582                        0x0E, False/*!isSTRM*/
   1583         );
   1584    assert(ok);
   1585    resECX = resV.uInt[0];
   1586    return (resOSZACP << 16) | resECX;
   1587 }
   1588 
   1589 void istri_0E ( void )
   1590 {
   1591    char* wot = "0E";
   1592    UInt(*h)(V128*,V128*) = h_pcmpistri_0E;
   1593    UInt(*s)(V128*,V128*) = s_pcmpistri_0E;
   1594 
   1595    try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
   1596 
   1597    try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
   1598 
   1599    try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
   1600    try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
   1601    try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
   1602 
   1603    try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
   1604 
   1605    try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
   1606    try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
   1607    try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
   1608    try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
   1609    try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
   1610 
   1611    try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
   1612    try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
   1613    try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
   1614 
   1615    try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
   1616    try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
   1617 
   1618    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
   1619    try_istri(wot,h,s, "1111111111111234", "0000000000000001");
   1620    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
   1621 
   1622    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
   1623    try_istri(wot,h,s, "a111111111111111", "000000000000000a");
   1624    try_istri(wot,h,s, "b111111111111111", "000000000000000a");
   1625 
   1626    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
   1627    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1628    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
   1629    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
   1630 }
   1631 
   1632 
   1633 //////////////////////////////////////////////////////////
   1634 //                                                      //
   1635 //                       ISTRI_34                       //
   1636 //                                                      //
   1637 //////////////////////////////////////////////////////////
   1638 
   1639 UInt h_pcmpistri_34 ( V128* argL, V128* argR )
   1640 {
   1641    V128 block[2];
   1642    memcpy(&block[0], argL, sizeof(V128));
   1643    memcpy(&block[1], argR, sizeof(V128));
   1644    ULong res, flags;
   1645    __asm__ __volatile__(
   1646       "subq      $1024,  %%rsp"             "\n\t"
   1647       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1648       "movdqu    16(%2), %%xmm11"           "\n\t"
   1649       "pcmpistri $0x34,  %%xmm2, %%xmm11"   "\n\t"
   1650       "pushfq"                              "\n\t"
   1651       "popq      %%rdx"                     "\n\t"
   1652       "movq      %%rcx,  %0"                "\n\t"
   1653       "movq      %%rdx,  %1"                "\n\t"
   1654       "addq      $1024,  %%rsp"             "\n\t"
   1655       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1656       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1657    );
   1658    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1659 }
   1660 
   1661 UInt s_pcmpistri_34 ( V128* argLU, V128* argRU )
   1662 {
   1663    V128 resV;
   1664    UInt resOSZACP, resECX;
   1665    Bool ok
   1666       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1667                        zmask_from_V128(argLU),
   1668                        zmask_from_V128(argRU),
   1669                        0x34, False/*!isSTRM*/
   1670         );
   1671    assert(ok);
   1672    resECX = resV.uInt[0];
   1673    return (resOSZACP << 16) | resECX;
   1674 }
   1675 
   1676 void istri_34 ( void )
   1677 {
   1678    char* wot = "34";
   1679    UInt(*h)(V128*,V128*) = h_pcmpistri_34;
   1680    UInt(*s)(V128*,V128*) = s_pcmpistri_34;
   1681 
   1682    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1683    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1684    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1685    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1686 
   1687    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1688    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1689    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1690    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1691    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1692 
   1693    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1694 
   1695    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1696    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1697    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1698 
   1699    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1700    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1701    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1702 
   1703    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1704    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1705 
   1706    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1707    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1708 
   1709    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
   1710    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
   1711    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
   1712    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
   1713 }
   1714 
   1715 
   1716 //////////////////////////////////////////////////////////
   1717 //                                                      //
   1718 //                       ISTRI_14                       //
   1719 //                                                      //
   1720 //////////////////////////////////////////////////////////
   1721 
   1722 UInt h_pcmpistri_14 ( V128* argL, V128* argR )
   1723 {
   1724    V128 block[2];
   1725    memcpy(&block[0], argL, sizeof(V128));
   1726    memcpy(&block[1], argR, sizeof(V128));
   1727    ULong res, flags;
   1728    __asm__ __volatile__(
   1729       "subq      $1024,  %%rsp"             "\n\t"
   1730       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1731       "movdqu    16(%2), %%xmm11"           "\n\t"
   1732       "pcmpistri $0x14,  %%xmm2, %%xmm11"   "\n\t"
   1733       "pushfq"                              "\n\t"
   1734       "popq      %%rdx"                     "\n\t"
   1735       "movq      %%rcx,  %0"                "\n\t"
   1736       "movq      %%rdx,  %1"                "\n\t"
   1737       "addq      $1024,  %%rsp"             "\n\t"
   1738       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1739       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1740    );
   1741    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1742 }
   1743 
   1744 UInt s_pcmpistri_14 ( V128* argLU, V128* argRU )
   1745 {
   1746    V128 resV;
   1747    UInt resOSZACP, resECX;
   1748    Bool ok
   1749       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1750                        zmask_from_V128(argLU),
   1751                        zmask_from_V128(argRU),
   1752                        0x14, False/*!isSTRM*/
   1753         );
   1754    assert(ok);
   1755    resECX = resV.uInt[0];
   1756    return (resOSZACP << 16) | resECX;
   1757 }
   1758 
   1759 void istri_14 ( void )
   1760 {
   1761    char* wot = "14";
   1762    UInt(*h)(V128*,V128*) = h_pcmpistri_14;
   1763    UInt(*s)(V128*,V128*) = s_pcmpistri_14;
   1764 
   1765    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1766    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1767    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1768    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1769 
   1770    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1771    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1772    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1773    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1774    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1775 
   1776    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1777 
   1778    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1779    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1780    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1781 
   1782    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1783    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1784    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1785 
   1786    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1787    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1788 
   1789    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1790    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1791 
   1792    try_istri(wot,h,s, "163887ec041a9b72", "fcd75adb9b3e895a");
   1793    try_istri(wot,h,s, "fc937cbfbf53f8e2", "0d136bcb024d3fb7");
   1794    try_istri(wot,h,s, "2ca34182c29a82ab", "302ebd646775ab54");
   1795    try_istri(wot,h,s, "3f2987608c11be6f", "a9ecb661f8e0a8cb");
   1796 }
   1797 
   1798 
   1799 //////////////////////////////////////////////////////////
   1800 //                                                      //
   1801 //                         main                         //
   1802 //                                                      //
   1803 //////////////////////////////////////////////////////////
   1804 
   1805 int main ( void )
   1806 {
   1807    istri_4A();
   1808    istri_3A();
   1809    istri_08();
   1810    istri_1A();
   1811    istri_02();
   1812    istri_0C();
   1813    istri_12();
   1814    istri_44();
   1815    istri_00();
   1816    istri_38();
   1817    istri_46();
   1818    istri_30();
   1819    istri_40();
   1820    istri_0E();
   1821    istri_14();
   1822    istri_34();
   1823    return 0;
   1824 }
   1825