Home | History | Annotate | Download | only in amd64
      1 
      2 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
      3    pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
      4    aspect. */
      5 
      6 #include <string.h>
      7 #include <stdio.h>
      8 #include <assert.h>
      9 
     10 typedef  unsigned int   UInt;
     11 typedef  signed int     Int;
     12 typedef  unsigned char  UChar;
     13 typedef  unsigned short UShort;
     14 typedef  unsigned long long int ULong;
     15 typedef  UChar          Bool;
     16 #define False ((Bool)0)
     17 #define True  ((Bool)1)
     18 
     19 //typedef  unsigned char  V128[16];
     20 typedef
     21    union {
     22       UChar  uChar[16];
     23       UShort uShort[8];
     24       UInt   uInt[4];
     25       UInt   w32[4];
     26    }
     27    V128;
     28 
     29 #define SHIFT_O   11
     30 #define SHIFT_S   7
     31 #define SHIFT_Z   6
     32 #define SHIFT_A   4
     33 #define SHIFT_C   0
     34 #define SHIFT_P   2
     35 
     36 #define MASK_O    (1ULL << SHIFT_O)
     37 #define MASK_S    (1ULL << SHIFT_S)
     38 #define MASK_Z    (1ULL << SHIFT_Z)
     39 #define MASK_A    (1ULL << SHIFT_A)
     40 #define MASK_C    (1ULL << SHIFT_C)
     41 #define MASK_P    (1ULL << SHIFT_P)
     42 
     43 
     44 UInt clz32 ( UInt x )
     45 {
     46    Int y, m, n;
     47    y = -(x >> 16);
     48    m = (y >> 16) & 16;
     49    n = 16 - m;
     50    x = x >> m;
     51    y = x - 0x100;
     52    m = (y >> 16) & 8;
     53    n = n + m;
     54    x = x << m;
     55    y = x - 0x1000;
     56    m = (y >> 16) & 4;
     57    n = n + m;
     58    x = x << m;
     59    y = x - 0x4000;
     60    m = (y >> 16) & 2;
     61    n = n + m;
     62    x = x << m;
     63    y = x >> 14;
     64    m = y & ~(y >> 1);
     65    return n + 2 - m;
     66 }
     67 
     68 UInt ctz32 ( UInt x )
     69 {
     70    return 32 - clz32((~x) & (x-1));
     71 }
     72 
     73 void expand ( V128* dst, char* summary )
     74 {
     75    Int i;
     76    assert( strlen(summary) == 16 );
     77    for (i = 0; i < 16; i++) {
     78       UChar xx = 0;
     79       UChar x = summary[15-i];
     80       if      (x >= '0' && x <= '9') { xx = x - '0'; }
     81       else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
     82       else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
     83       else assert(0);
     84 
     85       assert(xx < 16);
     86       xx = (xx << 4) | xx;
     87       assert(xx < 256);
     88       dst->uChar[i] = xx;
     89    }
     90 }
     91 
     92 void try_istri ( char* which,
     93                  UInt(*h_fn)(V128*,V128*),
     94                  UInt(*s_fn)(V128*,V128*),
     95                  char* summL, char* summR )
     96 {
     97    assert(strlen(which) == 2);
     98    V128 argL, argR;
     99    expand(&argL, summL);
    100    expand(&argR, summR);
    101    UInt h_res = h_fn(&argL, &argR);
    102    UInt s_res = s_fn(&argL, &argR);
    103    printf("istri %s  %s %s -> %08x %08x %s\n",
    104           which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
    105 }
    106 
    107 UInt zmask_from_V128 ( V128* arg )
    108 {
    109    UInt i, res = 0;
    110    for (i = 0; i < 8; i++) {
    111       res |=  ((arg->uShort[i] == 0) ? 1 : 0) << i;
    112    }
    113    return res;
    114 }
    115 
    116 //////////////////////////////////////////////////////////
    117 //                                                      //
    118 //                       GENERAL                        //
    119 //                                                      //
    120 //////////////////////////////////////////////////////////
    121 
    122 
    123 /* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
    124    basically), generate an I- or M-format output value, also the new
    125    OSZACP flags.  */
    126 static
    127 void PCMPxSTRx_WRK_gen_output_fmt_I_wide ( /*OUT*/V128* resV,
    128 					   /*OUT*/UInt* resOSZACP,
    129 					   UInt intRes1,
    130 					   UInt zmaskL, UInt zmaskR,
    131 					   UInt validL,
    132 					   UInt pol, UInt idx )
    133 {
    134    assert((pol >> 2) == 0);
    135    assert((idx >> 1) == 0);
    136 
    137    UInt intRes2 = 0;
    138    switch (pol) {
    139       case 0: intRes2 = intRes1;          break; // pol +
    140       case 1: intRes2 = ~intRes1;         break; // pol -
    141       case 2: intRes2 = intRes1;          break; // pol m+
    142       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
    143    }
    144    intRes2 &= 0xFF;
    145 
    146    // generate I-format output (an index in ECX)
    147    // generate ecx value
    148    UInt newECX = 0;
    149    if (idx) {
    150      // index of ms-1-bit
    151      newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2));
    152    } else {
    153      // index of ls-1-bit
    154      newECX = intRes2 == 0 ? 8 : ctz32(intRes2);
    155    }
    156 
    157    resV->w32[0] = newECX;
    158    resV->w32[1] = 0;
    159    resV->w32[2] = 0;
    160    resV->w32[3] = 0;
    161 
    162    // generate new flags, common to all ISTRI and ISTRM cases
    163    *resOSZACP    // A, P are zero
    164      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
    165      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
    166      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
    167      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
    168 }
    169 
    170 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
    171    variants on 16-bit characters.
    172 
    173    For xSTRI variants, the new ECX value is placed in the 32 bits
    174    pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
    175    variants, the result is a 128 bit value and is placed at *resV in
    176    the obvious way.
    177 
    178    For all variants, the new OSZACP value is placed at *resOSZACP.
    179 
    180    argLV and argRV are the vector args.  The caller must prepare a
    181    8-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
    182    must be 1 for each zero byte of of the respective arg.  For ESTRx
    183    variants this is derived from the explicit length indication, and
    184    must be 0 in all places except at the bit index corresponding to
    185    the valid length (0 .. 8).  If the valid length is 8 then the
    186    mask must be all zeroes.  In all cases, bits 31:8 must be zero.
    187 
    188    imm8 is the original immediate from the instruction.  isSTRM
    189    indicates whether this is a xSTRM or xSTRI variant, which controls
    190    how much of *res is written.
    191 
    192    If the given imm8 case can be handled, the return value is True.
    193    If not, False is returned, and neither *res not *resOSZACP are
    194    altered.
    195 */
    196 
    197 Bool pcmpXstrX_WRK_wide ( /*OUT*/V128* resV,
    198 			  /*OUT*/UInt* resOSZACP,
    199 			  V128* argLV,  V128* argRV,
    200 			  UInt zmaskL, UInt zmaskR,
    201 			  UInt imm8,   Bool isxSTRM )
    202 {
    203    assert(imm8 < 0x80);
    204    assert((zmaskL >> 8) == 0);
    205    assert((zmaskR >> 8) == 0);
    206 
    207    /* Explicitly reject any imm8 values that haven't been validated,
    208       even if they would probably work.  Life is too short to have
    209       unvalidated cases in the code base. */
    210    switch (imm8) {
    211       case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
    212                  case 0x13: case 0x19: case 0x1B:
    213                             case 0x39: case 0x3B:
    214                  case 0x45:            case 0x4B:
    215          break;
    216       default:
    217          return False;
    218    }
    219 
    220    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
    221    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
    222    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
    223    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
    224 
    225    /*----------------------------------------*/
    226    /*-- strcmp on wide data                --*/
    227    /*----------------------------------------*/
    228 
    229    if (agg == 2/*equal each, aka strcmp*/
    230        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
    231       Int    i;
    232       UShort* argL = (UShort*)argLV;
    233       UShort* argR = (UShort*)argRV;
    234       UInt boolResII = 0;
    235       for (i = 7; i >= 0; i--) {
    236          UShort cL  = argL[i];
    237          UShort cR  = argR[i];
    238          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
    239       }
    240       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    241       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    242 
    243       // do invalidation, common to all equal-each cases
    244       UInt intRes1
    245          = (boolResII & validL & validR)  // if both valid, use cmpres
    246            | (~ (validL | validR));       // if both invalid, force 1
    247                                           // else force 0
    248       intRes1 &= 0xFF;
    249 
    250       // generate I-format output
    251       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
    252          resV, resOSZACP,
    253          intRes1, zmaskL, zmaskR, validL, pol, idx
    254       );
    255 
    256       return True;
    257    }
    258 
    259    /*----------------------------------------*/
    260    /*-- set membership on wide data        --*/
    261    /*----------------------------------------*/
    262 
    263    if (agg == 0/*equal any, aka find chars in a set*/
    264        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
    265       /* argL: the string,  argR: charset */
    266       UInt   si, ci;
    267       UShort* argL    = (UShort*)argLV;
    268       UShort* argR    = (UShort*)argRV;
    269       UInt   boolRes = 0;
    270       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    271       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    272 
    273       for (si = 0; si < 8; si++) {
    274          if ((validL & (1 << si)) == 0)
    275             // run off the end of the string.
    276             break;
    277          UInt m = 0;
    278          for (ci = 0; ci < 8; ci++) {
    279             if ((validR & (1 << ci)) == 0) break;
    280             if (argR[ci] == argL[si]) { m = 1; break; }
    281          }
    282          boolRes |= (m << si);
    283       }
    284 
    285       // boolRes is "pre-invalidated"
    286       UInt intRes1 = boolRes & 0xFF;
    287 
    288       // generate I-format output
    289       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
    290          resV, resOSZACP,
    291          intRes1, zmaskL, zmaskR, validL, pol, idx
    292       );
    293 
    294       return True;
    295    }
    296 
    297    /*----------------------------------------*/
    298    /*-- substring search on wide data      --*/
    299    /*----------------------------------------*/
    300 
    301    if (agg == 3/*equal ordered, aka substring search*/
    302        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
    303 
    304       /* argL: haystack,  argR: needle */
    305       UInt   ni, hi;
    306       UShort* argL    = (UShort*)argLV;
    307       UShort* argR    = (UShort*)argRV;
    308       UInt   boolRes = 0;
    309       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    310       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    311       for (hi = 0; hi < 8; hi++) {
    312          UInt m = 1;
    313          for (ni = 0; ni < 8; ni++) {
    314             if ((validR & (1 << ni)) == 0) break;
    315             UInt i = ni + hi;
    316             if (i >= 8) break;
    317             if (argL[i] != argR[ni]) { m = 0; break; }
    318          }
    319          boolRes |= (m << hi);
    320          if ((validL & (1 << hi)) == 0)
    321             // run off the end of the haystack
    322             break;
    323       }
    324 
    325       // boolRes is "pre-invalidated"
    326       UInt intRes1 = boolRes & 0xFF;
    327 
    328       // generate I-format output
    329       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
    330          resV, resOSZACP,
    331          intRes1, zmaskL, zmaskR, validL, pol, idx
    332       );
    333 
    334       return True;
    335    }
    336 
    337    /*----------------------------------------*/
    338    /*-- ranges, unsigned wide data         --*/
    339    /*----------------------------------------*/
    340 
    341    if (agg == 1/*ranges*/
    342        && fmt == 1/*uw*/) {
    343 
    344       /* argL: string,  argR: range-pairs */
    345       UInt   ri, si;
    346       UShort* argL    = (UShort*)argLV;
    347       UShort* argR    = (UShort*)argRV;
    348       UInt   boolRes = 0;
    349       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    350       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    351       for (si = 0; si < 8; si++) {
    352          if ((validL & (1 << si)) == 0)
    353             // run off the end of the string
    354             break;
    355          UInt m = 0;
    356          for (ri = 0; ri < 8; ri += 2) {
    357             if ((validR & (3 << ri)) != (3 << ri)) break;
    358             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    359                m = 1; break;
    360             }
    361          }
    362          boolRes |= (m << si);
    363       }
    364 
    365       // boolRes is "pre-invalidated"
    366       UInt intRes1 = boolRes & 0xFF;
    367 
    368       // generate I-format output
    369       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
    370          resV, resOSZACP,
    371          intRes1, zmaskL, zmaskR, validL, pol, idx
    372       );
    373 
    374       return True;
    375    }
    376 
    377    return False;
    378 }
    379 
    380 //////////////////////////////////////////////////////////
    381 //                                                      //
    382 //                       ISTRI_4B                       //
    383 //                                                      //
    384 //////////////////////////////////////////////////////////
    385 
    386 UInt h_pcmpistri_4B ( V128* argL, V128* argR )
    387 {
    388    V128 block[2];
    389    memcpy(&block[0], argL, sizeof(V128));
    390    memcpy(&block[1], argR, sizeof(V128));
    391    ULong res, flags;
    392    __asm__ __volatile__(
    393       "subq      $1024,  %%rsp"             "\n\t"
    394       "movdqu    0(%2),  %%xmm2"            "\n\t"
    395       "movdqu    16(%2), %%xmm11"           "\n\t"
    396       "pcmpistri $0x4B,  %%xmm2, %%xmm11"   "\n\t"
    397       "pushfq"                              "\n\t"
    398       "popq      %%rdx"                     "\n\t"
    399       "movq      %%rcx,  %0"                "\n\t"
    400       "movq      %%rdx,  %1"                "\n\t"
    401       "addq      $1024,  %%rsp"             "\n\t"
    402       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    403       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    404    );
    405    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    406 }
    407 
    408 UInt s_pcmpistri_4B ( V128* argLU, V128* argRU )
    409 {
    410    V128 resV;
    411    UInt resOSZACP, resECX;
    412    Bool ok
    413       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    414 			    zmask_from_V128(argLU),
    415 			    zmask_from_V128(argRU),
    416 			    0x4B, False/*!isSTRM*/
    417         );
    418    assert(ok);
    419    resECX = resV.uInt[0];
    420    return (resOSZACP << 16) | resECX;
    421 }
    422 
    423 void istri_4B ( void )
    424 {
    425    char* wot = "4B";
    426    UInt(*h)(V128*,V128*) = h_pcmpistri_4B;
    427    UInt(*s)(V128*,V128*) = s_pcmpistri_4B;
    428 
    429    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    430 
    431    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    432    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    433    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    434    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    435 
    436    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    437    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    438    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    439 
    440    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    441    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    442    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    443    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    444 
    445    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    446    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    447    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    448 
    449    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    450 
    451    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    452    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    453    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
    454 
    455    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
    456    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    457    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
    458 
    459    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    460    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
    461    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
    462 
    463    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
    464    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
    465    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
    466 
    467    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    468    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    469 }
    470 
    471 //////////////////////////////////////////////////////////
    472 //                                                      //
    473 //                       ISTRI_3B                       //
    474 //                                                      //
    475 //////////////////////////////////////////////////////////
    476 
    477 UInt h_pcmpistri_3B ( V128* argL, V128* argR )
    478 {
    479    V128 block[2];
    480    memcpy(&block[0], argL, sizeof(V128));
    481    memcpy(&block[1], argR, sizeof(V128));
    482    ULong res, flags;
    483    __asm__ __volatile__(
    484       "subq      $1024,  %%rsp"             "\n\t"
    485       "movdqu    0(%2),  %%xmm2"            "\n\t"
    486       "movdqu    16(%2), %%xmm11"           "\n\t"
    487       "pcmpistri $0x3B,  %%xmm2, %%xmm11"   "\n\t"
    488       "pushfq"                              "\n\t"
    489       "popq      %%rdx"                     "\n\t"
    490       "movq      %%rcx,  %0"                "\n\t"
    491       "movq      %%rdx,  %1"                "\n\t"
    492       "addq      $1024,  %%rsp"             "\n\t"
    493       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    494       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    495    );
    496    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    497 }
    498 
    499 UInt s_pcmpistri_3B ( V128* argLU, V128* argRU )
    500 {
    501    V128 resV;
    502    UInt resOSZACP, resECX;
    503    Bool ok
    504       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    505 			    zmask_from_V128(argLU),
    506 			    zmask_from_V128(argRU),
    507 			    0x3B, False/*!isSTRM*/
    508         );
    509    assert(ok);
    510    resECX = resV.uInt[0];
    511    return (resOSZACP << 16) | resECX;
    512 }
    513 
    514 void istri_3B ( void )
    515 {
    516    char* wot = "3B";
    517    UInt(*h)(V128*,V128*) = h_pcmpistri_3B;
    518    UInt(*s)(V128*,V128*) = s_pcmpistri_3B;
    519 
    520    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    521 
    522    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    523    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    524    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    525    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    526 
    527    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    528    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    529    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    530 
    531    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    532    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    533    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    534    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    535 
    536    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    537    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    538    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    539 
    540    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    541 
    542    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    543    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    544    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
    545 
    546    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
    547    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    548    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
    549 
    550    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    551    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
    552    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
    553 
    554    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
    555    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
    556    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
    557 
    558    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    559    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    560 }
    561 
    562 
    563 
    564 //////////////////////////////////////////////////////////
    565 //                                                      //
    566 //                       ISTRI_0D                       //
    567 //                                                      //
    568 //////////////////////////////////////////////////////////
    569 
    570 __attribute__((noinline))
    571 UInt h_pcmpistri_0D ( V128* argL, V128* argR )
    572 {
    573    V128 block[2];
    574    memcpy(&block[0], argL, sizeof(V128));
    575    memcpy(&block[1], argR, sizeof(V128));
    576    ULong res = 0, flags = 0;
    577    __asm__ __volatile__(
    578       "movdqu    0(%2),  %%xmm2"            "\n\t"
    579       "movdqu    16(%2), %%xmm11"           "\n\t"
    580       "pcmpistri $0x0D,  %%xmm2, %%xmm11"   "\n\t"
    581       //"pcmpistrm $0x0D,  %%xmm2, %%xmm11"   "\n\t"
    582       //"movd %%xmm0, %%ecx" "\n\t"
    583       "pushfq"                              "\n\t"
    584       "popq      %%rdx"                     "\n\t"
    585       "movq      %%rcx,  %0"                "\n\t"
    586       "movq      %%rdx,  %1"                "\n\t"
    587       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    588       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    589    );
    590    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    591 }
    592 
    593 UInt s_pcmpistri_0D ( V128* argLU, V128* argRU )
    594 {
    595    V128 resV;
    596    UInt resOSZACP, resECX;
    597    Bool ok
    598       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    599 			    zmask_from_V128(argLU),
    600 			    zmask_from_V128(argRU),
    601 			    0x0D, False/*!isSTRM*/
    602         );
    603    assert(ok);
    604    resECX = resV.uInt[0];
    605    return (resOSZACP << 16) | resECX;
    606 }
    607 
    608 void istri_0D ( void )
    609 {
    610    char* wot = "0D";
    611    UInt(*h)(V128*,V128*) = h_pcmpistri_0D;
    612    UInt(*s)(V128*,V128*) = s_pcmpistri_0D;
    613 
    614    try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
    615 
    616    try_istri(wot,h,s, "11111111abcdef11", "00abcdef00abcdef");
    617 
    618    try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
    619    try_istri(wot,h,s, "1111111111abcdef", "0000000000abcdef");
    620    try_istri(wot,h,s, "111111111111abcd", "0000000000abcdef");
    621 
    622    try_istri(wot,h,s, "1111abcd11abcd11", "000000000000abcd");
    623 
    624    try_istri(wot,h,s, "11abcd1111abcd11", "000000000000abcd");
    625    try_istri(wot,h,s, "abcd111111abcd11", "000000000000abcd");
    626    try_istri(wot,h,s, "cd11111111abcd11", "000000000000abcd");
    627 
    628    try_istri(wot,h,s, "01abcd11abcd1111", "000000000000abcd");
    629    try_istri(wot,h,s, "00abcd11abcd1111", "000000000000abcd");
    630    try_istri(wot,h,s, "0000cd11abcd1111", "000000000000abcd");
    631 
    632    try_istri(wot,h,s, "00abcd1100abcd11", "000000000000abcd");
    633    try_istri(wot,h,s, "00abcd110000cd11", "000000000000abcd");
    634 
    635    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
    636    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
    637    try_istri(wot,h,s, "1111111111111234", "0000000000001111");
    638 
    639    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
    640    try_istri(wot,h,s, "0a11111111111111", "000000000000000a");
    641    try_istri(wot,h,s, "0b11111111111111", "000000000000000a");
    642 
    643    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
    644    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    645    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
    646    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
    647 }
    648 
    649 
    650 //////////////////////////////////////////////////////////
    651 //                                                      //
    652 //                       ISTRI_09                       //
    653 //                                                      //
    654 //////////////////////////////////////////////////////////
    655 
    656 UInt h_pcmpistri_09 ( V128* argL, V128* argR )
    657 {
    658    V128 block[2];
    659    memcpy(&block[0], argL, sizeof(V128));
    660    memcpy(&block[1], argR, sizeof(V128));
    661    ULong res, flags;
    662    __asm__ __volatile__(
    663       "subq      $1024,  %%rsp"             "\n\t"
    664       "movdqu    0(%2),  %%xmm2"            "\n\t"
    665       "movdqu    16(%2), %%xmm11"           "\n\t"
    666       "pcmpistri $0x09,  %%xmm2, %%xmm11"   "\n\t"
    667       "pushfq"                              "\n\t"
    668       "popq      %%rdx"                     "\n\t"
    669       "movq      %%rcx,  %0"                "\n\t"
    670       "movq      %%rdx,  %1"                "\n\t"
    671       "addq      $1024,  %%rsp"             "\n\t"
    672       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    673       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    674    );
    675    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    676 }
    677 
    678 UInt s_pcmpistri_09 ( V128* argLU, V128* argRU )
    679 {
    680    V128 resV;
    681    UInt resOSZACP, resECX;
    682    Bool ok
    683       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    684 			    zmask_from_V128(argLU),
    685 			    zmask_from_V128(argRU),
    686 			    0x09, False/*!isSTRM*/
    687         );
    688    assert(ok);
    689    resECX = resV.uInt[0];
    690    return (resOSZACP << 16) | resECX;
    691 }
    692 
    693 void istri_09 ( void )
    694 {
    695    char* wot = "09";
    696    UInt(*h)(V128*,V128*) = h_pcmpistri_09;
    697    UInt(*s)(V128*,V128*) = s_pcmpistri_09;
    698 
    699    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    700 
    701    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    702    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    703    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    704    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    705 
    706    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    707    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    708    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    709 
    710    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    711    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    712    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    713    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    714 
    715    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    716    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    717    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    718 
    719    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    720 
    721    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    722    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    723    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
    724 
    725    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
    726    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    727    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
    728 
    729    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    730    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
    731    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
    732 
    733    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
    734    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
    735    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
    736 
    737    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    738    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    739 }
    740 
    741 
    742 
    743 //////////////////////////////////////////////////////////
    744 //                                                      //
    745 //                       ISTRI_1B                       //
    746 //                                                      //
    747 //////////////////////////////////////////////////////////
    748 
    749 UInt h_pcmpistri_1B ( V128* argL, V128* argR )
    750 {
    751    V128 block[2];
    752    memcpy(&block[0], argL, sizeof(V128));
    753    memcpy(&block[1], argR, sizeof(V128));
    754    ULong res, flags;
    755    __asm__ __volatile__(
    756       "subq      $1024,  %%rsp"             "\n\t"
    757       "movdqu    0(%2),  %%xmm2"            "\n\t"
    758       "movdqu    16(%2), %%xmm11"           "\n\t"
    759       "pcmpistri $0x1B,  %%xmm2, %%xmm11"   "\n\t"
    760       "pushfq"                              "\n\t"
    761       "popq      %%rdx"                     "\n\t"
    762       "movq      %%rcx,  %0"                "\n\t"
    763       "movq      %%rdx,  %1"                "\n\t"
    764       "addq      $1024,  %%rsp"             "\n\t"
    765       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    766       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    767    );
    768    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    769 }
    770 
    771 UInt s_pcmpistri_1B ( V128* argLU, V128* argRU )
    772 {
    773    V128 resV;
    774    UInt resOSZACP, resECX;
    775    Bool ok
    776       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    777 			    zmask_from_V128(argLU),
    778 			    zmask_from_V128(argRU),
    779 			    0x1B, False/*!isSTRM*/
    780         );
    781    assert(ok);
    782    resECX = resV.uInt[0];
    783    return (resOSZACP << 16) | resECX;
    784 }
    785 
    786 void istri_1B ( void )
    787 {
    788    char* wot = "1B";
    789    UInt(*h)(V128*,V128*) = h_pcmpistri_1B;
    790    UInt(*s)(V128*,V128*) = s_pcmpistri_1B;
    791 
    792    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    793 
    794    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    795    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    796    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    797    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    798 
    799    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    800    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    801    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    802 
    803    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    804    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    805    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    806    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    807 
    808    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    809    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    810    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    811 
    812    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    813 
    814    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    815    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    816    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
    817 
    818    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
    819    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    820    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
    821 
    822    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    823    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
    824    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
    825 
    826    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
    827    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
    828    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
    829 
    830    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    831    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    832 }
    833 
    834 
    835 
    836 //////////////////////////////////////////////////////////
    837 //                                                      //
    838 //                       ISTRI_03                       //
    839 //                                                      //
    840 //////////////////////////////////////////////////////////
    841 
    842 UInt h_pcmpistri_03 ( V128* argL, V128* argR )
    843 {
    844    V128 block[2];
    845    memcpy(&block[0], argL, sizeof(V128));
    846    memcpy(&block[1], argR, sizeof(V128));
    847    ULong res, flags;
    848    __asm__ __volatile__(
    849       "subq      $1024,  %%rsp"             "\n\t"
    850       "movdqu    0(%2),  %%xmm2"            "\n\t"
    851       "movdqu    16(%2), %%xmm11"           "\n\t"
    852       "pcmpistri $0x03,  %%xmm2, %%xmm11"   "\n\t"
    853 //"pcmpistrm $0x03, %%xmm2, %%xmm11"   "\n\t"
    854 //"movd %%xmm0, %%ecx" "\n\t"
    855       "pushfq"                              "\n\t"
    856       "popq      %%rdx"                     "\n\t"
    857       "movq      %%rcx,  %0"                "\n\t"
    858       "movq      %%rdx,  %1"                "\n\t"
    859       "addq      $1024,  %%rsp"             "\n\t"
    860       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    861       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    862    );
    863    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    864 }
    865 
    866 UInt s_pcmpistri_03 ( V128* argLU, V128* argRU )
    867 {
    868    V128 resV;
    869    UInt resOSZACP, resECX;
    870    Bool ok
    871       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    872 			    zmask_from_V128(argLU),
    873 			    zmask_from_V128(argRU),
    874 			    0x03, False/*!isSTRM*/
    875         );
    876    assert(ok);
    877    resECX = resV.uInt[0];
    878    return (resOSZACP << 16) | resECX;
    879 }
    880 
    881 void istri_03 ( void )
    882 {
    883    char* wot = "03";
    884    UInt(*h)(V128*,V128*) = h_pcmpistri_03;
    885    UInt(*s)(V128*,V128*) = s_pcmpistri_03;
    886 
    887    try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
    888    try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
    889    try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
    890    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
    891 
    892    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
    893    try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
    894    try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
    895    try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
    896    try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
    897 
    898    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
    899    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
    900    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
    901    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
    902 
    903    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    904    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    905 
    906    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
    907    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
    908    try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
    909    try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
    910 
    911    try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
    912 
    913    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
    914    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
    915 }
    916 
    917 
    918 //////////////////////////////////////////////////////////
    919 //                                                      //
    920 //                       ISTRI_13                       //
    921 //                                                      //
    922 //////////////////////////////////////////////////////////
    923 
    924 UInt h_pcmpistri_13 ( V128* argL, V128* argR )
    925 {
    926    V128 block[2];
    927    memcpy(&block[0], argL, sizeof(V128));
    928    memcpy(&block[1], argR, sizeof(V128));
    929    ULong res, flags;
    930    __asm__ __volatile__(
    931       "subq      $1024,  %%rsp"             "\n\t"
    932       "movdqu    0(%2),  %%xmm2"            "\n\t"
    933       "movdqu    16(%2), %%xmm11"           "\n\t"
    934       "pcmpistri $0x13,  %%xmm2, %%xmm11"   "\n\t"
    935 //"pcmpistrm $0x13, %%xmm2, %%xmm11"   "\n\t"
    936 //"movd %%xmm0, %%ecx" "\n\t"
    937       "pushfq"                              "\n\t"
    938       "popq      %%rdx"                     "\n\t"
    939       "movq      %%rcx,  %0"                "\n\t"
    940       "movq      %%rdx,  %1"                "\n\t"
    941       "addq      $1024,  %%rsp"             "\n\t"
    942       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    943       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    944    );
    945    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    946 }
    947 
    948 UInt s_pcmpistri_13 ( V128* argLU, V128* argRU )
    949 {
    950    V128 resV;
    951    UInt resOSZACP, resECX;
    952    Bool ok
    953       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    954 			    zmask_from_V128(argLU),
    955 			    zmask_from_V128(argRU),
    956 			    0x13, False/*!isSTRM*/
    957         );
    958    assert(ok);
    959    resECX = resV.uInt[0];
    960    return (resOSZACP << 16) | resECX;
    961 }
    962 
    963 void istri_13 ( void )
    964 {
    965    char* wot = "13";
    966    UInt(*h)(V128*,V128*) = h_pcmpistri_13;
    967    UInt(*s)(V128*,V128*) = s_pcmpistri_13;
    968 
    969    try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
    970    try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
    971    try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
    972    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
    973 
    974    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
    975    try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
    976    try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
    977    try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
    978    try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
    979 
    980    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
    981    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
    982    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
    983    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
    984 
    985    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    986    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    987 
    988    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
    989    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
    990    try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
    991    try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
    992 
    993    try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
    994 
    995    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
    996    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
    997 }
    998 
    999 
   1000 
   1001 //////////////////////////////////////////////////////////
   1002 //                                                      //
   1003 //                       ISTRI_45                       //
   1004 //                                                      //
   1005 //////////////////////////////////////////////////////////
   1006 
   1007 UInt h_pcmpistri_45 ( V128* argL, V128* argR )
   1008 {
   1009    V128 block[2];
   1010    memcpy(&block[0], argL, sizeof(V128));
   1011    memcpy(&block[1], argR, sizeof(V128));
   1012    ULong res, flags;
   1013    __asm__ __volatile__(
   1014       "subq      $1024,  %%rsp"             "\n\t"
   1015       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1016       "movdqu    16(%2), %%xmm11"           "\n\t"
   1017       "pcmpistri $0x45,  %%xmm2, %%xmm11"   "\n\t"
   1018 //"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
   1019 //"movd %%xmm0, %%ecx" "\n\t"
   1020       "pushfq"                              "\n\t"
   1021       "popq      %%rdx"                     "\n\t"
   1022       "movq      %%rcx,  %0"                "\n\t"
   1023       "movq      %%rdx,  %1"                "\n\t"
   1024       "addq      $1024,  %%rsp"             "\n\t"
   1025       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1026       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1027    );
   1028    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1029 }
   1030 
   1031 UInt s_pcmpistri_45 ( V128* argLU, V128* argRU )
   1032 {
   1033    V128 resV;
   1034    UInt resOSZACP, resECX;
   1035    Bool ok
   1036       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
   1037 			    zmask_from_V128(argLU),
   1038 			    zmask_from_V128(argRU),
   1039 			    0x45, False/*!isSTRM*/
   1040         );
   1041    assert(ok);
   1042    resECX = resV.uInt[0];
   1043    return (resOSZACP << 16) | resECX;
   1044 }
   1045 
   1046 void istri_45 ( void )
   1047 {
   1048    char* wot = "45";
   1049    UInt(*h)(V128*,V128*) = h_pcmpistri_45;
   1050    UInt(*s)(V128*,V128*) = s_pcmpistri_45;
   1051 
   1052    try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000bbcc");
   1053    try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000ccbb");
   1054    try_istri(wot,h,s, "baaabbbbccccdddd", "000000000000ccbb");
   1055    try_istri(wot,h,s, "baaabbbbccccdddc", "000000000000ccbb");
   1056 
   1057    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
   1058    try_istri(wot,h,s, "bbbbbbbb00bbbbbb", "000000000000ccbb");
   1059    try_istri(wot,h,s, "bbbbbbbbbbbb00bb", "000000000000ccbb");
   1060    try_istri(wot,h,s, "bbbbbbbbbbbbbb00", "000000000000ccbb");
   1061    try_istri(wot,h,s, "0000000000000000", "000000000000ccbb");
   1062 
   1063    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1064 
   1065    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
   1066    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000bb");
   1067    try_istri(wot,h,s, "bb44bb44bb44bb44", "000000006622ccbb");
   1068 
   1069    try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000022ccbb");
   1070    try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000000ccbb");
   1071    try_istri(wot,h,s, "bb44bb44bb44bb44", "00000000000000bb");
   1072 
   1073    try_istri(wot,h,s, "0011223344556677", "0000997755442211");
   1074    try_istri(wot,h,s, "1122334455667711", "0000997755442211");
   1075 
   1076    try_istri(wot,h,s, "0011223344556677", "0000aa8866553322");
   1077    try_istri(wot,h,s, "1122334455667711", "0000aa8866553322");
   1078 }
   1079 
   1080 
   1081 //////////////////////////////////////////////////////////
   1082 //                                                      //
   1083 //                       ISTRI_01                       //
   1084 //                                                      //
   1085 //////////////////////////////////////////////////////////
   1086 
   1087 UInt h_pcmpistri_01 ( V128* argL, V128* argR )
   1088 {
   1089    V128 block[2];
   1090    memcpy(&block[0], argL, sizeof(V128));
   1091    memcpy(&block[1], argR, sizeof(V128));
   1092    ULong res, flags;
   1093    __asm__ __volatile__(
   1094       "subq      $1024,  %%rsp"             "\n\t"
   1095       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1096       "movdqu    16(%2), %%xmm11"           "\n\t"
   1097       "pcmpistri $0x01,  %%xmm2, %%xmm11"   "\n\t"
   1098 //"pcmpistrm $0x01, %%xmm2, %%xmm11"   "\n\t"
   1099 //"movd %%xmm0, %%ecx" "\n\t"
   1100       "pushfq"                              "\n\t"
   1101       "popq      %%rdx"                     "\n\t"
   1102       "movq      %%rcx,  %0"                "\n\t"
   1103       "movq      %%rdx,  %1"                "\n\t"
   1104       "addq      $1024,  %%rsp"             "\n\t"
   1105       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1106       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1107    );
   1108    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1109 }
   1110 
   1111 UInt s_pcmpistri_01 ( V128* argLU, V128* argRU )
   1112 {
   1113    V128 resV;
   1114    UInt resOSZACP, resECX;
   1115    Bool ok
   1116       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
   1117 			    zmask_from_V128(argLU),
   1118 			    zmask_from_V128(argRU),
   1119 			    0x01, False/*!isSTRM*/
   1120         );
   1121    assert(ok);
   1122    resECX = resV.uInt[0];
   1123    return (resOSZACP << 16) | resECX;
   1124 }
   1125 
   1126 void istri_01 ( void )
   1127 {
   1128    char* wot = "01";
   1129    UInt(*h)(V128*,V128*) = h_pcmpistri_01;
   1130    UInt(*s)(V128*,V128*) = s_pcmpistri_01;
   1131 
   1132    try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
   1133    try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
   1134    try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
   1135    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1136 
   1137    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
   1138    try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
   1139    try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
   1140    try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
   1141    try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
   1142 
   1143    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
   1144    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
   1145    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
   1146    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
   1147 
   1148    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1149    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1150 
   1151    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1152    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1153    try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
   1154    try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
   1155 
   1156    try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
   1157 
   1158    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1159    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1160 }
   1161 
   1162 
   1163 //////////////////////////////////////////////////////////
   1164 //                                                      //
   1165 //                       ISTRI_39                       //
   1166 //                                                      //
   1167 //////////////////////////////////////////////////////////
   1168 
   1169 UInt h_pcmpistri_39 ( V128* argL, V128* argR )
   1170 {
   1171    V128 block[2];
   1172    memcpy(&block[0], argL, sizeof(V128));
   1173    memcpy(&block[1], argR, sizeof(V128));
   1174    ULong res, flags;
   1175    __asm__ __volatile__(
   1176       "subq      $1024,  %%rsp"             "\n\t"
   1177       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1178       "movdqu    16(%2), %%xmm11"           "\n\t"
   1179       "pcmpistri $0x39,  %%xmm2, %%xmm11"   "\n\t"
   1180       "pushfq"                              "\n\t"
   1181       "popq      %%rdx"                     "\n\t"
   1182       "movq      %%rcx,  %0"                "\n\t"
   1183       "movq      %%rdx,  %1"                "\n\t"
   1184       "addq      $1024,  %%rsp"             "\n\t"
   1185       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1186       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1187    );
   1188    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1189 }
   1190 
   1191 UInt s_pcmpistri_39 ( V128* argLU, V128* argRU )
   1192 {
   1193    V128 resV;
   1194    UInt resOSZACP, resECX;
   1195    Bool ok
   1196       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
   1197 			    zmask_from_V128(argLU),
   1198 			    zmask_from_V128(argRU),
   1199 			    0x39, False/*!isSTRM*/
   1200         );
   1201    assert(ok);
   1202    resECX = resV.uInt[0];
   1203    return (resOSZACP << 16) | resECX;
   1204 }
   1205 
   1206 void istri_39 ( void )
   1207 {
   1208    char* wot = "39";
   1209    UInt(*h)(V128*,V128*) = h_pcmpistri_39;
   1210    UInt(*s)(V128*,V128*) = s_pcmpistri_39;
   1211 
   1212    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1213 
   1214    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1215    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1216    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1217    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1218 
   1219    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1220    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1221    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1222 
   1223    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1224    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1225    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1226    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1227 
   1228    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1229    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1230    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1231 
   1232    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1233 
   1234    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
   1235    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
   1236    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
   1237 
   1238    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
   1239    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
   1240    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
   1241 
   1242    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
   1243    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
   1244    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
   1245 
   1246    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
   1247    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
   1248    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
   1249 
   1250    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   1251    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   1252 }
   1253 
   1254 
   1255 
   1256 //////////////////////////////////////////////////////////
   1257 //                                                      //
   1258 //                       ISTRI_19                       //
   1259 //                                                      //
   1260 //////////////////////////////////////////////////////////
   1261 
   1262 UInt h_pcmpistri_19 ( V128* argL, V128* argR )
   1263 {
   1264    V128 block[2];
   1265    memcpy(&block[0], argL, sizeof(V128));
   1266    memcpy(&block[1], argR, sizeof(V128));
   1267    ULong res, flags;
   1268    __asm__ __volatile__(
   1269       "subq      $1024,  %%rsp"             "\n\t"
   1270       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1271       "movdqu    16(%2), %%xmm11"           "\n\t"
   1272       "pcmpistri $0x19,  %%xmm2, %%xmm11"   "\n\t"
   1273       "pushfq"                              "\n\t"
   1274       "popq      %%rdx"                     "\n\t"
   1275       "movq      %%rcx,  %0"                "\n\t"
   1276       "movq      %%rdx,  %1"                "\n\t"
   1277       "addq      $1024,  %%rsp"             "\n\t"
   1278       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1279       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1280    );
   1281    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1282 }
   1283 
   1284 UInt s_pcmpistri_19 ( V128* argLU, V128* argRU )
   1285 {
   1286    V128 resV;
   1287    UInt resOSZACP, resECX;
   1288    Bool ok
   1289       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
   1290 			    zmask_from_V128(argLU),
   1291 			    zmask_from_V128(argRU),
   1292 			    0x19, False/*!isSTRM*/
   1293         );
   1294    assert(ok);
   1295    resECX = resV.uInt[0];
   1296    return (resOSZACP << 16) | resECX;
   1297 }
   1298 
   1299 void istri_19 ( void )
   1300 {
   1301    char* wot = "19";
   1302    UInt(*h)(V128*,V128*) = h_pcmpistri_19;
   1303    UInt(*s)(V128*,V128*) = s_pcmpistri_19;
   1304 
   1305    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1306 
   1307    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1308    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1309    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1310    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1311 
   1312    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1313    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1314    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1315 
   1316    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1317    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1318    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1319    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1320 
   1321    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1322    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1323    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1324 
   1325    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1326 
   1327    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
   1328    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
   1329    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
   1330 
   1331    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
   1332    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
   1333    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
   1334 
   1335    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
   1336    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
   1337    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
   1338 
   1339    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
   1340    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
   1341    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
   1342 
   1343    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   1344    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   1345 }
   1346 
   1347 
   1348 
   1349 //////////////////////////////////////////////////////////
   1350 //                                                      //
   1351 //                         main                         //
   1352 //                                                      //
   1353 //////////////////////////////////////////////////////////
   1354 
   1355 int main ( void )
   1356 {
   1357    istri_4B();
   1358    istri_3B();
   1359    istri_09();
   1360    istri_1B();
   1361    istri_03();
   1362    istri_0D();
   1363    istri_13();
   1364    istri_45();
   1365    istri_01();
   1366    istri_39();
   1367    istri_19();
   1368    return 0;
   1369 }
   1370