Home | History | Annotate | Download | only in amd64
      1 
      2 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
      3    pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
      4    aspect. */
      5 
      6 #include <string.h>
      7 #include <stdio.h>
      8 #include <assert.h>
      9 
     10 typedef  unsigned int   UInt;
     11 typedef  signed int     Int;
     12 typedef  unsigned char  UChar;
     13 typedef  unsigned short UShort;
     14 typedef  unsigned long long int ULong;
     15 typedef  UChar          Bool;
     16 #define False ((Bool)0)
     17 #define True  ((Bool)1)
     18 
     19 //typedef  unsigned char  V128[16];
     20 typedef
     21    union {
     22       UChar  uChar[16];
     23       UShort uShort[8];
     24       UInt   uInt[4];
     25       UInt   w32[4];
     26    }
     27    V128;
     28 
     29 #define SHIFT_O   11
     30 #define SHIFT_S   7
     31 #define SHIFT_Z   6
     32 #define SHIFT_A   4
     33 #define SHIFT_C   0
     34 #define SHIFT_P   2
     35 
     36 #define MASK_O    (1ULL << SHIFT_O)
     37 #define MASK_S    (1ULL << SHIFT_S)
     38 #define MASK_Z    (1ULL << SHIFT_Z)
     39 #define MASK_A    (1ULL << SHIFT_A)
     40 #define MASK_C    (1ULL << SHIFT_C)
     41 #define MASK_P    (1ULL << SHIFT_P)
     42 
     43 
     44 UInt clz32 ( UInt x )
     45 {
     46    Int y, m, n;
     47    y = -(x >> 16);
     48    m = (y >> 16) & 16;
     49    n = 16 - m;
     50    x = x >> m;
     51    y = x - 0x100;
     52    m = (y >> 16) & 8;
     53    n = n + m;
     54    x = x << m;
     55    y = x - 0x1000;
     56    m = (y >> 16) & 4;
     57    n = n + m;
     58    x = x << m;
     59    y = x - 0x4000;
     60    m = (y >> 16) & 2;
     61    n = n + m;
     62    x = x << m;
     63    y = x >> 14;
     64    m = y & ~(y >> 1);
     65    return n + 2 - m;
     66 }
     67 
     68 UInt ctz32 ( UInt x )
     69 {
     70    return 32 - clz32((~x) & (x-1));
     71 }
     72 
     73 void expand ( V128* dst, char* summary )
     74 {
     75    Int i;
     76    assert( strlen(summary) == 16 );
     77    for (i = 0; i < 16; i++) {
     78       UChar xx = 0;
     79       UChar x = summary[15-i];
     80       if      (x >= '0' && x <= '9') { xx = x - '0'; }
     81       else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
     82       else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
     83       else assert(0);
     84 
     85       assert(xx < 16);
     86       xx = (xx << 4) | xx;
     87       assert(xx < 256);
     88       dst->uChar[i] = xx;
     89    }
     90 }
     91 
     92 void try_istri ( char* which,
     93                  UInt(*h_fn)(V128*,V128*),
     94                  UInt(*s_fn)(V128*,V128*),
     95                  char* summL, char* summR )
     96 {
     97    assert(strlen(which) == 2);
     98    V128 argL, argR;
     99    expand(&argL, summL);
    100    expand(&argR, summR);
    101    UInt h_res = h_fn(&argL, &argR);
    102    UInt s_res = s_fn(&argL, &argR);
    103    printf("istri %s  %s %s -> %08x %08x %s\n",
    104           which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
    105 }
    106 
    107 UInt zmask_from_V128 ( V128* arg )
    108 {
    109    UInt i, res = 0;
    110    for (i = 0; i < 8; i++) {
    111       res |=  ((arg->uShort[i] == 0) ? 1 : 0) << i;
    112    }
    113    return res;
    114 }
    115 
    116 //////////////////////////////////////////////////////////
    117 //                                                      //
    118 //                       GENERAL                        //
    119 //                                                      //
    120 //////////////////////////////////////////////////////////
    121 
    122 
    123 /* Given partial results from a 16-bit pcmpXstrX operation (intRes1,
    124    basically), generate an I- or M-format output value, also the new
    125    OSZACP flags.  */
    126 static
    127 void PCMPxSTRx_WRK_gen_output_fmt_I_wide ( /*OUT*/V128* resV,
    128 					   /*OUT*/UInt* resOSZACP,
    129 					   UInt intRes1,
    130 					   UInt zmaskL, UInt zmaskR,
    131 					   UInt validL,
    132 					   UInt pol, UInt idx )
    133 {
    134    assert((pol >> 2) == 0);
    135    assert((idx >> 1) == 0);
    136 
    137    UInt intRes2 = 0;
    138    switch (pol) {
    139       case 0: intRes2 = intRes1;          break; // pol +
    140       case 1: intRes2 = ~intRes1;         break; // pol -
    141       case 2: intRes2 = intRes1;          break; // pol m+
    142       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
    143    }
    144    intRes2 &= 0xFF;
    145 
    146    // generate I-format output (an index in ECX)
    147    // generate ecx value
    148    UInt newECX = 0;
    149    if (idx) {
    150      // index of ms-1-bit
    151      newECX = intRes2 == 0 ? 8 : (31 - clz32(intRes2));
    152    } else {
    153      // index of ls-1-bit
    154      newECX = intRes2 == 0 ? 8 : ctz32(intRes2);
    155    }
    156 
    157    resV->w32[0] = newECX;
    158    resV->w32[1] = 0;
    159    resV->w32[2] = 0;
    160    resV->w32[3] = 0;
    161 
    162    // generate new flags, common to all ISTRI and ISTRM cases
    163    *resOSZACP    // A, P are zero
    164      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
    165      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
    166      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
    167      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
    168 }
    169 
    170 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
    171    variants on 16-bit characters.
    172 
    173    For xSTRI variants, the new ECX value is placed in the 32 bits
    174    pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
    175    variants, the result is a 128 bit value and is placed at *resV in
    176    the obvious way.
    177 
    178    For all variants, the new OSZACP value is placed at *resOSZACP.
    179 
    180    argLV and argRV are the vector args.  The caller must prepare a
    181    8-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
    182    must be 1 for each zero byte of of the respective arg.  For ESTRx
    183    variants this is derived from the explicit length indication, and
    184    must be 0 in all places except at the bit index corresponding to
    185    the valid length (0 .. 8).  If the valid length is 8 then the
    186    mask must be all zeroes.  In all cases, bits 31:8 must be zero.
    187 
    188    imm8 is the original immediate from the instruction.  isSTRM
    189    indicates whether this is a xSTRM or xSTRI variant, which controls
    190    how much of *res is written.
    191 
    192    If the given imm8 case can be handled, the return value is True.
    193    If not, False is returned, and neither *res not *resOSZACP are
    194    altered.
    195 */
    196 
    197 Bool pcmpXstrX_WRK_wide ( /*OUT*/V128* resV,
    198 			  /*OUT*/UInt* resOSZACP,
    199 			  V128* argLV,  V128* argRV,
    200 			  UInt zmaskL, UInt zmaskR,
    201 			  UInt imm8,   Bool isxSTRM )
    202 {
    203    assert(imm8 < 0x80);
    204    assert((zmaskL >> 8) == 0);
    205    assert((zmaskR >> 8) == 0);
    206 
    207    /* Explicitly reject any imm8 values that haven't been validated,
    208       even if they would probably work.  Life is too short to have
    209       unvalidated cases in the code base. */
    210    switch (imm8) {
    211       case 0x01:
    212       case 0x03: case 0x09: case 0x0B: case 0x0D: case 0x13:
    213       case 0x1B: case 0x39: case 0x3B: case 0x45: case 0x4B:
    214          break;
    215       default:
    216          return False;
    217    }
    218 
    219    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
    220    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
    221    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
    222    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
    223 
    224    /*----------------------------------------*/
    225    /*-- strcmp on wide data                --*/
    226    /*----------------------------------------*/
    227 
    228    if (agg == 2/*equal each, aka strcmp*/
    229        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
    230       Int    i;
    231       UShort* argL = (UShort*)argLV;
    232       UShort* argR = (UShort*)argRV;
    233       UInt boolResII = 0;
    234       for (i = 7; i >= 0; i--) {
    235          UShort cL  = argL[i];
    236          UShort cR  = argR[i];
    237          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
    238       }
    239       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    240       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    241 
    242       // do invalidation, common to all equal-each cases
    243       UInt intRes1
    244          = (boolResII & validL & validR)  // if both valid, use cmpres
    245            | (~ (validL | validR));       // if both invalid, force 1
    246                                           // else force 0
    247       intRes1 &= 0xFF;
    248 
    249       // generate I-format output
    250       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
    251          resV, resOSZACP,
    252          intRes1, zmaskL, zmaskR, validL, pol, idx
    253       );
    254 
    255       return True;
    256    }
    257 
    258    /*----------------------------------------*/
    259    /*-- set membership on wide data        --*/
    260    /*----------------------------------------*/
    261 
    262    if (agg == 0/*equal any, aka find chars in a set*/
    263        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
    264       /* argL: the string,  argR: charset */
    265       UInt   si, ci;
    266       UShort* argL    = (UShort*)argLV;
    267       UShort* argR    = (UShort*)argRV;
    268       UInt   boolRes = 0;
    269       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    270       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    271 
    272       for (si = 0; si < 8; si++) {
    273          if ((validL & (1 << si)) == 0)
    274             // run off the end of the string.
    275             break;
    276          UInt m = 0;
    277          for (ci = 0; ci < 8; ci++) {
    278             if ((validR & (1 << ci)) == 0) break;
    279             if (argR[ci] == argL[si]) { m = 1; break; }
    280          }
    281          boolRes |= (m << si);
    282       }
    283 
    284       // boolRes is "pre-invalidated"
    285       UInt intRes1 = boolRes & 0xFF;
    286 
    287       // generate I-format output
    288       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
    289          resV, resOSZACP,
    290          intRes1, zmaskL, zmaskR, validL, pol, idx
    291       );
    292 
    293       return True;
    294    }
    295 
    296    /*----------------------------------------*/
    297    /*-- substring search on wide data      --*/
    298    /*----------------------------------------*/
    299 
    300    if (agg == 3/*equal ordered, aka substring search*/
    301        && (fmt == 1/*uw*/ || fmt == 3/*sw*/)) {
    302 
    303       /* argL: haystack,  argR: needle */
    304       UInt   ni, hi;
    305       UShort* argL    = (UShort*)argLV;
    306       UShort* argR    = (UShort*)argRV;
    307       UInt   boolRes = 0;
    308       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    309       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    310       for (hi = 0; hi < 8; hi++) {
    311          UInt m = 1;
    312          for (ni = 0; ni < 8; ni++) {
    313             if ((validR & (1 << ni)) == 0) break;
    314             UInt i = ni + hi;
    315             if (i >= 8) break;
    316             if (argL[i] != argR[ni]) { m = 0; break; }
    317          }
    318          boolRes |= (m << hi);
    319          if ((validL & (1 << hi)) == 0)
    320             // run off the end of the haystack
    321             break;
    322       }
    323 
    324       // boolRes is "pre-invalidated"
    325       UInt intRes1 = boolRes & 0xFF;
    326 
    327       // generate I-format output
    328       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
    329          resV, resOSZACP,
    330          intRes1, zmaskL, zmaskR, validL, pol, idx
    331       );
    332 
    333       return True;
    334    }
    335 
    336    /*----------------------------------------*/
    337    /*-- ranges, unsigned wide data         --*/
    338    /*----------------------------------------*/
    339 
    340    if (agg == 1/*ranges*/
    341        && fmt == 1/*uw*/) {
    342 
    343       /* argL: string,  argR: range-pairs */
    344       UInt   ri, si;
    345       UShort* argL    = (UShort*)argLV;
    346       UShort* argR    = (UShort*)argRV;
    347       UInt   boolRes = 0;
    348       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    349       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    350       for (si = 0; si < 8; si++) {
    351          if ((validL & (1 << si)) == 0)
    352             // run off the end of the string
    353             break;
    354          UInt m = 0;
    355          for (ri = 0; ri < 8; ri += 2) {
    356             if ((validR & (3 << ri)) != (3 << ri)) break;
    357             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    358                m = 1; break;
    359             }
    360          }
    361          boolRes |= (m << si);
    362       }
    363 
    364       // boolRes is "pre-invalidated"
    365       UInt intRes1 = boolRes & 0xFF;
    366 
    367       // generate I-format output
    368       PCMPxSTRx_WRK_gen_output_fmt_I_wide(
    369          resV, resOSZACP,
    370          intRes1, zmaskL, zmaskR, validL, pol, idx
    371       );
    372 
    373       return True;
    374    }
    375 
    376    return False;
    377 }
    378 
    379 //////////////////////////////////////////////////////////
    380 //                                                      //
    381 //                       ISTRI_4B                       //
    382 //                                                      //
    383 //////////////////////////////////////////////////////////
    384 
    385 UInt h_pcmpistri_4B ( V128* argL, V128* argR )
    386 {
    387    V128 block[2];
    388    memcpy(&block[0], argL, sizeof(V128));
    389    memcpy(&block[1], argR, sizeof(V128));
    390    ULong res, flags;
    391    __asm__ __volatile__(
    392       "subq      $1024,  %%rsp"             "\n\t"
    393       "movdqu    0(%2),  %%xmm2"            "\n\t"
    394       "movdqu    16(%2), %%xmm11"           "\n\t"
    395       "pcmpistri $0x4B,  %%xmm2, %%xmm11"   "\n\t"
    396       "pushfq"                              "\n\t"
    397       "popq      %%rdx"                     "\n\t"
    398       "movq      %%rcx,  %0"                "\n\t"
    399       "movq      %%rdx,  %1"                "\n\t"
    400       "addq      $1024,  %%rsp"             "\n\t"
    401       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    402       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    403    );
    404    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    405 }
    406 
    407 UInt s_pcmpistri_4B ( V128* argLU, V128* argRU )
    408 {
    409    V128 resV;
    410    UInt resOSZACP, resECX;
    411    Bool ok
    412       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    413 			    zmask_from_V128(argLU),
    414 			    zmask_from_V128(argRU),
    415 			    0x4B, False/*!isSTRM*/
    416         );
    417    assert(ok);
    418    resECX = resV.uInt[0];
    419    return (resOSZACP << 16) | resECX;
    420 }
    421 
    422 void istri_4B ( void )
    423 {
    424    char* wot = "4B";
    425    UInt(*h)(V128*,V128*) = h_pcmpistri_4B;
    426    UInt(*s)(V128*,V128*) = s_pcmpistri_4B;
    427 
    428    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    429 
    430    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    431    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    432    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    433    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    434 
    435    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    436    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    437    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    438 
    439    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    440    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    441    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    442    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    443 
    444    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    445    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    446    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    447 
    448    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    449 
    450    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    451    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    452    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
    453 
    454    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
    455    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    456    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
    457 
    458    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    459    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
    460    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
    461 
    462    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
    463    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
    464    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
    465 
    466    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    467    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    468 }
    469 
    470 //////////////////////////////////////////////////////////
    471 //                                                      //
    472 //                       ISTRI_3B                       //
    473 //                                                      //
    474 //////////////////////////////////////////////////////////
    475 
    476 UInt h_pcmpistri_3B ( V128* argL, V128* argR )
    477 {
    478    V128 block[2];
    479    memcpy(&block[0], argL, sizeof(V128));
    480    memcpy(&block[1], argR, sizeof(V128));
    481    ULong res, flags;
    482    __asm__ __volatile__(
    483       "subq      $1024,  %%rsp"             "\n\t"
    484       "movdqu    0(%2),  %%xmm2"            "\n\t"
    485       "movdqu    16(%2), %%xmm11"           "\n\t"
    486       "pcmpistri $0x3B,  %%xmm2, %%xmm11"   "\n\t"
    487       "pushfq"                              "\n\t"
    488       "popq      %%rdx"                     "\n\t"
    489       "movq      %%rcx,  %0"                "\n\t"
    490       "movq      %%rdx,  %1"                "\n\t"
    491       "addq      $1024,  %%rsp"             "\n\t"
    492       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    493       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    494    );
    495    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    496 }
    497 
    498 UInt s_pcmpistri_3B ( V128* argLU, V128* argRU )
    499 {
    500    V128 resV;
    501    UInt resOSZACP, resECX;
    502    Bool ok
    503       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    504 			    zmask_from_V128(argLU),
    505 			    zmask_from_V128(argRU),
    506 			    0x3B, False/*!isSTRM*/
    507         );
    508    assert(ok);
    509    resECX = resV.uInt[0];
    510    return (resOSZACP << 16) | resECX;
    511 }
    512 
    513 void istri_3B ( void )
    514 {
    515    char* wot = "3B";
    516    UInt(*h)(V128*,V128*) = h_pcmpistri_3B;
    517    UInt(*s)(V128*,V128*) = s_pcmpistri_3B;
    518 
    519    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    520 
    521    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    522    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    523    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    524    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    525 
    526    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    527    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    528    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    529 
    530    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    531    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    532    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    533    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    534 
    535    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    536    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    537    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    538 
    539    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    540 
    541    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    542    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    543    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
    544 
    545    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
    546    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    547    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
    548 
    549    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    550    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
    551    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
    552 
    553    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
    554    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
    555    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
    556 
    557    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    558    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    559 }
    560 
    561 
    562 
    563 //////////////////////////////////////////////////////////
    564 //                                                      //
    565 //                       ISTRI_0D                       //
    566 //                                                      //
    567 //////////////////////////////////////////////////////////
    568 
    569 __attribute__((noinline))
    570 UInt h_pcmpistri_0D ( V128* argL, V128* argR )
    571 {
    572    V128 block[2];
    573    memcpy(&block[0], argL, sizeof(V128));
    574    memcpy(&block[1], argR, sizeof(V128));
    575    ULong res = 0, flags = 0;
    576    __asm__ __volatile__(
    577       "movdqu    0(%2),  %%xmm2"            "\n\t"
    578       "movdqu    16(%2), %%xmm11"           "\n\t"
    579       "pcmpistri $0x0D,  %%xmm2, %%xmm11"   "\n\t"
    580       //"pcmpistrm $0x0D,  %%xmm2, %%xmm11"   "\n\t"
    581       //"movd %%xmm0, %%ecx" "\n\t"
    582       "pushfq"                              "\n\t"
    583       "popq      %%rdx"                     "\n\t"
    584       "movq      %%rcx,  %0"                "\n\t"
    585       "movq      %%rdx,  %1"                "\n\t"
    586       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    587       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    588    );
    589    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    590 }
    591 
    592 UInt s_pcmpistri_0D ( V128* argLU, V128* argRU )
    593 {
    594    V128 resV;
    595    UInt resOSZACP, resECX;
    596    Bool ok
    597       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    598 			    zmask_from_V128(argLU),
    599 			    zmask_from_V128(argRU),
    600 			    0x0D, False/*!isSTRM*/
    601         );
    602    assert(ok);
    603    resECX = resV.uInt[0];
    604    return (resOSZACP << 16) | resECX;
    605 }
    606 
    607 void istri_0D ( void )
    608 {
    609    char* wot = "0D";
    610    UInt(*h)(V128*,V128*) = h_pcmpistri_0D;
    611    UInt(*s)(V128*,V128*) = s_pcmpistri_0D;
    612 
    613    try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
    614 
    615    try_istri(wot,h,s, "11111111abcdef11", "00abcdef00abcdef");
    616 
    617    try_istri(wot,h,s, "11111111abcdef11", "0000000000abcdef");
    618    try_istri(wot,h,s, "1111111111abcdef", "0000000000abcdef");
    619    try_istri(wot,h,s, "111111111111abcd", "0000000000abcdef");
    620 
    621    try_istri(wot,h,s, "1111abcd11abcd11", "000000000000abcd");
    622 
    623    try_istri(wot,h,s, "11abcd1111abcd11", "000000000000abcd");
    624    try_istri(wot,h,s, "abcd111111abcd11", "000000000000abcd");
    625    try_istri(wot,h,s, "cd11111111abcd11", "000000000000abcd");
    626 
    627    try_istri(wot,h,s, "01abcd11abcd1111", "000000000000abcd");
    628    try_istri(wot,h,s, "00abcd11abcd1111", "000000000000abcd");
    629    try_istri(wot,h,s, "0000cd11abcd1111", "000000000000abcd");
    630 
    631    try_istri(wot,h,s, "00abcd1100abcd11", "000000000000abcd");
    632    try_istri(wot,h,s, "00abcd110000cd11", "000000000000abcd");
    633 
    634    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
    635    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
    636    try_istri(wot,h,s, "1111111111111234", "0000000000001111");
    637 
    638    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
    639    try_istri(wot,h,s, "0a11111111111111", "000000000000000a");
    640    try_istri(wot,h,s, "0b11111111111111", "000000000000000a");
    641 
    642    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
    643    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    644    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
    645    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
    646 }
    647 
    648 
    649 //////////////////////////////////////////////////////////
    650 //                                                      //
    651 //                       ISTRI_09                       //
    652 //                                                      //
    653 //////////////////////////////////////////////////////////
    654 
    655 UInt h_pcmpistri_09 ( V128* argL, V128* argR )
    656 {
    657    V128 block[2];
    658    memcpy(&block[0], argL, sizeof(V128));
    659    memcpy(&block[1], argR, sizeof(V128));
    660    ULong res, flags;
    661    __asm__ __volatile__(
    662       "subq      $1024,  %%rsp"             "\n\t"
    663       "movdqu    0(%2),  %%xmm2"            "\n\t"
    664       "movdqu    16(%2), %%xmm11"           "\n\t"
    665       "pcmpistri $0x09,  %%xmm2, %%xmm11"   "\n\t"
    666       "pushfq"                              "\n\t"
    667       "popq      %%rdx"                     "\n\t"
    668       "movq      %%rcx,  %0"                "\n\t"
    669       "movq      %%rdx,  %1"                "\n\t"
    670       "addq      $1024,  %%rsp"             "\n\t"
    671       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    672       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    673    );
    674    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    675 }
    676 
    677 UInt s_pcmpistri_09 ( V128* argLU, V128* argRU )
    678 {
    679    V128 resV;
    680    UInt resOSZACP, resECX;
    681    Bool ok
    682       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    683 			    zmask_from_V128(argLU),
    684 			    zmask_from_V128(argRU),
    685 			    0x09, False/*!isSTRM*/
    686         );
    687    assert(ok);
    688    resECX = resV.uInt[0];
    689    return (resOSZACP << 16) | resECX;
    690 }
    691 
    692 void istri_09 ( void )
    693 {
    694    char* wot = "09";
    695    UInt(*h)(V128*,V128*) = h_pcmpistri_09;
    696    UInt(*s)(V128*,V128*) = s_pcmpistri_09;
    697 
    698    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    699 
    700    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    701    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    702    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    703    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    704 
    705    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    706    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    707    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    708 
    709    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    710    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    711    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    712    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    713 
    714    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    715    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    716    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    717 
    718    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    719 
    720    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    721    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    722    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
    723 
    724    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
    725    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    726    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
    727 
    728    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    729    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
    730    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
    731 
    732    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
    733    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
    734    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
    735 
    736    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    737    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    738 }
    739 
    740 
    741 
    742 //////////////////////////////////////////////////////////
    743 //                                                      //
    744 //                       ISTRI_1B                       //
    745 //                                                      //
    746 //////////////////////////////////////////////////////////
    747 
    748 UInt h_pcmpistri_1B ( V128* argL, V128* argR )
    749 {
    750    V128 block[2];
    751    memcpy(&block[0], argL, sizeof(V128));
    752    memcpy(&block[1], argR, sizeof(V128));
    753    ULong res, flags;
    754    __asm__ __volatile__(
    755       "subq      $1024,  %%rsp"             "\n\t"
    756       "movdqu    0(%2),  %%xmm2"            "\n\t"
    757       "movdqu    16(%2), %%xmm11"           "\n\t"
    758       "pcmpistri $0x1B,  %%xmm2, %%xmm11"   "\n\t"
    759       "pushfq"                              "\n\t"
    760       "popq      %%rdx"                     "\n\t"
    761       "movq      %%rcx,  %0"                "\n\t"
    762       "movq      %%rdx,  %1"                "\n\t"
    763       "addq      $1024,  %%rsp"             "\n\t"
    764       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    765       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    766    );
    767    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    768 }
    769 
    770 UInt s_pcmpistri_1B ( V128* argLU, V128* argRU )
    771 {
    772    V128 resV;
    773    UInt resOSZACP, resECX;
    774    Bool ok
    775       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    776 			    zmask_from_V128(argLU),
    777 			    zmask_from_V128(argRU),
    778 			    0x1B, False/*!isSTRM*/
    779         );
    780    assert(ok);
    781    resECX = resV.uInt[0];
    782    return (resOSZACP << 16) | resECX;
    783 }
    784 
    785 void istri_1B ( void )
    786 {
    787    char* wot = "1B";
    788    UInt(*h)(V128*,V128*) = h_pcmpistri_1B;
    789    UInt(*s)(V128*,V128*) = s_pcmpistri_1B;
    790 
    791    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    792 
    793    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    794    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    795    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    796    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    797 
    798    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    799    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    800    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    801 
    802    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    803    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    804    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    805    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    806 
    807    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    808    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    809    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    810 
    811    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    812 
    813    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    814    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    815    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
    816 
    817    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
    818    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
    819    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
    820 
    821    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
    822    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
    823    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
    824 
    825    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
    826    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
    827    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
    828 
    829    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    830    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    831 }
    832 
    833 
    834 
    835 //////////////////////////////////////////////////////////
    836 //                                                      //
    837 //                       ISTRI_03                       //
    838 //                                                      //
    839 //////////////////////////////////////////////////////////
    840 
    841 UInt h_pcmpistri_03 ( V128* argL, V128* argR )
    842 {
    843    V128 block[2];
    844    memcpy(&block[0], argL, sizeof(V128));
    845    memcpy(&block[1], argR, sizeof(V128));
    846    ULong res, flags;
    847    __asm__ __volatile__(
    848       "subq      $1024,  %%rsp"             "\n\t"
    849       "movdqu    0(%2),  %%xmm2"            "\n\t"
    850       "movdqu    16(%2), %%xmm11"           "\n\t"
    851       "pcmpistri $0x03,  %%xmm2, %%xmm11"   "\n\t"
    852 //"pcmpistrm $0x03, %%xmm2, %%xmm11"   "\n\t"
    853 //"movd %%xmm0, %%ecx" "\n\t"
    854       "pushfq"                              "\n\t"
    855       "popq      %%rdx"                     "\n\t"
    856       "movq      %%rcx,  %0"                "\n\t"
    857       "movq      %%rdx,  %1"                "\n\t"
    858       "addq      $1024,  %%rsp"             "\n\t"
    859       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    860       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    861    );
    862    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    863 }
    864 
    865 UInt s_pcmpistri_03 ( V128* argLU, V128* argRU )
    866 {
    867    V128 resV;
    868    UInt resOSZACP, resECX;
    869    Bool ok
    870       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    871 			    zmask_from_V128(argLU),
    872 			    zmask_from_V128(argRU),
    873 			    0x03, False/*!isSTRM*/
    874         );
    875    assert(ok);
    876    resECX = resV.uInt[0];
    877    return (resOSZACP << 16) | resECX;
    878 }
    879 
    880 void istri_03 ( void )
    881 {
    882    char* wot = "03";
    883    UInt(*h)(V128*,V128*) = h_pcmpistri_03;
    884    UInt(*s)(V128*,V128*) = s_pcmpistri_03;
    885 
    886    try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
    887    try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
    888    try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
    889    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
    890 
    891    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
    892    try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
    893    try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
    894    try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
    895    try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
    896 
    897    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
    898    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
    899    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
    900    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
    901 
    902    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    903    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    904 
    905    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
    906    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
    907    try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
    908    try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
    909 
    910    try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
    911 
    912    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
    913    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
    914 }
    915 
    916 
    917 //////////////////////////////////////////////////////////
    918 //                                                      //
    919 //                       ISTRI_13                       //
    920 //                                                      //
    921 //////////////////////////////////////////////////////////
    922 
    923 UInt h_pcmpistri_13 ( V128* argL, V128* argR )
    924 {
    925    V128 block[2];
    926    memcpy(&block[0], argL, sizeof(V128));
    927    memcpy(&block[1], argR, sizeof(V128));
    928    ULong res, flags;
    929    __asm__ __volatile__(
    930       "subq      $1024,  %%rsp"             "\n\t"
    931       "movdqu    0(%2),  %%xmm2"            "\n\t"
    932       "movdqu    16(%2), %%xmm11"           "\n\t"
    933       "pcmpistri $0x13,  %%xmm2, %%xmm11"   "\n\t"
    934 //"pcmpistrm $0x13, %%xmm2, %%xmm11"   "\n\t"
    935 //"movd %%xmm0, %%ecx" "\n\t"
    936       "pushfq"                              "\n\t"
    937       "popq      %%rdx"                     "\n\t"
    938       "movq      %%rcx,  %0"                "\n\t"
    939       "movq      %%rdx,  %1"                "\n\t"
    940       "addq      $1024,  %%rsp"             "\n\t"
    941       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    942       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    943    );
    944    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    945 }
    946 
    947 UInt s_pcmpistri_13 ( V128* argLU, V128* argRU )
    948 {
    949    V128 resV;
    950    UInt resOSZACP, resECX;
    951    Bool ok
    952       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
    953 			    zmask_from_V128(argLU),
    954 			    zmask_from_V128(argRU),
    955 			    0x13, False/*!isSTRM*/
    956         );
    957    assert(ok);
    958    resECX = resV.uInt[0];
    959    return (resOSZACP << 16) | resECX;
    960 }
    961 
    962 void istri_13 ( void )
    963 {
    964    char* wot = "13";
    965    UInt(*h)(V128*,V128*) = h_pcmpistri_13;
    966    UInt(*s)(V128*,V128*) = s_pcmpistri_13;
    967 
    968    try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
    969    try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
    970    try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
    971    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
    972 
    973    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
    974    try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
    975    try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
    976    try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
    977    try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
    978 
    979    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
    980    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
    981    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
    982    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
    983 
    984    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    985    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    986 
    987    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
    988    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
    989    try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
    990    try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
    991 
    992    try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
    993 
    994    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
    995    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
    996 }
    997 
    998 
    999 
   1000 //////////////////////////////////////////////////////////
   1001 //                                                      //
   1002 //                       ISTRI_45                       //
   1003 //                                                      //
   1004 //////////////////////////////////////////////////////////
   1005 
   1006 UInt h_pcmpistri_45 ( V128* argL, V128* argR )
   1007 {
   1008    V128 block[2];
   1009    memcpy(&block[0], argL, sizeof(V128));
   1010    memcpy(&block[1], argR, sizeof(V128));
   1011    ULong res, flags;
   1012    __asm__ __volatile__(
   1013       "subq      $1024,  %%rsp"             "\n\t"
   1014       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1015       "movdqu    16(%2), %%xmm11"           "\n\t"
   1016       "pcmpistri $0x45,  %%xmm2, %%xmm11"   "\n\t"
   1017 //"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
   1018 //"movd %%xmm0, %%ecx" "\n\t"
   1019       "pushfq"                              "\n\t"
   1020       "popq      %%rdx"                     "\n\t"
   1021       "movq      %%rcx,  %0"                "\n\t"
   1022       "movq      %%rdx,  %1"                "\n\t"
   1023       "addq      $1024,  %%rsp"             "\n\t"
   1024       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1025       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1026    );
   1027    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1028 }
   1029 
   1030 UInt s_pcmpistri_45 ( V128* argLU, V128* argRU )
   1031 {
   1032    V128 resV;
   1033    UInt resOSZACP, resECX;
   1034    Bool ok
   1035       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
   1036 			    zmask_from_V128(argLU),
   1037 			    zmask_from_V128(argRU),
   1038 			    0x45, False/*!isSTRM*/
   1039         );
   1040    assert(ok);
   1041    resECX = resV.uInt[0];
   1042    return (resOSZACP << 16) | resECX;
   1043 }
   1044 
   1045 void istri_45 ( void )
   1046 {
   1047    char* wot = "45";
   1048    UInt(*h)(V128*,V128*) = h_pcmpistri_45;
   1049    UInt(*s)(V128*,V128*) = s_pcmpistri_45;
   1050 
   1051    try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000bbcc");
   1052    try_istri(wot,h,s, "aaaabbbbccccdddd", "000000000000ccbb");
   1053    try_istri(wot,h,s, "baaabbbbccccdddd", "000000000000ccbb");
   1054    try_istri(wot,h,s, "baaabbbbccccdddc", "000000000000ccbb");
   1055 
   1056    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
   1057    try_istri(wot,h,s, "bbbbbbbb00bbbbbb", "000000000000ccbb");
   1058    try_istri(wot,h,s, "bbbbbbbbbbbb00bb", "000000000000ccbb");
   1059    try_istri(wot,h,s, "bbbbbbbbbbbbbb00", "000000000000ccbb");
   1060    try_istri(wot,h,s, "0000000000000000", "000000000000ccbb");
   1061 
   1062    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1063 
   1064    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000ccbb");
   1065    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000bb");
   1066    try_istri(wot,h,s, "bb44bb44bb44bb44", "000000006622ccbb");
   1067 
   1068    try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000022ccbb");
   1069    try_istri(wot,h,s, "bb44bb44bb44bb44", "000000000000ccbb");
   1070    try_istri(wot,h,s, "bb44bb44bb44bb44", "00000000000000bb");
   1071 
   1072    try_istri(wot,h,s, "0011223344556677", "0000997755442211");
   1073    try_istri(wot,h,s, "1122334455667711", "0000997755442211");
   1074 
   1075    try_istri(wot,h,s, "0011223344556677", "0000aa8866553322");
   1076    try_istri(wot,h,s, "1122334455667711", "0000aa8866553322");
   1077 }
   1078 
   1079 
   1080 //////////////////////////////////////////////////////////
   1081 //                                                      //
   1082 //                       ISTRI_01                       //
   1083 //                                                      //
   1084 //////////////////////////////////////////////////////////
   1085 
   1086 UInt h_pcmpistri_01 ( V128* argL, V128* argR )
   1087 {
   1088    V128 block[2];
   1089    memcpy(&block[0], argL, sizeof(V128));
   1090    memcpy(&block[1], argR, sizeof(V128));
   1091    ULong res, flags;
   1092    __asm__ __volatile__(
   1093       "subq      $1024,  %%rsp"             "\n\t"
   1094       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1095       "movdqu    16(%2), %%xmm11"           "\n\t"
   1096       "pcmpistri $0x01,  %%xmm2, %%xmm11"   "\n\t"
   1097 //"pcmpistrm $0x01, %%xmm2, %%xmm11"   "\n\t"
   1098 //"movd %%xmm0, %%ecx" "\n\t"
   1099       "pushfq"                              "\n\t"
   1100       "popq      %%rdx"                     "\n\t"
   1101       "movq      %%rcx,  %0"                "\n\t"
   1102       "movq      %%rdx,  %1"                "\n\t"
   1103       "addq      $1024,  %%rsp"             "\n\t"
   1104       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1105       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1106    );
   1107    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1108 }
   1109 
   1110 UInt s_pcmpistri_01 ( V128* argLU, V128* argRU )
   1111 {
   1112    V128 resV;
   1113    UInt resOSZACP, resECX;
   1114    Bool ok
   1115       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
   1116 			    zmask_from_V128(argLU),
   1117 			    zmask_from_V128(argRU),
   1118 			    0x01, False/*!isSTRM*/
   1119         );
   1120    assert(ok);
   1121    resECX = resV.uInt[0];
   1122    return (resOSZACP << 16) | resECX;
   1123 }
   1124 
   1125 void istri_01 ( void )
   1126 {
   1127    char* wot = "01";
   1128    UInt(*h)(V128*,V128*) = h_pcmpistri_01;
   1129    UInt(*s)(V128*,V128*) = s_pcmpistri_01;
   1130 
   1131    try_istri(wot,h,s, "aacdacbdaacdaacd", "00000000000000aa");
   1132    try_istri(wot,h,s, "aabbaabbaabbaabb", "00000000000000bb");
   1133    try_istri(wot,h,s, "aabbccddaabbccdd", "000000000000aabb");
   1134    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1135 
   1136    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
   1137    try_istri(wot,h,s, "00bbccddaabbccdd", "00000000aabbccdd");
   1138    try_istri(wot,h,s, "aabbccddaa00ccdd", "00000000aabbccdd");
   1139    try_istri(wot,h,s, "aabbccddaabb00dd", "00000000aabbccdd");
   1140    try_istri(wot,h,s, "aabbccddaabbcc00", "00000000aabbccdd");
   1141 
   1142    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbccdd");
   1143    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aa00ccdd");
   1144    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabb00dd");
   1145    try_istri(wot,h,s, "aabbccddaabbccdd", "00000000aabbcc00");
   1146 
   1147    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1148    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1149 
   1150    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1151    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1152    try_istri(wot,h,s, "0000aabbaabbaabb", "000000000000bbbb");
   1153    try_istri(wot,h,s, "0000ccddaabbccdd", "00000000bbaabbaa");
   1154 
   1155    try_istri(wot,h,s, "0000ccddaabbccdd", "000000bbaabbaa00");
   1156 
   1157    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1158    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1159 }
   1160 
   1161 
   1162 //////////////////////////////////////////////////////////
   1163 //                                                      //
   1164 //                       ISTRI_39                       //
   1165 //                                                      //
   1166 //////////////////////////////////////////////////////////
   1167 
   1168 UInt h_pcmpistri_39 ( V128* argL, V128* argR )
   1169 {
   1170    V128 block[2];
   1171    memcpy(&block[0], argL, sizeof(V128));
   1172    memcpy(&block[1], argR, sizeof(V128));
   1173    ULong res, flags;
   1174    __asm__ __volatile__(
   1175       "subq      $1024,  %%rsp"             "\n\t"
   1176       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1177       "movdqu    16(%2), %%xmm11"           "\n\t"
   1178       "pcmpistri $0x39,  %%xmm2, %%xmm11"   "\n\t"
   1179       "pushfq"                              "\n\t"
   1180       "popq      %%rdx"                     "\n\t"
   1181       "movq      %%rcx,  %0"                "\n\t"
   1182       "movq      %%rdx,  %1"                "\n\t"
   1183       "addq      $1024,  %%rsp"             "\n\t"
   1184       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1185       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1186    );
   1187    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1188 }
   1189 
   1190 UInt s_pcmpistri_39 ( V128* argLU, V128* argRU )
   1191 {
   1192    V128 resV;
   1193    UInt resOSZACP, resECX;
   1194    Bool ok
   1195       = pcmpXstrX_WRK_wide( &resV, &resOSZACP, argLU, argRU,
   1196 			    zmask_from_V128(argLU),
   1197 			    zmask_from_V128(argRU),
   1198 			    0x39, False/*!isSTRM*/
   1199         );
   1200    assert(ok);
   1201    resECX = resV.uInt[0];
   1202    return (resOSZACP << 16) | resECX;
   1203 }
   1204 
   1205 void istri_39 ( void )
   1206 {
   1207    char* wot = "39";
   1208    UInt(*h)(V128*,V128*) = h_pcmpistri_39;
   1209    UInt(*s)(V128*,V128*) = s_pcmpistri_39;
   1210 
   1211    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1212 
   1213    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1214    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1215    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1216    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1217 
   1218    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1219    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1220    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1221 
   1222    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1223    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1224    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1225    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1226 
   1227    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1228    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1229    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1230 
   1231    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1232 
   1233    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
   1234    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
   1235    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaa00aa");
   1236 
   1237    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaaaaaa");
   1238    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa00aa");
   1239    try_istri(wot,h,s, "aaaaaaaa00aaaaaa", "aaaaaaaaaaaa00aa");
   1240 
   1241    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaaaaaaaaaa");
   1242    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa00aaaaaa");
   1243    try_istri(wot,h,s, "aaaaaaaaaaaa00aa", "aaaaaaaa00aaaaaa");
   1244 
   1245    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa00aaaaaa");
   1246    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa00aaaaaa");
   1247    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa00aaaaaa");
   1248 
   1249    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   1250    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   1251 }
   1252 
   1253 
   1254 
   1255 //////////////////////////////////////////////////////////
   1256 //                                                      //
   1257 //                         main                         //
   1258 //                                                      //
   1259 //////////////////////////////////////////////////////////
   1260 
   1261 int main ( void )
   1262 {
   1263    istri_4B();
   1264    istri_3B();
   1265    istri_09();
   1266    istri_1B();
   1267    istri_03();
   1268    istri_0D();
   1269    istri_13();
   1270    istri_45();
   1271    istri_01();
   1272    istri_39();
   1273    return 0;
   1274 }
   1275