Home | History | Annotate | Download | only in amd64
      1 
      2 /* Tests in detail the core arithmetic for pcmp{e,i}str{i,m} using
      3    pcmpistri to drive it.  Does not check the e-vs-i or i-vs-m
      4    aspect. */
      5 
      6 #include <string.h>
      7 #include <stdio.h>
      8 #include <assert.h>
      9 
     10 typedef  unsigned int   UInt;
     11 typedef  signed int     Int;
     12 typedef  unsigned char  UChar;
     13 typedef  unsigned long long int ULong;
     14 typedef  UChar          Bool;
     15 #define False ((Bool)0)
     16 #define True  ((Bool)1)
     17 
     18 //typedef  unsigned char  V128[16];
     19 typedef
     20    union {
     21       UChar uChar[16];
     22       UInt  uInt[4];
     23    }
     24    V128;
     25 
     26 #define SHIFT_O   11
     27 #define SHIFT_S   7
     28 #define SHIFT_Z   6
     29 #define SHIFT_A   4
     30 #define SHIFT_C   0
     31 #define SHIFT_P   2
     32 
     33 #define MASK_O    (1ULL << SHIFT_O)
     34 #define MASK_S    (1ULL << SHIFT_S)
     35 #define MASK_Z    (1ULL << SHIFT_Z)
     36 #define MASK_A    (1ULL << SHIFT_A)
     37 #define MASK_C    (1ULL << SHIFT_C)
     38 #define MASK_P    (1ULL << SHIFT_P)
     39 
     40 
     41 UInt clz32 ( UInt x )
     42 {
     43    Int y, m, n;
     44    y = -(x >> 16);
     45    m = (y >> 16) & 16;
     46    n = 16 - m;
     47    x = x >> m;
     48    y = x - 0x100;
     49    m = (y >> 16) & 8;
     50    n = n + m;
     51    x = x << m;
     52    y = x - 0x1000;
     53    m = (y >> 16) & 4;
     54    n = n + m;
     55    x = x << m;
     56    y = x - 0x4000;
     57    m = (y >> 16) & 2;
     58    n = n + m;
     59    x = x << m;
     60    y = x >> 14;
     61    m = y & ~(y >> 1);
     62    return n + 2 - m;
     63 }
     64 
     65 UInt ctz32 ( UInt x )
     66 {
     67    return 32 - clz32((~x) & (x-1));
     68 }
     69 
     70 void expand ( V128* dst, char* summary )
     71 {
     72    Int i;
     73    assert( strlen(summary) == 16 );
     74    for (i = 0; i < 16; i++) {
     75       UChar xx = 0;
     76       UChar x = summary[15-i];
     77       if      (x >= '0' && x <= '9') { xx = x - '0'; }
     78       else if (x >= 'A' && x <= 'F') { xx = x - 'A' + 10; }
     79       else if (x >= 'a' && x <= 'f') { xx = x - 'a' + 10; }
     80       else assert(0);
     81 
     82       assert(xx < 16);
     83       xx = (xx << 4) | xx;
     84       assert(xx < 256);
     85       dst->uChar[i] = xx;
     86    }
     87 }
     88 
     89 void try_istri ( char* which,
     90                  UInt(*h_fn)(V128*,V128*),
     91                  UInt(*s_fn)(V128*,V128*),
     92                  char* summL, char* summR )
     93 {
     94    assert(strlen(which) == 2);
     95    V128 argL, argR;
     96    expand(&argL, summL);
     97    expand(&argR, summR);
     98    UInt h_res = h_fn(&argL, &argR);
     99    UInt s_res = s_fn(&argL, &argR);
    100    printf("istri %s  %s %s -> %08x %08x %s\n",
    101           which, summL, summR, h_res, s_res, h_res == s_res ? "" : "!!!!");
    102 }
    103 
    104 UInt zmask_from_V128 ( V128* arg )
    105 {
    106    UInt i, res = 0;
    107    for (i = 0; i < 16; i++) {
    108       res |=  ((arg->uChar[i] == 0) ? 1 : 0) << i;
    109    }
    110    return res;
    111 }
    112 
    113 //////////////////////////////////////////////////////////
    114 //                                                      //
    115 //                       GENERAL                        //
    116 //                                                      //
    117 //////////////////////////////////////////////////////////
    118 
    119 
    120 /* Given partial results from a pcmpXstrX operation (intRes1,
    121    basically), generate an I format (index value for ECX) output, and
    122    also the new OSZACP flags.
    123 */
    124 static
    125 void pcmpXstrX_WRK_gen_output_fmt_I(/*OUT*/V128* resV,
    126                                     /*OUT*/UInt* resOSZACP,
    127                                     UInt intRes1,
    128                                     UInt zmaskL, UInt zmaskR,
    129                                     UInt validL,
    130                                     UInt pol, UInt idx )
    131 {
    132    assert((pol >> 2) == 0);
    133    assert((idx >> 1) == 0);
    134 
    135    UInt intRes2 = 0;
    136    switch (pol) {
    137       case 0: intRes2 = intRes1;          break; // pol +
    138       case 1: intRes2 = ~intRes1;         break; // pol -
    139       case 2: intRes2 = intRes1;          break; // pol m+
    140       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
    141    }
    142    intRes2 &= 0xFFFF;
    143 
    144    // generate ecx value
    145    UInt newECX = 0;
    146    if (idx) {
    147      // index of ms-1-bit
    148      newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
    149    } else {
    150      // index of ls-1-bit
    151      newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
    152    }
    153 
    154    *(UInt*)(&resV[0]) = newECX;
    155 
    156    // generate new flags, common to all ISTRI and ISTRM cases
    157    *resOSZACP    // A, P are zero
    158      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
    159      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
    160      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
    161      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
    162 }
    163 
    164 
    165 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
    166    variants.
    167 
    168    For xSTRI variants, the new ECX value is placed in the 32 bits
    169    pointed to by *resV.  For xSTRM variants, the result is a 128 bit
    170    value and is placed at *resV in the obvious way.
    171 
    172    For all variants, the new OSZACP value is placed at *resOSZACP.
    173 
    174    argLV and argRV are the vector args.  The caller must prepare a
    175    16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
    176    must be 1 for each zero byte of of the respective arg.  For ESTRx
    177    variants this is derived from the explicit length indication, and
    178    must be 0 in all places except at the bit index corresponding to
    179    the valid length (0 .. 16).  If the valid length is 16 then the
    180    mask must be all zeroes.  In all cases, bits 31:16 must be zero.
    181 
    182    imm8 is the original immediate from the instruction.  isSTRM
    183    indicates whether this is a xSTRM or xSTRI variant, which controls
    184    how much of *res is written.
    185 
    186    If the given imm8 case can be handled, the return value is True.
    187    If not, False is returned, and neither *res not *resOSZACP are
    188    altered.
    189 */
    190 
    191 Bool pcmpXstrX_WRK ( /*OUT*/V128* resV,
    192                      /*OUT*/UInt* resOSZACP,
    193                      V128* argLV,  V128* argRV,
    194                      UInt zmaskL, UInt zmaskR,
    195                      UInt imm8,   Bool isSTRM )
    196 {
    197    assert(imm8 < 0x80);
    198    assert((zmaskL >> 16) == 0);
    199    assert((zmaskR >> 16) == 0);
    200 
    201    /* Explicitly reject any imm8 values that haven't been validated,
    202       even if they would probably work.  Life is too short to have
    203       unvalidated cases in the code base. */
    204    switch (imm8) {
    205       case 0x00:
    206       case 0x02: case 0x08: case 0x0C: case 0x12: case 0x1A:
    207       case 0x38: case 0x3A: case 0x44: case 0x4A:
    208          break;
    209       default:
    210          return False;
    211    }
    212 
    213    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
    214    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
    215    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
    216    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
    217 
    218    /*----------------------------------------*/
    219    /*-- strcmp on byte data                --*/
    220    /*----------------------------------------*/
    221 
    222    if (agg == 2/*equal each, aka strcmp*/
    223        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    224        && !isSTRM) {
    225       Int    i;
    226       UChar* argL = (UChar*)argLV;
    227       UChar* argR = (UChar*)argRV;
    228       UInt boolResII = 0;
    229       for (i = 15; i >= 0; i--) {
    230          UChar cL  = argL[i];
    231          UChar cR  = argR[i];
    232          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
    233       }
    234       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    235       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    236 
    237       // do invalidation, common to all equal-each cases
    238       UInt intRes1
    239          = (boolResII & validL & validR)  // if both valid, use cmpres
    240            | (~ (validL | validR));       // if both invalid, force 1
    241                                           // else force 0
    242       intRes1 &= 0xFFFF;
    243 
    244       // generate I-format output
    245       pcmpXstrX_WRK_gen_output_fmt_I(
    246          resV, resOSZACP,
    247          intRes1, zmaskL, zmaskR, validL, pol, idx
    248       );
    249 
    250       return True;
    251    }
    252 
    253    /*----------------------------------------*/
    254    /*-- set membership on byte data        --*/
    255    /*----------------------------------------*/
    256 
    257    if (agg == 0/*equal any, aka find chars in a set*/
    258        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    259        && !isSTRM) {
    260       /* argL: the string,  argR: charset */
    261       UInt   si, ci;
    262       UChar* argL    = (UChar*)argLV;
    263       UChar* argR    = (UChar*)argRV;
    264       UInt   boolRes = 0;
    265       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    266       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    267 
    268       for (si = 0; si < 16; si++) {
    269          if ((validL & (1 << si)) == 0)
    270             // run off the end of the string.
    271             break;
    272          UInt m = 0;
    273          for (ci = 0; ci < 16; ci++) {
    274             if ((validR & (1 << ci)) == 0) break;
    275             if (argR[ci] == argL[si]) { m = 1; break; }
    276          }
    277          boolRes |= (m << si);
    278       }
    279 
    280       // boolRes is "pre-invalidated"
    281       UInt intRes1 = boolRes & 0xFFFF;
    282 
    283       // generate I-format output
    284       pcmpXstrX_WRK_gen_output_fmt_I(
    285          resV, resOSZACP,
    286          intRes1, zmaskL, zmaskR, validL, pol, idx
    287       );
    288 
    289       return True;
    290    }
    291 
    292    /*----------------------------------------*/
    293    /*-- substring search on byte data      --*/
    294    /*----------------------------------------*/
    295 
    296    if (agg == 3/*equal ordered, aka substring search*/
    297        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)
    298        && !isSTRM) {
    299 
    300       /* argL: haystack,  argR: needle */
    301       UInt   ni, hi;
    302       UChar* argL    = (UChar*)argLV;
    303       UChar* argR    = (UChar*)argRV;
    304       UInt   boolRes = 0;
    305       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    306       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    307       for (hi = 0; hi < 16; hi++) {
    308          UInt m = 1;
    309          for (ni = 0; ni < 16; ni++) {
    310             if ((validR & (1 << ni)) == 0) break;
    311             UInt i = ni + hi;
    312             if (i >= 16) break;
    313             if (argL[i] != argR[ni]) { m = 0; break; }
    314          }
    315          boolRes |= (m << hi);
    316          if ((validL & (1 << hi)) == 0)
    317             // run off the end of the haystack
    318             break;
    319       }
    320 
    321       // boolRes is "pre-invalidated"
    322       UInt intRes1 = boolRes & 0xFFFF;
    323 
    324       // generate I-format output
    325       pcmpXstrX_WRK_gen_output_fmt_I(
    326          resV, resOSZACP,
    327          intRes1, zmaskL, zmaskR, validL, pol, idx
    328       );
    329 
    330       return True;
    331    }
    332 
    333    /*----------------------------------------*/
    334    /*-- ranges, unsigned byte data         --*/
    335    /*----------------------------------------*/
    336 
    337    if (agg == 1/*ranges*/
    338        && fmt == 0/*ub*/
    339        && !isSTRM) {
    340 
    341       /* argL: string,  argR: range-pairs */
    342       UInt   ri, si;
    343       UChar* argL    = (UChar*)argLV;
    344       UChar* argR    = (UChar*)argRV;
    345       UInt   boolRes = 0;
    346       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
    347       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
    348       for (si = 0; si < 16; si++) {
    349          if ((validL & (1 << si)) == 0)
    350             // run off the end of the string
    351             break;
    352          UInt m = 0;
    353          for (ri = 0; ri < 16; ri += 2) {
    354             if ((validR & (3 << ri)) != (3 << ri)) break;
    355             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
    356                m = 1; break;
    357             }
    358          }
    359          boolRes |= (m << si);
    360       }
    361 
    362       // boolRes is "pre-invalidated"
    363       UInt intRes1 = boolRes & 0xFFFF;
    364 
    365       // generate I-format output
    366       pcmpXstrX_WRK_gen_output_fmt_I(
    367          resV, resOSZACP,
    368          intRes1, zmaskL, zmaskR, validL, pol, idx
    369       );
    370 
    371       return True;
    372    }
    373 
    374    return False;
    375 }
    376 
    377 
    378 //////////////////////////////////////////////////////////
    379 //                                                      //
    380 //                       ISTRI_4A                       //
    381 //                                                      //
    382 //////////////////////////////////////////////////////////
    383 
    384 UInt h_pcmpistri_4A ( V128* argL, V128* argR )
    385 {
    386    V128 block[2];
    387    memcpy(&block[0], argL, sizeof(V128));
    388    memcpy(&block[1], argR, sizeof(V128));
    389    ULong res, flags;
    390    __asm__ __volatile__(
    391       "subq      $1024,  %%rsp"             "\n\t"
    392       "movdqu    0(%2),  %%xmm2"            "\n\t"
    393       "movdqu    16(%2), %%xmm11"           "\n\t"
    394       "pcmpistri $0x4A,  %%xmm2, %%xmm11"   "\n\t"
    395       "pushfq"                              "\n\t"
    396       "popq      %%rdx"                     "\n\t"
    397       "movq      %%rcx,  %0"                "\n\t"
    398       "movq      %%rdx,  %1"                "\n\t"
    399       "addq      $1024,  %%rsp"             "\n\t"
    400       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    401       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    402    );
    403    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    404 }
    405 
    406 UInt s_pcmpistri_4A ( V128* argLU, V128* argRU )
    407 {
    408    V128 resV;
    409    UInt resOSZACP, resECX;
    410    Bool ok
    411       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    412                        zmask_from_V128(argLU),
    413                        zmask_from_V128(argRU),
    414                        0x4A, False/*!isSTRM*/
    415         );
    416    assert(ok);
    417    resECX = resV.uInt[0];
    418    return (resOSZACP << 16) | resECX;
    419 }
    420 
    421 void istri_4A ( void )
    422 {
    423    char* wot = "4A";
    424    UInt(*h)(V128*,V128*) = h_pcmpistri_4A;
    425    UInt(*s)(V128*,V128*) = s_pcmpistri_4A;
    426 
    427    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    428 
    429    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    430    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    431    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    432    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    433 
    434    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    435    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    436    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    437 
    438    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    439    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    440    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    441    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    442 
    443    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    444    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    445    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    446 
    447    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    448 
    449    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    450    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    451    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    452 
    453    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    454    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    455    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    456 
    457    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    458    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    459    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    460 
    461    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    462    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    463    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    464 
    465    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    466    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    467 }
    468 
    469 //////////////////////////////////////////////////////////
    470 //                                                      //
    471 //                       ISTRI_3A                       //
    472 //                                                      //
    473 //////////////////////////////////////////////////////////
    474 
    475 UInt h_pcmpistri_3A ( V128* argL, V128* argR )
    476 {
    477    V128 block[2];
    478    memcpy(&block[0], argL, sizeof(V128));
    479    memcpy(&block[1], argR, sizeof(V128));
    480    ULong res, flags;
    481    __asm__ __volatile__(
    482       "subq      $1024,  %%rsp"             "\n\t"
    483       "movdqu    0(%2),  %%xmm2"            "\n\t"
    484       "movdqu    16(%2), %%xmm11"           "\n\t"
    485       "pcmpistri $0x3A,  %%xmm2, %%xmm11"   "\n\t"
    486       "pushfq"                              "\n\t"
    487       "popq      %%rdx"                     "\n\t"
    488       "movq      %%rcx,  %0"                "\n\t"
    489       "movq      %%rdx,  %1"                "\n\t"
    490       "addq      $1024,  %%rsp"             "\n\t"
    491       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    492       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    493    );
    494    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    495 }
    496 
    497 UInt s_pcmpistri_3A ( V128* argLU, V128* argRU )
    498 {
    499    V128 resV;
    500    UInt resOSZACP, resECX;
    501    Bool ok
    502       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    503                        zmask_from_V128(argLU),
    504                        zmask_from_V128(argRU),
    505                        0x3A, False/*!isSTRM*/
    506         );
    507    assert(ok);
    508    resECX = resV.uInt[0];
    509    return (resOSZACP << 16) | resECX;
    510 }
    511 
    512 void istri_3A ( void )
    513 {
    514    char* wot = "3A";
    515    UInt(*h)(V128*,V128*) = h_pcmpistri_3A;
    516    UInt(*s)(V128*,V128*) = s_pcmpistri_3A;
    517 
    518    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    519 
    520    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    521    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    522    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    523    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    524 
    525    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    526    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    527    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    528 
    529    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    530    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    531    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    532    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    533 
    534    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    535    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    536    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    537 
    538    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    539 
    540    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    541    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    542    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    543 
    544    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    545    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    546    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    547 
    548    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    549    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    550    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    551 
    552    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    553    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    554    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    555 
    556    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    557    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    558 }
    559 
    560 
    561 
    562 //////////////////////////////////////////////////////////
    563 //                                                      //
    564 //                       ISTRI_0C                       //
    565 //                                                      //
    566 //////////////////////////////////////////////////////////
    567 
    568 __attribute__((noinline))
    569 UInt h_pcmpistri_0C ( V128* argL, V128* argR )
    570 {
    571    V128 block[2];
    572    memcpy(&block[0], argL, sizeof(V128));
    573    memcpy(&block[1], argR, sizeof(V128));
    574    ULong res = 0, flags = 0;
    575    __asm__ __volatile__(
    576       "movdqu    0(%2),  %%xmm2"            "\n\t"
    577       "movdqu    16(%2), %%xmm11"           "\n\t"
    578       "pcmpistri $0x0C,  %%xmm2, %%xmm11"   "\n\t"
    579       //"pcmpistrm $0x0C,  %%xmm2, %%xmm11"   "\n\t"
    580       //"movd %%xmm0, %%ecx" "\n\t"
    581       "pushfq"                              "\n\t"
    582       "popq      %%rdx"                     "\n\t"
    583       "movq      %%rcx,  %0"                "\n\t"
    584       "movq      %%rdx,  %1"                "\n\t"
    585       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    586       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    587    );
    588    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    589 }
    590 
    591 UInt s_pcmpistri_0C ( V128* argLU, V128* argRU )
    592 {
    593    V128 resV;
    594    UInt resOSZACP, resECX;
    595    Bool ok
    596       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    597                        zmask_from_V128(argLU),
    598                        zmask_from_V128(argRU),
    599                        0x0C, False/*!isSTRM*/
    600         );
    601    assert(ok);
    602    resECX = resV.uInt[0];
    603    return (resOSZACP << 16) | resECX;
    604 }
    605 
    606 void istri_0C ( void )
    607 {
    608    char* wot = "0C";
    609    UInt(*h)(V128*,V128*) = h_pcmpistri_0C;
    610    UInt(*s)(V128*,V128*) = s_pcmpistri_0C;
    611 
    612    try_istri(wot,h,s, "111111111abcde11", "00000000000abcde");
    613 
    614    try_istri(wot,h,s, "111111111abcde11", "0000abcde00abcde");
    615 
    616    try_istri(wot,h,s, "1111111111abcde1", "00000000000abcde");
    617    try_istri(wot,h,s, "11111111111abcde", "00000000000abcde");
    618    try_istri(wot,h,s, "111111111111abcd", "00000000000abcde");
    619 
    620    try_istri(wot,h,s, "111abcde1abcde11", "00000000000abcde");
    621 
    622    try_istri(wot,h,s, "11abcde11abcde11", "00000000000abcde");
    623    try_istri(wot,h,s, "1abcde111abcde11", "00000000000abcde");
    624    try_istri(wot,h,s, "abcde1111abcde11", "00000000000abcde");
    625    try_istri(wot,h,s, "bcde11111abcde11", "00000000000abcde");
    626    try_istri(wot,h,s, "cde111111abcde11", "00000000000abcde");
    627 
    628    try_istri(wot,h,s, "01abcde11abcde11", "00000000000abcde");
    629    try_istri(wot,h,s, "00abcde11abcde11", "00000000000abcde");
    630    try_istri(wot,h,s, "000bcde11abcde11", "00000000000abcde");
    631 
    632    try_istri(wot,h,s, "00abcde10abcde11", "00000000000abcde");
    633    try_istri(wot,h,s, "00abcde100bcde11", "00000000000abcde");
    634 
    635    try_istri(wot,h,s, "1111111111111234", "0000000000000000");
    636    try_istri(wot,h,s, "1111111111111234", "0000000000000001");
    637    try_istri(wot,h,s, "1111111111111234", "0000000000000011");
    638 
    639    try_istri(wot,h,s, "1111111111111234", "1111111111111234");
    640    try_istri(wot,h,s, "a111111111111111", "000000000000000a");
    641    try_istri(wot,h,s, "b111111111111111", "000000000000000a");
    642 
    643    try_istri(wot,h,s, "b111111111111111", "0000000000000000");
    644    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    645    try_istri(wot,h,s, "123456789abcdef1", "0000000000000000");
    646    try_istri(wot,h,s, "0000000000000000", "123456789abcdef1");
    647 }
    648 
    649 
    650 //////////////////////////////////////////////////////////
    651 //                                                      //
    652 //                       ISTRI_08                       //
    653 //                                                      //
    654 //////////////////////////////////////////////////////////
    655 
    656 UInt h_pcmpistri_08 ( V128* argL, V128* argR )
    657 {
    658    V128 block[2];
    659    memcpy(&block[0], argL, sizeof(V128));
    660    memcpy(&block[1], argR, sizeof(V128));
    661    ULong res, flags;
    662    __asm__ __volatile__(
    663       "subq      $1024,  %%rsp"             "\n\t"
    664       "movdqu    0(%2),  %%xmm2"            "\n\t"
    665       "movdqu    16(%2), %%xmm11"           "\n\t"
    666       "pcmpistri $0x08,  %%xmm2, %%xmm11"   "\n\t"
    667       "pushfq"                              "\n\t"
    668       "popq      %%rdx"                     "\n\t"
    669       "movq      %%rcx,  %0"                "\n\t"
    670       "movq      %%rdx,  %1"                "\n\t"
    671       "addq      $1024,  %%rsp"             "\n\t"
    672       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    673       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    674    );
    675    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    676 }
    677 
    678 UInt s_pcmpistri_08 ( V128* argLU, V128* argRU )
    679 {
    680    V128 resV;
    681    UInt resOSZACP, resECX;
    682    Bool ok
    683       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    684                        zmask_from_V128(argLU),
    685                        zmask_from_V128(argRU),
    686                        0x08, False/*!isSTRM*/
    687         );
    688    assert(ok);
    689    resECX = resV.uInt[0];
    690    return (resOSZACP << 16) | resECX;
    691 }
    692 
    693 void istri_08 ( void )
    694 {
    695    char* wot = "08";
    696    UInt(*h)(V128*,V128*) = h_pcmpistri_08;
    697    UInt(*s)(V128*,V128*) = s_pcmpistri_08;
    698 
    699    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    700 
    701    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    702    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    703    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    704    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    705 
    706    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    707    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    708    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    709 
    710    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    711    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    712    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    713    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    714 
    715    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    716    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    717    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    718 
    719    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    720 
    721    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    722    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    723    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    724 
    725    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    726    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    727    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    728 
    729    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    730    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    731    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    732 
    733    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    734    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    735    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    736 
    737    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    738    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    739 }
    740 
    741 
    742 
    743 //////////////////////////////////////////////////////////
    744 //                                                      //
    745 //                       ISTRI_1A                       //
    746 //                                                      //
    747 //////////////////////////////////////////////////////////
    748 
    749 UInt h_pcmpistri_1A ( V128* argL, V128* argR )
    750 {
    751    V128 block[2];
    752    memcpy(&block[0], argL, sizeof(V128));
    753    memcpy(&block[1], argR, sizeof(V128));
    754    ULong res, flags;
    755    __asm__ __volatile__(
    756       "subq      $1024,  %%rsp"             "\n\t"
    757       "movdqu    0(%2),  %%xmm2"            "\n\t"
    758       "movdqu    16(%2), %%xmm11"           "\n\t"
    759       "pcmpistri $0x1A,  %%xmm2, %%xmm11"   "\n\t"
    760       "pushfq"                              "\n\t"
    761       "popq      %%rdx"                     "\n\t"
    762       "movq      %%rcx,  %0"                "\n\t"
    763       "movq      %%rdx,  %1"                "\n\t"
    764       "addq      $1024,  %%rsp"             "\n\t"
    765       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    766       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    767    );
    768    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    769 }
    770 
    771 UInt s_pcmpistri_1A ( V128* argLU, V128* argRU )
    772 {
    773    V128 resV;
    774    UInt resOSZACP, resECX;
    775    Bool ok
    776       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    777                        zmask_from_V128(argLU),
    778                        zmask_from_V128(argRU),
    779                        0x1A, False/*!isSTRM*/
    780         );
    781    assert(ok);
    782    resECX = resV.uInt[0];
    783    return (resOSZACP << 16) | resECX;
    784 }
    785 
    786 void istri_1A ( void )
    787 {
    788    char* wot = "1A";
    789    UInt(*h)(V128*,V128*) = h_pcmpistri_1A;
    790    UInt(*s)(V128*,V128*) = s_pcmpistri_1A;
    791 
    792    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    793 
    794    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    795    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    796    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
    797    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
    798 
    799    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
    800    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
    801    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
    802 
    803    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    804    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    805    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    806    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    807 
    808    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    809    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
    810    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
    811 
    812    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    813 
    814    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    815    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    816    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
    817 
    818    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
    819    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
    820    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
    821 
    822    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
    823    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
    824    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
    825 
    826    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
    827    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
    828    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
    829 
    830    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
    831    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
    832 }
    833 
    834 
    835 
    836 //////////////////////////////////////////////////////////
    837 //                                                      //
    838 //                       ISTRI_02                       //
    839 //                                                      //
    840 //////////////////////////////////////////////////////////
    841 
    842 UInt h_pcmpistri_02 ( V128* argL, V128* argR )
    843 {
    844    V128 block[2];
    845    memcpy(&block[0], argL, sizeof(V128));
    846    memcpy(&block[1], argR, sizeof(V128));
    847    ULong res, flags;
    848    __asm__ __volatile__(
    849       "subq      $1024,  %%rsp"             "\n\t"
    850       "movdqu    0(%2),  %%xmm2"            "\n\t"
    851       "movdqu    16(%2), %%xmm11"           "\n\t"
    852       "pcmpistri $0x02,  %%xmm2, %%xmm11"   "\n\t"
    853 //"pcmpistrm $0x02, %%xmm2, %%xmm11"   "\n\t"
    854 //"movd %%xmm0, %%ecx" "\n\t"
    855       "pushfq"                              "\n\t"
    856       "popq      %%rdx"                     "\n\t"
    857       "movq      %%rcx,  %0"                "\n\t"
    858       "movq      %%rdx,  %1"                "\n\t"
    859       "addq      $1024,  %%rsp"             "\n\t"
    860       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    861       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    862    );
    863    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    864 }
    865 
    866 UInt s_pcmpistri_02 ( V128* argLU, V128* argRU )
    867 {
    868    V128 resV;
    869    UInt resOSZACP, resECX;
    870    Bool ok
    871       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    872                        zmask_from_V128(argLU),
    873                        zmask_from_V128(argRU),
    874                        0x02, False/*!isSTRM*/
    875         );
    876    assert(ok);
    877    resECX = resV.uInt[0];
    878    return (resOSZACP << 16) | resECX;
    879 }
    880 
    881 void istri_02 ( void )
    882 {
    883    char* wot = "02";
    884    UInt(*h)(V128*,V128*) = h_pcmpistri_02;
    885    UInt(*s)(V128*,V128*) = s_pcmpistri_02;
    886 
    887    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
    888    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
    889    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
    890    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
    891 
    892    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    893    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
    894    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
    895    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
    896    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
    897 
    898    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    899    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
    900    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
    901    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
    902 
    903    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    904    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    905 
    906    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
    907    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
    908    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
    909    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
    910 
    911    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
    912 
    913    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
    914    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
    915 }
    916 
    917 
    918 //////////////////////////////////////////////////////////
    919 //                                                      //
    920 //                       ISTRI_12                       //
    921 //                                                      //
    922 //////////////////////////////////////////////////////////
    923 
    924 UInt h_pcmpistri_12 ( V128* argL, V128* argR )
    925 {
    926    V128 block[2];
    927    memcpy(&block[0], argL, sizeof(V128));
    928    memcpy(&block[1], argR, sizeof(V128));
    929    ULong res, flags;
    930    __asm__ __volatile__(
    931       "subq      $1024,  %%rsp"             "\n\t"
    932       "movdqu    0(%2),  %%xmm2"            "\n\t"
    933       "movdqu    16(%2), %%xmm11"           "\n\t"
    934       "pcmpistri $0x12,  %%xmm2, %%xmm11"   "\n\t"
    935 //"pcmpistrm $0x12, %%xmm2, %%xmm11"   "\n\t"
    936 //"movd %%xmm0, %%ecx" "\n\t"
    937       "pushfq"                              "\n\t"
    938       "popq      %%rdx"                     "\n\t"
    939       "movq      %%rcx,  %0"                "\n\t"
    940       "movq      %%rdx,  %1"                "\n\t"
    941       "addq      $1024,  %%rsp"             "\n\t"
    942       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
    943       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
    944    );
    945    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
    946 }
    947 
    948 UInt s_pcmpistri_12 ( V128* argLU, V128* argRU )
    949 {
    950    V128 resV;
    951    UInt resOSZACP, resECX;
    952    Bool ok
    953       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
    954                        zmask_from_V128(argLU),
    955                        zmask_from_V128(argRU),
    956                        0x12, False/*!isSTRM*/
    957         );
    958    assert(ok);
    959    resECX = resV.uInt[0];
    960    return (resOSZACP << 16) | resECX;
    961 }
    962 
    963 void istri_12 ( void )
    964 {
    965    char* wot = "12";
    966    UInt(*h)(V128*,V128*) = h_pcmpistri_12;
    967    UInt(*s)(V128*,V128*) = s_pcmpistri_12;
    968 
    969    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
    970    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
    971    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
    972    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
    973 
    974    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    975    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
    976    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
    977    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
    978    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
    979 
    980    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
    981    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
    982    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
    983    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
    984 
    985    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
    986    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
    987 
    988    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
    989    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
    990    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
    991    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
    992 
    993    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
    994 
    995    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
    996    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
    997 }
    998 
    999 
   1000 
   1001 //////////////////////////////////////////////////////////
   1002 //                                                      //
   1003 //                       ISTRI_44                       //
   1004 //                                                      //
   1005 //////////////////////////////////////////////////////////
   1006 
   1007 UInt h_pcmpistri_44 ( V128* argL, V128* argR )
   1008 {
   1009    V128 block[2];
   1010    memcpy(&block[0], argL, sizeof(V128));
   1011    memcpy(&block[1], argR, sizeof(V128));
   1012    ULong res, flags;
   1013    __asm__ __volatile__(
   1014       "subq      $1024,  %%rsp"             "\n\t"
   1015       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1016       "movdqu    16(%2), %%xmm11"           "\n\t"
   1017       "pcmpistri $0x44,  %%xmm2, %%xmm11"   "\n\t"
   1018 //"pcmpistrm $0x04, %%xmm2, %%xmm11"   "\n\t"
   1019 //"movd %%xmm0, %%ecx" "\n\t"
   1020       "pushfq"                              "\n\t"
   1021       "popq      %%rdx"                     "\n\t"
   1022       "movq      %%rcx,  %0"                "\n\t"
   1023       "movq      %%rdx,  %1"                "\n\t"
   1024       "addq      $1024,  %%rsp"             "\n\t"
   1025       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1026       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1027    );
   1028    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1029 }
   1030 
   1031 UInt s_pcmpistri_44 ( V128* argLU, V128* argRU )
   1032 {
   1033    V128 resV;
   1034    UInt resOSZACP, resECX;
   1035    Bool ok
   1036       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1037                        zmask_from_V128(argLU),
   1038                        zmask_from_V128(argRU),
   1039                        0x44, False/*!isSTRM*/
   1040         );
   1041    assert(ok);
   1042    resECX = resV.uInt[0];
   1043    return (resOSZACP << 16) | resECX;
   1044 }
   1045 
   1046 void istri_44 ( void )
   1047 {
   1048    char* wot = "44";
   1049    UInt(*h)(V128*,V128*) = h_pcmpistri_44;
   1050    UInt(*s)(V128*,V128*) = s_pcmpistri_44;
   1051 
   1052    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000bc");
   1053    try_istri(wot,h,s, "aaaabbbbccccdddd", "00000000000000cb");
   1054    try_istri(wot,h,s, "baaabbbbccccdddd", "00000000000000cb");
   1055    try_istri(wot,h,s, "baaabbbbccccdddc", "00000000000000cb");
   1056 
   1057    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1058    try_istri(wot,h,s, "bbbbbbbb0bbbbbbb", "00000000000000cb");
   1059    try_istri(wot,h,s, "bbbbbbbbbbbbbb0b", "00000000000000cb");
   1060    try_istri(wot,h,s, "bbbbbbbbbbbbbbb0", "00000000000000cb");
   1061    try_istri(wot,h,s, "0000000000000000", "00000000000000cb");
   1062 
   1063    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1064 
   1065    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "00000000000000cb");
   1066    try_istri(wot,h,s, "bbbbbbbbbbbbbbbb", "000000000000000b");
   1067    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000062cb");
   1068 
   1069    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000002cb");
   1070    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "00000000000000cb");
   1071    try_istri(wot,h,s, "b4b4b4b4b4b4b4b4", "000000000000000b");
   1072 
   1073    try_istri(wot,h,s, "0123456789abcdef", "000000fecb975421");
   1074    try_istri(wot,h,s, "123456789abcdef1", "000000fecb975421");
   1075 
   1076    try_istri(wot,h,s, "0123456789abcdef", "00000000dca86532");
   1077    try_istri(wot,h,s, "123456789abcdef1", "00000000dca86532");
   1078 }
   1079 
   1080 
   1081 //////////////////////////////////////////////////////////
   1082 //                                                      //
   1083 //                       ISTRI_00                       //
   1084 //                                                      //
   1085 //////////////////////////////////////////////////////////
   1086 
   1087 UInt h_pcmpistri_00 ( V128* argL, V128* argR )
   1088 {
   1089    V128 block[2];
   1090    memcpy(&block[0], argL, sizeof(V128));
   1091    memcpy(&block[1], argR, sizeof(V128));
   1092    ULong res, flags;
   1093    __asm__ __volatile__(
   1094       "subq      $1024,  %%rsp"             "\n\t"
   1095       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1096       "movdqu    16(%2), %%xmm11"           "\n\t"
   1097       "pcmpistri $0x00,  %%xmm2, %%xmm11"   "\n\t"
   1098 //"pcmpistrm $0x00, %%xmm2, %%xmm11"   "\n\t"
   1099 //"movd %%xmm0, %%ecx" "\n\t"
   1100       "pushfq"                              "\n\t"
   1101       "popq      %%rdx"                     "\n\t"
   1102       "movq      %%rcx,  %0"                "\n\t"
   1103       "movq      %%rdx,  %1"                "\n\t"
   1104       "addq      $1024,  %%rsp"             "\n\t"
   1105       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1106       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1107    );
   1108    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1109 }
   1110 
   1111 UInt s_pcmpistri_00 ( V128* argLU, V128* argRU )
   1112 {
   1113    V128 resV;
   1114    UInt resOSZACP, resECX;
   1115    Bool ok
   1116       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1117                        zmask_from_V128(argLU),
   1118                        zmask_from_V128(argRU),
   1119                        0x00, False/*!isSTRM*/
   1120         );
   1121    assert(ok);
   1122    resECX = resV.uInt[0];
   1123    return (resOSZACP << 16) | resECX;
   1124 }
   1125 
   1126 void istri_00 ( void )
   1127 {
   1128    char* wot = "00";
   1129    UInt(*h)(V128*,V128*) = h_pcmpistri_00;
   1130    UInt(*s)(V128*,V128*) = s_pcmpistri_00;
   1131 
   1132    try_istri(wot,h,s, "abcdacbdabcdabcd", "000000000000000a");
   1133    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000000b");
   1134    try_istri(wot,h,s, "abcdabcdabcdabcd", "00000000000000ab");
   1135    try_istri(wot,h,s, "abcdabc0abcdabcd", "000000000000abcd");
   1136 
   1137    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1138    try_istri(wot,h,s, "0bcdabcdabcdabcd", "000000000000abcd");
   1139    try_istri(wot,h,s, "abcdabcdabcda0cd", "000000000000abcd");
   1140    try_istri(wot,h,s, "abcdabcdabcdab0d", "000000000000abcd");
   1141    try_istri(wot,h,s, "abcdabcdabcdabc0", "000000000000abcd");
   1142 
   1143    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abcd");
   1144    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000a0cd");
   1145    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000ab0d");
   1146    try_istri(wot,h,s, "abcdabcdabcdabcd", "000000000000abc0");
   1147 
   1148    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1149    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1150 
   1151    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000abcd");
   1152    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000dcba");
   1153    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000bbbb");
   1154    try_istri(wot,h,s, "0000abcdabcdabcd", "000000000000baba");
   1155 
   1156    try_istri(wot,h,s, "0000abcdabcdabcd", "00000000000baba0");
   1157 
   1158    try_istri(wot,h,s, "0ddc0ffeebadf00d", "00000000cafebabe");
   1159    try_istri(wot,h,s, "0ddc0ffeebadfeed", "00000000cafebabe");
   1160 }
   1161 
   1162 
   1163 //////////////////////////////////////////////////////////
   1164 //                                                      //
   1165 //                       ISTRI_38                       //
   1166 //                                                      //
   1167 //////////////////////////////////////////////////////////
   1168 
   1169 UInt h_pcmpistri_38 ( V128* argL, V128* argR )
   1170 {
   1171    V128 block[2];
   1172    memcpy(&block[0], argL, sizeof(V128));
   1173    memcpy(&block[1], argR, sizeof(V128));
   1174    ULong res, flags;
   1175    __asm__ __volatile__(
   1176       "subq      $1024,  %%rsp"             "\n\t"
   1177       "movdqu    0(%2),  %%xmm2"            "\n\t"
   1178       "movdqu    16(%2), %%xmm11"           "\n\t"
   1179       "pcmpistri $0x38,  %%xmm2, %%xmm11"   "\n\t"
   1180       "pushfq"                              "\n\t"
   1181       "popq      %%rdx"                     "\n\t"
   1182       "movq      %%rcx,  %0"                "\n\t"
   1183       "movq      %%rdx,  %1"                "\n\t"
   1184       "addq      $1024,  %%rsp"             "\n\t"
   1185       : /*out*/ "=r"(res), "=r"(flags) : "r"/*in*/(&block[0])
   1186       : "rcx","rdx","xmm0","xmm2","xmm11","cc","memory"
   1187    );
   1188    return ((flags & 0x8D5) << 16) | (res & 0xFFFF);
   1189 }
   1190 
   1191 UInt s_pcmpistri_38 ( V128* argLU, V128* argRU )
   1192 {
   1193    V128 resV;
   1194    UInt resOSZACP, resECX;
   1195    Bool ok
   1196       = pcmpXstrX_WRK( &resV, &resOSZACP, argLU, argRU,
   1197                        zmask_from_V128(argLU),
   1198                        zmask_from_V128(argRU),
   1199                        0x38, False/*!isSTRM*/
   1200         );
   1201    assert(ok);
   1202    resECX = resV.uInt[0];
   1203    return (resOSZACP << 16) | resECX;
   1204 }
   1205 
   1206 void istri_38 ( void )
   1207 {
   1208    char* wot = "38";
   1209    UInt(*h)(V128*,V128*) = h_pcmpistri_38;
   1210    UInt(*s)(V128*,V128*) = s_pcmpistri_38;
   1211 
   1212    try_istri(wot,h,s, "0000000000000000", "0000000000000000");
   1213 
   1214    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1215    try_istri(wot,h,s, "aaaa2aaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1216    try_istri(wot,h,s, "aaaaaaaaa2aaaaaa", "aaaaaaaaaaaaaaaa");
   1217    try_istri(wot,h,s, "aaaaaaaaaaaaa2aa", "aaaaaaaaaaaaaaaa");
   1218 
   1219    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaa2aaaaaaaaaaa");
   1220    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaa2aaaaaa");
   1221    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaa2a");
   1222 
   1223    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1224    try_istri(wot,h,s, "baaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1225    try_istri(wot,h,s, "b9aaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1226    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1227 
   1228    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1229    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaaaaaa7aaa");
   1230    try_istri(wot,h,s, "b9baaaaaaaaaaaaa", "aaaaaaaa2aaa4aaa");
   1231 
   1232    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaa");
   1233 
   1234    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1235    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1236    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaa0aaa");
   1237 
   1238    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaaaaaa");
   1239    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaaaaaa0aaa");
   1240    try_istri(wot,h,s, "aaaaaaaa0aaaaaaa", "aaaaaaaaaaaa0aaa");
   1241 
   1242    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaaaaaaaaaa");
   1243    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "aaaaaaaa0aaaaaaa");
   1244    try_istri(wot,h,s, "aaaaaaaaaaaa0aaa", "aaaaaaaa0aaaaaaa");
   1245 
   1246    try_istri(wot,h,s, "0000000000000000", "aaaaaaaa0aaaaaaa");
   1247    try_istri(wot,h,s, "8000000000000000", "aaaaaaaa0aaaaaaa");
   1248    try_istri(wot,h,s, "0000000000000001", "aaaaaaaa0aaaaaaa");
   1249 
   1250    try_istri(wot,h,s, "0000000000000000", "aaaaaaaaaaaaaaaa");
   1251    try_istri(wot,h,s, "aaaaaaaaaaaaaaaa", "0000000000000000");
   1252 }
   1253 
   1254 
   1255 
   1256 //////////////////////////////////////////////////////////
   1257 //                                                      //
   1258 //                         main                         //
   1259 //                                                      //
   1260 //////////////////////////////////////////////////////////
   1261 
   1262 int main ( void )
   1263 {
   1264    istri_4A();
   1265    istri_3A();
   1266    istri_08();
   1267    istri_1A();
   1268    istri_02();
   1269    istri_0C();
   1270    istri_12();
   1271    istri_44();
   1272    istri_00();
   1273    istri_38();
   1274    return 0;
   1275 }
   1276