Home | History | Annotate | Download | only in memcheck
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Instrument IR to perform memory checking operations.         ---*/
      4 /*---                                               mc_translate.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of MemCheck, a heavyweight Valgrind tool for
      9    detecting memory errors.
     10 
     11    Copyright (C) 2000-2017 Julian Seward
     12       jseward (at) acm.org
     13 
     14    This program is free software; you can redistribute it and/or
     15    modify it under the terms of the GNU General Public License as
     16    published by the Free Software Foundation; either version 2 of the
     17    License, or (at your option) any later version.
     18 
     19    This program is distributed in the hope that it will be useful, but
     20    WITHOUT ANY WARRANTY; without even the implied warranty of
     21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     22    General Public License for more details.
     23 
     24    You should have received a copy of the GNU General Public License
     25    along with this program; if not, write to the Free Software
     26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     27    02111-1307, USA.
     28 
     29    The GNU General Public License is contained in the file COPYING.
     30 */
     31 
     32 #include "pub_tool_basics.h"
     33 #include "pub_tool_poolalloc.h"     // For mc_include.h
     34 #include "pub_tool_hashtable.h"     // For mc_include.h
     35 #include "pub_tool_libcassert.h"
     36 #include "pub_tool_libcprint.h"
     37 #include "pub_tool_tooliface.h"
     38 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
     39 #include "pub_tool_xarray.h"
     40 #include "pub_tool_mallocfree.h"
     41 #include "pub_tool_libcbase.h"
     42 
     43 #include "mc_include.h"
     44 
     45 
     46 /* FIXMEs JRS 2011-June-16.
     47 
     48    Check the interpretation for vector narrowing and widening ops,
     49    particularly the saturating ones.  I suspect they are either overly
     50    pessimistic and/or wrong.
     51 
     52    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
     53    saturating shifts): the interpretation is overly pessimistic.
     54    See comments on the relevant cases below for details.
     55 
     56    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
     57    both rounding and non-rounding variants): ditto
     58 */
     59 
     60 /* This file implements the Memcheck instrumentation, and in
     61    particular contains the core of its undefined value detection
     62    machinery.  For a comprehensive background of the terminology,
     63    algorithms and rationale used herein, read:
     64 
     65      Using Valgrind to detect undefined value errors with
     66      bit-precision
     67 
     68      Julian Seward and Nicholas Nethercote
     69 
     70      2005 USENIX Annual Technical Conference (General Track),
     71      Anaheim, CA, USA, April 10-15, 2005.
     72 
     73    ----
     74 
     75    Here is as good a place as any to record exactly when V bits are and
     76    should be checked, why, and what function is responsible.
     77 
     78 
     79    Memcheck complains when an undefined value is used:
     80 
     81    1. In the condition of a conditional branch.  Because it could cause
     82       incorrect control flow, and thus cause incorrect externally-visible
     83       behaviour.  [mc_translate.c:complainIfUndefined]
     84 
     85    2. As an argument to a system call, or as the value that specifies
     86       the system call number.  Because it could cause an incorrect
     87       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
     88 
     89    3. As the address in a load or store.  Because it could cause an
     90       incorrect value to be used later, which could cause externally-visible
     91       behaviour (eg. via incorrect control flow or an incorrect system call
     92       argument)  [complainIfUndefined]
     93 
     94    4. As the target address of a branch.  Because it could cause incorrect
     95       control flow.  [complainIfUndefined]
     96 
     97    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
     98       an incorrect value into the external environment.
     99       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
    100 
    101    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
    102       [complainIfUndefined]
    103 
    104    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
    105       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
    106       requested it.  [in memcheck.h]
    107 
    108 
    109    Memcheck also complains, but should not, when an undefined value is used:
    110 
    111    8. As the shift value in certain SIMD shift operations (but not in the
    112       standard integer shift operations).  This inconsistency is due to
    113       historical reasons.)  [complainIfUndefined]
    114 
    115 
    116    Memcheck does not complain, but should, when an undefined value is used:
    117 
    118    9. As an input to a client request.  Because the client request may
    119       affect the visible behaviour -- see bug #144362 for an example
    120       involving the malloc replacements in vg_replace_malloc.c and
    121       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
    122       isn't identified.  That bug report also has some info on how to solve
    123       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
    124 
    125 
    126    In practice, 1 and 2 account for the vast majority of cases.
    127 */
    128 
    129 /* Generation of addr-definedness, addr-validity and
    130    guard-definedness checks pertaining to loads and stores (Iex_Load,
    131    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
    132    loads/stores) was re-checked 11 May 2013. */
    133 
    134 /*------------------------------------------------------------*/
    135 /*--- Forward decls                                        ---*/
    136 /*------------------------------------------------------------*/
    137 
    138 struct _MCEnv;
    139 
    140 static IRType  shadowTypeV ( IRType ty );
    141 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e );
    142 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
    143 
    144 static IRExpr *i128_const_zero(void);
    145 
    146 /*------------------------------------------------------------*/
    147 /*--- Memcheck running state, and tmp management.          ---*/
    148 /*------------------------------------------------------------*/
    149 
    150 /* Carries info about a particular tmp.  The tmp's number is not
    151    recorded, as this is implied by (equal to) its index in the tmpMap
    152    in MCEnv.  The tmp's type is also not recorded, as this is present
    153    in MCEnv.sb->tyenv.
    154 
    155    When .kind is Orig, .shadowV and .shadowB may give the identities
    156    of the temps currently holding the associated definedness (shadowV)
    157    and origin (shadowB) values, or these may be IRTemp_INVALID if code
    158    to compute such values has not yet been emitted.
    159 
    160    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
    161    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
    162    illogical for a shadow tmp itself to be shadowed.
    163 */
    164 typedef
    165    enum { Orig=1, VSh=2, BSh=3 }
    166    TempKind;
    167 
    168 typedef
    169    struct {
    170       TempKind kind;
    171       IRTemp   shadowV;
    172       IRTemp   shadowB;
    173    }
    174    TempMapEnt;
    175 
    176 
    177 /* Carries around state during memcheck instrumentation. */
    178 typedef
    179    struct _MCEnv {
    180       /* MODIFIED: the superblock being constructed.  IRStmts are
    181          added. */
    182       IRSB* sb;
    183       Bool  trace;
    184 
    185       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
    186          current kind and possibly shadow temps for each temp in the
    187          IRSB being constructed.  Note that it does not contain the
    188          type of each tmp.  If you want to know the type, look at the
    189          relevant entry in sb->tyenv.  It follows that at all times
    190          during the instrumentation process, the valid indices for
    191          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
    192          total number of Orig, V- and B- temps allocated so far.
    193 
    194          The reason for this strange split (types in one place, all
    195          other info in another) is that we need the types to be
    196          attached to sb so as to make it possible to do
    197          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
    198          instrumentation process. */
    199       XArray* /* of TempMapEnt */ tmpMap;
    200 
    201       /* MODIFIED: indicates whether "bogus" literals have so far been
    202          found.  Starts off False, and may change to True. */
    203       Bool bogusLiterals;
    204 
    205       /* READONLY: indicates whether we should use expensive
    206          interpretations of integer adds, since unfortunately LLVM
    207          uses them to do ORs in some circumstances.  Defaulted to True
    208          on MacOS and False everywhere else. */
    209       Bool useLLVMworkarounds;
    210 
    211       /* READONLY: the guest layout.  This indicates which parts of
    212          the guest state should be regarded as 'always defined'. */
    213       const VexGuestLayout* layout;
    214 
    215       /* READONLY: the host word type.  Needed for constructing
    216          arguments of type 'HWord' to be passed to helper functions.
    217          Ity_I32 or Ity_I64 only. */
    218       IRType hWordTy;
    219    }
    220    MCEnv;
    221 
    222 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
    223    demand), as they are encountered.  This is for two reasons.
    224 
    225    (1) (less important reason): Many original tmps are unused due to
    226    initial IR optimisation, and we do not want to spaces in tables
    227    tracking them.
    228 
    229    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
    230    table indexed [0 .. n_types-1], which gives the current shadow for
    231    each original tmp, or INVALID_IRTEMP if none is so far assigned.
    232    It is necessary to support making multiple assignments to a shadow
    233    -- specifically, after testing a shadow for definedness, it needs
    234    to be made defined.  But IR's SSA property disallows this.
    235 
    236    (2) (more important reason): Therefore, when a shadow needs to get
    237    a new value, a new temporary is created, the value is assigned to
    238    that, and the tmpMap is updated to reflect the new binding.
    239 
    240    A corollary is that if the tmpMap maps a given tmp to
    241    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
    242    there's a read-before-write error in the original tmps.  The IR
    243    sanity checker should catch all such anomalies, however.
    244 */
    245 
    246 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
    247    both the table in mce->sb and to our auxiliary mapping.  Note that
    248    newTemp may cause mce->tmpMap to resize, hence previous results
    249    from VG_(indexXA)(mce->tmpMap) are invalidated. */
    250 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
    251 {
    252    Word       newIx;
    253    TempMapEnt ent;
    254    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
    255    ent.kind    = kind;
    256    ent.shadowV = IRTemp_INVALID;
    257    ent.shadowB = IRTemp_INVALID;
    258    newIx = VG_(addToXA)( mce->tmpMap, &ent );
    259    tl_assert(newIx == (Word)tmp);
    260    return tmp;
    261 }
    262 
    263 
    264 /* Find the tmp currently shadowing the given original tmp.  If none
    265    so far exists, allocate one.  */
    266 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
    267 {
    268    TempMapEnt* ent;
    269    /* VG_(indexXA) range-checks 'orig', hence no need to check
    270       here. */
    271    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    272    tl_assert(ent->kind == Orig);
    273    if (ent->shadowV == IRTemp_INVALID) {
    274       IRTemp tmpV
    275         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
    276       /* newTemp may cause mce->tmpMap to resize, hence previous results
    277          from VG_(indexXA) are invalid. */
    278       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    279       tl_assert(ent->kind == Orig);
    280       tl_assert(ent->shadowV == IRTemp_INVALID);
    281       ent->shadowV = tmpV;
    282    }
    283    return ent->shadowV;
    284 }
    285 
    286 /* Allocate a new shadow for the given original tmp.  This means any
    287    previous shadow is abandoned.  This is needed because it is
    288    necessary to give a new value to a shadow once it has been tested
    289    for undefinedness, but unfortunately IR's SSA property disallows
    290    this.  Instead we must abandon the old shadow, allocate a new one
    291    and use that instead.
    292 
    293    This is the same as findShadowTmpV, except we don't bother to see
    294    if a shadow temp already existed -- we simply allocate a new one
    295    regardless. */
    296 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
    297 {
    298    TempMapEnt* ent;
    299    /* VG_(indexXA) range-checks 'orig', hence no need to check
    300       here. */
    301    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    302    tl_assert(ent->kind == Orig);
    303    if (1) {
    304       IRTemp tmpV
    305         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
    306       /* newTemp may cause mce->tmpMap to resize, hence previous results
    307          from VG_(indexXA) are invalid. */
    308       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    309       tl_assert(ent->kind == Orig);
    310       ent->shadowV = tmpV;
    311    }
    312 }
    313 
    314 
    315 /*------------------------------------------------------------*/
    316 /*--- IRAtoms -- a subset of IRExprs                       ---*/
    317 /*------------------------------------------------------------*/
    318 
    319 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
    320    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
    321    input, most of this code deals in atoms.  Usefully, a value atom
    322    always has a V-value which is also an atom: constants are shadowed
    323    by constants, and temps are shadowed by the corresponding shadow
    324    temporary. */
    325 
    326 typedef  IRExpr  IRAtom;
    327 
    328 /* (used for sanity checks only): is this an atom which looks
    329    like it's from original code? */
    330 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
    331 {
    332    if (a1->tag == Iex_Const)
    333       return True;
    334    if (a1->tag == Iex_RdTmp) {
    335       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
    336       return ent->kind == Orig;
    337    }
    338    return False;
    339 }
    340 
    341 /* (used for sanity checks only): is this an atom which looks
    342    like it's from shadow code? */
    343 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
    344 {
    345    if (a1->tag == Iex_Const)
    346       return True;
    347    if (a1->tag == Iex_RdTmp) {
    348       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
    349       return ent->kind == VSh || ent->kind == BSh;
    350    }
    351    return False;
    352 }
    353 
    354 /* (used for sanity checks only): check that both args are atoms and
    355    are identically-kinded. */
    356 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
    357 {
    358    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
    359       return True;
    360    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
    361       return True;
    362    return False;
    363 }
    364 
    365 
    366 /*------------------------------------------------------------*/
    367 /*--- Type management                                      ---*/
    368 /*------------------------------------------------------------*/
    369 
    370 /* Shadow state is always accessed using integer types.  This returns
    371    an integer type with the same size (as per sizeofIRType) as the
    372    given type.  The only valid shadow types are Bit, I8, I16, I32,
    373    I64, I128, V128, V256. */
    374 
    375 static IRType shadowTypeV ( IRType ty )
    376 {
    377    switch (ty) {
    378       case Ity_I1:
    379       case Ity_I8:
    380       case Ity_I16:
    381       case Ity_I32:
    382       case Ity_I64:
    383       case Ity_I128: return ty;
    384       case Ity_F16:  return Ity_I16;
    385       case Ity_F32:  return Ity_I32;
    386       case Ity_D32:  return Ity_I32;
    387       case Ity_F64:  return Ity_I64;
    388       case Ity_D64:  return Ity_I64;
    389       case Ity_F128: return Ity_I128;
    390       case Ity_D128: return Ity_I128;
    391       case Ity_V128: return Ity_V128;
    392       case Ity_V256: return Ity_V256;
    393       default: ppIRType(ty);
    394                VG_(tool_panic)("memcheck:shadowTypeV");
    395    }
    396 }
    397 
    398 /* Produce a 'defined' value of the given shadow type.  Should only be
    399    supplied shadow types (Bit/I8/I16/I32/UI64). */
    400 static IRExpr* definedOfType ( IRType ty ) {
    401    switch (ty) {
    402       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
    403       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
    404       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
    405       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
    406       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
    407       case Ity_I128: return i128_const_zero();
    408       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
    409       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
    410       default:       VG_(tool_panic)("memcheck:definedOfType");
    411    }
    412 }
    413 
    414 
    415 /*------------------------------------------------------------*/
    416 /*--- Constructing IR fragments                            ---*/
    417 /*------------------------------------------------------------*/
    418 
    419 /* add stmt to a bb */
    420 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
    421    if (mce->trace) {
    422       VG_(printf)("  %c: ", cat);
    423       ppIRStmt(st);
    424       VG_(printf)("\n");
    425    }
    426    addStmtToIRSB(mce->sb, st);
    427 }
    428 
    429 /* assign value to tmp */
    430 static inline
    431 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
    432    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
    433 }
    434 
    435 /* build various kinds of expressions */
    436 #define triop(_op, _arg1, _arg2, _arg3) \
    437                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
    438 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
    439 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
    440 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
    441 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
    442 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
    443 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
    444 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
    445 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
    446 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
    447 
    448 /* Bind the given expression to a new temporary, and return the
    449    temporary.  This effectively converts an arbitrary expression into
    450    an atom.
    451 
    452    'ty' is the type of 'e' and hence the type that the new temporary
    453    needs to be.  But passing it in is redundant, since we can deduce
    454    the type merely by inspecting 'e'.  So at least use that fact to
    455    assert that the two types agree. */
    456 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
    457 {
    458    TempKind k;
    459    IRTemp   t;
    460    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
    461 
    462    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
    463    switch (cat) {
    464       case 'V': k = VSh;  break;
    465       case 'B': k = BSh;  break;
    466       case 'C': k = Orig; break;
    467                 /* happens when we are making up new "orig"
    468                    expressions, for IRCAS handling */
    469       default: tl_assert(0);
    470    }
    471    t = newTemp(mce, ty, k);
    472    assign(cat, mce, t, e);
    473    return mkexpr(t);
    474 }
    475 
    476 
    477 /*------------------------------------------------------------*/
    478 /*--- Helper functions for 128-bit ops                     ---*/
    479 /*------------------------------------------------------------*/
    480 
    481 static IRExpr *i128_const_zero(void)
    482 {
    483    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
    484    return binop(Iop_64HLto128, z64, z64);
    485 }
    486 
    487 /* There are no I128-bit loads and/or stores [as generated by any
    488    current front ends].  So we do not need to worry about that in
    489    expr2vbits_Load */
    490 
    491 
    492 /*------------------------------------------------------------*/
    493 /*--- Constructing definedness primitive ops               ---*/
    494 /*------------------------------------------------------------*/
    495 
    496 /* --------- Defined-if-either-defined --------- */
    497 
    498 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    499    tl_assert(isShadowAtom(mce,a1));
    500    tl_assert(isShadowAtom(mce,a2));
    501    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
    502 }
    503 
    504 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    505    tl_assert(isShadowAtom(mce,a1));
    506    tl_assert(isShadowAtom(mce,a2));
    507    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
    508 }
    509 
    510 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    511    tl_assert(isShadowAtom(mce,a1));
    512    tl_assert(isShadowAtom(mce,a2));
    513    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
    514 }
    515 
    516 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    517    tl_assert(isShadowAtom(mce,a1));
    518    tl_assert(isShadowAtom(mce,a2));
    519    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
    520 }
    521 
    522 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    523    tl_assert(isShadowAtom(mce,a1));
    524    tl_assert(isShadowAtom(mce,a2));
    525    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
    526 }
    527 
    528 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    529    tl_assert(isShadowAtom(mce,a1));
    530    tl_assert(isShadowAtom(mce,a2));
    531    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
    532 }
    533 
    534 /* --------- Undefined-if-either-undefined --------- */
    535 
    536 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    537    tl_assert(isShadowAtom(mce,a1));
    538    tl_assert(isShadowAtom(mce,a2));
    539    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
    540 }
    541 
    542 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    543    tl_assert(isShadowAtom(mce,a1));
    544    tl_assert(isShadowAtom(mce,a2));
    545    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
    546 }
    547 
    548 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    549    tl_assert(isShadowAtom(mce,a1));
    550    tl_assert(isShadowAtom(mce,a2));
    551    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
    552 }
    553 
    554 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    555    tl_assert(isShadowAtom(mce,a1));
    556    tl_assert(isShadowAtom(mce,a2));
    557    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
    558 }
    559 
    560 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    561    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
    562    tl_assert(isShadowAtom(mce,a1));
    563    tl_assert(isShadowAtom(mce,a2));
    564    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
    565    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
    566    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
    567    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
    568    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
    569    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
    570 
    571    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
    572 }
    573 
    574 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    575    tl_assert(isShadowAtom(mce,a1));
    576    tl_assert(isShadowAtom(mce,a2));
    577    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
    578 }
    579 
    580 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    581    tl_assert(isShadowAtom(mce,a1));
    582    tl_assert(isShadowAtom(mce,a2));
    583    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
    584 }
    585 
    586 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
    587    switch (vty) {
    588       case Ity_I8:   return mkUifU8(mce, a1, a2);
    589       case Ity_I16:  return mkUifU16(mce, a1, a2);
    590       case Ity_I32:  return mkUifU32(mce, a1, a2);
    591       case Ity_I64:  return mkUifU64(mce, a1, a2);
    592       case Ity_I128: return mkUifU128(mce, a1, a2);
    593       case Ity_V128: return mkUifUV128(mce, a1, a2);
    594       case Ity_V256: return mkUifUV256(mce, a1, a2);
    595       default:
    596          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
    597          VG_(tool_panic)("memcheck:mkUifU");
    598    }
    599 }
    600 
    601 /* --------- The Left-family of operations. --------- */
    602 
    603 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
    604    tl_assert(isShadowAtom(mce,a1));
    605    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
    606 }
    607 
    608 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
    609    tl_assert(isShadowAtom(mce,a1));
    610    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
    611 }
    612 
    613 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
    614    tl_assert(isShadowAtom(mce,a1));
    615    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
    616 }
    617 
    618 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
    619    tl_assert(isShadowAtom(mce,a1));
    620    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
    621 }
    622 
    623 /* --------- 'Improvement' functions for AND/OR. --------- */
    624 
    625 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
    626    defined (0); all other -> undefined (1).
    627 */
    628 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    629 {
    630    tl_assert(isOriginalAtom(mce, data));
    631    tl_assert(isShadowAtom(mce, vbits));
    632    tl_assert(sameKindedAtoms(data, vbits));
    633    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
    634 }
    635 
    636 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    637 {
    638    tl_assert(isOriginalAtom(mce, data));
    639    tl_assert(isShadowAtom(mce, vbits));
    640    tl_assert(sameKindedAtoms(data, vbits));
    641    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
    642 }
    643 
    644 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    645 {
    646    tl_assert(isOriginalAtom(mce, data));
    647    tl_assert(isShadowAtom(mce, vbits));
    648    tl_assert(sameKindedAtoms(data, vbits));
    649    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
    650 }
    651 
    652 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    653 {
    654    tl_assert(isOriginalAtom(mce, data));
    655    tl_assert(isShadowAtom(mce, vbits));
    656    tl_assert(sameKindedAtoms(data, vbits));
    657    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
    658 }
    659 
    660 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    661 {
    662    tl_assert(isOriginalAtom(mce, data));
    663    tl_assert(isShadowAtom(mce, vbits));
    664    tl_assert(sameKindedAtoms(data, vbits));
    665    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
    666 }
    667 
    668 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    669 {
    670    tl_assert(isOriginalAtom(mce, data));
    671    tl_assert(isShadowAtom(mce, vbits));
    672    tl_assert(sameKindedAtoms(data, vbits));
    673    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
    674 }
    675 
    676 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
    677    defined (0); all other -> undefined (1).
    678 */
    679 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    680 {
    681    tl_assert(isOriginalAtom(mce, data));
    682    tl_assert(isShadowAtom(mce, vbits));
    683    tl_assert(sameKindedAtoms(data, vbits));
    684    return assignNew(
    685              'V', mce, Ity_I8,
    686              binop(Iop_Or8,
    687                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
    688                    vbits) );
    689 }
    690 
    691 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    692 {
    693    tl_assert(isOriginalAtom(mce, data));
    694    tl_assert(isShadowAtom(mce, vbits));
    695    tl_assert(sameKindedAtoms(data, vbits));
    696    return assignNew(
    697              'V', mce, Ity_I16,
    698              binop(Iop_Or16,
    699                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
    700                    vbits) );
    701 }
    702 
    703 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    704 {
    705    tl_assert(isOriginalAtom(mce, data));
    706    tl_assert(isShadowAtom(mce, vbits));
    707    tl_assert(sameKindedAtoms(data, vbits));
    708    return assignNew(
    709              'V', mce, Ity_I32,
    710              binop(Iop_Or32,
    711                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
    712                    vbits) );
    713 }
    714 
    715 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    716 {
    717    tl_assert(isOriginalAtom(mce, data));
    718    tl_assert(isShadowAtom(mce, vbits));
    719    tl_assert(sameKindedAtoms(data, vbits));
    720    return assignNew(
    721              'V', mce, Ity_I64,
    722              binop(Iop_Or64,
    723                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
    724                    vbits) );
    725 }
    726 
    727 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    728 {
    729    tl_assert(isOriginalAtom(mce, data));
    730    tl_assert(isShadowAtom(mce, vbits));
    731    tl_assert(sameKindedAtoms(data, vbits));
    732    return assignNew(
    733              'V', mce, Ity_V128,
    734              binop(Iop_OrV128,
    735                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
    736                    vbits) );
    737 }
    738 
    739 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    740 {
    741    tl_assert(isOriginalAtom(mce, data));
    742    tl_assert(isShadowAtom(mce, vbits));
    743    tl_assert(sameKindedAtoms(data, vbits));
    744    return assignNew(
    745              'V', mce, Ity_V256,
    746              binop(Iop_OrV256,
    747                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
    748                    vbits) );
    749 }
    750 
    751 /* --------- Pessimising casts. --------- */
    752 
    753 /* The function returns an expression of type DST_TY. If any of the VBITS
    754    is undefined (value == 1) the resulting expression has all bits set to
    755    1. Otherwise, all bits are 0. */
    756 
    757 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
    758 {
    759    IRType  src_ty;
    760    IRAtom* tmp1;
    761 
    762    /* Note, dst_ty is a shadow type, not an original type. */
    763    tl_assert(isShadowAtom(mce,vbits));
    764    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
    765 
    766    /* Fast-track some common cases */
    767    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
    768       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    769 
    770    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
    771       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
    772 
    773    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
    774       /* PCast the arg, then clone it. */
    775       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    776       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    777    }
    778 
    779    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
    780       /* PCast the arg, then clone it 4 times. */
    781       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    782       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    783       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
    784    }
    785 
    786    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
    787       /* PCast the arg, then clone it 8 times. */
    788       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    789       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    790       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
    791       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
    792    }
    793 
    794    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
    795       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
    796          the top half. */
    797       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
    798       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
    799    }
    800 
    801    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
    802       /* Use InterleaveHI64x2 to copy the top half of the vector into
    803          the bottom half.  Then we can UifU it with the original, throw
    804          away the upper half of the result, and PCast-I64-to-I64
    805          the lower half. */
    806       // Generates vbits[127:64] : vbits[127:64]
    807       IRAtom* hi64hi64
    808          = assignNew('V', mce, Ity_V128,
    809                      binop(Iop_InterleaveHI64x2, vbits, vbits));
    810       // Generates
    811       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
    812       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
    813       IRAtom* lohi64
    814          = mkUifUV128(mce, hi64hi64, vbits);
    815       // Generates UifU(vbits[127:64],vbits[63:0])
    816       IRAtom* lo64
    817          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
    818       // Generates
    819       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
    820       //   == PCast-to-I64( vbits[127:0] )
    821       IRAtom* res
    822          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
    823       return res;
    824    }
    825 
    826    /* Else do it the slow way .. */
    827    /* First of all, collapse vbits down to a single bit. */
    828    tmp1   = NULL;
    829    switch (src_ty) {
    830       case Ity_I1:
    831          tmp1 = vbits;
    832          break;
    833       case Ity_I8:
    834          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
    835          break;
    836       case Ity_I16:
    837          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
    838          break;
    839       case Ity_I32:
    840          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
    841          break;
    842       case Ity_I64:
    843          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
    844          break;
    845       case Ity_I128: {
    846          /* Gah.  Chop it in half, OR the halves together, and compare
    847             that with zero. */
    848          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
    849          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
    850          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
    851          tmp1         = assignNew('V', mce, Ity_I1,
    852                                        unop(Iop_CmpNEZ64, tmp4));
    853          break;
    854       }
    855       case Ity_V128: {
    856          /* Chop it in half, OR the halves together, and compare that
    857           * with zero.
    858           */
    859          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vbits));
    860          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vbits));
    861          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
    862          tmp1         = assignNew('V', mce, Ity_I1,
    863                                        unop(Iop_CmpNEZ64, tmp4));
    864          break;
    865       }
    866       default:
    867          ppIRType(src_ty);
    868          VG_(tool_panic)("mkPCastTo(1)");
    869    }
    870    tl_assert(tmp1);
    871    /* Now widen up to the dst type. */
    872    switch (dst_ty) {
    873       case Ity_I1:
    874          return tmp1;
    875       case Ity_I8:
    876          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
    877       case Ity_I16:
    878          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
    879       case Ity_I32:
    880          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
    881       case Ity_I64:
    882          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
    883       case Ity_V128:
    884          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    885          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
    886          return tmp1;
    887       case Ity_I128:
    888          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    889          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
    890          return tmp1;
    891       case Ity_V256:
    892          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    893          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
    894                                                     tmp1, tmp1));
    895          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
    896                                                     tmp1, tmp1));
    897          return tmp1;
    898       default:
    899          ppIRType(dst_ty);
    900          VG_(tool_panic)("mkPCastTo(2)");
    901    }
    902 }
    903 
    904 /* This is a minor variant.  It takes an arg of some type and returns
    905    a value of the same type.  The result consists entirely of Defined
    906    (zero) bits except its least significant bit, which is a PCast of
    907    the entire argument down to a single bit. */
    908 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
    909 {
    910    if (ty == Ity_V128) {
    911       /* --- Case for V128 --- */
    912       IRAtom* varg128 = varg;
    913       // generates: PCast-to-I64(varg128)
    914       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
    915       // Now introduce zeros (defined bits) in the top 63 places
    916       // generates: Def--(63)--Def PCast-to-I1(varg128)
    917       IRAtom* d63pc
    918          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
    919       // generates: Def--(64)--Def
    920       IRAtom* d64
    921          = definedOfType(Ity_I64);
    922       // generates: Def--(127)--Def PCast-to-I1(varg128)
    923       IRAtom* res
    924          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
    925       return res;
    926    }
    927    if (ty == Ity_I64) {
    928       /* --- Case for I64 --- */
    929       // PCast to 64
    930       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
    931       // Zero (Def) out the top 63 bits
    932       IRAtom* res
    933          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
    934       return res;
    935    }
    936    /*NOTREACHED*/
    937    tl_assert(0);
    938 }
    939 
    940 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
    941 /*
    942    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
    943    PCasting to Ity_U1.  However, sometimes it is necessary to be more
    944    accurate.  The insight is that the result is defined if two
    945    corresponding bits can be found, one from each argument, so that
    946    both bits are defined but are different -- that makes EQ say "No"
    947    and NE say "Yes".  Hence, we compute an improvement term and DifD
    948    it onto the "normal" (UifU) result.
    949 
    950    The result is:
    951 
    952    PCastTo<1> (
    953       -- naive version
    954       PCastTo<sz>( UifU<sz>(vxx, vyy) )
    955 
    956       `DifD<sz>`
    957 
    958       -- improvement term
    959       PCastTo<sz>( PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) ) )
    960    )
    961 
    962    where
    963      vec contains 0 (defined) bits where the corresponding arg bits
    964      are defined but different, and 1 bits otherwise.
    965 
    966      vec = Or<sz>( vxx,   // 0 iff bit defined
    967                    vyy,   // 0 iff bit defined
    968                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
    969                  )
    970 
    971      If any bit of vec is 0, the result is defined and so the
    972      improvement term should produce 0...0, else it should produce
    973      1...1.
    974 
    975      Hence require for the improvement term:
    976 
    977         if vec == 1...1 then 1...1 else 0...0
    978      ->
    979         PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) )
    980 
    981    This was extensively re-analysed and checked on 6 July 05.
    982 */
    983 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
    984                                     IRType  ty,
    985                                     IRAtom* vxx, IRAtom* vyy,
    986                                     IRAtom* xx,  IRAtom* yy )
    987 {
    988    IRAtom *naive, *vec, *improvement_term;
    989    IRAtom *improved, *final_cast, *top;
    990    IROp   opDIFD, opUIFU, opXOR, opNOT, opCMP, opOR;
    991 
    992    tl_assert(isShadowAtom(mce,vxx));
    993    tl_assert(isShadowAtom(mce,vyy));
    994    tl_assert(isOriginalAtom(mce,xx));
    995    tl_assert(isOriginalAtom(mce,yy));
    996    tl_assert(sameKindedAtoms(vxx,xx));
    997    tl_assert(sameKindedAtoms(vyy,yy));
    998 
    999    switch (ty) {
   1000       case Ity_I16:
   1001          opOR   = Iop_Or16;
   1002          opDIFD = Iop_And16;
   1003          opUIFU = Iop_Or16;
   1004          opNOT  = Iop_Not16;
   1005          opXOR  = Iop_Xor16;
   1006          opCMP  = Iop_CmpEQ16;
   1007          top    = mkU16(0xFFFF);
   1008          break;
   1009       case Ity_I32:
   1010          opOR   = Iop_Or32;
   1011          opDIFD = Iop_And32;
   1012          opUIFU = Iop_Or32;
   1013          opNOT  = Iop_Not32;
   1014          opXOR  = Iop_Xor32;
   1015          opCMP  = Iop_CmpEQ32;
   1016          top    = mkU32(0xFFFFFFFF);
   1017          break;
   1018       case Ity_I64:
   1019          opOR   = Iop_Or64;
   1020          opDIFD = Iop_And64;
   1021          opUIFU = Iop_Or64;
   1022          opNOT  = Iop_Not64;
   1023          opXOR  = Iop_Xor64;
   1024          opCMP  = Iop_CmpEQ64;
   1025          top    = mkU64(0xFFFFFFFFFFFFFFFFULL);
   1026          break;
   1027       default:
   1028          VG_(tool_panic)("expensiveCmpEQorNE");
   1029    }
   1030 
   1031    naive
   1032       = mkPCastTo(mce,ty,
   1033                   assignNew('V', mce, ty, binop(opUIFU, vxx, vyy)));
   1034 
   1035    vec
   1036       = assignNew(
   1037            'V', mce,ty,
   1038            binop( opOR,
   1039                   assignNew('V', mce,ty, binop(opOR, vxx, vyy)),
   1040                   assignNew(
   1041                      'V', mce,ty,
   1042                      unop( opNOT,
   1043                            assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
   1044 
   1045    improvement_term
   1046       = mkPCastTo( mce,ty,
   1047                    assignNew('V', mce,Ity_I1, binop(opCMP, vec, top)));
   1048 
   1049    improved
   1050       = assignNew( 'V', mce,ty, binop(opDIFD, naive, improvement_term) );
   1051 
   1052    final_cast
   1053       = mkPCastTo( mce, Ity_I1, improved );
   1054 
   1055    return final_cast;
   1056 }
   1057 
   1058 
   1059 /* --------- Semi-accurate interpretation of CmpORD. --------- */
   1060 
   1061 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
   1062 
   1063       CmpORD32S(x,y) = 1<<3   if  x <s y
   1064                      = 1<<2   if  x >s y
   1065                      = 1<<1   if  x == y
   1066 
   1067    and similarly the unsigned variant.  The default interpretation is:
   1068 
   1069       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
   1070                                   & (7<<1)
   1071 
   1072    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
   1073    are zero and therefore defined (viz, zero).
   1074 
   1075    Also deal with a special case better:
   1076 
   1077       CmpORD32S(x,0)
   1078 
   1079    Here, bit 3 (LT) of the result is a copy of the top bit of x and
   1080    will be defined even if the rest of x isn't.  In which case we do:
   1081 
   1082       CmpORD32S#(x,x#,0,{impliedly 0}#)
   1083          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
   1084            | (x# >>u 31) << 3      -- LT# = x#[31]
   1085 
   1086    Analogous handling for CmpORD64{S,U}.
   1087 */
   1088 static Bool isZeroU32 ( IRAtom* e )
   1089 {
   1090    return
   1091       toBool( e->tag == Iex_Const
   1092               && e->Iex.Const.con->tag == Ico_U32
   1093               && e->Iex.Const.con->Ico.U32 == 0 );
   1094 }
   1095 
   1096 static Bool isZeroU64 ( IRAtom* e )
   1097 {
   1098    return
   1099       toBool( e->tag == Iex_Const
   1100               && e->Iex.Const.con->tag == Ico_U64
   1101               && e->Iex.Const.con->Ico.U64 == 0 );
   1102 }
   1103 
   1104 static IRAtom* doCmpORD ( MCEnv*  mce,
   1105                           IROp    cmp_op,
   1106                           IRAtom* xxhash, IRAtom* yyhash,
   1107                           IRAtom* xx,     IRAtom* yy )
   1108 {
   1109    Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
   1110    Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
   1111    IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
   1112    IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
   1113    IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
   1114    IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
   1115    IRType ty     = m64 ? Ity_I64   : Ity_I32;
   1116    Int    width  = m64 ? 64        : 32;
   1117 
   1118    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
   1119 
   1120    IRAtom* threeLeft1 = NULL;
   1121    IRAtom* sevenLeft1 = NULL;
   1122 
   1123    tl_assert(isShadowAtom(mce,xxhash));
   1124    tl_assert(isShadowAtom(mce,yyhash));
   1125    tl_assert(isOriginalAtom(mce,xx));
   1126    tl_assert(isOriginalAtom(mce,yy));
   1127    tl_assert(sameKindedAtoms(xxhash,xx));
   1128    tl_assert(sameKindedAtoms(yyhash,yy));
   1129    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
   1130              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
   1131 
   1132    if (0) {
   1133       ppIROp(cmp_op); VG_(printf)(" ");
   1134       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
   1135    }
   1136 
   1137    if (syned && isZero(yy)) {
   1138       /* fancy interpretation */
   1139       /* if yy is zero, then it must be fully defined (zero#). */
   1140       tl_assert(isZero(yyhash));
   1141       threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
   1142       return
   1143          binop(
   1144             opOR,
   1145             assignNew(
   1146                'V', mce,ty,
   1147                binop(
   1148                   opAND,
   1149                   mkPCastTo(mce,ty, xxhash),
   1150                   threeLeft1
   1151                )),
   1152             assignNew(
   1153                'V', mce,ty,
   1154                binop(
   1155                   opSHL,
   1156                   assignNew(
   1157                      'V', mce,ty,
   1158                      binop(opSHR, xxhash, mkU8(width-1))),
   1159                   mkU8(3)
   1160                ))
   1161 	 );
   1162    } else {
   1163       /* standard interpretation */
   1164       sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
   1165       return
   1166          binop(
   1167             opAND,
   1168             mkPCastTo( mce,ty,
   1169                        mkUifU(mce,ty, xxhash,yyhash)),
   1170             sevenLeft1
   1171          );
   1172    }
   1173 }
   1174 
   1175 
   1176 /*------------------------------------------------------------*/
   1177 /*--- Emit a test and complaint if something is undefined. ---*/
   1178 /*------------------------------------------------------------*/
   1179 
   1180 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
   1181 
   1182 
   1183 /* Set the annotations on a dirty helper to indicate that the stack
   1184    pointer and instruction pointers might be read.  This is the
   1185    behaviour of all 'emit-a-complaint' style functions we might
   1186    call. */
   1187 
   1188 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
   1189    di->nFxState = 2;
   1190    di->fxState[0].fx        = Ifx_Read;
   1191    di->fxState[0].offset    = mce->layout->offset_SP;
   1192    di->fxState[0].size      = mce->layout->sizeof_SP;
   1193    di->fxState[0].nRepeats  = 0;
   1194    di->fxState[0].repeatLen = 0;
   1195    di->fxState[1].fx        = Ifx_Read;
   1196    di->fxState[1].offset    = mce->layout->offset_IP;
   1197    di->fxState[1].size      = mce->layout->sizeof_IP;
   1198    di->fxState[1].nRepeats  = 0;
   1199    di->fxState[1].repeatLen = 0;
   1200 }
   1201 
   1202 
   1203 /* Check the supplied *original* |atom| for undefinedness, and emit a
   1204    complaint if so.  Once that happens, mark it as defined.  This is
   1205    possible because the atom is either a tmp or literal.  If it's a
   1206    tmp, it will be shadowed by a tmp, and so we can set the shadow to
   1207    be defined.  In fact as mentioned above, we will have to allocate a
   1208    new tmp to carry the new 'defined' shadow value, and update the
   1209    original->tmp mapping accordingly; we cannot simply assign a new
   1210    value to an existing shadow tmp as this breaks SSAness.
   1211 
   1212    The checks are performed, any resulting complaint emitted, and
   1213    |atom|'s shadow temp set to 'defined', ONLY in the case that
   1214    |guard| evaluates to True at run-time.  If it evaluates to False
   1215    then no action is performed.  If |guard| is NULL (the usual case)
   1216    then it is assumed to be always-true, and hence these actions are
   1217    performed unconditionally.
   1218 
   1219    This routine does not generate code to check the definedness of
   1220    |guard|.  The caller is assumed to have taken care of that already.
   1221 */
   1222 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
   1223 {
   1224    IRAtom*  vatom;
   1225    IRType   ty;
   1226    Int      sz;
   1227    IRDirty* di;
   1228    IRAtom*  cond;
   1229    IRAtom*  origin;
   1230    void*    fn;
   1231    const HChar* nm;
   1232    IRExpr** args;
   1233    Int      nargs;
   1234 
   1235    // Don't do V bit tests if we're not reporting undefined value errors.
   1236    if (MC_(clo_mc_level) == 1)
   1237       return;
   1238 
   1239    if (guard)
   1240       tl_assert(isOriginalAtom(mce, guard));
   1241 
   1242    /* Since the original expression is atomic, there's no duplicated
   1243       work generated by making multiple V-expressions for it.  So we
   1244       don't really care about the possibility that someone else may
   1245       also create a V-interpretion for it. */
   1246    tl_assert(isOriginalAtom(mce, atom));
   1247    vatom = expr2vbits( mce, atom );
   1248    tl_assert(isShadowAtom(mce, vatom));
   1249    tl_assert(sameKindedAtoms(atom, vatom));
   1250 
   1251    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
   1252 
   1253    /* sz is only used for constructing the error message */
   1254    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
   1255 
   1256    cond = mkPCastTo( mce, Ity_I1, vatom );
   1257    /* cond will be 0 if all defined, and 1 if any not defined. */
   1258 
   1259    /* Get the origin info for the value we are about to check.  At
   1260       least, if we are doing origin tracking.  If not, use a dummy
   1261       zero origin. */
   1262    if (MC_(clo_mc_level) == 3) {
   1263       origin = schemeE( mce, atom );
   1264       if (mce->hWordTy == Ity_I64) {
   1265          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
   1266       }
   1267    } else {
   1268       origin = NULL;
   1269    }
   1270 
   1271    fn    = NULL;
   1272    nm    = NULL;
   1273    args  = NULL;
   1274    nargs = -1;
   1275 
   1276    switch (sz) {
   1277       case 0:
   1278          if (origin) {
   1279             fn    = &MC_(helperc_value_check0_fail_w_o);
   1280             nm    = "MC_(helperc_value_check0_fail_w_o)";
   1281             args  = mkIRExprVec_1(origin);
   1282             nargs = 1;
   1283          } else {
   1284             fn    = &MC_(helperc_value_check0_fail_no_o);
   1285             nm    = "MC_(helperc_value_check0_fail_no_o)";
   1286             args  = mkIRExprVec_0();
   1287             nargs = 0;
   1288          }
   1289          break;
   1290       case 1:
   1291          if (origin) {
   1292             fn    = &MC_(helperc_value_check1_fail_w_o);
   1293             nm    = "MC_(helperc_value_check1_fail_w_o)";
   1294             args  = mkIRExprVec_1(origin);
   1295             nargs = 1;
   1296          } else {
   1297             fn    = &MC_(helperc_value_check1_fail_no_o);
   1298             nm    = "MC_(helperc_value_check1_fail_no_o)";
   1299             args  = mkIRExprVec_0();
   1300             nargs = 0;
   1301          }
   1302          break;
   1303       case 4:
   1304          if (origin) {
   1305             fn    = &MC_(helperc_value_check4_fail_w_o);
   1306             nm    = "MC_(helperc_value_check4_fail_w_o)";
   1307             args  = mkIRExprVec_1(origin);
   1308             nargs = 1;
   1309          } else {
   1310             fn    = &MC_(helperc_value_check4_fail_no_o);
   1311             nm    = "MC_(helperc_value_check4_fail_no_o)";
   1312             args  = mkIRExprVec_0();
   1313             nargs = 0;
   1314          }
   1315          break;
   1316       case 8:
   1317          if (origin) {
   1318             fn    = &MC_(helperc_value_check8_fail_w_o);
   1319             nm    = "MC_(helperc_value_check8_fail_w_o)";
   1320             args  = mkIRExprVec_1(origin);
   1321             nargs = 1;
   1322          } else {
   1323             fn    = &MC_(helperc_value_check8_fail_no_o);
   1324             nm    = "MC_(helperc_value_check8_fail_no_o)";
   1325             args  = mkIRExprVec_0();
   1326             nargs = 0;
   1327          }
   1328          break;
   1329       case 2:
   1330       case 16:
   1331          if (origin) {
   1332             fn    = &MC_(helperc_value_checkN_fail_w_o);
   1333             nm    = "MC_(helperc_value_checkN_fail_w_o)";
   1334             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
   1335             nargs = 2;
   1336          } else {
   1337             fn    = &MC_(helperc_value_checkN_fail_no_o);
   1338             nm    = "MC_(helperc_value_checkN_fail_no_o)";
   1339             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
   1340             nargs = 1;
   1341          }
   1342          break;
   1343       default:
   1344          VG_(tool_panic)("unexpected szB");
   1345    }
   1346 
   1347    tl_assert(fn);
   1348    tl_assert(nm);
   1349    tl_assert(args);
   1350    tl_assert(nargs >= 0 && nargs <= 2);
   1351    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
   1352               || (MC_(clo_mc_level) == 2 && origin == NULL) );
   1353 
   1354    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
   1355                            VG_(fnptr_to_fnentry)( fn ), args );
   1356    di->guard = cond; // and cond is PCast-to-1(atom#)
   1357 
   1358    /* If the complaint is to be issued under a guard condition, AND
   1359       that into the guard condition for the helper call. */
   1360    if (guard) {
   1361       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
   1362       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
   1363       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
   1364       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
   1365    }
   1366 
   1367    setHelperAnns( mce, di );
   1368    stmt( 'V', mce, IRStmt_Dirty(di));
   1369 
   1370    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
   1371       defined -- but only in the case where the guard evaluates to
   1372       True at run-time.  Do the update by setting the orig->shadow
   1373       mapping for tmp to reflect the fact that this shadow is getting
   1374       a new value. */
   1375    tl_assert(isIRAtom(vatom));
   1376    /* sameKindedAtoms ... */
   1377    if (vatom->tag == Iex_RdTmp) {
   1378       tl_assert(atom->tag == Iex_RdTmp);
   1379       if (guard == NULL) {
   1380          // guard is 'always True', hence update unconditionally
   1381          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1382          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
   1383                           definedOfType(ty));
   1384       } else {
   1385          // update the temp only conditionally.  Do this by copying
   1386          // its old value when the guard is False.
   1387          // The old value ..
   1388          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1389          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1390          IRAtom* new_tmpV
   1391             = assignNew('V', mce, shadowTypeV(ty),
   1392                         IRExpr_ITE(guard, definedOfType(ty),
   1393                                           mkexpr(old_tmpV)));
   1394          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
   1395       }
   1396    }
   1397 }
   1398 
   1399 
   1400 /*------------------------------------------------------------*/
   1401 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
   1402 /*------------------------------------------------------------*/
   1403 
   1404 /* Examine the always-defined sections declared in layout to see if
   1405    the (offset,size) section is within one.  Note, is is an error to
   1406    partially fall into such a region: (offset,size) should either be
   1407    completely in such a region or completely not-in such a region.
   1408 */
   1409 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
   1410 {
   1411    Int minoffD, maxoffD, i;
   1412    Int minoff = offset;
   1413    Int maxoff = minoff + size - 1;
   1414    tl_assert((minoff & ~0xFFFF) == 0);
   1415    tl_assert((maxoff & ~0xFFFF) == 0);
   1416 
   1417    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
   1418       minoffD = mce->layout->alwaysDefd[i].offset;
   1419       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
   1420       tl_assert((minoffD & ~0xFFFF) == 0);
   1421       tl_assert((maxoffD & ~0xFFFF) == 0);
   1422 
   1423       if (maxoff < minoffD || maxoffD < minoff)
   1424          continue; /* no overlap */
   1425       if (minoff >= minoffD && maxoff <= maxoffD)
   1426          return True; /* completely contained in an always-defd section */
   1427 
   1428       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
   1429    }
   1430    return False; /* could not find any containing section */
   1431 }
   1432 
   1433 
   1434 /* Generate into bb suitable actions to shadow this Put.  If the state
   1435    slice is marked 'always defined', do nothing.  Otherwise, write the
   1436    supplied V bits to the shadow state.  We can pass in either an
   1437    original atom or a V-atom, but not both.  In the former case the
   1438    relevant V-bits are then generated from the original.
   1439    We assume here, that the definedness of GUARD has already been checked.
   1440 */
   1441 static
   1442 void do_shadow_PUT ( MCEnv* mce,  Int offset,
   1443                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
   1444 {
   1445    IRType ty;
   1446 
   1447    // Don't do shadow PUTs if we're not doing undefined value checking.
   1448    // Their absence lets Vex's optimiser remove all the shadow computation
   1449    // that they depend on, which includes GETs of the shadow registers.
   1450    if (MC_(clo_mc_level) == 1)
   1451       return;
   1452 
   1453    if (atom) {
   1454       tl_assert(!vatom);
   1455       tl_assert(isOriginalAtom(mce, atom));
   1456       vatom = expr2vbits( mce, atom );
   1457    } else {
   1458       tl_assert(vatom);
   1459       tl_assert(isShadowAtom(mce, vatom));
   1460    }
   1461 
   1462    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
   1463    tl_assert(ty != Ity_I1);
   1464    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
   1465       /* later: no ... */
   1466       /* emit code to emit a complaint if any of the vbits are 1. */
   1467       /* complainIfUndefined(mce, atom); */
   1468    } else {
   1469       /* Do a plain shadow Put. */
   1470       if (guard) {
   1471          /* If the guard expression evaluates to false we simply Put the value
   1472             that is already stored in the guest state slot */
   1473          IRAtom *cond, *iffalse;
   1474 
   1475          cond    = assignNew('V', mce, Ity_I1, guard);
   1476          iffalse = assignNew('V', mce, ty,
   1477                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
   1478          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
   1479       }
   1480       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
   1481    }
   1482 }
   1483 
   1484 
   1485 /* Return an expression which contains the V bits corresponding to the
   1486    given GETI (passed in in pieces).
   1487 */
   1488 static
   1489 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
   1490 {
   1491    IRAtom* vatom;
   1492    IRType  ty, tyS;
   1493    Int     arrSize;;
   1494    IRRegArray* descr = puti->descr;
   1495    IRAtom*     ix    = puti->ix;
   1496    Int         bias  = puti->bias;
   1497    IRAtom*     atom  = puti->data;
   1498 
   1499    // Don't do shadow PUTIs if we're not doing undefined value checking.
   1500    // Their absence lets Vex's optimiser remove all the shadow computation
   1501    // that they depend on, which includes GETIs of the shadow registers.
   1502    if (MC_(clo_mc_level) == 1)
   1503       return;
   1504 
   1505    tl_assert(isOriginalAtom(mce,atom));
   1506    vatom = expr2vbits( mce, atom );
   1507    tl_assert(sameKindedAtoms(atom, vatom));
   1508    ty   = descr->elemTy;
   1509    tyS  = shadowTypeV(ty);
   1510    arrSize = descr->nElems * sizeofIRType(ty);
   1511    tl_assert(ty != Ity_I1);
   1512    tl_assert(isOriginalAtom(mce,ix));
   1513    complainIfUndefined(mce, ix, NULL);
   1514    if (isAlwaysDefd(mce, descr->base, arrSize)) {
   1515       /* later: no ... */
   1516       /* emit code to emit a complaint if any of the vbits are 1. */
   1517       /* complainIfUndefined(mce, atom); */
   1518    } else {
   1519       /* Do a cloned version of the Put that refers to the shadow
   1520          area. */
   1521       IRRegArray* new_descr
   1522          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
   1523                          tyS, descr->nElems);
   1524       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
   1525    }
   1526 }
   1527 
   1528 
   1529 /* Return an expression which contains the V bits corresponding to the
   1530    given GET (passed in in pieces).
   1531 */
   1532 static
   1533 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
   1534 {
   1535    IRType tyS = shadowTypeV(ty);
   1536    tl_assert(ty != Ity_I1);
   1537    tl_assert(ty != Ity_I128);
   1538    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
   1539       /* Always defined, return all zeroes of the relevant type */
   1540       return definedOfType(tyS);
   1541    } else {
   1542       /* return a cloned version of the Get that refers to the shadow
   1543          area. */
   1544       /* FIXME: this isn't an atom! */
   1545       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
   1546    }
   1547 }
   1548 
   1549 
   1550 /* Return an expression which contains the V bits corresponding to the
   1551    given GETI (passed in in pieces).
   1552 */
   1553 static
   1554 IRExpr* shadow_GETI ( MCEnv* mce,
   1555                       IRRegArray* descr, IRAtom* ix, Int bias )
   1556 {
   1557    IRType ty   = descr->elemTy;
   1558    IRType tyS  = shadowTypeV(ty);
   1559    Int arrSize = descr->nElems * sizeofIRType(ty);
   1560    tl_assert(ty != Ity_I1);
   1561    tl_assert(isOriginalAtom(mce,ix));
   1562    complainIfUndefined(mce, ix, NULL);
   1563    if (isAlwaysDefd(mce, descr->base, arrSize)) {
   1564       /* Always defined, return all zeroes of the relevant type */
   1565       return definedOfType(tyS);
   1566    } else {
   1567       /* return a cloned version of the Get that refers to the shadow
   1568          area. */
   1569       IRRegArray* new_descr
   1570          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
   1571                          tyS, descr->nElems);
   1572       return IRExpr_GetI( new_descr, ix, bias );
   1573    }
   1574 }
   1575 
   1576 
   1577 /*------------------------------------------------------------*/
   1578 /*--- Generating approximations for unknown operations,    ---*/
   1579 /*--- using lazy-propagate semantics                       ---*/
   1580 /*------------------------------------------------------------*/
   1581 
   1582 /* Lazy propagation of undefinedness from two values, resulting in the
   1583    specified shadow type.
   1584 */
   1585 static
   1586 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
   1587 {
   1588    IRAtom* at;
   1589    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1590    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1591    tl_assert(isShadowAtom(mce,va1));
   1592    tl_assert(isShadowAtom(mce,va2));
   1593 
   1594    /* The general case is inefficient because PCast is an expensive
   1595       operation.  Here are some special cases which use PCast only
   1596       once rather than twice. */
   1597 
   1598    /* I64 x I64 -> I64 */
   1599    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
   1600       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
   1601       at = mkUifU(mce, Ity_I64, va1, va2);
   1602       at = mkPCastTo(mce, Ity_I64, at);
   1603       return at;
   1604    }
   1605 
   1606    /* I64 x I64 -> I32 */
   1607    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
   1608       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
   1609       at = mkUifU(mce, Ity_I64, va1, va2);
   1610       at = mkPCastTo(mce, Ity_I32, at);
   1611       return at;
   1612    }
   1613 
   1614    /* I32 x I32 -> I32 */
   1615    if (t1 == Ity_I32 && t2 == Ity_I32 && finalVty == Ity_I32) {
   1616       if (0) VG_(printf)("mkLazy2: I32 x I32 -> I32\n");
   1617       at = mkUifU(mce, Ity_I32, va1, va2);
   1618       at = mkPCastTo(mce, Ity_I32, at);
   1619       return at;
   1620    }
   1621 
   1622    if (0) {
   1623       VG_(printf)("mkLazy2 ");
   1624       ppIRType(t1);
   1625       VG_(printf)("_");
   1626       ppIRType(t2);
   1627       VG_(printf)("_");
   1628       ppIRType(finalVty);
   1629       VG_(printf)("\n");
   1630    }
   1631 
   1632    /* General case: force everything via 32-bit intermediaries. */
   1633    at = mkPCastTo(mce, Ity_I32, va1);
   1634    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
   1635    at = mkPCastTo(mce, finalVty, at);
   1636    return at;
   1637 }
   1638 
   1639 
   1640 /* 3-arg version of the above. */
   1641 static
   1642 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
   1643                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
   1644 {
   1645    IRAtom* at;
   1646    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1647    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1648    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
   1649    tl_assert(isShadowAtom(mce,va1));
   1650    tl_assert(isShadowAtom(mce,va2));
   1651    tl_assert(isShadowAtom(mce,va3));
   1652 
   1653    /* The general case is inefficient because PCast is an expensive
   1654       operation.  Here are some special cases which use PCast only
   1655       twice rather than three times. */
   1656 
   1657    /* I32 x I64 x I64 -> I64 */
   1658    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1659    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
   1660        && finalVty == Ity_I64) {
   1661       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
   1662       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
   1663          mode indication which is fully defined, this should get
   1664          folded out later. */
   1665       at = mkPCastTo(mce, Ity_I64, va1);
   1666       /* Now fold in 2nd and 3rd args. */
   1667       at = mkUifU(mce, Ity_I64, at, va2);
   1668       at = mkUifU(mce, Ity_I64, at, va3);
   1669       /* and PCast once again. */
   1670       at = mkPCastTo(mce, Ity_I64, at);
   1671       return at;
   1672    }
   1673 
   1674    /* I32 x I8 x I64 -> I64 */
   1675    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
   1676        && finalVty == Ity_I64) {
   1677       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
   1678       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
   1679        * rounding mode indication which is fully defined, this should
   1680        * get folded out later.
   1681       */
   1682       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
   1683       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
   1684       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
   1685       at = mkUifU(mce, Ity_I64, at, va3);
   1686       /* and PCast once again. */
   1687       at = mkPCastTo(mce, Ity_I64, at);
   1688       return at;
   1689    }
   1690 
   1691    /* I32 x I64 x I64 -> I32 */
   1692    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
   1693        && finalVty == Ity_I32) {
   1694       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
   1695       at = mkPCastTo(mce, Ity_I64, va1);
   1696       at = mkUifU(mce, Ity_I64, at, va2);
   1697       at = mkUifU(mce, Ity_I64, at, va3);
   1698       at = mkPCastTo(mce, Ity_I32, at);
   1699       return at;
   1700    }
   1701 
   1702    /* I32 x I32 x I32 -> I32 */
   1703    /* 32-bit FP idiom, as (eg) happens on ARM */
   1704    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
   1705        && finalVty == Ity_I32) {
   1706       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
   1707       at = va1;
   1708       at = mkUifU(mce, Ity_I32, at, va2);
   1709       at = mkUifU(mce, Ity_I32, at, va3);
   1710       at = mkPCastTo(mce, Ity_I32, at);
   1711       return at;
   1712    }
   1713 
   1714    /* I32 x I128 x I128 -> I128 */
   1715    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1716    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
   1717        && finalVty == Ity_I128) {
   1718       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
   1719       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
   1720          mode indication which is fully defined, this should get
   1721          folded out later. */
   1722       at = mkPCastTo(mce, Ity_I128, va1);
   1723       /* Now fold in 2nd and 3rd args. */
   1724       at = mkUifU(mce, Ity_I128, at, va2);
   1725       at = mkUifU(mce, Ity_I128, at, va3);
   1726       /* and PCast once again. */
   1727       at = mkPCastTo(mce, Ity_I128, at);
   1728       return at;
   1729    }
   1730 
   1731    /* I32 x I8 x I128 -> I128 */
   1732    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1733    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
   1734        && finalVty == Ity_I128) {
   1735       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
   1736       /* Use I64 as an intermediate type, which means PCasting all 3
   1737          args to I64 to start with. 1st arg is typically a rounding
   1738          mode indication which is fully defined, so we hope that it
   1739          will get folded out later. */
   1740       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
   1741       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
   1742       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
   1743       /* Now UifU all three together. */
   1744       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
   1745       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
   1746       /* and PCast once again. */
   1747       at = mkPCastTo(mce, Ity_I128, at);
   1748       return at;
   1749    }
   1750    if (1) {
   1751       VG_(printf)("mkLazy3: ");
   1752       ppIRType(t1);
   1753       VG_(printf)(" x ");
   1754       ppIRType(t2);
   1755       VG_(printf)(" x ");
   1756       ppIRType(t3);
   1757       VG_(printf)(" -> ");
   1758       ppIRType(finalVty);
   1759       VG_(printf)("\n");
   1760    }
   1761 
   1762    tl_assert(0);
   1763    /* General case: force everything via 32-bit intermediaries. */
   1764    /*
   1765    at = mkPCastTo(mce, Ity_I32, va1);
   1766    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
   1767    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
   1768    at = mkPCastTo(mce, finalVty, at);
   1769    return at;
   1770    */
   1771 }
   1772 
   1773 
   1774 /* 4-arg version of the above. */
   1775 static
   1776 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
   1777                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
   1778 {
   1779    IRAtom* at;
   1780    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1781    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1782    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
   1783    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
   1784    tl_assert(isShadowAtom(mce,va1));
   1785    tl_assert(isShadowAtom(mce,va2));
   1786    tl_assert(isShadowAtom(mce,va3));
   1787    tl_assert(isShadowAtom(mce,va4));
   1788 
   1789    /* The general case is inefficient because PCast is an expensive
   1790       operation.  Here are some special cases which use PCast only
   1791       twice rather than three times. */
   1792 
   1793    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
   1794 
   1795    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128 && t4 == Ity_I128
   1796        && finalVty == Ity_I128) {
   1797       if (0) VG_(printf)("mkLazy4: I32 x I128 x I128 x I128 -> I128\n");
   1798       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
   1799          mode indication which is fully defined, this should get
   1800          folded out later. */
   1801       at = mkPCastTo(mce, Ity_I128, va1);
   1802       /* Now fold in 2nd, 3rd, 4th args. */
   1803       at = mkUifU(mce, Ity_I128, at, va2);
   1804       at = mkUifU(mce, Ity_I128, at, va3);
   1805       at = mkUifU(mce, Ity_I128, at, va4);
   1806       /* and PCast once again. */
   1807       at = mkPCastTo(mce, Ity_I128, at);
   1808       return at;
   1809    }
   1810 
   1811    /* I32 x I64 x I64 x I64 -> I64 */
   1812    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
   1813        && finalVty == Ity_I64) {
   1814       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
   1815       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
   1816          mode indication which is fully defined, this should get
   1817          folded out later. */
   1818       at = mkPCastTo(mce, Ity_I64, va1);
   1819       /* Now fold in 2nd, 3rd, 4th args. */
   1820       at = mkUifU(mce, Ity_I64, at, va2);
   1821       at = mkUifU(mce, Ity_I64, at, va3);
   1822       at = mkUifU(mce, Ity_I64, at, va4);
   1823       /* and PCast once again. */
   1824       at = mkPCastTo(mce, Ity_I64, at);
   1825       return at;
   1826    }
   1827    /* I32 x I32 x I32 x I32 -> I32 */
   1828    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
   1829    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
   1830        && finalVty == Ity_I32) {
   1831       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
   1832       at = va1;
   1833       /* Now fold in 2nd, 3rd, 4th args. */
   1834       at = mkUifU(mce, Ity_I32, at, va2);
   1835       at = mkUifU(mce, Ity_I32, at, va3);
   1836       at = mkUifU(mce, Ity_I32, at, va4);
   1837       at = mkPCastTo(mce, Ity_I32, at);
   1838       return at;
   1839    }
   1840 
   1841    if (1) {
   1842       VG_(printf)("mkLazy4: ");
   1843       ppIRType(t1);
   1844       VG_(printf)(" x ");
   1845       ppIRType(t2);
   1846       VG_(printf)(" x ");
   1847       ppIRType(t3);
   1848       VG_(printf)(" x ");
   1849       ppIRType(t4);
   1850       VG_(printf)(" -> ");
   1851       ppIRType(finalVty);
   1852       VG_(printf)("\n");
   1853    }
   1854 
   1855    tl_assert(0);
   1856 }
   1857 
   1858 
   1859 /* Do the lazy propagation game from a null-terminated vector of
   1860    atoms.  This is presumably the arguments to a helper call, so the
   1861    IRCallee info is also supplied in order that we can know which
   1862    arguments should be ignored (via the .mcx_mask field).
   1863 */
   1864 static
   1865 IRAtom* mkLazyN ( MCEnv* mce,
   1866                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
   1867 {
   1868    Int     i;
   1869    IRAtom* here;
   1870    IRAtom* curr;
   1871    IRType  mergeTy;
   1872    Bool    mergeTy64 = True;
   1873 
   1874    /* Decide on the type of the merge intermediary.  If all relevant
   1875       args are I64, then it's I64.  In all other circumstances, use
   1876       I32. */
   1877    for (i = 0; exprvec[i]; i++) {
   1878       tl_assert(i < 32);
   1879       tl_assert(isOriginalAtom(mce, exprvec[i]));
   1880       if (cee->mcx_mask & (1<<i))
   1881          continue;
   1882       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
   1883          mergeTy64 = False;
   1884    }
   1885 
   1886    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
   1887    curr    = definedOfType(mergeTy);
   1888 
   1889    for (i = 0; exprvec[i]; i++) {
   1890       tl_assert(i < 32);
   1891       tl_assert(isOriginalAtom(mce, exprvec[i]));
   1892       /* Only take notice of this arg if the callee's mc-exclusion
   1893          mask does not say it is to be excluded. */
   1894       if (cee->mcx_mask & (1<<i)) {
   1895          /* the arg is to be excluded from definedness checking.  Do
   1896             nothing. */
   1897          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
   1898       } else {
   1899          /* calculate the arg's definedness, and pessimistically merge
   1900             it in. */
   1901          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i]) );
   1902          curr = mergeTy64
   1903                    ? mkUifU64(mce, here, curr)
   1904                    : mkUifU32(mce, here, curr);
   1905       }
   1906    }
   1907    return mkPCastTo(mce, finalVtype, curr );
   1908 }
   1909 
   1910 
   1911 /*------------------------------------------------------------*/
   1912 /*--- Generating expensive sequences for exact carry-chain ---*/
   1913 /*--- propagation in add/sub and related operations.       ---*/
   1914 /*------------------------------------------------------------*/
   1915 
   1916 static
   1917 IRAtom* expensiveAddSub ( MCEnv*  mce,
   1918                           Bool    add,
   1919                           IRType  ty,
   1920                           IRAtom* qaa, IRAtom* qbb,
   1921                           IRAtom* aa,  IRAtom* bb )
   1922 {
   1923    IRAtom *a_min, *b_min, *a_max, *b_max;
   1924    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
   1925 
   1926    tl_assert(isShadowAtom(mce,qaa));
   1927    tl_assert(isShadowAtom(mce,qbb));
   1928    tl_assert(isOriginalAtom(mce,aa));
   1929    tl_assert(isOriginalAtom(mce,bb));
   1930    tl_assert(sameKindedAtoms(qaa,aa));
   1931    tl_assert(sameKindedAtoms(qbb,bb));
   1932 
   1933    switch (ty) {
   1934       case Ity_I32:
   1935          opAND = Iop_And32;
   1936          opOR  = Iop_Or32;
   1937          opXOR = Iop_Xor32;
   1938          opNOT = Iop_Not32;
   1939          opADD = Iop_Add32;
   1940          opSUB = Iop_Sub32;
   1941          break;
   1942       case Ity_I64:
   1943          opAND = Iop_And64;
   1944          opOR  = Iop_Or64;
   1945          opXOR = Iop_Xor64;
   1946          opNOT = Iop_Not64;
   1947          opADD = Iop_Add64;
   1948          opSUB = Iop_Sub64;
   1949          break;
   1950       default:
   1951          VG_(tool_panic)("expensiveAddSub");
   1952    }
   1953 
   1954    // a_min = aa & ~qaa
   1955    a_min = assignNew('V', mce,ty,
   1956                      binop(opAND, aa,
   1957                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
   1958 
   1959    // b_min = bb & ~qbb
   1960    b_min = assignNew('V', mce,ty,
   1961                      binop(opAND, bb,
   1962                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
   1963 
   1964    // a_max = aa | qaa
   1965    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
   1966 
   1967    // b_max = bb | qbb
   1968    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
   1969 
   1970    if (add) {
   1971       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
   1972       return
   1973       assignNew('V', mce,ty,
   1974          binop( opOR,
   1975                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
   1976                 assignNew('V', mce,ty,
   1977                    binop( opXOR,
   1978                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
   1979                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
   1980                    )
   1981                 )
   1982          )
   1983       );
   1984    } else {
   1985       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max + b_min))
   1986       return
   1987       assignNew('V', mce,ty,
   1988          binop( opOR,
   1989                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
   1990                 assignNew('V', mce,ty,
   1991                    binop( opXOR,
   1992                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
   1993                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
   1994                    )
   1995                 )
   1996          )
   1997       );
   1998    }
   1999 
   2000 }
   2001 
   2002 
   2003 static
   2004 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
   2005                                        IRAtom* atom, IRAtom* vatom )
   2006 {
   2007    IRType ty;
   2008    IROp xorOp, subOp, andOp;
   2009    IRExpr *one;
   2010    IRAtom *improver, *improved;
   2011    tl_assert(isShadowAtom(mce,vatom));
   2012    tl_assert(isOriginalAtom(mce,atom));
   2013    tl_assert(sameKindedAtoms(atom,vatom));
   2014 
   2015    switch (czop) {
   2016       case Iop_Ctz32:
   2017          ty = Ity_I32;
   2018          xorOp = Iop_Xor32;
   2019          subOp = Iop_Sub32;
   2020          andOp = Iop_And32;
   2021          one = mkU32(1);
   2022          break;
   2023       case Iop_Ctz64:
   2024          ty = Ity_I64;
   2025          xorOp = Iop_Xor64;
   2026          subOp = Iop_Sub64;
   2027          andOp = Iop_And64;
   2028          one = mkU64(1);
   2029          break;
   2030       default:
   2031          ppIROp(czop);
   2032          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
   2033    }
   2034 
   2035    // improver = atom ^ (atom - 1)
   2036    //
   2037    // That is, improver has its low ctz(atom) bits equal to one;
   2038    // higher bits (if any) equal to zero.
   2039    improver = assignNew('V', mce,ty,
   2040                         binop(xorOp,
   2041                               atom,
   2042                               assignNew('V', mce, ty,
   2043                                         binop(subOp, atom, one))));
   2044 
   2045    // improved = vatom & improver
   2046    //
   2047    // That is, treat any V bits above the first ctz(atom) bits as
   2048    // "defined".
   2049    improved = assignNew('V', mce, ty,
   2050                         binop(andOp, vatom, improver));
   2051 
   2052    // Return pessimizing cast of improved.
   2053    return mkPCastTo(mce, ty, improved);
   2054 }
   2055 
   2056 
   2057 /*------------------------------------------------------------*/
   2058 /*--- Scalar shifts.                                       ---*/
   2059 /*------------------------------------------------------------*/
   2060 
   2061 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
   2062    idea is to shift the definedness bits by the original shift amount.
   2063    This introduces 0s ("defined") in new positions for left shifts and
   2064    unsigned right shifts, and copies the top definedness bit for
   2065    signed right shifts.  So, conveniently, applying the original shift
   2066    operator to the definedness bits for the left arg is exactly the
   2067    right thing to do:
   2068 
   2069       (qaa << bb)
   2070 
   2071    However if the shift amount is undefined then the whole result
   2072    is undefined.  Hence need:
   2073 
   2074       (qaa << bb) `UifU` PCast(qbb)
   2075 
   2076    If the shift amount bb is a literal than qbb will say 'all defined'
   2077    and the UifU and PCast will get folded out by post-instrumentation
   2078    optimisation.
   2079 */
   2080 static IRAtom* scalarShift ( MCEnv*  mce,
   2081                              IRType  ty,
   2082                              IROp    original_op,
   2083                              IRAtom* qaa, IRAtom* qbb,
   2084                              IRAtom* aa,  IRAtom* bb )
   2085 {
   2086    tl_assert(isShadowAtom(mce,qaa));
   2087    tl_assert(isShadowAtom(mce,qbb));
   2088    tl_assert(isOriginalAtom(mce,aa));
   2089    tl_assert(isOriginalAtom(mce,bb));
   2090    tl_assert(sameKindedAtoms(qaa,aa));
   2091    tl_assert(sameKindedAtoms(qbb,bb));
   2092    return
   2093       assignNew(
   2094          'V', mce, ty,
   2095          mkUifU( mce, ty,
   2096                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
   2097                  mkPCastTo(mce, ty, qbb)
   2098          )
   2099    );
   2100 }
   2101 
   2102 
   2103 /*------------------------------------------------------------*/
   2104 /*--- Helpers for dealing with vector primops.             ---*/
   2105 /*------------------------------------------------------------*/
   2106 
   2107 /* Vector pessimisation -- pessimise within each lane individually. */
   2108 
   2109 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
   2110 {
   2111    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
   2112 }
   2113 
   2114 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
   2115 {
   2116    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
   2117 }
   2118 
   2119 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
   2120 {
   2121    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
   2122 }
   2123 
   2124 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
   2125 {
   2126    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
   2127 }
   2128 
   2129 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
   2130 {
   2131    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
   2132 }
   2133 
   2134 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
   2135 {
   2136    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
   2137 }
   2138 
   2139 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
   2140 {
   2141    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
   2142 }
   2143 
   2144 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
   2145 {
   2146    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
   2147 }
   2148 
   2149 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
   2150 {
   2151    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
   2152 }
   2153 
   2154 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
   2155 {
   2156    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
   2157 }
   2158 
   2159 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
   2160 {
   2161    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
   2162 }
   2163 
   2164 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
   2165 {
   2166    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
   2167 }
   2168 
   2169 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
   2170 {
   2171    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
   2172 }
   2173 
   2174 
   2175 /* Here's a simple scheme capable of handling ops derived from SSE1
   2176    code and while only generating ops that can be efficiently
   2177    implemented in SSE1. */
   2178 
   2179 /* All-lanes versions are straightforward:
   2180 
   2181    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
   2182 
   2183    unary32Fx4(x,y)    ==> PCast32x4(x#)
   2184 
   2185    Lowest-lane-only versions are more complex:
   2186 
   2187    binary32F0x4(x,y)  ==> SetV128lo32(
   2188                              x#,
   2189                              PCast32(V128to32(UifUV128(x#,y#)))
   2190                           )
   2191 
   2192    This is perhaps not so obvious.  In particular, it's faster to
   2193    do a V128-bit UifU and then take the bottom 32 bits than the more
   2194    obvious scheme of taking the bottom 32 bits of each operand
   2195    and doing a 32-bit UifU.  Basically since UifU is fast and
   2196    chopping lanes off vector values is slow.
   2197 
   2198    Finally:
   2199 
   2200    unary32F0x4(x)     ==> SetV128lo32(
   2201                              x#,
   2202                              PCast32(V128to32(x#))
   2203                           )
   2204 
   2205    Where:
   2206 
   2207    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
   2208    PCast32x4(v#) = CmpNEZ32x4(v#)
   2209 */
   2210 
   2211 static
   2212 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2213 {
   2214    IRAtom* at;
   2215    tl_assert(isShadowAtom(mce, vatomX));
   2216    tl_assert(isShadowAtom(mce, vatomY));
   2217    at = mkUifUV128(mce, vatomX, vatomY);
   2218    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
   2219    return at;
   2220 }
   2221 
   2222 static
   2223 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
   2224 {
   2225    IRAtom* at;
   2226    tl_assert(isShadowAtom(mce, vatomX));
   2227    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
   2228    return at;
   2229 }
   2230 
   2231 static
   2232 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2233 {
   2234    IRAtom* at;
   2235    tl_assert(isShadowAtom(mce, vatomX));
   2236    tl_assert(isShadowAtom(mce, vatomY));
   2237    at = mkUifUV128(mce, vatomX, vatomY);
   2238    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
   2239    at = mkPCastTo(mce, Ity_I32, at);
   2240    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
   2241    return at;
   2242 }
   2243 
   2244 static
   2245 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
   2246 {
   2247    IRAtom* at;
   2248    tl_assert(isShadowAtom(mce, vatomX));
   2249    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
   2250    at = mkPCastTo(mce, Ity_I32, at);
   2251    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
   2252    return at;
   2253 }
   2254 
   2255 /* --- ... and ... 64Fx2 versions of the same ... --- */
   2256 
   2257 static
   2258 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2259 {
   2260    IRAtom* at;
   2261    tl_assert(isShadowAtom(mce, vatomX));
   2262    tl_assert(isShadowAtom(mce, vatomY));
   2263    at = mkUifUV128(mce, vatomX, vatomY);
   2264    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
   2265    return at;
   2266 }
   2267 
   2268 static
   2269 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
   2270 {
   2271    IRAtom* at;
   2272    tl_assert(isShadowAtom(mce, vatomX));
   2273    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
   2274    return at;
   2275 }
   2276 
   2277 static
   2278 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2279 {
   2280    IRAtom* at;
   2281    tl_assert(isShadowAtom(mce, vatomX));
   2282    tl_assert(isShadowAtom(mce, vatomY));
   2283    at = mkUifUV128(mce, vatomX, vatomY);
   2284    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
   2285    at = mkPCastTo(mce, Ity_I64, at);
   2286    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
   2287    return at;
   2288 }
   2289 
   2290 static
   2291 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
   2292 {
   2293    IRAtom* at;
   2294    tl_assert(isShadowAtom(mce, vatomX));
   2295    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
   2296    at = mkPCastTo(mce, Ity_I64, at);
   2297    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
   2298    return at;
   2299 }
   2300 
   2301 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
   2302 
   2303 static
   2304 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2305 {
   2306    IRAtom* at;
   2307    tl_assert(isShadowAtom(mce, vatomX));
   2308    tl_assert(isShadowAtom(mce, vatomY));
   2309    at = mkUifU64(mce, vatomX, vatomY);
   2310    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
   2311    return at;
   2312 }
   2313 
   2314 static
   2315 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
   2316 {
   2317    IRAtom* at;
   2318    tl_assert(isShadowAtom(mce, vatomX));
   2319    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
   2320    return at;
   2321 }
   2322 
   2323 /* --- ... and ... 64Fx4 versions of the same ... --- */
   2324 
   2325 static
   2326 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2327 {
   2328    IRAtom* at;
   2329    tl_assert(isShadowAtom(mce, vatomX));
   2330    tl_assert(isShadowAtom(mce, vatomY));
   2331    at = mkUifUV256(mce, vatomX, vatomY);
   2332    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
   2333    return at;
   2334 }
   2335 
   2336 static
   2337 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
   2338 {
   2339    IRAtom* at;
   2340    tl_assert(isShadowAtom(mce, vatomX));
   2341    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
   2342    return at;
   2343 }
   2344 
   2345 /* --- ... and ... 32Fx8 versions of the same ... --- */
   2346 
   2347 static
   2348 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2349 {
   2350    IRAtom* at;
   2351    tl_assert(isShadowAtom(mce, vatomX));
   2352    tl_assert(isShadowAtom(mce, vatomY));
   2353    at = mkUifUV256(mce, vatomX, vatomY);
   2354    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
   2355    return at;
   2356 }
   2357 
   2358 static
   2359 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
   2360 {
   2361    IRAtom* at;
   2362    tl_assert(isShadowAtom(mce, vatomX));
   2363    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
   2364    return at;
   2365 }
   2366 
   2367 /* --- 64Fx2 binary FP ops, with rounding mode --- */
   2368 
   2369 static
   2370 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
   2371                                        IRAtom* vatomX, IRAtom* vatomY )
   2372 {
   2373    /* This is the same as binary64Fx2, except that we subsequently
   2374       pessimise vRM (definedness of the rounding mode), widen to 128
   2375       bits and UifU it into the result.  As with the scalar cases, if
   2376       the RM is a constant then it is defined and so this extra bit
   2377       will get constant-folded out later. */
   2378    // "do" the vector args
   2379    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
   2380    // PCast the RM, and widen it to 128 bits
   2381    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2382    // Roll it into the result
   2383    t1 = mkUifUV128(mce, t1, t2);
   2384    return t1;
   2385 }
   2386 
   2387 /* --- ... and ... 32Fx4 versions of the same --- */
   2388 
   2389 static
   2390 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
   2391                                        IRAtom* vatomX, IRAtom* vatomY )
   2392 {
   2393    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
   2394    // PCast the RM, and widen it to 128 bits
   2395    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2396    // Roll it into the result
   2397    t1 = mkUifUV128(mce, t1, t2);
   2398    return t1;
   2399 }
   2400 
   2401 /* --- ... and ... 64Fx4 versions of the same --- */
   2402 
   2403 static
   2404 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
   2405                                        IRAtom* vatomX, IRAtom* vatomY )
   2406 {
   2407    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
   2408    // PCast the RM, and widen it to 256 bits
   2409    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
   2410    // Roll it into the result
   2411    t1 = mkUifUV256(mce, t1, t2);
   2412    return t1;
   2413 }
   2414 
   2415 /* --- ... and ... 32Fx8 versions of the same --- */
   2416 
   2417 static
   2418 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
   2419                                        IRAtom* vatomX, IRAtom* vatomY )
   2420 {
   2421    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
   2422    // PCast the RM, and widen it to 256 bits
   2423    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
   2424    // Roll it into the result
   2425    t1 = mkUifUV256(mce, t1, t2);
   2426    return t1;
   2427 }
   2428 
   2429 /* --- 64Fx2 unary FP ops, with rounding mode --- */
   2430 
   2431 static
   2432 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
   2433 {
   2434    /* Same scheme as binary64Fx2_w_rm. */
   2435    // "do" the vector arg
   2436    IRAtom* t1 = unary64Fx2(mce, vatomX);
   2437    // PCast the RM, and widen it to 128 bits
   2438    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2439    // Roll it into the result
   2440    t1 = mkUifUV128(mce, t1, t2);
   2441    return t1;
   2442 }
   2443 
   2444 /* --- ... and ... 32Fx4 versions of the same --- */
   2445 
   2446 static
   2447 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
   2448 {
   2449    /* Same scheme as unary32Fx4_w_rm. */
   2450    IRAtom* t1 = unary32Fx4(mce, vatomX);
   2451    // PCast the RM, and widen it to 128 bits
   2452    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2453    // Roll it into the result
   2454    t1 = mkUifUV128(mce, t1, t2);
   2455    return t1;
   2456 }
   2457 
   2458 
   2459 /* --- --- Vector saturated narrowing --- --- */
   2460 
   2461 /* We used to do something very clever here, but on closer inspection
   2462    (2011-Jun-15), and in particular bug #279698, it turns out to be
   2463    wrong.  Part of the problem came from the fact that for a long
   2464    time, the IR primops to do with saturated narrowing were
   2465    underspecified and managed to confuse multiple cases which needed
   2466    to be separate: the op names had a signedness qualifier, but in
   2467    fact the source and destination signednesses needed to be specified
   2468    independently, so the op names really need two independent
   2469    signedness specifiers.
   2470 
   2471    As of 2011-Jun-15 (ish) the underspecification was sorted out
   2472    properly.  The incorrect instrumentation remained, though.  That
   2473    has now (2011-Oct-22) been fixed.
   2474 
   2475    What we now do is simple:
   2476 
   2477    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
   2478    number of lanes, X is the source lane width and signedness, and Y
   2479    is the destination lane width and signedness.  In all cases the
   2480    destination lane width is half the source lane width, so the names
   2481    have a bit of redundancy, but are at least easy to read.
   2482 
   2483    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
   2484    to unsigned 16s.
   2485 
   2486    Let Vanilla(OP) be a function that takes OP, one of these
   2487    saturating narrowing ops, and produces the same "shaped" narrowing
   2488    op which is not saturating, but merely dumps the most significant
   2489    bits.  "same shape" means that the lane numbers and widths are the
   2490    same as with OP.
   2491 
   2492    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
   2493                   = Iop_NarrowBin32to16x8,
   2494    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
   2495    dumping the top half of each lane.
   2496 
   2497    So, with that in place, the scheme is simple, and it is simple to
   2498    pessimise each lane individually and then apply Vanilla(OP) so as
   2499    to get the result in the right "shape".  If the original OP is
   2500    QNarrowBinXtoYxZ then we produce
   2501 
   2502    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
   2503 
   2504    or for the case when OP is unary (Iop_QNarrowUn*)
   2505 
   2506    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
   2507 */
   2508 static
   2509 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
   2510 {
   2511    switch (qnarrowOp) {
   2512       /* Binary: (128, 128) -> 128 */
   2513       case Iop_QNarrowBin16Sto8Ux16:
   2514       case Iop_QNarrowBin16Sto8Sx16:
   2515       case Iop_QNarrowBin16Uto8Ux16:
   2516       case Iop_QNarrowBin64Sto32Sx4:
   2517       case Iop_QNarrowBin64Uto32Ux4:
   2518          return Iop_NarrowBin16to8x16;
   2519       case Iop_QNarrowBin32Sto16Ux8:
   2520       case Iop_QNarrowBin32Sto16Sx8:
   2521       case Iop_QNarrowBin32Uto16Ux8:
   2522          return Iop_NarrowBin32to16x8;
   2523       /* Binary: (64, 64) -> 64 */
   2524       case Iop_QNarrowBin32Sto16Sx4:
   2525          return Iop_NarrowBin32to16x4;
   2526       case Iop_QNarrowBin16Sto8Ux8:
   2527       case Iop_QNarrowBin16Sto8Sx8:
   2528          return Iop_NarrowBin16to8x8;
   2529       /* Unary: 128 -> 64 */
   2530       case Iop_QNarrowUn64Uto32Ux2:
   2531       case Iop_QNarrowUn64Sto32Sx2:
   2532       case Iop_QNarrowUn64Sto32Ux2:
   2533          return Iop_NarrowUn64to32x2;
   2534       case Iop_QNarrowUn32Uto16Ux4:
   2535       case Iop_QNarrowUn32Sto16Sx4:
   2536       case Iop_QNarrowUn32Sto16Ux4:
   2537       case Iop_F32toF16x4:
   2538          return Iop_NarrowUn32to16x4;
   2539       case Iop_QNarrowUn16Uto8Ux8:
   2540       case Iop_QNarrowUn16Sto8Sx8:
   2541       case Iop_QNarrowUn16Sto8Ux8:
   2542          return Iop_NarrowUn16to8x8;
   2543       default:
   2544          ppIROp(qnarrowOp);
   2545          VG_(tool_panic)("vanillaNarrowOpOfShape");
   2546    }
   2547 }
   2548 
   2549 static
   2550 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
   2551                               IRAtom* vatom1, IRAtom* vatom2)
   2552 {
   2553    IRAtom *at1, *at2, *at3;
   2554    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2555    switch (narrow_op) {
   2556       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
   2557       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
   2558       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
   2559       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
   2560       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
   2561       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
   2562       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
   2563       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
   2564       default: VG_(tool_panic)("vectorNarrowBinV128");
   2565    }
   2566    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2567    tl_assert(isShadowAtom(mce,vatom1));
   2568    tl_assert(isShadowAtom(mce,vatom2));
   2569    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
   2570    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
   2571    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
   2572    return at3;
   2573 }
   2574 
   2575 static
   2576 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
   2577                             IRAtom* vatom1, IRAtom* vatom2)
   2578 {
   2579    IRAtom *at1, *at2, *at3;
   2580    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2581    switch (narrow_op) {
   2582       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
   2583       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
   2584       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
   2585       default: VG_(tool_panic)("vectorNarrowBin64");
   2586    }
   2587    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2588    tl_assert(isShadowAtom(mce,vatom1));
   2589    tl_assert(isShadowAtom(mce,vatom2));
   2590    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
   2591    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
   2592    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
   2593    return at3;
   2594 }
   2595 
   2596 static
   2597 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
   2598                              IRAtom* vatom1)
   2599 {
   2600    IRAtom *at1, *at2;
   2601    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2602    tl_assert(isShadowAtom(mce,vatom1));
   2603    /* For vanilla narrowing (non-saturating), we can just apply
   2604       the op directly to the V bits. */
   2605    switch (narrow_op) {
   2606       case Iop_NarrowUn16to8x8:
   2607       case Iop_NarrowUn32to16x4:
   2608       case Iop_NarrowUn64to32x2:
   2609       case Iop_F32toF16x4:
   2610          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
   2611          return at1;
   2612       default:
   2613          break; /* Do Plan B */
   2614    }
   2615    /* Plan B: for ops that involve a saturation operation on the args,
   2616       we must PCast before the vanilla narrow. */
   2617    switch (narrow_op) {
   2618       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
   2619       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
   2620       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
   2621       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
   2622       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
   2623       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
   2624       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
   2625       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
   2626       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
   2627       default: VG_(tool_panic)("vectorNarrowUnV128");
   2628    }
   2629    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2630    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
   2631    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
   2632    return at2;
   2633 }
   2634 
   2635 static
   2636 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
   2637                          IRAtom* vatom1)
   2638 {
   2639    IRAtom *at1, *at2;
   2640    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2641    switch (longen_op) {
   2642       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
   2643       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
   2644       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
   2645       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
   2646       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
   2647       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
   2648       case Iop_F16toF32x4:     pcast = mkPCast32x4; break;
   2649       default: VG_(tool_panic)("vectorWidenI64");
   2650    }
   2651    tl_assert(isShadowAtom(mce,vatom1));
   2652    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
   2653    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
   2654    return at2;
   2655 }
   2656 
   2657 
   2658 /* --- --- Vector integer arithmetic --- --- */
   2659 
   2660 /* Simple ... UifU the args and per-lane pessimise the results. */
   2661 
   2662 /* --- V256-bit versions --- */
   2663 
   2664 static
   2665 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2666 {
   2667    IRAtom* at;
   2668    at = mkUifUV256(mce, vatom1, vatom2);
   2669    at = mkPCast8x32(mce, at);
   2670    return at;
   2671 }
   2672 
   2673 static
   2674 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2675 {
   2676    IRAtom* at;
   2677    at = mkUifUV256(mce, vatom1, vatom2);
   2678    at = mkPCast16x16(mce, at);
   2679    return at;
   2680 }
   2681 
   2682 static
   2683 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2684 {
   2685    IRAtom* at;
   2686    at = mkUifUV256(mce, vatom1, vatom2);
   2687    at = mkPCast32x8(mce, at);
   2688    return at;
   2689 }
   2690 
   2691 static
   2692 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2693 {
   2694    IRAtom* at;
   2695    at = mkUifUV256(mce, vatom1, vatom2);
   2696    at = mkPCast64x4(mce, at);
   2697    return at;
   2698 }
   2699 
   2700 /* --- V128-bit versions --- */
   2701 
   2702 static
   2703 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2704 {
   2705    IRAtom* at;
   2706    at = mkUifUV128(mce, vatom1, vatom2);
   2707    at = mkPCast8x16(mce, at);
   2708    return at;
   2709 }
   2710 
   2711 static
   2712 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2713 {
   2714    IRAtom* at;
   2715    at = mkUifUV128(mce, vatom1, vatom2);
   2716    at = mkPCast16x8(mce, at);
   2717    return at;
   2718 }
   2719 
   2720 static
   2721 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2722 {
   2723    IRAtom* at;
   2724    at = mkUifUV128(mce, vatom1, vatom2);
   2725    at = mkPCast32x4(mce, at);
   2726    return at;
   2727 }
   2728 
   2729 static
   2730 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2731 {
   2732    IRAtom* at;
   2733    at = mkUifUV128(mce, vatom1, vatom2);
   2734    at = mkPCast64x2(mce, at);
   2735    return at;
   2736 }
   2737 
   2738 /* --- 64-bit versions --- */
   2739 
   2740 static
   2741 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2742 {
   2743    IRAtom* at;
   2744    at = mkUifU64(mce, vatom1, vatom2);
   2745    at = mkPCast8x8(mce, at);
   2746    return at;
   2747 }
   2748 
   2749 static
   2750 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2751 {
   2752    IRAtom* at;
   2753    at = mkUifU64(mce, vatom1, vatom2);
   2754    at = mkPCast16x4(mce, at);
   2755    return at;
   2756 }
   2757 
   2758 static
   2759 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2760 {
   2761    IRAtom* at;
   2762    at = mkUifU64(mce, vatom1, vatom2);
   2763    at = mkPCast32x2(mce, at);
   2764    return at;
   2765 }
   2766 
   2767 static
   2768 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2769 {
   2770    IRAtom* at;
   2771    at = mkUifU64(mce, vatom1, vatom2);
   2772    at = mkPCastTo(mce, Ity_I64, at);
   2773    return at;
   2774 }
   2775 
   2776 /* --- 32-bit versions --- */
   2777 
   2778 static
   2779 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2780 {
   2781    IRAtom* at;
   2782    at = mkUifU32(mce, vatom1, vatom2);
   2783    at = mkPCast8x4(mce, at);
   2784    return at;
   2785 }
   2786 
   2787 static
   2788 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2789 {
   2790    IRAtom* at;
   2791    at = mkUifU32(mce, vatom1, vatom2);
   2792    at = mkPCast16x2(mce, at);
   2793    return at;
   2794 }
   2795 
   2796 
   2797 /*------------------------------------------------------------*/
   2798 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
   2799 /*------------------------------------------------------------*/
   2800 
   2801 static
   2802 IRAtom* expr2vbits_Qop ( MCEnv* mce,
   2803                          IROp op,
   2804                          IRAtom* atom1, IRAtom* atom2,
   2805                          IRAtom* atom3, IRAtom* atom4 )
   2806 {
   2807    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2808    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2809    IRAtom* vatom3 = expr2vbits( mce, atom3 );
   2810    IRAtom* vatom4 = expr2vbits( mce, atom4 );
   2811 
   2812    tl_assert(isOriginalAtom(mce,atom1));
   2813    tl_assert(isOriginalAtom(mce,atom2));
   2814    tl_assert(isOriginalAtom(mce,atom3));
   2815    tl_assert(isOriginalAtom(mce,atom4));
   2816    tl_assert(isShadowAtom(mce,vatom1));
   2817    tl_assert(isShadowAtom(mce,vatom2));
   2818    tl_assert(isShadowAtom(mce,vatom3));
   2819    tl_assert(isShadowAtom(mce,vatom4));
   2820    tl_assert(sameKindedAtoms(atom1,vatom1));
   2821    tl_assert(sameKindedAtoms(atom2,vatom2));
   2822    tl_assert(sameKindedAtoms(atom3,vatom3));
   2823    tl_assert(sameKindedAtoms(atom4,vatom4));
   2824    switch (op) {
   2825       case Iop_MAddF64:
   2826       case Iop_MAddF64r32:
   2827       case Iop_MSubF64:
   2828       case Iop_MSubF64r32:
   2829          /* I32(rm) x F64 x F64 x F64 -> F64 */
   2830          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
   2831 
   2832       case Iop_MAddF32:
   2833       case Iop_MSubF32:
   2834          /* I32(rm) x F32 x F32 x F32 -> F32 */
   2835          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
   2836 
   2837       case Iop_MAddF128:
   2838       case Iop_MSubF128:
   2839       case Iop_NegMAddF128:
   2840       case Iop_NegMSubF128:
   2841          /* I32(rm) x F128 x F128 x F128 -> F128 */
   2842          return mkLazy4(mce, Ity_I128, vatom1, vatom2, vatom3, vatom4);
   2843 
   2844       /* V256-bit data-steering */
   2845       case Iop_64x4toV256:
   2846          return assignNew('V', mce, Ity_V256,
   2847                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
   2848 
   2849       default:
   2850          ppIROp(op);
   2851          VG_(tool_panic)("memcheck:expr2vbits_Qop");
   2852    }
   2853 }
   2854 
   2855 
   2856 static
   2857 IRAtom* expr2vbits_Triop ( MCEnv* mce,
   2858                            IROp op,
   2859                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
   2860 {
   2861    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2862    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2863    IRAtom* vatom3 = expr2vbits( mce, atom3 );
   2864 
   2865    tl_assert(isOriginalAtom(mce,atom1));
   2866    tl_assert(isOriginalAtom(mce,atom2));
   2867    tl_assert(isOriginalAtom(mce,atom3));
   2868    tl_assert(isShadowAtom(mce,vatom1));
   2869    tl_assert(isShadowAtom(mce,vatom2));
   2870    tl_assert(isShadowAtom(mce,vatom3));
   2871    tl_assert(sameKindedAtoms(atom1,vatom1));
   2872    tl_assert(sameKindedAtoms(atom2,vatom2));
   2873    tl_assert(sameKindedAtoms(atom3,vatom3));
   2874    switch (op) {
   2875       case Iop_AddF128:
   2876       case Iop_SubF128:
   2877       case Iop_MulF128:
   2878       case Iop_DivF128:
   2879       case Iop_AddD128:
   2880       case Iop_SubD128:
   2881       case Iop_MulD128:
   2882       case Iop_DivD128:
   2883       case Iop_QuantizeD128:
   2884          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
   2885          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
   2886       case Iop_AddF64:
   2887       case Iop_AddD64:
   2888       case Iop_AddF64r32:
   2889       case Iop_SubF64:
   2890       case Iop_SubD64:
   2891       case Iop_SubF64r32:
   2892       case Iop_MulF64:
   2893       case Iop_MulD64:
   2894       case Iop_MulF64r32:
   2895       case Iop_DivF64:
   2896       case Iop_DivD64:
   2897       case Iop_DivF64r32:
   2898       case Iop_ScaleF64:
   2899       case Iop_Yl2xF64:
   2900       case Iop_Yl2xp1F64:
   2901       case Iop_AtanF64:
   2902       case Iop_PRemF64:
   2903       case Iop_PRem1F64:
   2904       case Iop_QuantizeD64:
   2905          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
   2906          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
   2907       case Iop_PRemC3210F64:
   2908       case Iop_PRem1C3210F64:
   2909          /* I32(rm) x F64 x F64 -> I32 */
   2910          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
   2911       case Iop_AddF32:
   2912       case Iop_SubF32:
   2913       case Iop_MulF32:
   2914       case Iop_DivF32:
   2915          /* I32(rm) x F32 x F32 -> I32 */
   2916          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
   2917       case Iop_SignificanceRoundD64:
   2918          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
   2919          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
   2920       case Iop_SignificanceRoundD128:
   2921          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
   2922          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
   2923       case Iop_SliceV128:
   2924          /* (V128, V128, I8) -> V128 */
   2925          complainIfUndefined(mce, atom3, NULL);
   2926          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
   2927       case Iop_Slice64:
   2928          /* (I64, I64, I8) -> I64 */
   2929          complainIfUndefined(mce, atom3, NULL);
   2930          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
   2931       case Iop_SetElem8x8:
   2932       case Iop_SetElem16x4:
   2933       case Iop_SetElem32x2:
   2934          complainIfUndefined(mce, atom2, NULL);
   2935          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
   2936 
   2937       /* Vector FP with rounding mode as the first arg */
   2938       case Iop_Add64Fx2:
   2939       case Iop_Sub64Fx2:
   2940       case Iop_Mul64Fx2:
   2941       case Iop_Div64Fx2:
   2942          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
   2943 
   2944       case Iop_Add32Fx4:
   2945       case Iop_Sub32Fx4:
   2946       case Iop_Mul32Fx4:
   2947       case Iop_Div32Fx4:
   2948         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
   2949 
   2950       case Iop_Add64Fx4:
   2951       case Iop_Sub64Fx4:
   2952       case Iop_Mul64Fx4:
   2953       case Iop_Div64Fx4:
   2954          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
   2955 
   2956       case Iop_Add32Fx8:
   2957       case Iop_Sub32Fx8:
   2958       case Iop_Mul32Fx8:
   2959       case Iop_Div32Fx8:
   2960          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
   2961 
   2962       default:
   2963          ppIROp(op);
   2964          VG_(tool_panic)("memcheck:expr2vbits_Triop");
   2965    }
   2966 }
   2967 
   2968 
   2969 static
   2970 IRAtom* expr2vbits_Binop ( MCEnv* mce,
   2971                            IROp op,
   2972                            IRAtom* atom1, IRAtom* atom2 )
   2973 {
   2974    IRType  and_or_ty;
   2975    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*);
   2976    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*);
   2977    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*);
   2978 
   2979    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2980    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2981 
   2982    tl_assert(isOriginalAtom(mce,atom1));
   2983    tl_assert(isOriginalAtom(mce,atom2));
   2984    tl_assert(isShadowAtom(mce,vatom1));
   2985    tl_assert(isShadowAtom(mce,vatom2));
   2986    tl_assert(sameKindedAtoms(atom1,vatom1));
   2987    tl_assert(sameKindedAtoms(atom2,vatom2));
   2988    switch (op) {
   2989 
   2990       /* 32-bit SIMD */
   2991 
   2992       case Iop_Add16x2:
   2993       case Iop_HAdd16Ux2:
   2994       case Iop_HAdd16Sx2:
   2995       case Iop_Sub16x2:
   2996       case Iop_HSub16Ux2:
   2997       case Iop_HSub16Sx2:
   2998       case Iop_QAdd16Sx2:
   2999       case Iop_QSub16Sx2:
   3000       case Iop_QSub16Ux2:
   3001       case Iop_QAdd16Ux2:
   3002          return binary16Ix2(mce, vatom1, vatom2);
   3003 
   3004       case Iop_Add8x4:
   3005       case Iop_HAdd8Ux4:
   3006       case Iop_HAdd8Sx4:
   3007       case Iop_Sub8x4:
   3008       case Iop_HSub8Ux4:
   3009       case Iop_HSub8Sx4:
   3010       case Iop_QSub8Ux4:
   3011       case Iop_QAdd8Ux4:
   3012       case Iop_QSub8Sx4:
   3013       case Iop_QAdd8Sx4:
   3014          return binary8Ix4(mce, vatom1, vatom2);
   3015 
   3016       /* 64-bit SIMD */
   3017 
   3018       case Iop_ShrN8x8:
   3019       case Iop_ShrN16x4:
   3020       case Iop_ShrN32x2:
   3021       case Iop_SarN8x8:
   3022       case Iop_SarN16x4:
   3023       case Iop_SarN32x2:
   3024       case Iop_ShlN16x4:
   3025       case Iop_ShlN32x2:
   3026       case Iop_ShlN8x8:
   3027          /* Same scheme as with all other shifts. */
   3028          complainIfUndefined(mce, atom2, NULL);
   3029          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
   3030 
   3031       case Iop_QNarrowBin32Sto16Sx4:
   3032       case Iop_QNarrowBin16Sto8Sx8:
   3033       case Iop_QNarrowBin16Sto8Ux8:
   3034          return vectorNarrowBin64(mce, op, vatom1, vatom2);
   3035 
   3036       case Iop_Min8Ux8:
   3037       case Iop_Min8Sx8:
   3038       case Iop_Max8Ux8:
   3039       case Iop_Max8Sx8:
   3040       case Iop_Avg8Ux8:
   3041       case Iop_QSub8Sx8:
   3042       case Iop_QSub8Ux8:
   3043       case Iop_Sub8x8:
   3044       case Iop_CmpGT8Sx8:
   3045       case Iop_CmpGT8Ux8:
   3046       case Iop_CmpEQ8x8:
   3047       case Iop_QAdd8Sx8:
   3048       case Iop_QAdd8Ux8:
   3049       case Iop_QSal8x8:
   3050       case Iop_QShl8x8:
   3051       case Iop_Add8x8:
   3052       case Iop_Mul8x8:
   3053       case Iop_PolynomialMul8x8:
   3054          return binary8Ix8(mce, vatom1, vatom2);
   3055 
   3056       case Iop_Min16Sx4:
   3057       case Iop_Min16Ux4:
   3058       case Iop_Max16Sx4:
   3059       case Iop_Max16Ux4:
   3060       case Iop_Avg16Ux4:
   3061       case Iop_QSub16Ux4:
   3062       case Iop_QSub16Sx4:
   3063       case Iop_Sub16x4:
   3064       case Iop_Mul16x4:
   3065       case Iop_MulHi16Sx4:
   3066       case Iop_MulHi16Ux4:
   3067       case Iop_CmpGT16Sx4:
   3068       case Iop_CmpGT16Ux4:
   3069       case Iop_CmpEQ16x4:
   3070       case Iop_QAdd16Sx4:
   3071       case Iop_QAdd16Ux4:
   3072       case Iop_QSal16x4:
   3073       case Iop_QShl16x4:
   3074       case Iop_Add16x4:
   3075       case Iop_QDMulHi16Sx4:
   3076       case Iop_QRDMulHi16Sx4:
   3077          return binary16Ix4(mce, vatom1, vatom2);
   3078 
   3079       case Iop_Sub32x2:
   3080       case Iop_Mul32x2:
   3081       case Iop_Max32Sx2:
   3082       case Iop_Max32Ux2:
   3083       case Iop_Min32Sx2:
   3084       case Iop_Min32Ux2:
   3085       case Iop_CmpGT32Sx2:
   3086       case Iop_CmpGT32Ux2:
   3087       case Iop_CmpEQ32x2:
   3088       case Iop_Add32x2:
   3089       case Iop_QAdd32Ux2:
   3090       case Iop_QAdd32Sx2:
   3091       case Iop_QSub32Ux2:
   3092       case Iop_QSub32Sx2:
   3093       case Iop_QSal32x2:
   3094       case Iop_QShl32x2:
   3095       case Iop_QDMulHi32Sx2:
   3096       case Iop_QRDMulHi32Sx2:
   3097          return binary32Ix2(mce, vatom1, vatom2);
   3098 
   3099       case Iop_QSub64Ux1:
   3100       case Iop_QSub64Sx1:
   3101       case Iop_QAdd64Ux1:
   3102       case Iop_QAdd64Sx1:
   3103       case Iop_QSal64x1:
   3104       case Iop_QShl64x1:
   3105       case Iop_Sal64x1:
   3106          return binary64Ix1(mce, vatom1, vatom2);
   3107 
   3108       case Iop_QShlNsatSU8x8:
   3109       case Iop_QShlNsatUU8x8:
   3110       case Iop_QShlNsatSS8x8:
   3111          complainIfUndefined(mce, atom2, NULL);
   3112          return mkPCast8x8(mce, vatom1);
   3113 
   3114       case Iop_QShlNsatSU16x4:
   3115       case Iop_QShlNsatUU16x4:
   3116       case Iop_QShlNsatSS16x4:
   3117          complainIfUndefined(mce, atom2, NULL);
   3118          return mkPCast16x4(mce, vatom1);
   3119 
   3120       case Iop_QShlNsatSU32x2:
   3121       case Iop_QShlNsatUU32x2:
   3122       case Iop_QShlNsatSS32x2:
   3123          complainIfUndefined(mce, atom2, NULL);
   3124          return mkPCast32x2(mce, vatom1);
   3125 
   3126       case Iop_QShlNsatSU64x1:
   3127       case Iop_QShlNsatUU64x1:
   3128       case Iop_QShlNsatSS64x1:
   3129          complainIfUndefined(mce, atom2, NULL);
   3130          return mkPCast32x2(mce, vatom1);
   3131 
   3132       case Iop_PwMax32Sx2:
   3133       case Iop_PwMax32Ux2:
   3134       case Iop_PwMin32Sx2:
   3135       case Iop_PwMin32Ux2:
   3136       case Iop_PwMax32Fx2:
   3137       case Iop_PwMin32Fx2:
   3138          return assignNew('V', mce, Ity_I64,
   3139                           binop(Iop_PwMax32Ux2,
   3140                                 mkPCast32x2(mce, vatom1),
   3141                                 mkPCast32x2(mce, vatom2)));
   3142 
   3143       case Iop_PwMax16Sx4:
   3144       case Iop_PwMax16Ux4:
   3145       case Iop_PwMin16Sx4:
   3146       case Iop_PwMin16Ux4:
   3147          return assignNew('V', mce, Ity_I64,
   3148                           binop(Iop_PwMax16Ux4,
   3149                                 mkPCast16x4(mce, vatom1),
   3150                                 mkPCast16x4(mce, vatom2)));
   3151 
   3152       case Iop_PwMax8Sx8:
   3153       case Iop_PwMax8Ux8:
   3154       case Iop_PwMin8Sx8:
   3155       case Iop_PwMin8Ux8:
   3156          return assignNew('V', mce, Ity_I64,
   3157                           binop(Iop_PwMax8Ux8,
   3158                                 mkPCast8x8(mce, vatom1),
   3159                                 mkPCast8x8(mce, vatom2)));
   3160 
   3161       case Iop_PwAdd32x2:
   3162       case Iop_PwAdd32Fx2:
   3163          return mkPCast32x2(mce,
   3164                assignNew('V', mce, Ity_I64,
   3165                          binop(Iop_PwAdd32x2,
   3166                                mkPCast32x2(mce, vatom1),
   3167                                mkPCast32x2(mce, vatom2))));
   3168 
   3169       case Iop_PwAdd16x4:
   3170          return mkPCast16x4(mce,
   3171                assignNew('V', mce, Ity_I64,
   3172                          binop(op, mkPCast16x4(mce, vatom1),
   3173                                    mkPCast16x4(mce, vatom2))));
   3174 
   3175       case Iop_PwAdd8x8:
   3176          return mkPCast8x8(mce,
   3177                assignNew('V', mce, Ity_I64,
   3178                          binop(op, mkPCast8x8(mce, vatom1),
   3179                                    mkPCast8x8(mce, vatom2))));
   3180 
   3181       case Iop_Shl8x8:
   3182       case Iop_Shr8x8:
   3183       case Iop_Sar8x8:
   3184       case Iop_Sal8x8:
   3185          return mkUifU64(mce,
   3186                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3187                    mkPCast8x8(mce,vatom2)
   3188                 );
   3189 
   3190       case Iop_Shl16x4:
   3191       case Iop_Shr16x4:
   3192       case Iop_Sar16x4:
   3193       case Iop_Sal16x4:
   3194          return mkUifU64(mce,
   3195                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3196                    mkPCast16x4(mce,vatom2)
   3197                 );
   3198 
   3199       case Iop_Shl32x2:
   3200       case Iop_Shr32x2:
   3201       case Iop_Sar32x2:
   3202       case Iop_Sal32x2:
   3203          return mkUifU64(mce,
   3204                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3205                    mkPCast32x2(mce,vatom2)
   3206                 );
   3207 
   3208       /* 64-bit data-steering */
   3209       case Iop_InterleaveLO32x2:
   3210       case Iop_InterleaveLO16x4:
   3211       case Iop_InterleaveLO8x8:
   3212       case Iop_InterleaveHI32x2:
   3213       case Iop_InterleaveHI16x4:
   3214       case Iop_InterleaveHI8x8:
   3215       case Iop_CatOddLanes8x8:
   3216       case Iop_CatEvenLanes8x8:
   3217       case Iop_CatOddLanes16x4:
   3218       case Iop_CatEvenLanes16x4:
   3219       case Iop_InterleaveOddLanes8x8:
   3220       case Iop_InterleaveEvenLanes8x8:
   3221       case Iop_InterleaveOddLanes16x4:
   3222       case Iop_InterleaveEvenLanes16x4:
   3223          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
   3224 
   3225       case Iop_GetElem8x8:
   3226          complainIfUndefined(mce, atom2, NULL);
   3227          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
   3228       case Iop_GetElem16x4:
   3229          complainIfUndefined(mce, atom2, NULL);
   3230          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
   3231       case Iop_GetElem32x2:
   3232          complainIfUndefined(mce, atom2, NULL);
   3233          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
   3234 
   3235       /* Perm8x8: rearrange values in left arg using steering values
   3236         from right arg.  So rearrange the vbits in the same way but
   3237         pessimise wrt steering values. */
   3238       case Iop_Perm8x8:
   3239          return mkUifU64(
   3240                    mce,
   3241                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3242                    mkPCast8x8(mce, vatom2)
   3243                 );
   3244 
   3245       /* V128-bit SIMD */
   3246 
   3247       case Iop_Sqrt32Fx4:
   3248          return unary32Fx4_w_rm(mce, vatom1, vatom2);
   3249       case Iop_Sqrt64Fx2:
   3250          return unary64Fx2_w_rm(mce, vatom1, vatom2);
   3251 
   3252       case Iop_ShrN8x16:
   3253       case Iop_ShrN16x8:
   3254       case Iop_ShrN32x4:
   3255       case Iop_ShrN64x2:
   3256       case Iop_SarN8x16:
   3257       case Iop_SarN16x8:
   3258       case Iop_SarN32x4:
   3259       case Iop_SarN64x2:
   3260       case Iop_ShlN8x16:
   3261       case Iop_ShlN16x8:
   3262       case Iop_ShlN32x4:
   3263       case Iop_ShlN64x2:
   3264          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
   3265             this is wrong now, scalar shifts are done properly lazily.
   3266             Vector shifts should be fixed too. */
   3267          complainIfUndefined(mce, atom2, NULL);
   3268          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3269 
   3270       /* V x V shifts/rotates are done using the standard lazy scheme. */
   3271       /* For the non-rounding variants of bi-di vector x vector
   3272          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
   3273          But note that this is overly pessimistic, because in fact only
   3274          the bottom 8 bits of each lane of the second argument are taken
   3275          into account when shifting.  So really we ought to ignore
   3276          undefinedness in bits 8 and above of each lane in the
   3277          second argument. */
   3278       case Iop_Shl8x16:
   3279       case Iop_Shr8x16:
   3280       case Iop_Sar8x16:
   3281       case Iop_Sal8x16:
   3282       case Iop_Rol8x16:
   3283       case Iop_Sh8Sx16:
   3284       case Iop_Sh8Ux16:
   3285          return mkUifUV128(mce,
   3286                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3287                    mkPCast8x16(mce,vatom2)
   3288                 );
   3289 
   3290       case Iop_Shl16x8:
   3291       case Iop_Shr16x8:
   3292       case Iop_Sar16x8:
   3293       case Iop_Sal16x8:
   3294       case Iop_Rol16x8:
   3295       case Iop_Sh16Sx8:
   3296       case Iop_Sh16Ux8:
   3297          return mkUifUV128(mce,
   3298                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3299                    mkPCast16x8(mce,vatom2)
   3300                 );
   3301 
   3302       case Iop_Shl32x4:
   3303       case Iop_Shr32x4:
   3304       case Iop_Sar32x4:
   3305       case Iop_Sal32x4:
   3306       case Iop_Rol32x4:
   3307       case Iop_Sh32Sx4:
   3308       case Iop_Sh32Ux4:
   3309          return mkUifUV128(mce,
   3310                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3311                    mkPCast32x4(mce,vatom2)
   3312                 );
   3313 
   3314       case Iop_Shl64x2:
   3315       case Iop_Shr64x2:
   3316       case Iop_Sar64x2:
   3317       case Iop_Sal64x2:
   3318       case Iop_Rol64x2:
   3319       case Iop_Sh64Sx2:
   3320       case Iop_Sh64Ux2:
   3321          return mkUifUV128(mce,
   3322                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3323                    mkPCast64x2(mce,vatom2)
   3324                 );
   3325 
   3326       /* For the rounding variants of bi-di vector x vector shifts, the
   3327          rounding adjustment can cause undefinedness to propagate through
   3328          the entire lane, in the worst case.  Too complex to handle
   3329          properly .. just UifU the arguments and then PCast them.
   3330          Suboptimal but safe. */
   3331       case Iop_Rsh8Sx16:
   3332       case Iop_Rsh8Ux16:
   3333          return binary8Ix16(mce, vatom1, vatom2);
   3334       case Iop_Rsh16Sx8:
   3335       case Iop_Rsh16Ux8:
   3336          return binary16Ix8(mce, vatom1, vatom2);
   3337       case Iop_Rsh32Sx4:
   3338       case Iop_Rsh32Ux4:
   3339          return binary32Ix4(mce, vatom1, vatom2);
   3340       case Iop_Rsh64Sx2:
   3341       case Iop_Rsh64Ux2:
   3342          return binary64Ix2(mce, vatom1, vatom2);
   3343 
   3344       case Iop_F32ToFixed32Ux4_RZ:
   3345       case Iop_F32ToFixed32Sx4_RZ:
   3346       case Iop_Fixed32UToF32x4_RN:
   3347       case Iop_Fixed32SToF32x4_RN:
   3348          complainIfUndefined(mce, atom2, NULL);
   3349          return mkPCast32x4(mce, vatom1);
   3350 
   3351       case Iop_F32ToFixed32Ux2_RZ:
   3352       case Iop_F32ToFixed32Sx2_RZ:
   3353       case Iop_Fixed32UToF32x2_RN:
   3354       case Iop_Fixed32SToF32x2_RN:
   3355          complainIfUndefined(mce, atom2, NULL);
   3356          return mkPCast32x2(mce, vatom1);
   3357 
   3358       case Iop_QSub8Ux16:
   3359       case Iop_QSub8Sx16:
   3360       case Iop_Sub8x16:
   3361       case Iop_Min8Ux16:
   3362       case Iop_Min8Sx16:
   3363       case Iop_Max8Ux16:
   3364       case Iop_Max8Sx16:
   3365       case Iop_CmpGT8Sx16:
   3366       case Iop_CmpGT8Ux16:
   3367       case Iop_CmpEQ8x16:
   3368       case Iop_Avg8Ux16:
   3369       case Iop_Avg8Sx16:
   3370       case Iop_QAdd8Ux16:
   3371       case Iop_QAdd8Sx16:
   3372       case Iop_QAddExtUSsatSS8x16:
   3373       case Iop_QAddExtSUsatUU8x16:
   3374       case Iop_QSal8x16:
   3375       case Iop_QShl8x16:
   3376       case Iop_Add8x16:
   3377       case Iop_Mul8x16:
   3378       case Iop_PolynomialMul8x16:
   3379       case Iop_PolynomialMulAdd8x16:
   3380          return binary8Ix16(mce, vatom1, vatom2);
   3381 
   3382       case Iop_QSub16Ux8:
   3383       case Iop_QSub16Sx8:
   3384       case Iop_Sub16x8:
   3385       case Iop_Mul16x8:
   3386       case Iop_MulHi16Sx8:
   3387       case Iop_MulHi16Ux8:
   3388       case Iop_Min16Sx8:
   3389       case Iop_Min16Ux8:
   3390       case Iop_Max16Sx8:
   3391       case Iop_Max16Ux8:
   3392       case Iop_CmpGT16Sx8:
   3393       case Iop_CmpGT16Ux8:
   3394       case Iop_CmpEQ16x8:
   3395       case Iop_Avg16Ux8:
   3396       case Iop_Avg16Sx8:
   3397       case Iop_QAdd16Ux8:
   3398       case Iop_QAdd16Sx8:
   3399       case Iop_QAddExtUSsatSS16x8:
   3400       case Iop_QAddExtSUsatUU16x8:
   3401       case Iop_QSal16x8:
   3402       case Iop_QShl16x8:
   3403       case Iop_Add16x8:
   3404       case Iop_QDMulHi16Sx8:
   3405       case Iop_QRDMulHi16Sx8:
   3406       case Iop_PolynomialMulAdd16x8:
   3407          return binary16Ix8(mce, vatom1, vatom2);
   3408 
   3409       case Iop_Sub32x4:
   3410       case Iop_CmpGT32Sx4:
   3411       case Iop_CmpGT32Ux4:
   3412       case Iop_CmpEQ32x4:
   3413       case Iop_QAdd32Sx4:
   3414       case Iop_QAdd32Ux4:
   3415       case Iop_QSub32Sx4:
   3416       case Iop_QSub32Ux4:
   3417       case Iop_QAddExtUSsatSS32x4:
   3418       case Iop_QAddExtSUsatUU32x4:
   3419       case Iop_QSal32x4:
   3420       case Iop_QShl32x4:
   3421       case Iop_Avg32Ux4:
   3422       case Iop_Avg32Sx4:
   3423       case Iop_Add32x4:
   3424       case Iop_Max32Ux4:
   3425       case Iop_Max32Sx4:
   3426       case Iop_Min32Ux4:
   3427       case Iop_Min32Sx4:
   3428       case Iop_Mul32x4:
   3429       case Iop_QDMulHi32Sx4:
   3430       case Iop_QRDMulHi32Sx4:
   3431       case Iop_PolynomialMulAdd32x4:
   3432          return binary32Ix4(mce, vatom1, vatom2);
   3433 
   3434       case Iop_Sub64x2:
   3435       case Iop_Add64x2:
   3436       case Iop_Max64Sx2:
   3437       case Iop_Max64Ux2:
   3438       case Iop_Min64Sx2:
   3439       case Iop_Min64Ux2:
   3440       case Iop_CmpEQ64x2:
   3441       case Iop_CmpGT64Sx2:
   3442       case Iop_CmpGT64Ux2:
   3443       case Iop_QSal64x2:
   3444       case Iop_QShl64x2:
   3445       case Iop_QAdd64Ux2:
   3446       case Iop_QAdd64Sx2:
   3447       case Iop_QSub64Ux2:
   3448       case Iop_QSub64Sx2:
   3449       case Iop_QAddExtUSsatSS64x2:
   3450       case Iop_QAddExtSUsatUU64x2:
   3451       case Iop_PolynomialMulAdd64x2:
   3452       case Iop_CipherV128:
   3453       case Iop_CipherLV128:
   3454       case Iop_NCipherV128:
   3455       case Iop_NCipherLV128:
   3456       case Iop_MulI128by10E:
   3457       case Iop_MulI128by10ECarry:
   3458         return binary64Ix2(mce, vatom1, vatom2);
   3459 
   3460       case Iop_QNarrowBin64Sto32Sx4:
   3461       case Iop_QNarrowBin64Uto32Ux4:
   3462       case Iop_QNarrowBin32Sto16Sx8:
   3463       case Iop_QNarrowBin32Uto16Ux8:
   3464       case Iop_QNarrowBin32Sto16Ux8:
   3465       case Iop_QNarrowBin16Sto8Sx16:
   3466       case Iop_QNarrowBin16Uto8Ux16:
   3467       case Iop_QNarrowBin16Sto8Ux16:
   3468          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
   3469 
   3470       case Iop_Min64Fx2:
   3471       case Iop_Max64Fx2:
   3472       case Iop_CmpLT64Fx2:
   3473       case Iop_CmpLE64Fx2:
   3474       case Iop_CmpEQ64Fx2:
   3475       case Iop_CmpUN64Fx2:
   3476       case Iop_RecipStep64Fx2:
   3477       case Iop_RSqrtStep64Fx2:
   3478          return binary64Fx2(mce, vatom1, vatom2);
   3479 
   3480       case Iop_Sub64F0x2:
   3481       case Iop_Mul64F0x2:
   3482       case Iop_Min64F0x2:
   3483       case Iop_Max64F0x2:
   3484       case Iop_Div64F0x2:
   3485       case Iop_CmpLT64F0x2:
   3486       case Iop_CmpLE64F0x2:
   3487       case Iop_CmpEQ64F0x2:
   3488       case Iop_CmpUN64F0x2:
   3489       case Iop_Add64F0x2:
   3490          return binary64F0x2(mce, vatom1, vatom2);
   3491 
   3492       case Iop_Min32Fx4:
   3493       case Iop_Max32Fx4:
   3494       case Iop_CmpLT32Fx4:
   3495       case Iop_CmpLE32Fx4:
   3496       case Iop_CmpEQ32Fx4:
   3497       case Iop_CmpUN32Fx4:
   3498       case Iop_CmpGT32Fx4:
   3499       case Iop_CmpGE32Fx4:
   3500       case Iop_RecipStep32Fx4:
   3501       case Iop_RSqrtStep32Fx4:
   3502          return binary32Fx4(mce, vatom1, vatom2);
   3503 
   3504       case Iop_Sub32Fx2:
   3505       case Iop_Mul32Fx2:
   3506       case Iop_Min32Fx2:
   3507       case Iop_Max32Fx2:
   3508       case Iop_CmpEQ32Fx2:
   3509       case Iop_CmpGT32Fx2:
   3510       case Iop_CmpGE32Fx2:
   3511       case Iop_Add32Fx2:
   3512       case Iop_RecipStep32Fx2:
   3513       case Iop_RSqrtStep32Fx2:
   3514          return binary32Fx2(mce, vatom1, vatom2);
   3515 
   3516       case Iop_Sub32F0x4:
   3517       case Iop_Mul32F0x4:
   3518       case Iop_Min32F0x4:
   3519       case Iop_Max32F0x4:
   3520       case Iop_Div32F0x4:
   3521       case Iop_CmpLT32F0x4:
   3522       case Iop_CmpLE32F0x4:
   3523       case Iop_CmpEQ32F0x4:
   3524       case Iop_CmpUN32F0x4:
   3525       case Iop_Add32F0x4:
   3526          return binary32F0x4(mce, vatom1, vatom2);
   3527 
   3528       case Iop_QShlNsatSU8x16:
   3529       case Iop_QShlNsatUU8x16:
   3530       case Iop_QShlNsatSS8x16:
   3531          complainIfUndefined(mce, atom2, NULL);
   3532          return mkPCast8x16(mce, vatom1);
   3533 
   3534       case Iop_QShlNsatSU16x8:
   3535       case Iop_QShlNsatUU16x8:
   3536       case Iop_QShlNsatSS16x8:
   3537          complainIfUndefined(mce, atom2, NULL);
   3538          return mkPCast16x8(mce, vatom1);
   3539 
   3540       case Iop_QShlNsatSU32x4:
   3541       case Iop_QShlNsatUU32x4:
   3542       case Iop_QShlNsatSS32x4:
   3543          complainIfUndefined(mce, atom2, NULL);
   3544          return mkPCast32x4(mce, vatom1);
   3545 
   3546       case Iop_QShlNsatSU64x2:
   3547       case Iop_QShlNsatUU64x2:
   3548       case Iop_QShlNsatSS64x2:
   3549          complainIfUndefined(mce, atom2, NULL);
   3550          return mkPCast32x4(mce, vatom1);
   3551 
   3552       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
   3553          To make this simpler, do the following:
   3554          * complain if the shift amount (the I8) is undefined
   3555          * pcast each lane at the wide width
   3556          * truncate each lane to half width
   3557          * pcast the resulting 64-bit value to a single bit and use
   3558            that as the least significant bit of the upper half of the
   3559            result. */
   3560       case Iop_QandQShrNnarrow64Uto32Ux2:
   3561       case Iop_QandQSarNnarrow64Sto32Sx2:
   3562       case Iop_QandQSarNnarrow64Sto32Ux2:
   3563       case Iop_QandQRShrNnarrow64Uto32Ux2:
   3564       case Iop_QandQRSarNnarrow64Sto32Sx2:
   3565       case Iop_QandQRSarNnarrow64Sto32Ux2:
   3566       case Iop_QandQShrNnarrow32Uto16Ux4:
   3567       case Iop_QandQSarNnarrow32Sto16Sx4:
   3568       case Iop_QandQSarNnarrow32Sto16Ux4:
   3569       case Iop_QandQRShrNnarrow32Uto16Ux4:
   3570       case Iop_QandQRSarNnarrow32Sto16Sx4:
   3571       case Iop_QandQRSarNnarrow32Sto16Ux4:
   3572       case Iop_QandQShrNnarrow16Uto8Ux8:
   3573       case Iop_QandQSarNnarrow16Sto8Sx8:
   3574       case Iop_QandQSarNnarrow16Sto8Ux8:
   3575       case Iop_QandQRShrNnarrow16Uto8Ux8:
   3576       case Iop_QandQRSarNnarrow16Sto8Sx8:
   3577       case Iop_QandQRSarNnarrow16Sto8Ux8:
   3578       {
   3579          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
   3580          IROp opNarrow = Iop_INVALID;
   3581          switch (op) {
   3582             case Iop_QandQShrNnarrow64Uto32Ux2:
   3583             case Iop_QandQSarNnarrow64Sto32Sx2:
   3584             case Iop_QandQSarNnarrow64Sto32Ux2:
   3585             case Iop_QandQRShrNnarrow64Uto32Ux2:
   3586             case Iop_QandQRSarNnarrow64Sto32Sx2:
   3587             case Iop_QandQRSarNnarrow64Sto32Ux2:
   3588                fnPessim = mkPCast64x2;
   3589                opNarrow = Iop_NarrowUn64to32x2;
   3590                break;
   3591             case Iop_QandQShrNnarrow32Uto16Ux4:
   3592             case Iop_QandQSarNnarrow32Sto16Sx4:
   3593             case Iop_QandQSarNnarrow32Sto16Ux4:
   3594             case Iop_QandQRShrNnarrow32Uto16Ux4:
   3595             case Iop_QandQRSarNnarrow32Sto16Sx4:
   3596             case Iop_QandQRSarNnarrow32Sto16Ux4:
   3597                fnPessim = mkPCast32x4;
   3598                opNarrow = Iop_NarrowUn32to16x4;
   3599                break;
   3600             case Iop_QandQShrNnarrow16Uto8Ux8:
   3601             case Iop_QandQSarNnarrow16Sto8Sx8:
   3602             case Iop_QandQSarNnarrow16Sto8Ux8:
   3603             case Iop_QandQRShrNnarrow16Uto8Ux8:
   3604             case Iop_QandQRSarNnarrow16Sto8Sx8:
   3605             case Iop_QandQRSarNnarrow16Sto8Ux8:
   3606                fnPessim = mkPCast16x8;
   3607                opNarrow = Iop_NarrowUn16to8x8;
   3608                break;
   3609             default:
   3610                tl_assert(0);
   3611          }
   3612          complainIfUndefined(mce, atom2, NULL);
   3613          // Pessimised shift result
   3614          IRAtom* shV
   3615             = fnPessim(mce, vatom1);
   3616          // Narrowed, pessimised shift result
   3617          IRAtom* shVnarrowed
   3618             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
   3619          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
   3620          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
   3621          // and assemble the result
   3622          return assignNew('V', mce, Ity_V128,
   3623                           binop(Iop_64HLtoV128, qV, shVnarrowed));
   3624       }
   3625 
   3626       case Iop_Mull32Sx2:
   3627       case Iop_Mull32Ux2:
   3628       case Iop_QDMull32Sx2:
   3629          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
   3630                                     mkUifU64(mce, vatom1, vatom2));
   3631 
   3632       case Iop_Mull16Sx4:
   3633       case Iop_Mull16Ux4:
   3634       case Iop_QDMull16Sx4:
   3635          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
   3636                                     mkUifU64(mce, vatom1, vatom2));
   3637 
   3638       case Iop_Mull8Sx8:
   3639       case Iop_Mull8Ux8:
   3640       case Iop_PolynomialMull8x8:
   3641          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
   3642                                     mkUifU64(mce, vatom1, vatom2));
   3643 
   3644       case Iop_PwAdd32x4:
   3645          return mkPCast32x4(mce,
   3646                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
   3647                      mkPCast32x4(mce, vatom2))));
   3648 
   3649       case Iop_PwAdd16x8:
   3650          return mkPCast16x8(mce,
   3651                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
   3652                      mkPCast16x8(mce, vatom2))));
   3653 
   3654       case Iop_PwAdd8x16:
   3655          return mkPCast8x16(mce,
   3656                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
   3657                      mkPCast8x16(mce, vatom2))));
   3658 
   3659       /* V128-bit data-steering */
   3660       case Iop_SetV128lo32:
   3661       case Iop_SetV128lo64:
   3662       case Iop_64HLtoV128:
   3663       case Iop_InterleaveLO64x2:
   3664       case Iop_InterleaveLO32x4:
   3665       case Iop_InterleaveLO16x8:
   3666       case Iop_InterleaveLO8x16:
   3667       case Iop_InterleaveHI64x2:
   3668       case Iop_InterleaveHI32x4:
   3669       case Iop_InterleaveHI16x8:
   3670       case Iop_InterleaveHI8x16:
   3671       case Iop_CatOddLanes8x16:
   3672       case Iop_CatOddLanes16x8:
   3673       case Iop_CatOddLanes32x4:
   3674       case Iop_CatEvenLanes8x16:
   3675       case Iop_CatEvenLanes16x8:
   3676       case Iop_CatEvenLanes32x4:
   3677       case Iop_InterleaveOddLanes8x16:
   3678       case Iop_InterleaveOddLanes16x8:
   3679       case Iop_InterleaveOddLanes32x4:
   3680       case Iop_InterleaveEvenLanes8x16:
   3681       case Iop_InterleaveEvenLanes16x8:
   3682       case Iop_InterleaveEvenLanes32x4:
   3683          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
   3684 
   3685       case Iop_GetElem8x16:
   3686          complainIfUndefined(mce, atom2, NULL);
   3687          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
   3688       case Iop_GetElem16x8:
   3689          complainIfUndefined(mce, atom2, NULL);
   3690          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
   3691       case Iop_GetElem32x4:
   3692          complainIfUndefined(mce, atom2, NULL);
   3693          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
   3694       case Iop_GetElem64x2:
   3695          complainIfUndefined(mce, atom2, NULL);
   3696          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
   3697 
   3698      /* Perm8x16: rearrange values in left arg using steering values
   3699         from right arg.  So rearrange the vbits in the same way but
   3700         pessimise wrt steering values.  Perm32x4 ditto. */
   3701       case Iop_Perm8x16:
   3702          return mkUifUV128(
   3703                    mce,
   3704                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3705                    mkPCast8x16(mce, vatom2)
   3706                 );
   3707       case Iop_Perm32x4:
   3708          return mkUifUV128(
   3709                    mce,
   3710                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3711                    mkPCast32x4(mce, vatom2)
   3712                 );
   3713 
   3714      /* These two take the lower half of each 16-bit lane, sign/zero
   3715         extend it to 32, and multiply together, producing a 32x4
   3716         result (and implicitly ignoring half the operand bits).  So
   3717         treat it as a bunch of independent 16x8 operations, but then
   3718         do 32-bit shifts left-right to copy the lower half results
   3719         (which are all 0s or all 1s due to PCasting in binary16Ix8)
   3720         into the upper half of each result lane. */
   3721       case Iop_MullEven16Ux8:
   3722       case Iop_MullEven16Sx8: {
   3723          IRAtom* at;
   3724          at = binary16Ix8(mce,vatom1,vatom2);
   3725          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
   3726          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
   3727 	 return at;
   3728       }
   3729 
   3730       /* Same deal as Iop_MullEven16{S,U}x8 */
   3731       case Iop_MullEven8Ux16:
   3732       case Iop_MullEven8Sx16: {
   3733          IRAtom* at;
   3734          at = binary8Ix16(mce,vatom1,vatom2);
   3735          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
   3736          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
   3737 	 return at;
   3738       }
   3739 
   3740       /* Same deal as Iop_MullEven16{S,U}x8 */
   3741       case Iop_MullEven32Ux4:
   3742       case Iop_MullEven32Sx4: {
   3743          IRAtom* at;
   3744          at = binary32Ix4(mce,vatom1,vatom2);
   3745          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
   3746          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
   3747          return at;
   3748       }
   3749 
   3750       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
   3751          32x4 -> 16x8 laneage, discarding the upper half of each lane.
   3752          Simply apply same op to the V bits, since this really no more
   3753          than a data steering operation. */
   3754       case Iop_NarrowBin32to16x8:
   3755       case Iop_NarrowBin16to8x16:
   3756       case Iop_NarrowBin64to32x4:
   3757          return assignNew('V', mce, Ity_V128,
   3758                                     binop(op, vatom1, vatom2));
   3759 
   3760       case Iop_ShrV128:
   3761       case Iop_ShlV128:
   3762       case Iop_I128StoBCD128:
   3763          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
   3764             this is wrong now, scalar shifts are done properly lazily.
   3765             Vector shifts should be fixed too. */
   3766          complainIfUndefined(mce, atom2, NULL);
   3767          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3768 
   3769       case Iop_BCDAdd:
   3770       case Iop_BCDSub:
   3771          return mkLazy2(mce, Ity_V128, vatom1, vatom2);
   3772 
   3773       /* SHA Iops */
   3774       case Iop_SHA256:
   3775       case Iop_SHA512:
   3776          complainIfUndefined(mce, atom2, NULL);
   3777          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3778 
   3779       /* I128-bit data-steering */
   3780       case Iop_64HLto128:
   3781          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
   3782 
   3783       /* V256-bit SIMD */
   3784 
   3785       case Iop_Max64Fx4:
   3786       case Iop_Min64Fx4:
   3787          return binary64Fx4(mce, vatom1, vatom2);
   3788 
   3789       case Iop_Max32Fx8:
   3790       case Iop_Min32Fx8:
   3791          return binary32Fx8(mce, vatom1, vatom2);
   3792 
   3793       /* V256-bit data-steering */
   3794       case Iop_V128HLtoV256:
   3795          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
   3796 
   3797       /* Scalar floating point */
   3798 
   3799       case Iop_F32toI64S:
   3800       case Iop_F32toI64U:
   3801          /* I32(rm) x F32 -> I64 */
   3802          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3803 
   3804       case Iop_I64StoF32:
   3805          /* I32(rm) x I64 -> F32 */
   3806          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3807 
   3808       case Iop_RoundF64toInt:
   3809       case Iop_RoundF64toF32:
   3810       case Iop_F64toI64S:
   3811       case Iop_F64toI64U:
   3812       case Iop_I64StoF64:
   3813       case Iop_I64UtoF64:
   3814       case Iop_SinF64:
   3815       case Iop_CosF64:
   3816       case Iop_TanF64:
   3817       case Iop_2xm1F64:
   3818       case Iop_SqrtF64:
   3819       case Iop_RecpExpF64:
   3820          /* I32(rm) x I64/F64 -> I64/F64 */
   3821          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3822 
   3823       case Iop_ShlD64:
   3824       case Iop_ShrD64:
   3825       case Iop_RoundD64toInt:
   3826          /* I32(rm) x D64 -> D64 */
   3827          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3828 
   3829       case Iop_ShlD128:
   3830       case Iop_ShrD128:
   3831       case Iop_RoundD128toInt:
   3832          /* I32(rm) x D128 -> D128 */
   3833          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3834 
   3835       case Iop_RoundF128toInt:
   3836          /* I32(rm) x F128 -> F128 */
   3837          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3838 
   3839       case Iop_D64toI64S:
   3840       case Iop_D64toI64U:
   3841       case Iop_I64StoD64:
   3842       case Iop_I64UtoD64:
   3843          /* I32(rm) x I64/D64 -> D64/I64 */
   3844          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3845 
   3846       case Iop_F32toD32:
   3847       case Iop_F64toD32:
   3848       case Iop_F128toD32:
   3849       case Iop_D32toF32:
   3850       case Iop_D64toF32:
   3851       case Iop_D128toF32:
   3852          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
   3853          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3854 
   3855       case Iop_F32toD64:
   3856       case Iop_F64toD64:
   3857       case Iop_F128toD64:
   3858       case Iop_D32toF64:
   3859       case Iop_D64toF64:
   3860       case Iop_D128toF64:
   3861          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
   3862          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3863 
   3864       case Iop_F32toD128:
   3865       case Iop_F64toD128:
   3866       case Iop_F128toD128:
   3867       case Iop_D32toF128:
   3868       case Iop_D64toF128:
   3869       case Iop_D128toF128:
   3870          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
   3871          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3872 
   3873       case Iop_RoundF32toInt:
   3874       case Iop_SqrtF32:
   3875       case Iop_RecpExpF32:
   3876          /* I32(rm) x I32/F32 -> I32/F32 */
   3877          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3878 
   3879       case Iop_SqrtF128:
   3880          /* I32(rm) x F128 -> F128 */
   3881          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3882 
   3883       case Iop_I32StoF32:
   3884       case Iop_I32UtoF32:
   3885       case Iop_F32toI32S:
   3886       case Iop_F32toI32U:
   3887          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
   3888          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3889 
   3890       case Iop_F64toF16:
   3891       case Iop_F32toF16:
   3892          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
   3893          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
   3894 
   3895       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
   3896       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
   3897       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
   3898       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
   3899       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
   3900          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3901 
   3902       case Iop_F128toI128S:   /* IRRoundingMode(I32) x F128 -> signed I128 */
   3903       case Iop_RndF128:       /* IRRoundingMode(I32) x F128 -> F128 */
   3904          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3905 
   3906       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
   3907       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
   3908       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
   3909       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
   3910       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
   3911       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
   3912          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3913 
   3914       case Iop_F64HLtoF128:
   3915       case Iop_D64HLtoD128:
   3916          return assignNew('V', mce, Ity_I128,
   3917                           binop(Iop_64HLto128, vatom1, vatom2));
   3918 
   3919       case Iop_F64toI32U:
   3920       case Iop_F64toI32S:
   3921       case Iop_F64toF32:
   3922       case Iop_I64UtoF32:
   3923       case Iop_D64toI32U:
   3924       case Iop_D64toI32S:
   3925          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
   3926          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3927 
   3928       case Iop_D64toD32:
   3929          /* First arg is I32 (rounding mode), second is D64 (data). */
   3930          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3931 
   3932       case Iop_F64toI16S:
   3933          /* First arg is I32 (rounding mode), second is F64 (data). */
   3934          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
   3935 
   3936       case Iop_InsertExpD64:
   3937          /*  I64 x I64 -> D64 */
   3938          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3939 
   3940       case Iop_InsertExpD128:
   3941          /*  I64 x I128 -> D128 */
   3942          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3943 
   3944       case Iop_CmpF32:
   3945       case Iop_CmpF64:
   3946       case Iop_CmpF128:
   3947       case Iop_CmpD64:
   3948       case Iop_CmpD128:
   3949       case Iop_CmpExpD64:
   3950       case Iop_CmpExpD128:
   3951          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3952 
   3953       case Iop_MaxNumF32:
   3954       case Iop_MinNumF32:
   3955          /* F32 x F32 -> F32 */
   3956          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3957 
   3958       case Iop_MaxNumF64:
   3959       case Iop_MinNumF64:
   3960          /* F64 x F64 -> F64 */
   3961          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3962 
   3963       /* non-FP after here */
   3964 
   3965       case Iop_DivModU64to32:
   3966       case Iop_DivModS64to32:
   3967          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3968 
   3969       case Iop_DivModU128to64:
   3970       case Iop_DivModS128to64:
   3971          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3972 
   3973       case Iop_8HLto16:
   3974          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
   3975       case Iop_16HLto32:
   3976          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
   3977       case Iop_32HLto64:
   3978          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
   3979 
   3980       case Iop_DivModS64to64:
   3981       case Iop_MullS64:
   3982       case Iop_MullU64: {
   3983          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
   3984          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
   3985          return assignNew('V', mce, Ity_I128,
   3986                           binop(Iop_64HLto128, vHi64, vLo64));
   3987       }
   3988 
   3989       case Iop_MullS32:
   3990       case Iop_MullU32: {
   3991          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
   3992          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
   3993          return assignNew('V', mce, Ity_I64,
   3994                           binop(Iop_32HLto64, vHi32, vLo32));
   3995       }
   3996 
   3997       case Iop_MullS16:
   3998       case Iop_MullU16: {
   3999          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
   4000          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
   4001          return assignNew('V', mce, Ity_I32,
   4002                           binop(Iop_16HLto32, vHi16, vLo16));
   4003       }
   4004 
   4005       case Iop_MullS8:
   4006       case Iop_MullU8: {
   4007          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
   4008          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
   4009          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
   4010       }
   4011 
   4012       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
   4013       case Iop_DivS32:
   4014       case Iop_DivU32:
   4015       case Iop_DivU32E:
   4016       case Iop_DivS32E:
   4017       case Iop_QAdd32S: /* could probably do better */
   4018       case Iop_QSub32S: /* could probably do better */
   4019          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   4020 
   4021       case Iop_DivS64:
   4022       case Iop_DivU64:
   4023       case Iop_DivS64E:
   4024       case Iop_DivU64E:
   4025          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   4026 
   4027       case Iop_Add32:
   4028          if (mce->bogusLiterals || mce->useLLVMworkarounds)
   4029             return expensiveAddSub(mce,True,Ity_I32,
   4030                                    vatom1,vatom2, atom1,atom2);
   4031          else
   4032             goto cheap_AddSub32;
   4033       case Iop_Sub32:
   4034          if (mce->bogusLiterals)
   4035             return expensiveAddSub(mce,False,Ity_I32,
   4036                                    vatom1,vatom2, atom1,atom2);
   4037          else
   4038             goto cheap_AddSub32;
   4039 
   4040       cheap_AddSub32:
   4041       case Iop_Mul32:
   4042          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
   4043 
   4044       case Iop_CmpORD32S:
   4045       case Iop_CmpORD32U:
   4046       case Iop_CmpORD64S:
   4047       case Iop_CmpORD64U:
   4048          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
   4049 
   4050       case Iop_Add64:
   4051          if (mce->bogusLiterals || mce->useLLVMworkarounds)
   4052             return expensiveAddSub(mce,True,Ity_I64,
   4053                                    vatom1,vatom2, atom1,atom2);
   4054          else
   4055             goto cheap_AddSub64;
   4056       case Iop_Sub64:
   4057          if (mce->bogusLiterals)
   4058             return expensiveAddSub(mce,False,Ity_I64,
   4059                                    vatom1,vatom2, atom1,atom2);
   4060          else
   4061             goto cheap_AddSub64;
   4062 
   4063       cheap_AddSub64:
   4064       case Iop_Mul64:
   4065          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
   4066 
   4067       case Iop_Mul16:
   4068       case Iop_Add16:
   4069       case Iop_Sub16:
   4070          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
   4071 
   4072       case Iop_Mul8:
   4073       case Iop_Sub8:
   4074       case Iop_Add8:
   4075          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
   4076 
   4077       case Iop_CmpEQ64:
   4078       case Iop_CmpNE64:
   4079          if (mce->bogusLiterals)
   4080             goto expensive_cmp64;
   4081          else
   4082             goto cheap_cmp64;
   4083 
   4084       expensive_cmp64:
   4085       case Iop_ExpCmpNE64:
   4086          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
   4087 
   4088       cheap_cmp64:
   4089       case Iop_CmpLE64S: case Iop_CmpLE64U:
   4090       case Iop_CmpLT64U: case Iop_CmpLT64S:
   4091          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
   4092 
   4093       case Iop_CmpEQ32:
   4094       case Iop_CmpNE32:
   4095          if (mce->bogusLiterals)
   4096             goto expensive_cmp32;
   4097          else
   4098             goto cheap_cmp32;
   4099 
   4100       expensive_cmp32:
   4101       case Iop_ExpCmpNE32:
   4102          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
   4103 
   4104       cheap_cmp32:
   4105       case Iop_CmpLE32S: case Iop_CmpLE32U:
   4106       case Iop_CmpLT32U: case Iop_CmpLT32S:
   4107          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
   4108 
   4109       case Iop_CmpEQ16: case Iop_CmpNE16:
   4110          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
   4111 
   4112       case Iop_ExpCmpNE16:
   4113          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
   4114 
   4115       case Iop_CmpEQ8: case Iop_CmpNE8:
   4116          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
   4117 
   4118       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
   4119       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
   4120       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
   4121       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
   4122          /* Just say these all produce a defined result, regardless
   4123             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
   4124          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
   4125 
   4126       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
   4127          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
   4128 
   4129       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
   4130          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
   4131 
   4132       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
   4133          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
   4134 
   4135       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
   4136          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
   4137 
   4138       case Iop_AndV256:
   4139          uifu = mkUifUV256; difd = mkDifDV256;
   4140          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
   4141       case Iop_AndV128:
   4142          uifu = mkUifUV128; difd = mkDifDV128;
   4143          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
   4144       case Iop_And64:
   4145          uifu = mkUifU64; difd = mkDifD64;
   4146          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
   4147       case Iop_And32:
   4148          uifu = mkUifU32; difd = mkDifD32;
   4149          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
   4150       case Iop_And16:
   4151          uifu = mkUifU16; difd = mkDifD16;
   4152          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
   4153       case Iop_And8:
   4154          uifu = mkUifU8; difd = mkDifD8;
   4155          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
   4156 
   4157       case Iop_OrV256:
   4158          uifu = mkUifUV256; difd = mkDifDV256;
   4159          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
   4160       case Iop_OrV128:
   4161          uifu = mkUifUV128; difd = mkDifDV128;
   4162          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
   4163       case Iop_Or64:
   4164          uifu = mkUifU64; difd = mkDifD64;
   4165          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
   4166       case Iop_Or32:
   4167          uifu = mkUifU32; difd = mkDifD32;
   4168          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
   4169       case Iop_Or16:
   4170          uifu = mkUifU16; difd = mkDifD16;
   4171          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
   4172       case Iop_Or8:
   4173          uifu = mkUifU8; difd = mkDifD8;
   4174          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
   4175 
   4176       do_And_Or:
   4177          return
   4178          assignNew(
   4179             'V', mce,
   4180             and_or_ty,
   4181             difd(mce, uifu(mce, vatom1, vatom2),
   4182                       difd(mce, improve(mce, atom1, vatom1),
   4183                                 improve(mce, atom2, vatom2) ) ) );
   4184 
   4185       case Iop_Xor8:
   4186          return mkUifU8(mce, vatom1, vatom2);
   4187       case Iop_Xor16:
   4188          return mkUifU16(mce, vatom1, vatom2);
   4189       case Iop_Xor32:
   4190          return mkUifU32(mce, vatom1, vatom2);
   4191       case Iop_Xor64:
   4192          return mkUifU64(mce, vatom1, vatom2);
   4193       case Iop_XorV128:
   4194          return mkUifUV128(mce, vatom1, vatom2);
   4195       case Iop_XorV256:
   4196          return mkUifUV256(mce, vatom1, vatom2);
   4197 
   4198       /* V256-bit SIMD */
   4199 
   4200       case Iop_ShrN16x16:
   4201       case Iop_ShrN32x8:
   4202       case Iop_ShrN64x4:
   4203       case Iop_SarN16x16:
   4204       case Iop_SarN32x8:
   4205       case Iop_ShlN16x16:
   4206       case Iop_ShlN32x8:
   4207       case Iop_ShlN64x4:
   4208          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
   4209             this is wrong now, scalar shifts are done properly lazily.
   4210             Vector shifts should be fixed too. */
   4211          complainIfUndefined(mce, atom2, NULL);
   4212          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
   4213 
   4214       case Iop_QSub8Ux32:
   4215       case Iop_QSub8Sx32:
   4216       case Iop_Sub8x32:
   4217       case Iop_Min8Ux32:
   4218       case Iop_Min8Sx32:
   4219       case Iop_Max8Ux32:
   4220       case Iop_Max8Sx32:
   4221       case Iop_CmpGT8Sx32:
   4222       case Iop_CmpEQ8x32:
   4223       case Iop_Avg8Ux32:
   4224       case Iop_QAdd8Ux32:
   4225       case Iop_QAdd8Sx32:
   4226       case Iop_Add8x32:
   4227          return binary8Ix32(mce, vatom1, vatom2);
   4228 
   4229       case Iop_QSub16Ux16:
   4230       case Iop_QSub16Sx16:
   4231       case Iop_Sub16x16:
   4232       case Iop_Mul16x16:
   4233       case Iop_MulHi16Sx16:
   4234       case Iop_MulHi16Ux16:
   4235       case Iop_Min16Sx16:
   4236       case Iop_Min16Ux16:
   4237       case Iop_Max16Sx16:
   4238       case Iop_Max16Ux16:
   4239       case Iop_CmpGT16Sx16:
   4240       case Iop_CmpEQ16x16:
   4241       case Iop_Avg16Ux16:
   4242       case Iop_QAdd16Ux16:
   4243       case Iop_QAdd16Sx16:
   4244       case Iop_Add16x16:
   4245          return binary16Ix16(mce, vatom1, vatom2);
   4246 
   4247       case Iop_Sub32x8:
   4248       case Iop_CmpGT32Sx8:
   4249       case Iop_CmpEQ32x8:
   4250       case Iop_Add32x8:
   4251       case Iop_Max32Ux8:
   4252       case Iop_Max32Sx8:
   4253       case Iop_Min32Ux8:
   4254       case Iop_Min32Sx8:
   4255       case Iop_Mul32x8:
   4256          return binary32Ix8(mce, vatom1, vatom2);
   4257 
   4258       case Iop_Sub64x4:
   4259       case Iop_Add64x4:
   4260       case Iop_CmpEQ64x4:
   4261       case Iop_CmpGT64Sx4:
   4262          return binary64Ix4(mce, vatom1, vatom2);
   4263 
   4264      /* Perm32x8: rearrange values in left arg using steering values
   4265         from right arg.  So rearrange the vbits in the same way but
   4266         pessimise wrt steering values. */
   4267       case Iop_Perm32x8:
   4268          return mkUifUV256(
   4269                    mce,
   4270                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
   4271                    mkPCast32x8(mce, vatom2)
   4272                 );
   4273 
   4274       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
   4275          Handle the shifted results in the same way that other
   4276          binary Q ops are handled, eg QSub: UifU the two args,
   4277          then pessimise -- which is binaryNIxM.  But for the upper
   4278          V128, we require to generate just 1 bit which is the
   4279          pessimised shift result, with 127 defined zeroes above it.
   4280 
   4281          Note that this overly pessimistic in that in fact only the
   4282          bottom 8 bits of each lane of the second arg determine the shift
   4283          amount.  Really we ought to ignore any undefinedness in the
   4284          rest of the lanes of the second arg. */
   4285       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
   4286       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
   4287       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
   4288       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
   4289       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
   4290       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
   4291       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
   4292       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
   4293       {
   4294          // The function to generate the pessimised shift result
   4295          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
   4296          switch (op) {
   4297             case Iop_QandSQsh64x2:
   4298             case Iop_QandUQsh64x2:
   4299             case Iop_QandSQRsh64x2:
   4300             case Iop_QandUQRsh64x2:
   4301                binaryNIxM = binary64Ix2;
   4302                break;
   4303             case Iop_QandSQsh32x4:
   4304             case Iop_QandUQsh32x4:
   4305             case Iop_QandSQRsh32x4:
   4306             case Iop_QandUQRsh32x4:
   4307                binaryNIxM = binary32Ix4;
   4308                break;
   4309             case Iop_QandSQsh16x8:
   4310             case Iop_QandUQsh16x8:
   4311             case Iop_QandSQRsh16x8:
   4312             case Iop_QandUQRsh16x8:
   4313                binaryNIxM = binary16Ix8;
   4314                break;
   4315             case Iop_QandSQsh8x16:
   4316             case Iop_QandUQsh8x16:
   4317             case Iop_QandSQRsh8x16:
   4318             case Iop_QandUQRsh8x16:
   4319                binaryNIxM = binary8Ix16;
   4320                break;
   4321             default:
   4322                tl_assert(0);
   4323          }
   4324          tl_assert(binaryNIxM);
   4325          // Pessimised shift result, shV[127:0]
   4326          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
   4327          // Generates: Def--(127)--Def PCast-to-I1(shV)
   4328          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
   4329          // and assemble the result
   4330          return assignNew('V', mce, Ity_V256,
   4331                           binop(Iop_V128HLtoV256, qV, shV));
   4332       }
   4333 
   4334       default:
   4335          ppIROp(op);
   4336          VG_(tool_panic)("memcheck:expr2vbits_Binop");
   4337    }
   4338 }
   4339 
   4340 
   4341 static
   4342 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
   4343 {
   4344    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
   4345       selection of shadow operation implicitly duplicates the logic in
   4346       do_shadow_LoadG and should be kept in sync (in the very unlikely
   4347       event that the interpretation of such widening ops changes in
   4348       future).  See comment in do_shadow_LoadG. */
   4349    IRAtom* vatom = expr2vbits( mce, atom );
   4350    tl_assert(isOriginalAtom(mce,atom));
   4351    switch (op) {
   4352 
   4353       case Iop_Abs64Fx2:
   4354       case Iop_Neg64Fx2:
   4355       case Iop_RSqrtEst64Fx2:
   4356       case Iop_RecipEst64Fx2:
   4357          return unary64Fx2(mce, vatom);
   4358 
   4359       case Iop_Sqrt64F0x2:
   4360          return unary64F0x2(mce, vatom);
   4361 
   4362       case Iop_Sqrt32Fx8:
   4363       case Iop_RSqrtEst32Fx8:
   4364       case Iop_RecipEst32Fx8:
   4365          return unary32Fx8(mce, vatom);
   4366 
   4367       case Iop_Sqrt64Fx4:
   4368          return unary64Fx4(mce, vatom);
   4369 
   4370       case Iop_RecipEst32Fx4:
   4371       case Iop_I32UtoFx4:
   4372       case Iop_I32StoFx4:
   4373       case Iop_QFtoI32Ux4_RZ:
   4374       case Iop_QFtoI32Sx4_RZ:
   4375       case Iop_RoundF32x4_RM:
   4376       case Iop_RoundF32x4_RP:
   4377       case Iop_RoundF32x4_RN:
   4378       case Iop_RoundF32x4_RZ:
   4379       case Iop_RecipEst32Ux4:
   4380       case Iop_Abs32Fx4:
   4381       case Iop_Neg32Fx4:
   4382       case Iop_RSqrtEst32Fx4:
   4383          return unary32Fx4(mce, vatom);
   4384 
   4385       case Iop_I32UtoFx2:
   4386       case Iop_I32StoFx2:
   4387       case Iop_RecipEst32Fx2:
   4388       case Iop_RecipEst32Ux2:
   4389       case Iop_Abs32Fx2:
   4390       case Iop_Neg32Fx2:
   4391       case Iop_RSqrtEst32Fx2:
   4392          return unary32Fx2(mce, vatom);
   4393 
   4394       case Iop_Sqrt32F0x4:
   4395       case Iop_RSqrtEst32F0x4:
   4396       case Iop_RecipEst32F0x4:
   4397          return unary32F0x4(mce, vatom);
   4398 
   4399       case Iop_32UtoV128:
   4400       case Iop_64UtoV128:
   4401       case Iop_Dup8x16:
   4402       case Iop_Dup16x8:
   4403       case Iop_Dup32x4:
   4404       case Iop_Reverse1sIn8_x16:
   4405       case Iop_Reverse8sIn16_x8:
   4406       case Iop_Reverse8sIn32_x4:
   4407       case Iop_Reverse16sIn32_x4:
   4408       case Iop_Reverse8sIn64_x2:
   4409       case Iop_Reverse16sIn64_x2:
   4410       case Iop_Reverse32sIn64_x2:
   4411       case Iop_V256toV128_1: case Iop_V256toV128_0:
   4412       case Iop_ZeroHI64ofV128:
   4413       case Iop_ZeroHI96ofV128:
   4414       case Iop_ZeroHI112ofV128:
   4415       case Iop_ZeroHI120ofV128:
   4416          return assignNew('V', mce, Ity_V128, unop(op, vatom));
   4417 
   4418       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
   4419       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
   4420          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
   4421       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
   4422       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
   4423          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
   4424 
   4425       case Iop_NegF128:
   4426       case Iop_AbsF128:
   4427       case Iop_RndF128:
   4428       case Iop_TruncF128toI64S: /* F128 -> I64S */
   4429       case Iop_TruncF128toI32S: /* F128 -> I32S (result stored in 64-bits) */
   4430       case Iop_TruncF128toI64U: /* F128 -> I64U */
   4431       case Iop_TruncF128toI32U: /* F128 -> I32U (result stored in 64-bits) */
   4432          return mkPCastTo(mce, Ity_I128, vatom);
   4433 
   4434       case Iop_BCD128toI128S:
   4435       case Iop_MulI128by10:
   4436       case Iop_MulI128by10Carry:
   4437       case Iop_F16toF64x2:
   4438       case Iop_F64toF16x2:
   4439          return vatom;
   4440 
   4441       case Iop_I32StoF128: /* signed I32 -> F128 */
   4442       case Iop_I64StoF128: /* signed I64 -> F128 */
   4443       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
   4444       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
   4445       case Iop_F32toF128:  /* F32 -> F128 */
   4446       case Iop_F64toF128:  /* F64 -> F128 */
   4447       case Iop_I32StoD128: /* signed I64 -> D128 */
   4448       case Iop_I64StoD128: /* signed I64 -> D128 */
   4449       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
   4450       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
   4451          return mkPCastTo(mce, Ity_I128, vatom);
   4452 
   4453       case Iop_F16toF64:
   4454       case Iop_F32toF64:
   4455       case Iop_I32StoF64:
   4456       case Iop_I32UtoF64:
   4457       case Iop_NegF64:
   4458       case Iop_AbsF64:
   4459       case Iop_RSqrtEst5GoodF64:
   4460       case Iop_RoundF64toF64_NEAREST:
   4461       case Iop_RoundF64toF64_NegINF:
   4462       case Iop_RoundF64toF64_PosINF:
   4463       case Iop_RoundF64toF64_ZERO:
   4464       case Iop_Clz64:
   4465       case Iop_D32toD64:
   4466       case Iop_I32StoD64:
   4467       case Iop_I32UtoD64:
   4468       case Iop_ExtractExpD64:    /* D64  -> I64 */
   4469       case Iop_ExtractExpD128:   /* D128 -> I64 */
   4470       case Iop_ExtractSigD64:    /* D64  -> I64 */
   4471       case Iop_ExtractSigD128:   /* D128 -> I64 */
   4472       case Iop_DPBtoBCD:
   4473       case Iop_BCDtoDPB:
   4474          return mkPCastTo(mce, Ity_I64, vatom);
   4475 
   4476       case Iop_D64toD128:
   4477          return mkPCastTo(mce, Ity_I128, vatom);
   4478 
   4479       case Iop_Clz32:
   4480       case Iop_TruncF64asF32:
   4481       case Iop_NegF32:
   4482       case Iop_AbsF32:
   4483       case Iop_F16toF32:
   4484          return mkPCastTo(mce, Ity_I32, vatom);
   4485 
   4486       case Iop_Ctz32:
   4487       case Iop_Ctz64:
   4488          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
   4489 
   4490       case Iop_1Uto64:
   4491       case Iop_1Sto64:
   4492       case Iop_8Uto64:
   4493       case Iop_8Sto64:
   4494       case Iop_16Uto64:
   4495       case Iop_16Sto64:
   4496       case Iop_32Sto64:
   4497       case Iop_32Uto64:
   4498       case Iop_V128to64:
   4499       case Iop_V128HIto64:
   4500       case Iop_128HIto64:
   4501       case Iop_128to64:
   4502       case Iop_Dup8x8:
   4503       case Iop_Dup16x4:
   4504       case Iop_Dup32x2:
   4505       case Iop_Reverse8sIn16_x4:
   4506       case Iop_Reverse8sIn32_x2:
   4507       case Iop_Reverse16sIn32_x2:
   4508       case Iop_Reverse8sIn64_x1:
   4509       case Iop_Reverse16sIn64_x1:
   4510       case Iop_Reverse32sIn64_x1:
   4511       case Iop_V256to64_0: case Iop_V256to64_1:
   4512       case Iop_V256to64_2: case Iop_V256to64_3:
   4513          return assignNew('V', mce, Ity_I64, unop(op, vatom));
   4514 
   4515       case Iop_64to32:
   4516       case Iop_64HIto32:
   4517       case Iop_1Uto32:
   4518       case Iop_1Sto32:
   4519       case Iop_8Uto32:
   4520       case Iop_16Uto32:
   4521       case Iop_16Sto32:
   4522       case Iop_8Sto32:
   4523       case Iop_V128to32:
   4524          return assignNew('V', mce, Ity_I32, unop(op, vatom));
   4525 
   4526       case Iop_8Sto16:
   4527       case Iop_8Uto16:
   4528       case Iop_32to16:
   4529       case Iop_32HIto16:
   4530       case Iop_64to16:
   4531       case Iop_GetMSBs8x16:
   4532          return assignNew('V', mce, Ity_I16, unop(op, vatom));
   4533 
   4534       case Iop_1Uto8:
   4535       case Iop_1Sto8:
   4536       case Iop_16to8:
   4537       case Iop_16HIto8:
   4538       case Iop_32to8:
   4539       case Iop_64to8:
   4540       case Iop_GetMSBs8x8:
   4541          return assignNew('V', mce, Ity_I8, unop(op, vatom));
   4542 
   4543       case Iop_32to1:
   4544          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
   4545 
   4546       case Iop_64to1:
   4547          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
   4548 
   4549       case Iop_ReinterpF64asI64:
   4550       case Iop_ReinterpI64asF64:
   4551       case Iop_ReinterpI32asF32:
   4552       case Iop_ReinterpF32asI32:
   4553       case Iop_ReinterpI64asD64:
   4554       case Iop_ReinterpD64asI64:
   4555       case Iop_NotV256:
   4556       case Iop_NotV128:
   4557       case Iop_Not64:
   4558       case Iop_Not32:
   4559       case Iop_Not16:
   4560       case Iop_Not8:
   4561       case Iop_Not1:
   4562          return vatom;
   4563 
   4564       case Iop_CmpNEZ8x8:
   4565       case Iop_Cnt8x8:
   4566       case Iop_Clz8x8:
   4567       case Iop_Cls8x8:
   4568       case Iop_Abs8x8:
   4569          return mkPCast8x8(mce, vatom);
   4570 
   4571       case Iop_CmpNEZ8x16:
   4572       case Iop_Cnt8x16:
   4573       case Iop_Clz8x16:
   4574       case Iop_Cls8x16:
   4575       case Iop_Abs8x16:
   4576       case Iop_Ctz8x16:
   4577          return mkPCast8x16(mce, vatom);
   4578 
   4579       case Iop_CmpNEZ16x4:
   4580       case Iop_Clz16x4:
   4581       case Iop_Cls16x4:
   4582       case Iop_Abs16x4:
   4583          return mkPCast16x4(mce, vatom);
   4584 
   4585       case Iop_CmpNEZ16x8:
   4586       case Iop_Clz16x8:
   4587       case Iop_Cls16x8:
   4588       case Iop_Abs16x8:
   4589       case Iop_Ctz16x8:
   4590          return mkPCast16x8(mce, vatom);
   4591 
   4592       case Iop_CmpNEZ32x2:
   4593       case Iop_Clz32x2:
   4594       case Iop_Cls32x2:
   4595       case Iop_FtoI32Ux2_RZ:
   4596       case Iop_FtoI32Sx2_RZ:
   4597       case Iop_Abs32x2:
   4598          return mkPCast32x2(mce, vatom);
   4599 
   4600       case Iop_CmpNEZ32x4:
   4601       case Iop_Clz32x4:
   4602       case Iop_Cls32x4:
   4603       case Iop_FtoI32Ux4_RZ:
   4604       case Iop_FtoI32Sx4_RZ:
   4605       case Iop_Abs32x4:
   4606       case Iop_RSqrtEst32Ux4:
   4607       case Iop_Ctz32x4:
   4608          return mkPCast32x4(mce, vatom);
   4609 
   4610       case Iop_CmpwNEZ32:
   4611          return mkPCastTo(mce, Ity_I32, vatom);
   4612 
   4613       case Iop_CmpwNEZ64:
   4614          return mkPCastTo(mce, Ity_I64, vatom);
   4615 
   4616       case Iop_CmpNEZ64x2:
   4617       case Iop_CipherSV128:
   4618       case Iop_Clz64x2:
   4619       case Iop_Abs64x2:
   4620       case Iop_Ctz64x2:
   4621          return mkPCast64x2(mce, vatom);
   4622 
   4623       case Iop_PwBitMtxXpose64x2:
   4624          return assignNew('V', mce, Ity_V128, unop(op, vatom));
   4625 
   4626       case Iop_NarrowUn16to8x8:
   4627       case Iop_NarrowUn32to16x4:
   4628       case Iop_NarrowUn64to32x2:
   4629       case Iop_QNarrowUn16Sto8Sx8:
   4630       case Iop_QNarrowUn16Sto8Ux8:
   4631       case Iop_QNarrowUn16Uto8Ux8:
   4632       case Iop_QNarrowUn32Sto16Sx4:
   4633       case Iop_QNarrowUn32Sto16Ux4:
   4634       case Iop_QNarrowUn32Uto16Ux4:
   4635       case Iop_QNarrowUn64Sto32Sx2:
   4636       case Iop_QNarrowUn64Sto32Ux2:
   4637       case Iop_QNarrowUn64Uto32Ux2:
   4638       case Iop_F32toF16x4:
   4639          return vectorNarrowUnV128(mce, op, vatom);
   4640 
   4641       case Iop_Widen8Sto16x8:
   4642       case Iop_Widen8Uto16x8:
   4643       case Iop_Widen16Sto32x4:
   4644       case Iop_Widen16Uto32x4:
   4645       case Iop_Widen32Sto64x2:
   4646       case Iop_Widen32Uto64x2:
   4647       case Iop_F16toF32x4:
   4648          return vectorWidenI64(mce, op, vatom);
   4649 
   4650       case Iop_PwAddL32Ux2:
   4651       case Iop_PwAddL32Sx2:
   4652          return mkPCastTo(mce, Ity_I64,
   4653                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
   4654 
   4655       case Iop_PwAddL16Ux4:
   4656       case Iop_PwAddL16Sx4:
   4657          return mkPCast32x2(mce,
   4658                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
   4659 
   4660       case Iop_PwAddL8Ux8:
   4661       case Iop_PwAddL8Sx8:
   4662          return mkPCast16x4(mce,
   4663                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
   4664 
   4665       case Iop_PwAddL32Ux4:
   4666       case Iop_PwAddL32Sx4:
   4667          return mkPCast64x2(mce,
   4668                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
   4669 
   4670       case Iop_PwAddL16Ux8:
   4671       case Iop_PwAddL16Sx8:
   4672          return mkPCast32x4(mce,
   4673                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
   4674 
   4675       case Iop_PwAddL8Ux16:
   4676       case Iop_PwAddL8Sx16:
   4677          return mkPCast16x8(mce,
   4678                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
   4679 
   4680       case Iop_I64UtoF32:
   4681       default:
   4682          ppIROp(op);
   4683          VG_(tool_panic)("memcheck:expr2vbits_Unop");
   4684    }
   4685 }
   4686 
   4687 
   4688 /* Worker function -- do not call directly.  See comments on
   4689    expr2vbits_Load for the meaning of |guard|.
   4690 
   4691    Generates IR to (1) perform a definedness test of |addr|, (2)
   4692    perform a validity test of |addr|, and (3) return the Vbits for the
   4693    location indicated by |addr|.  All of this only happens when
   4694    |guard| is NULL or |guard| evaluates to True at run time.
   4695 
   4696    If |guard| evaluates to False at run time, the returned value is
   4697    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
   4698    performed.
   4699 
   4700    The definedness of |guard| itself is not checked.  That is assumed
   4701    to have been done before this point, by the caller. */
   4702 static
   4703 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
   4704                               IREndness end, IRType ty,
   4705                               IRAtom* addr, UInt bias, IRAtom* guard )
   4706 {
   4707    tl_assert(isOriginalAtom(mce,addr));
   4708    tl_assert(end == Iend_LE || end == Iend_BE);
   4709 
   4710    /* First, emit a definedness test for the address.  This also sets
   4711       the address (shadow) to 'defined' following the test. */
   4712    complainIfUndefined( mce, addr, guard );
   4713 
   4714    /* Now cook up a call to the relevant helper function, to read the
   4715       data V bits from shadow memory. */
   4716    ty = shadowTypeV(ty);
   4717 
   4718    void*        helper           = NULL;
   4719    const HChar* hname            = NULL;
   4720    Bool         ret_via_outparam = False;
   4721 
   4722    if (end == Iend_LE) {
   4723       switch (ty) {
   4724          case Ity_V256: helper = &MC_(helperc_LOADV256le);
   4725                         hname = "MC_(helperc_LOADV256le)";
   4726                         ret_via_outparam = True;
   4727                         break;
   4728          case Ity_V128: helper = &MC_(helperc_LOADV128le);
   4729                         hname = "MC_(helperc_LOADV128le)";
   4730                         ret_via_outparam = True;
   4731                         break;
   4732          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
   4733                         hname = "MC_(helperc_LOADV64le)";
   4734                         break;
   4735          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
   4736                         hname = "MC_(helperc_LOADV32le)";
   4737                         break;
   4738          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
   4739                         hname = "MC_(helperc_LOADV16le)";
   4740                         break;
   4741          case Ity_I8:   helper = &MC_(helperc_LOADV8);
   4742                         hname = "MC_(helperc_LOADV8)";
   4743                         break;
   4744          default:       ppIRType(ty);
   4745                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
   4746       }
   4747    } else {
   4748       switch (ty) {
   4749          case Ity_V256: helper = &MC_(helperc_LOADV256be);
   4750                         hname = "MC_(helperc_LOADV256be)";
   4751                         ret_via_outparam = True;
   4752                         break;
   4753          case Ity_V128: helper = &MC_(helperc_LOADV128be);
   4754                         hname = "MC_(helperc_LOADV128be)";
   4755                         ret_via_outparam = True;
   4756                         break;
   4757          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
   4758                         hname = "MC_(helperc_LOADV64be)";
   4759                         break;
   4760          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
   4761                         hname = "MC_(helperc_LOADV32be)";
   4762                         break;
   4763          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
   4764                         hname = "MC_(helperc_LOADV16be)";
   4765                         break;
   4766          case Ity_I8:   helper = &MC_(helperc_LOADV8);
   4767                         hname = "MC_(helperc_LOADV8)";
   4768                         break;
   4769          default:       ppIRType(ty);
   4770                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
   4771       }
   4772    }
   4773 
   4774    tl_assert(helper);
   4775    tl_assert(hname);
   4776 
   4777    /* Generate the actual address into addrAct. */
   4778    IRAtom* addrAct;
   4779    if (bias == 0) {
   4780       addrAct = addr;
   4781    } else {
   4782       IROp    mkAdd;
   4783       IRAtom* eBias;
   4784       IRType  tyAddr  = mce->hWordTy;
   4785       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
   4786       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
   4787       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
   4788       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
   4789    }
   4790 
   4791    /* We need to have a place to park the V bits we're just about to
   4792       read. */
   4793    IRTemp datavbits = newTemp(mce, ty, VSh);
   4794 
   4795    /* Here's the call. */
   4796    IRDirty* di;
   4797    if (ret_via_outparam) {
   4798       di = unsafeIRDirty_1_N( datavbits,
   4799                               2/*regparms*/,
   4800                               hname, VG_(fnptr_to_fnentry)( helper ),
   4801                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
   4802    } else {
   4803       di = unsafeIRDirty_1_N( datavbits,
   4804                               1/*regparms*/,
   4805                               hname, VG_(fnptr_to_fnentry)( helper ),
   4806                               mkIRExprVec_1( addrAct ) );
   4807    }
   4808 
   4809    setHelperAnns( mce, di );
   4810    if (guard) {
   4811       di->guard = guard;
   4812       /* Ideally the didn't-happen return value here would be all-ones
   4813          (all-undefined), so it'd be obvious if it got used
   4814          inadvertently.  We can get by with the IR-mandated default
   4815          value (0b01 repeating, 0x55 etc) as that'll still look pretty
   4816          undefined if it ever leaks out. */
   4817    }
   4818    stmt( 'V', mce, IRStmt_Dirty(di) );
   4819 
   4820    return mkexpr(datavbits);
   4821 }
   4822 
   4823 
   4824 /* Generate IR to do a shadow load.  The helper is expected to check
   4825    the validity of the address and return the V bits for that address.
   4826    This can optionally be controlled by a guard, which is assumed to
   4827    be True if NULL.  In the case where the guard is False at runtime,
   4828    the helper will return the didn't-do-the-call value of 0x55..55.
   4829    Since that means "completely undefined result", the caller of
   4830    this function will need to fix up the result somehow in that
   4831    case.
   4832 
   4833    Caller of this function is also expected to have checked the
   4834    definedness of |guard| before this point.
   4835 */
   4836 static
   4837 IRAtom* expr2vbits_Load ( MCEnv* mce,
   4838                           IREndness end, IRType ty,
   4839                           IRAtom* addr, UInt bias,
   4840                           IRAtom* guard )
   4841 {
   4842    tl_assert(end == Iend_LE || end == Iend_BE);
   4843    switch (shadowTypeV(ty)) {
   4844       case Ity_I8:
   4845       case Ity_I16:
   4846       case Ity_I32:
   4847       case Ity_I64:
   4848       case Ity_V128:
   4849       case Ity_V256:
   4850          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
   4851       default:
   4852          VG_(tool_panic)("expr2vbits_Load");
   4853    }
   4854 }
   4855 
   4856 
   4857 /* The most general handler for guarded loads.  Assumes the
   4858    definedness of GUARD has already been checked by the caller.  A
   4859    GUARD of NULL is assumed to mean "always True".  Generates code to
   4860    check the definedness and validity of ADDR.
   4861 
   4862    Generate IR to do a shadow load from ADDR and return the V bits.
   4863    The loaded type is TY.  The loaded data is then (shadow) widened by
   4864    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
   4865    evaluates to False at run time then the returned Vbits are simply
   4866    VALT instead.  Note therefore that the argument type of VWIDEN must
   4867    be TY and the result type of VWIDEN must equal the type of VALT.
   4868 */
   4869 static
   4870 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
   4871                                           IREndness end, IRType ty,
   4872                                           IRAtom* addr, UInt bias,
   4873                                           IRAtom* guard,
   4874                                           IROp vwiden, IRAtom* valt )
   4875 {
   4876    /* Sanity check the conversion operation, and also set TYWIDE. */
   4877    IRType tyWide = Ity_INVALID;
   4878    switch (vwiden) {
   4879       case Iop_INVALID:
   4880          tyWide = ty;
   4881          break;
   4882       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
   4883          tyWide = Ity_I32;
   4884          break;
   4885       default:
   4886          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
   4887    }
   4888 
   4889    /* If the guard evaluates to True, this will hold the loaded V bits
   4890       at TY.  If the guard evaluates to False, this will be all
   4891       ones, meaning "all undefined", in which case we will have to
   4892       replace it using an ITE below. */
   4893    IRAtom* iftrue1
   4894       = assignNew('V', mce, ty,
   4895                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
   4896    /* Now (shadow-) widen the loaded V bits to the desired width.  In
   4897       the guard-is-False case, the allowable widening operators will
   4898       in the worst case (unsigned widening) at least leave the
   4899       pre-widened part as being marked all-undefined, and in the best
   4900       case (signed widening) mark the whole widened result as
   4901       undefined.  Anyway, it doesn't matter really, since in this case
   4902       we will replace said value with the default value |valt| using an
   4903       ITE. */
   4904    IRAtom* iftrue2
   4905       = vwiden == Iop_INVALID
   4906            ? iftrue1
   4907            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
   4908    /* These are the V bits we will return if the load doesn't take
   4909       place. */
   4910    IRAtom* iffalse
   4911       = valt;
   4912    /* Prepare the cond for the ITE.  Convert a NULL cond into
   4913       something that iropt knows how to fold out later. */
   4914    IRAtom* cond
   4915       = guard == NULL  ? mkU1(1)  : guard;
   4916    /* And assemble the final result. */
   4917    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
   4918 }
   4919 
   4920 
   4921 /* A simpler handler for guarded loads, in which there is no
   4922    conversion operation, and the default V bit return (when the guard
   4923    evaluates to False at runtime) is "all defined".  If there is no
   4924    guard expression or the guard is always TRUE this function behaves
   4925    like expr2vbits_Load.  It is assumed that definedness of GUARD has
   4926    already been checked at the call site. */
   4927 static
   4928 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
   4929                                          IREndness end, IRType ty,
   4930                                          IRAtom* addr, UInt bias,
   4931                                          IRAtom *guard )
   4932 {
   4933    return expr2vbits_Load_guarded_General(
   4934              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
   4935           );
   4936 }
   4937 
   4938 
   4939 static
   4940 IRAtom* expr2vbits_ITE ( MCEnv* mce,
   4941                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
   4942 {
   4943    IRAtom *vbitsC, *vbits0, *vbits1;
   4944    IRType ty;
   4945    /* Given ITE(cond, iftrue,  iffalse),  generate
   4946             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
   4947       That is, steer the V bits like the originals, but trash the
   4948       result if the steering value is undefined.  This gives
   4949       lazy propagation. */
   4950    tl_assert(isOriginalAtom(mce, cond));
   4951    tl_assert(isOriginalAtom(mce, iftrue));
   4952    tl_assert(isOriginalAtom(mce, iffalse));
   4953 
   4954    vbitsC = expr2vbits(mce, cond);
   4955    vbits1 = expr2vbits(mce, iftrue);
   4956    vbits0 = expr2vbits(mce, iffalse);
   4957    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
   4958 
   4959    return
   4960       mkUifU(mce, ty, assignNew('V', mce, ty,
   4961                                      IRExpr_ITE(cond, vbits1, vbits0)),
   4962                       mkPCastTo(mce, ty, vbitsC) );
   4963 }
   4964 
   4965 /* --------- This is the main expression-handling function. --------- */
   4966 
   4967 static
   4968 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
   4969 {
   4970    switch (e->tag) {
   4971 
   4972       case Iex_Get:
   4973          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
   4974 
   4975       case Iex_GetI:
   4976          return shadow_GETI( mce, e->Iex.GetI.descr,
   4977                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
   4978 
   4979       case Iex_RdTmp:
   4980          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
   4981 
   4982       case Iex_Const:
   4983          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
   4984 
   4985       case Iex_Qop:
   4986          return expr2vbits_Qop(
   4987                    mce,
   4988                    e->Iex.Qop.details->op,
   4989                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
   4990                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
   4991                 );
   4992 
   4993       case Iex_Triop:
   4994          return expr2vbits_Triop(
   4995                    mce,
   4996                    e->Iex.Triop.details->op,
   4997                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
   4998                    e->Iex.Triop.details->arg3
   4999                 );
   5000 
   5001       case Iex_Binop:
   5002          return expr2vbits_Binop(
   5003                    mce,
   5004                    e->Iex.Binop.op,
   5005                    e->Iex.Binop.arg1, e->Iex.Binop.arg2
   5006                 );
   5007 
   5008       case Iex_Unop:
   5009          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
   5010 
   5011       case Iex_Load:
   5012          return expr2vbits_Load( mce, e->Iex.Load.end,
   5013                                       e->Iex.Load.ty,
   5014                                       e->Iex.Load.addr, 0/*addr bias*/,
   5015                                       NULL/* guard == "always True"*/ );
   5016 
   5017       case Iex_CCall:
   5018          return mkLazyN( mce, e->Iex.CCall.args,
   5019                               e->Iex.CCall.retty,
   5020                               e->Iex.CCall.cee );
   5021 
   5022       case Iex_ITE:
   5023          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
   5024                                      e->Iex.ITE.iffalse);
   5025 
   5026       default:
   5027          VG_(printf)("\n");
   5028          ppIRExpr(e);
   5029          VG_(printf)("\n");
   5030          VG_(tool_panic)("memcheck: expr2vbits");
   5031    }
   5032 }
   5033 
   5034 /*------------------------------------------------------------*/
   5035 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
   5036 /*------------------------------------------------------------*/
   5037 
   5038 /* Widen a value to the host word size. */
   5039 
   5040 static
   5041 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
   5042 {
   5043    IRType ty, tyH;
   5044 
   5045    /* vatom is vbits-value and as such can only have a shadow type. */
   5046    tl_assert(isShadowAtom(mce,vatom));
   5047 
   5048    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
   5049    tyH = mce->hWordTy;
   5050 
   5051    if (tyH == Ity_I32) {
   5052       switch (ty) {
   5053          case Ity_I32:
   5054             return vatom;
   5055          case Ity_I16:
   5056             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
   5057          case Ity_I8:
   5058             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
   5059          default:
   5060             goto unhandled;
   5061       }
   5062    } else
   5063    if (tyH == Ity_I64) {
   5064       switch (ty) {
   5065          case Ity_I32:
   5066             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
   5067          case Ity_I16:
   5068             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
   5069                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
   5070          case Ity_I8:
   5071             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
   5072                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
   5073          default:
   5074             goto unhandled;
   5075       }
   5076    } else {
   5077       goto unhandled;
   5078    }
   5079   unhandled:
   5080    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
   5081    VG_(tool_panic)("zwidenToHostWord");
   5082 }
   5083 
   5084 
   5085 /* Generate a shadow store.  |addr| is always the original address
   5086    atom.  You can pass in either originals or V-bits for the data
   5087    atom, but obviously not both.  This function generates a check for
   5088    the definedness and (indirectly) the validity of |addr|, but only
   5089    when |guard| evaluates to True at run time (or is NULL).
   5090 
   5091    |guard| :: Ity_I1 controls whether the store really happens; NULL
   5092    means it unconditionally does.  Note that |guard| itself is not
   5093    checked for definedness; the caller of this function must do that
   5094    if necessary.
   5095 */
   5096 static
   5097 void do_shadow_Store ( MCEnv* mce,
   5098                        IREndness end,
   5099                        IRAtom* addr, UInt bias,
   5100                        IRAtom* data, IRAtom* vdata,
   5101                        IRAtom* guard )
   5102 {
   5103    IROp     mkAdd;
   5104    IRType   ty, tyAddr;
   5105    void*    helper = NULL;
   5106    const HChar* hname = NULL;
   5107    IRConst* c;
   5108 
   5109    tyAddr = mce->hWordTy;
   5110    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
   5111    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
   5112    tl_assert( end == Iend_LE || end == Iend_BE );
   5113 
   5114    if (data) {
   5115       tl_assert(!vdata);
   5116       tl_assert(isOriginalAtom(mce, data));
   5117       tl_assert(bias == 0);
   5118       vdata = expr2vbits( mce, data );
   5119    } else {
   5120       tl_assert(vdata);
   5121    }
   5122 
   5123    tl_assert(isOriginalAtom(mce,addr));
   5124    tl_assert(isShadowAtom(mce,vdata));
   5125 
   5126    if (guard) {
   5127       tl_assert(isOriginalAtom(mce, guard));
   5128       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
   5129    }
   5130 
   5131    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
   5132 
   5133    // If we're not doing undefined value checking, pretend that this value
   5134    // is "all valid".  That lets Vex's optimiser remove some of the V bit
   5135    // shadow computation ops that precede it.
   5136    if (MC_(clo_mc_level) == 1) {
   5137       switch (ty) {
   5138          case Ity_V256: // V256 weirdness -- used four times
   5139                         c = IRConst_V256(V_BITS32_DEFINED); break;
   5140          case Ity_V128: // V128 weirdness -- used twice
   5141                         c = IRConst_V128(V_BITS16_DEFINED); break;
   5142          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
   5143          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
   5144          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
   5145          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
   5146          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
   5147       }
   5148       vdata = IRExpr_Const( c );
   5149    }
   5150 
   5151    /* First, emit a definedness test for the address.  This also sets
   5152       the address (shadow) to 'defined' following the test.  Both of
   5153       those actions are gated on |guard|. */
   5154    complainIfUndefined( mce, addr, guard );
   5155 
   5156    /* Now decide which helper function to call to write the data V
   5157       bits into shadow memory. */
   5158    if (end == Iend_LE) {
   5159       switch (ty) {
   5160          case Ity_V256: /* we'll use the helper four times */
   5161          case Ity_V128: /* we'll use the helper twice */
   5162          case Ity_I64: helper = &MC_(helperc_STOREV64le);
   5163                        hname = "MC_(helperc_STOREV64le)";
   5164                        break;
   5165          case Ity_I32: helper = &MC_(helperc_STOREV32le);
   5166                        hname = "MC_(helperc_STOREV32le)";
   5167                        break;
   5168          case Ity_I16: helper = &MC_(helperc_STOREV16le);
   5169                        hname = "MC_(helperc_STOREV16le)";
   5170                        break;
   5171          case Ity_I8:  helper = &MC_(helperc_STOREV8);
   5172                        hname = "MC_(helperc_STOREV8)";
   5173                        break;
   5174          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
   5175       }
   5176    } else {
   5177       switch (ty) {
   5178          case Ity_V128: /* we'll use the helper twice */
   5179          case Ity_I64: helper = &MC_(helperc_STOREV64be);
   5180                        hname = "MC_(helperc_STOREV64be)";
   5181                        break;
   5182          case Ity_I32: helper = &MC_(helperc_STOREV32be);
   5183                        hname = "MC_(helperc_STOREV32be)";
   5184                        break;
   5185          case Ity_I16: helper = &MC_(helperc_STOREV16be);
   5186                        hname = "MC_(helperc_STOREV16be)";
   5187                        break;
   5188          case Ity_I8:  helper = &MC_(helperc_STOREV8);
   5189                        hname = "MC_(helperc_STOREV8)";
   5190                        break;
   5191          /* Note, no V256 case here, because no big-endian target that
   5192             we support, has 256 vectors. */
   5193          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
   5194       }
   5195    }
   5196 
   5197    if (UNLIKELY(ty == Ity_V256)) {
   5198 
   5199       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
   5200          Q3 being the most significant lane. */
   5201       /* These are the offsets of the Qs in memory. */
   5202       Int     offQ0, offQ1, offQ2, offQ3;
   5203 
   5204       /* Various bits for constructing the 4 lane helper calls */
   5205       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
   5206       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
   5207       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
   5208       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
   5209 
   5210       if (end == Iend_LE) {
   5211          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
   5212       } else {
   5213          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
   5214       }
   5215 
   5216       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
   5217       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
   5218       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
   5219       diQ0    = unsafeIRDirty_0_N(
   5220                    1/*regparms*/,
   5221                    hname, VG_(fnptr_to_fnentry)( helper ),
   5222                    mkIRExprVec_2( addrQ0, vdataQ0 )
   5223                 );
   5224 
   5225       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
   5226       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
   5227       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
   5228       diQ1    = unsafeIRDirty_0_N(
   5229                    1/*regparms*/,
   5230                    hname, VG_(fnptr_to_fnentry)( helper ),
   5231                    mkIRExprVec_2( addrQ1, vdataQ1 )
   5232                 );
   5233 
   5234       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
   5235       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
   5236       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
   5237       diQ2    = unsafeIRDirty_0_N(
   5238                    1/*regparms*/,
   5239                    hname, VG_(fnptr_to_fnentry)( helper ),
   5240                    mkIRExprVec_2( addrQ2, vdataQ2 )
   5241                 );
   5242 
   5243       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
   5244       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
   5245       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
   5246       diQ3    = unsafeIRDirty_0_N(
   5247                    1/*regparms*/,
   5248                    hname, VG_(fnptr_to_fnentry)( helper ),
   5249                    mkIRExprVec_2( addrQ3, vdataQ3 )
   5250                 );
   5251 
   5252       if (guard)
   5253          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
   5254 
   5255       setHelperAnns( mce, diQ0 );
   5256       setHelperAnns( mce, diQ1 );
   5257       setHelperAnns( mce, diQ2 );
   5258       setHelperAnns( mce, diQ3 );
   5259       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
   5260       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
   5261       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
   5262       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
   5263 
   5264    }
   5265    else if (UNLIKELY(ty == Ity_V128)) {
   5266 
   5267       /* V128-bit case */
   5268       /* See comment in next clause re 64-bit regparms */
   5269       /* also, need to be careful about endianness */
   5270 
   5271       Int     offLo64, offHi64;
   5272       IRDirty *diLo64, *diHi64;
   5273       IRAtom  *addrLo64, *addrHi64;
   5274       IRAtom  *vdataLo64, *vdataHi64;
   5275       IRAtom  *eBiasLo64, *eBiasHi64;
   5276 
   5277       if (end == Iend_LE) {
   5278          offLo64 = 0;
   5279          offHi64 = 8;
   5280       } else {
   5281          offLo64 = 8;
   5282          offHi64 = 0;
   5283       }
   5284 
   5285       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
   5286       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
   5287       vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
   5288       diLo64    = unsafeIRDirty_0_N(
   5289                      1/*regparms*/,
   5290                      hname, VG_(fnptr_to_fnentry)( helper ),
   5291                      mkIRExprVec_2( addrLo64, vdataLo64 )
   5292                   );
   5293       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
   5294       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
   5295       vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
   5296       diHi64    = unsafeIRDirty_0_N(
   5297                      1/*regparms*/,
   5298                      hname, VG_(fnptr_to_fnentry)( helper ),
   5299                      mkIRExprVec_2( addrHi64, vdataHi64 )
   5300                   );
   5301       if (guard) diLo64->guard = guard;
   5302       if (guard) diHi64->guard = guard;
   5303       setHelperAnns( mce, diLo64 );
   5304       setHelperAnns( mce, diHi64 );
   5305       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
   5306       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
   5307 
   5308    } else {
   5309 
   5310       IRDirty *di;
   5311       IRAtom  *addrAct;
   5312 
   5313       /* 8/16/32/64-bit cases */
   5314       /* Generate the actual address into addrAct. */
   5315       if (bias == 0) {
   5316          addrAct = addr;
   5317       } else {
   5318          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
   5319          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
   5320       }
   5321 
   5322       if (ty == Ity_I64) {
   5323          /* We can't do this with regparm 2 on 32-bit platforms, since
   5324             the back ends aren't clever enough to handle 64-bit
   5325             regparm args.  Therefore be different. */
   5326          di = unsafeIRDirty_0_N(
   5327                  1/*regparms*/,
   5328                  hname, VG_(fnptr_to_fnentry)( helper ),
   5329                  mkIRExprVec_2( addrAct, vdata )
   5330               );
   5331       } else {
   5332          di = unsafeIRDirty_0_N(
   5333                  2/*regparms*/,
   5334                  hname, VG_(fnptr_to_fnentry)( helper ),
   5335                  mkIRExprVec_2( addrAct,
   5336                                 zwidenToHostWord( mce, vdata ))
   5337               );
   5338       }
   5339       if (guard) di->guard = guard;
   5340       setHelperAnns( mce, di );
   5341       stmt( 'V', mce, IRStmt_Dirty(di) );
   5342    }
   5343 
   5344 }
   5345 
   5346 
   5347 /* Do lazy pessimistic propagation through a dirty helper call, by
   5348    looking at the annotations on it.  This is the most complex part of
   5349    Memcheck. */
   5350 
   5351 static IRType szToITy ( Int n )
   5352 {
   5353    switch (n) {
   5354       case 1: return Ity_I8;
   5355       case 2: return Ity_I16;
   5356       case 4: return Ity_I32;
   5357       case 8: return Ity_I64;
   5358       default: VG_(tool_panic)("szToITy(memcheck)");
   5359    }
   5360 }
   5361 
   5362 static
   5363 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
   5364 {
   5365    Int       i, k, n, toDo, gSz, gOff;
   5366    IRAtom    *src, *here, *curr;
   5367    IRType    tySrc, tyDst;
   5368    IRTemp    dst;
   5369    IREndness end;
   5370 
   5371    /* What's the native endianness?  We need to know this. */
   5372 #  if defined(VG_BIGENDIAN)
   5373    end = Iend_BE;
   5374 #  elif defined(VG_LITTLEENDIAN)
   5375    end = Iend_LE;
   5376 #  else
   5377 #    error "Unknown endianness"
   5378 #  endif
   5379 
   5380    /* First check the guard. */
   5381    complainIfUndefined(mce, d->guard, NULL);
   5382 
   5383    /* Now round up all inputs and PCast over them. */
   5384    curr = definedOfType(Ity_I32);
   5385 
   5386    /* Inputs: unmasked args
   5387       Note: arguments are evaluated REGARDLESS of the guard expression */
   5388    for (i = 0; d->args[i]; i++) {
   5389       IRAtom* arg = d->args[i];
   5390       if ( (d->cee->mcx_mask & (1<<i))
   5391            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
   5392          /* ignore this arg */
   5393       } else {
   5394          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg) );
   5395          curr = mkUifU32(mce, here, curr);
   5396       }
   5397    }
   5398 
   5399    /* Inputs: guest state that we read. */
   5400    for (i = 0; i < d->nFxState; i++) {
   5401       tl_assert(d->fxState[i].fx != Ifx_None);
   5402       if (d->fxState[i].fx == Ifx_Write)
   5403          continue;
   5404 
   5405       /* Enumerate the described state segments */
   5406       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   5407          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   5408          gSz  = d->fxState[i].size;
   5409 
   5410          /* Ignore any sections marked as 'always defined'. */
   5411          if (isAlwaysDefd(mce, gOff, gSz)) {
   5412             if (0)
   5413             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
   5414                         gOff, gSz);
   5415             continue;
   5416          }
   5417 
   5418          /* This state element is read or modified.  So we need to
   5419             consider it.  If larger than 8 bytes, deal with it in
   5420             8-byte chunks. */
   5421          while (True) {
   5422             tl_assert(gSz >= 0);
   5423             if (gSz == 0) break;
   5424             n = gSz <= 8 ? gSz : 8;
   5425             /* update 'curr' with UifU of the state slice
   5426                gOff .. gOff+n-1 */
   5427             tySrc = szToITy( n );
   5428 
   5429             /* Observe the guard expression. If it is false use an
   5430                all-bits-defined bit pattern */
   5431             IRAtom *cond, *iffalse, *iftrue;
   5432 
   5433             cond    = assignNew('V', mce, Ity_I1, d->guard);
   5434             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
   5435             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
   5436             src     = assignNew('V', mce, tySrc,
   5437                                 IRExpr_ITE(cond, iftrue, iffalse));
   5438 
   5439             here = mkPCastTo( mce, Ity_I32, src );
   5440             curr = mkUifU32(mce, here, curr);
   5441             gSz -= n;
   5442             gOff += n;
   5443          }
   5444       }
   5445    }
   5446 
   5447    /* Inputs: memory.  First set up some info needed regardless of
   5448       whether we're doing reads or writes. */
   5449 
   5450    if (d->mFx != Ifx_None) {
   5451       /* Because we may do multiple shadow loads/stores from the same
   5452          base address, it's best to do a single test of its
   5453          definedness right now.  Post-instrumentation optimisation
   5454          should remove all but this test. */
   5455       IRType tyAddr;
   5456       tl_assert(d->mAddr);
   5457       complainIfUndefined(mce, d->mAddr, d->guard);
   5458 
   5459       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
   5460       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
   5461       tl_assert(tyAddr == mce->hWordTy); /* not really right */
   5462    }
   5463 
   5464    /* Deal with memory inputs (reads or modifies) */
   5465    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
   5466       toDo   = d->mSize;
   5467       /* chew off 32-bit chunks.  We don't care about the endianness
   5468          since it's all going to be condensed down to a single bit,
   5469          but nevertheless choose an endianness which is hopefully
   5470          native to the platform. */
   5471       while (toDo >= 4) {
   5472          here = mkPCastTo(
   5473                    mce, Ity_I32,
   5474                    expr2vbits_Load_guarded_Simple(
   5475                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
   5476                 );
   5477          curr = mkUifU32(mce, here, curr);
   5478          toDo -= 4;
   5479       }
   5480       /* chew off 16-bit chunks */
   5481       while (toDo >= 2) {
   5482          here = mkPCastTo(
   5483                    mce, Ity_I32,
   5484                    expr2vbits_Load_guarded_Simple(
   5485                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
   5486                 );
   5487          curr = mkUifU32(mce, here, curr);
   5488          toDo -= 2;
   5489       }
   5490       /* chew off the remaining 8-bit chunk, if any */
   5491       if (toDo == 1) {
   5492          here = mkPCastTo(
   5493                    mce, Ity_I32,
   5494                    expr2vbits_Load_guarded_Simple(
   5495                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
   5496                 );
   5497          curr = mkUifU32(mce, here, curr);
   5498          toDo -= 1;
   5499       }
   5500       tl_assert(toDo == 0);
   5501    }
   5502 
   5503    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
   5504       all the inputs to the helper.  Now we need to re-distribute the
   5505       results to all destinations. */
   5506 
   5507    /* Outputs: the destination temporary, if there is one. */
   5508    if (d->tmp != IRTemp_INVALID) {
   5509       dst   = findShadowTmpV(mce, d->tmp);
   5510       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
   5511       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
   5512    }
   5513 
   5514    /* Outputs: guest state that we write or modify. */
   5515    for (i = 0; i < d->nFxState; i++) {
   5516       tl_assert(d->fxState[i].fx != Ifx_None);
   5517       if (d->fxState[i].fx == Ifx_Read)
   5518          continue;
   5519 
   5520       /* Enumerate the described state segments */
   5521       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   5522          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   5523          gSz  = d->fxState[i].size;
   5524 
   5525          /* Ignore any sections marked as 'always defined'. */
   5526          if (isAlwaysDefd(mce, gOff, gSz))
   5527             continue;
   5528 
   5529          /* This state element is written or modified.  So we need to
   5530             consider it.  If larger than 8 bytes, deal with it in
   5531             8-byte chunks. */
   5532          while (True) {
   5533             tl_assert(gSz >= 0);
   5534             if (gSz == 0) break;
   5535             n = gSz <= 8 ? gSz : 8;
   5536             /* Write suitably-casted 'curr' to the state slice
   5537                gOff .. gOff+n-1 */
   5538             tyDst = szToITy( n );
   5539             do_shadow_PUT( mce, gOff,
   5540                                 NULL, /* original atom */
   5541                                 mkPCastTo( mce, tyDst, curr ), d->guard );
   5542             gSz -= n;
   5543             gOff += n;
   5544          }
   5545       }
   5546    }
   5547 
   5548    /* Outputs: memory that we write or modify.  Same comments about
   5549       endianness as above apply. */
   5550    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
   5551       toDo   = d->mSize;
   5552       /* chew off 32-bit chunks */
   5553       while (toDo >= 4) {
   5554          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5555                           NULL, /* original data */
   5556                           mkPCastTo( mce, Ity_I32, curr ),
   5557                           d->guard );
   5558          toDo -= 4;
   5559       }
   5560       /* chew off 16-bit chunks */
   5561       while (toDo >= 2) {
   5562          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5563                           NULL, /* original data */
   5564                           mkPCastTo( mce, Ity_I16, curr ),
   5565                           d->guard );
   5566          toDo -= 2;
   5567       }
   5568       /* chew off the remaining 8-bit chunk, if any */
   5569       if (toDo == 1) {
   5570          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5571                           NULL, /* original data */
   5572                           mkPCastTo( mce, Ity_I8, curr ),
   5573                           d->guard );
   5574          toDo -= 1;
   5575       }
   5576       tl_assert(toDo == 0);
   5577    }
   5578 
   5579 }
   5580 
   5581 
   5582 /* We have an ABI hint telling us that [base .. base+len-1] is to
   5583    become undefined ("writable").  Generate code to call a helper to
   5584    notify the A/V bit machinery of this fact.
   5585 
   5586    We call
   5587    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
   5588                                                     Addr nia );
   5589 */
   5590 static
   5591 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
   5592 {
   5593    IRDirty* di;
   5594 
   5595    if (MC_(clo_mc_level) == 3) {
   5596       di = unsafeIRDirty_0_N(
   5597               3/*regparms*/,
   5598               "MC_(helperc_MAKE_STACK_UNINIT_w_o)",
   5599               VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_w_o) ),
   5600               mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
   5601            );
   5602    } else {
   5603       /* We ignore the supplied nia, since it is irrelevant. */
   5604       tl_assert(MC_(clo_mc_level) == 2 || MC_(clo_mc_level) == 1);
   5605       /* Special-case the len==128 case, since that is for amd64-ELF,
   5606          which is a very common target. */
   5607       if (len == 128) {
   5608          di = unsafeIRDirty_0_N(
   5609                  1/*regparms*/,
   5610                  "MC_(helperc_MAKE_STACK_UNINIT_128_no_o)",
   5611                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_128_no_o)),
   5612                  mkIRExprVec_1( base )
   5613               );
   5614       } else {
   5615          di = unsafeIRDirty_0_N(
   5616                  2/*regparms*/,
   5617                  "MC_(helperc_MAKE_STACK_UNINIT_no_o)",
   5618                  VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT_no_o) ),
   5619                  mkIRExprVec_2( base, mkIRExpr_HWord( (UInt)len) )
   5620               );
   5621       }
   5622    }
   5623 
   5624    stmt( 'V', mce, IRStmt_Dirty(di) );
   5625 }
   5626 
   5627 
   5628 /* ------ Dealing with IRCAS (big and complex) ------ */
   5629 
   5630 /* FWDS */
   5631 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
   5632                              IRAtom* baseaddr, Int offset );
   5633 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
   5634 static void    gen_store_b ( MCEnv* mce, Int szB,
   5635                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
   5636                              IRAtom* guard );
   5637 
   5638 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
   5639 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
   5640 
   5641 
   5642 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
   5643    IRExpr.Consts, else this asserts.  If they are both Consts, it
   5644    doesn't do anything.  So that just leaves the RdTmp case.
   5645 
   5646    In which case: this assigns the shadow value SHADOW to the IR
   5647    shadow temporary associated with ORIG.  That is, ORIG, being an
   5648    original temporary, will have a shadow temporary associated with
   5649    it.  However, in the case envisaged here, there will so far have
   5650    been no IR emitted to actually write a shadow value into that
   5651    temporary.  What this routine does is to (emit IR to) copy the
   5652    value in SHADOW into said temporary, so that after this call,
   5653    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
   5654    value in SHADOW.
   5655 
   5656    Point is to allow callers to compute "by hand" a shadow value for
   5657    ORIG, and force it to be associated with ORIG.
   5658 
   5659    How do we know that that shadow associated with ORIG has not so far
   5660    been assigned to?  Well, we don't per se know that, but supposing
   5661    it had.  Then this routine would create a second assignment to it,
   5662    and later the IR sanity checker would barf.  But that never
   5663    happens.  QED.
   5664 */
   5665 static void bind_shadow_tmp_to_orig ( UChar how,
   5666                                       MCEnv* mce,
   5667                                       IRAtom* orig, IRAtom* shadow )
   5668 {
   5669    tl_assert(isOriginalAtom(mce, orig));
   5670    tl_assert(isShadowAtom(mce, shadow));
   5671    switch (orig->tag) {
   5672       case Iex_Const:
   5673          tl_assert(shadow->tag == Iex_Const);
   5674          break;
   5675       case Iex_RdTmp:
   5676          tl_assert(shadow->tag == Iex_RdTmp);
   5677          if (how == 'V') {
   5678             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
   5679                    shadow);
   5680          } else {
   5681             tl_assert(how == 'B');
   5682             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
   5683                    shadow);
   5684          }
   5685          break;
   5686       default:
   5687          tl_assert(0);
   5688    }
   5689 }
   5690 
   5691 
   5692 static
   5693 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
   5694 {
   5695    /* Scheme is (both single- and double- cases):
   5696 
   5697       1. fetch data#,dataB (the proposed new value)
   5698 
   5699       2. fetch expd#,expdB (what we expect to see at the address)
   5700 
   5701       3. check definedness of address
   5702 
   5703       4. load old#,oldB from shadow memory; this also checks
   5704          addressibility of the address
   5705 
   5706       5. the CAS itself
   5707 
   5708       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
   5709 
   5710       7. if "expected == old" (as computed by (6))
   5711             store data#,dataB to shadow memory
   5712 
   5713       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
   5714       'data' but 7 stores 'data#'.  Hence it is possible for the
   5715       shadow data to be incorrectly checked and/or updated:
   5716 
   5717       * 7 is at least gated correctly, since the 'expected == old'
   5718         condition is derived from outputs of 5.  However, the shadow
   5719         write could happen too late: imagine after 5 we are
   5720         descheduled, a different thread runs, writes a different
   5721         (shadow) value at the address, and then we resume, hence
   5722         overwriting the shadow value written by the other thread.
   5723 
   5724       Because the original memory access is atomic, there's no way to
   5725       make both the original and shadow accesses into a single atomic
   5726       thing, hence this is unavoidable.
   5727 
   5728       At least as Valgrind stands, I don't think it's a problem, since
   5729       we're single threaded *and* we guarantee that there are no
   5730       context switches during the execution of any specific superblock
   5731       -- context switches can only happen at superblock boundaries.
   5732 
   5733       If Valgrind ever becomes MT in the future, then it might be more
   5734       of a problem.  A possible kludge would be to artificially
   5735       associate with the location, a lock, which we must acquire and
   5736       release around the transaction as a whole.  Hmm, that probably
   5737       would't work properly since it only guards us against other
   5738       threads doing CASs on the same location, not against other
   5739       threads doing normal reads and writes.
   5740 
   5741       ------------------------------------------------------------
   5742 
   5743       COMMENT_ON_CasCmpEQ:
   5744 
   5745       Note two things.  Firstly, in the sequence above, we compute
   5746       "expected == old", but we don't check definedness of it.  Why
   5747       not?  Also, the x86 and amd64 front ends use
   5748       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
   5749       determination (expected == old ?) for themselves, and we also
   5750       don't check definedness for those primops; we just say that the
   5751       result is defined.  Why?  Details follow.
   5752 
   5753       x86/amd64 contains various forms of locked insns:
   5754       * lock prefix before all basic arithmetic insn;
   5755         eg lock xorl %reg1,(%reg2)
   5756       * atomic exchange reg-mem
   5757       * compare-and-swaps
   5758 
   5759       Rather than attempt to represent them all, which would be a
   5760       royal PITA, I used a result from Maurice Herlihy
   5761       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
   5762       demonstrates that compare-and-swap is a primitive more general
   5763       than the other two, and so can be used to represent all of them.
   5764       So the translation scheme for (eg) lock incl (%reg) is as
   5765       follows:
   5766 
   5767         again:
   5768          old = * %reg
   5769          new = old + 1
   5770          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
   5771 
   5772       The "atomically" is the CAS bit.  The scheme is always the same:
   5773       get old value from memory, compute new value, atomically stuff
   5774       new value back in memory iff the old value has not changed (iow,
   5775       no other thread modified it in the meantime).  If it has changed
   5776       then we've been out-raced and we have to start over.
   5777 
   5778       Now that's all very neat, but it has the bad side effect of
   5779       introducing an explicit equality test into the translation.
   5780       Consider the behaviour of said code on a memory location which
   5781       is uninitialised.  We will wind up doing a comparison on
   5782       uninitialised data, and mc duly complains.
   5783 
   5784       What's difficult about this is, the common case is that the
   5785       location is uncontended, and so we're usually comparing the same
   5786       value (* %reg) with itself.  So we shouldn't complain even if it
   5787       is undefined.  But mc doesn't know that.
   5788 
   5789       My solution is to mark the == in the IR specially, so as to tell
   5790       mc that it almost certainly compares a value with itself, and we
   5791       should just regard the result as always defined.  Rather than
   5792       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
   5793       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
   5794 
   5795       So there's always the question of, can this give a false
   5796       negative?  eg, imagine that initially, * %reg is defined; and we
   5797       read that; but then in the gap between the read and the CAS, a
   5798       different thread writes an undefined (and different) value at
   5799       the location.  Then the CAS in this thread will fail and we will
   5800       go back to "again:", but without knowing that the trip back
   5801       there was based on an undefined comparison.  No matter; at least
   5802       the other thread won the race and the location is correctly
   5803       marked as undefined.  What if it wrote an uninitialised version
   5804       of the same value that was there originally, though?
   5805 
   5806       etc etc.  Seems like there's a small corner case in which we
   5807       might lose the fact that something's defined -- we're out-raced
   5808       in between the "old = * reg" and the "atomically {", _and_ the
   5809       other thread is writing in an undefined version of what's
   5810       already there.  Well, that seems pretty unlikely.
   5811 
   5812       ---
   5813 
   5814       If we ever need to reinstate it .. code which generates a
   5815       definedness test for "expected == old" was removed at r10432 of
   5816       this file.
   5817    */
   5818    if (cas->oldHi == IRTemp_INVALID) {
   5819       do_shadow_CAS_single( mce, cas );
   5820    } else {
   5821       do_shadow_CAS_double( mce, cas );
   5822    }
   5823 }
   5824 
   5825 
   5826 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
   5827 {
   5828    IRAtom *vdataLo = NULL, *bdataLo = NULL;
   5829    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
   5830    IRAtom *voldLo  = NULL, *boldLo  = NULL;
   5831    IRAtom *expd_eq_old = NULL;
   5832    IROp   opCasCmpEQ;
   5833    Int    elemSzB;
   5834    IRType elemTy;
   5835    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
   5836 
   5837    /* single CAS */
   5838    tl_assert(cas->oldHi == IRTemp_INVALID);
   5839    tl_assert(cas->expdHi == NULL);
   5840    tl_assert(cas->dataHi == NULL);
   5841 
   5842    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
   5843    switch (elemTy) {
   5844       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
   5845       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
   5846       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
   5847       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
   5848       default: tl_assert(0); /* IR defn disallows any other types */
   5849    }
   5850 
   5851    /* 1. fetch data# (the proposed new value) */
   5852    tl_assert(isOriginalAtom(mce, cas->dataLo));
   5853    vdataLo
   5854       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
   5855    tl_assert(isShadowAtom(mce, vdataLo));
   5856    if (otrak) {
   5857       bdataLo
   5858          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
   5859       tl_assert(isShadowAtom(mce, bdataLo));
   5860    }
   5861 
   5862    /* 2. fetch expected# (what we expect to see at the address) */
   5863    tl_assert(isOriginalAtom(mce, cas->expdLo));
   5864    vexpdLo
   5865       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
   5866    tl_assert(isShadowAtom(mce, vexpdLo));
   5867    if (otrak) {
   5868       bexpdLo
   5869          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
   5870       tl_assert(isShadowAtom(mce, bexpdLo));
   5871    }
   5872 
   5873    /* 3. check definedness of address */
   5874    /* 4. fetch old# from shadow memory; this also checks
   5875          addressibility of the address */
   5876    voldLo
   5877       = assignNew(
   5878            'V', mce, elemTy,
   5879            expr2vbits_Load(
   5880               mce,
   5881               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
   5882               NULL/*always happens*/
   5883         ));
   5884    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
   5885    if (otrak) {
   5886       boldLo
   5887          = assignNew('B', mce, Ity_I32,
   5888                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
   5889       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
   5890    }
   5891 
   5892    /* 5. the CAS itself */
   5893    stmt( 'C', mce, IRStmt_CAS(cas) );
   5894 
   5895    /* 6. compute "expected == old" */
   5896    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
   5897    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
   5898       tree, but it's not copied from the input block. */
   5899    expd_eq_old
   5900       = assignNew('C', mce, Ity_I1,
   5901                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
   5902 
   5903    /* 7. if "expected == old"
   5904             store data# to shadow memory */
   5905    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
   5906                     NULL/*data*/, vdataLo/*vdata*/,
   5907                     expd_eq_old/*guard for store*/ );
   5908    if (otrak) {
   5909       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
   5910                    bdataLo/*bdata*/,
   5911                    expd_eq_old/*guard for store*/ );
   5912    }
   5913 }
   5914 
   5915 
   5916 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
   5917 {
   5918    IRAtom *vdataHi = NULL, *bdataHi = NULL;
   5919    IRAtom *vdataLo = NULL, *bdataLo = NULL;
   5920    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
   5921    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
   5922    IRAtom *voldHi  = NULL, *boldHi  = NULL;
   5923    IRAtom *voldLo  = NULL, *boldLo  = NULL;
   5924    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
   5925    IRAtom *expd_eq_old = NULL, *zero = NULL;
   5926    IROp   opCasCmpEQ, opOr, opXor;
   5927    Int    elemSzB, memOffsLo, memOffsHi;
   5928    IRType elemTy;
   5929    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
   5930 
   5931    /* double CAS */
   5932    tl_assert(cas->oldHi != IRTemp_INVALID);
   5933    tl_assert(cas->expdHi != NULL);
   5934    tl_assert(cas->dataHi != NULL);
   5935 
   5936    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
   5937    switch (elemTy) {
   5938       case Ity_I8:
   5939          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
   5940          elemSzB = 1; zero = mkU8(0);
   5941          break;
   5942       case Ity_I16:
   5943          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
   5944          elemSzB = 2; zero = mkU16(0);
   5945          break;
   5946       case Ity_I32:
   5947          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
   5948          elemSzB = 4; zero = mkU32(0);
   5949          break;
   5950       case Ity_I64:
   5951          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
   5952          elemSzB = 8; zero = mkU64(0);
   5953          break;
   5954       default:
   5955          tl_assert(0); /* IR defn disallows any other types */
   5956    }
   5957 
   5958    /* 1. fetch data# (the proposed new value) */
   5959    tl_assert(isOriginalAtom(mce, cas->dataHi));
   5960    tl_assert(isOriginalAtom(mce, cas->dataLo));
   5961    vdataHi
   5962       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi));
   5963    vdataLo
   5964       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
   5965    tl_assert(isShadowAtom(mce, vdataHi));
   5966    tl_assert(isShadowAtom(mce, vdataLo));
   5967    if (otrak) {
   5968       bdataHi
   5969          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
   5970       bdataLo
   5971          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
   5972       tl_assert(isShadowAtom(mce, bdataHi));
   5973       tl_assert(isShadowAtom(mce, bdataLo));
   5974    }
   5975 
   5976    /* 2. fetch expected# (what we expect to see at the address) */
   5977    tl_assert(isOriginalAtom(mce, cas->expdHi));
   5978    tl_assert(isOriginalAtom(mce, cas->expdLo));
   5979    vexpdHi
   5980       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi));
   5981    vexpdLo
   5982       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
   5983    tl_assert(isShadowAtom(mce, vexpdHi));
   5984    tl_assert(isShadowAtom(mce, vexpdLo));
   5985    if (otrak) {
   5986       bexpdHi
   5987          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
   5988       bexpdLo
   5989          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
   5990       tl_assert(isShadowAtom(mce, bexpdHi));
   5991       tl_assert(isShadowAtom(mce, bexpdLo));
   5992    }
   5993 
   5994    /* 3. check definedness of address */
   5995    /* 4. fetch old# from shadow memory; this also checks
   5996          addressibility of the address */
   5997    if (cas->end == Iend_LE) {
   5998       memOffsLo = 0;
   5999       memOffsHi = elemSzB;
   6000    } else {
   6001       tl_assert(cas->end == Iend_BE);
   6002       memOffsLo = elemSzB;
   6003       memOffsHi = 0;
   6004    }
   6005    voldHi
   6006       = assignNew(
   6007            'V', mce, elemTy,
   6008            expr2vbits_Load(
   6009               mce,
   6010               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
   6011               NULL/*always happens*/
   6012         ));
   6013    voldLo
   6014       = assignNew(
   6015            'V', mce, elemTy,
   6016            expr2vbits_Load(
   6017               mce,
   6018               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
   6019               NULL/*always happens*/
   6020         ));
   6021    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
   6022    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
   6023    if (otrak) {
   6024       boldHi
   6025          = assignNew('B', mce, Ity_I32,
   6026                      gen_load_b(mce, elemSzB, cas->addr,
   6027                                 memOffsHi/*addr bias*/));
   6028       boldLo
   6029          = assignNew('B', mce, Ity_I32,
   6030                      gen_load_b(mce, elemSzB, cas->addr,
   6031                                 memOffsLo/*addr bias*/));
   6032       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
   6033       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
   6034    }
   6035 
   6036    /* 5. the CAS itself */
   6037    stmt( 'C', mce, IRStmt_CAS(cas) );
   6038 
   6039    /* 6. compute "expected == old" */
   6040    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
   6041    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
   6042       tree, but it's not copied from the input block. */
   6043    /*
   6044       xHi = oldHi ^ expdHi;
   6045       xLo = oldLo ^ expdLo;
   6046       xHL = xHi | xLo;
   6047       expd_eq_old = xHL == 0;
   6048    */
   6049    xHi = assignNew('C', mce, elemTy,
   6050                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
   6051    xLo = assignNew('C', mce, elemTy,
   6052                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
   6053    xHL = assignNew('C', mce, elemTy,
   6054                    binop(opOr, xHi, xLo));
   6055    expd_eq_old
   6056       = assignNew('C', mce, Ity_I1,
   6057                   binop(opCasCmpEQ, xHL, zero));
   6058 
   6059    /* 7. if "expected == old"
   6060             store data# to shadow memory */
   6061    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
   6062                     NULL/*data*/, vdataHi/*vdata*/,
   6063                     expd_eq_old/*guard for store*/ );
   6064    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
   6065                     NULL/*data*/, vdataLo/*vdata*/,
   6066                     expd_eq_old/*guard for store*/ );
   6067    if (otrak) {
   6068       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
   6069                    bdataHi/*bdata*/,
   6070                    expd_eq_old/*guard for store*/ );
   6071       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
   6072                    bdataLo/*bdata*/,
   6073                    expd_eq_old/*guard for store*/ );
   6074    }
   6075 }
   6076 
   6077 
   6078 /* ------ Dealing with LL/SC (not difficult) ------ */
   6079 
   6080 static void do_shadow_LLSC ( MCEnv*    mce,
   6081                              IREndness stEnd,
   6082                              IRTemp    stResult,
   6083                              IRExpr*   stAddr,
   6084                              IRExpr*   stStoredata )
   6085 {
   6086    /* In short: treat a load-linked like a normal load followed by an
   6087       assignment of the loaded (shadow) data to the result temporary.
   6088       Treat a store-conditional like a normal store, and mark the
   6089       result temporary as defined. */
   6090    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
   6091    IRTemp resTmp = findShadowTmpV(mce, stResult);
   6092 
   6093    tl_assert(isIRAtom(stAddr));
   6094    if (stStoredata)
   6095       tl_assert(isIRAtom(stStoredata));
   6096 
   6097    if (stStoredata == NULL) {
   6098       /* Load Linked */
   6099       /* Just treat this as a normal load, followed by an assignment of
   6100          the value to .result. */
   6101       /* Stay sane */
   6102       tl_assert(resTy == Ity_I64 || resTy == Ity_I32
   6103                 || resTy == Ity_I16 || resTy == Ity_I8);
   6104       assign( 'V', mce, resTmp,
   6105                    expr2vbits_Load(
   6106                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
   6107                       NULL/*always happens*/) );
   6108    } else {
   6109       /* Store Conditional */
   6110       /* Stay sane */
   6111       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
   6112                                    stStoredata);
   6113       tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
   6114                 || dataTy == Ity_I16 || dataTy == Ity_I8);
   6115       do_shadow_Store( mce, stEnd,
   6116                             stAddr, 0/* addr bias */,
   6117                             stStoredata,
   6118                             NULL /* shadow data */,
   6119                             NULL/*guard*/ );
   6120       /* This is a store conditional, so it writes to .result a value
   6121          indicating whether or not the store succeeded.  Just claim
   6122          this value is always defined.  In the PowerPC interpretation
   6123          of store-conditional, definedness of the success indication
   6124          depends on whether the address of the store matches the
   6125          reservation address.  But we can't tell that here (and
   6126          anyway, we're not being PowerPC-specific).  At least we are
   6127          guaranteed that the definedness of the store address, and its
   6128          addressibility, will be checked as per normal.  So it seems
   6129          pretty safe to just say that the success indication is always
   6130          defined.
   6131 
   6132          In schemeS, for origin tracking, we must correspondingly set
   6133          a no-origin value for the origin shadow of .result.
   6134       */
   6135       tl_assert(resTy == Ity_I1);
   6136       assign( 'V', mce, resTmp, definedOfType(resTy) );
   6137    }
   6138 }
   6139 
   6140 
   6141 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
   6142 
   6143 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
   6144 {
   6145    complainIfUndefined(mce, sg->guard, NULL);
   6146    /* do_shadow_Store will generate code to check the definedness and
   6147       validity of sg->addr, in the case where sg->guard evaluates to
   6148       True at run-time. */
   6149    do_shadow_Store( mce, sg->end,
   6150                     sg->addr, 0/* addr bias */,
   6151                     sg->data,
   6152                     NULL /* shadow data */,
   6153                     sg->guard );
   6154 }
   6155 
   6156 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
   6157 {
   6158    complainIfUndefined(mce, lg->guard, NULL);
   6159    /* expr2vbits_Load_guarded_General will generate code to check the
   6160       definedness and validity of lg->addr, in the case where
   6161       lg->guard evaluates to True at run-time. */
   6162 
   6163    /* Look at the LoadG's built-in conversion operation, to determine
   6164       the source (actual loaded data) type, and the equivalent IROp.
   6165       NOTE that implicitly we are taking a widening operation to be
   6166       applied to original atoms and producing one that applies to V
   6167       bits.  Since signed and unsigned widening are self-shadowing,
   6168       this is a straight copy of the op (modulo swapping from the
   6169       IRLoadGOp form to the IROp form).  Note also therefore that this
   6170       implicitly duplicates the logic to do with said widening ops in
   6171       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
   6172    IROp   vwiden   = Iop_INVALID;
   6173    IRType loadedTy = Ity_INVALID;
   6174    switch (lg->cvt) {
   6175       case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
   6176       case ILGop_Ident64:   loadedTy = Ity_I64;  vwiden = Iop_INVALID; break;
   6177       case ILGop_Ident32:   loadedTy = Ity_I32;  vwiden = Iop_INVALID; break;
   6178       case ILGop_16Uto32:   loadedTy = Ity_I16;  vwiden = Iop_16Uto32; break;
   6179       case ILGop_16Sto32:   loadedTy = Ity_I16;  vwiden = Iop_16Sto32; break;
   6180       case ILGop_8Uto32:    loadedTy = Ity_I8;   vwiden = Iop_8Uto32;  break;
   6181       case ILGop_8Sto32:    loadedTy = Ity_I8;   vwiden = Iop_8Sto32;  break;
   6182       default: VG_(tool_panic)("do_shadow_LoadG");
   6183    }
   6184 
   6185    IRAtom* vbits_alt
   6186       = expr2vbits( mce, lg->alt );
   6187    IRAtom* vbits_final
   6188       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
   6189                                         lg->addr, 0/*addr bias*/,
   6190                                         lg->guard, vwiden, vbits_alt );
   6191    /* And finally, bind the V bits to the destination temporary. */
   6192    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
   6193 }
   6194 
   6195 
   6196 /*------------------------------------------------------------*/
   6197 /*--- Memcheck main                                        ---*/
   6198 /*------------------------------------------------------------*/
   6199 
   6200 static void schemeS ( MCEnv* mce, IRStmt* st );
   6201 
   6202 static Bool isBogusAtom ( IRAtom* at )
   6203 {
   6204    ULong n = 0;
   6205    IRConst* con;
   6206    tl_assert(isIRAtom(at));
   6207    if (at->tag == Iex_RdTmp)
   6208       return False;
   6209    tl_assert(at->tag == Iex_Const);
   6210    con = at->Iex.Const.con;
   6211    switch (con->tag) {
   6212       case Ico_U1:   return False;
   6213       case Ico_U8:   n = (ULong)con->Ico.U8; break;
   6214       case Ico_U16:  n = (ULong)con->Ico.U16; break;
   6215       case Ico_U32:  n = (ULong)con->Ico.U32; break;
   6216       case Ico_U64:  n = (ULong)con->Ico.U64; break;
   6217       case Ico_F32:  return False;
   6218       case Ico_F64:  return False;
   6219       case Ico_F32i: return False;
   6220       case Ico_F64i: return False;
   6221       case Ico_V128: return False;
   6222       case Ico_V256: return False;
   6223       default: ppIRExpr(at); tl_assert(0);
   6224    }
   6225    /* VG_(printf)("%llx\n", n); */
   6226    return (/*32*/    n == 0xFEFEFEFFULL
   6227            /*32*/ || n == 0x80808080ULL
   6228            /*32*/ || n == 0x7F7F7F7FULL
   6229            /*32*/ || n == 0x7EFEFEFFULL
   6230            /*32*/ || n == 0x81010100ULL
   6231            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
   6232            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
   6233            /*64*/ || n == 0x0000000000008080ULL
   6234            /*64*/ || n == 0x8080808080808080ULL
   6235            /*64*/ || n == 0x0101010101010101ULL
   6236           );
   6237 }
   6238 
   6239 static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
   6240 {
   6241    Int      i;
   6242    IRExpr*  e;
   6243    IRDirty* d;
   6244    IRCAS*   cas;
   6245    switch (st->tag) {
   6246       case Ist_WrTmp:
   6247          e = st->Ist.WrTmp.data;
   6248          switch (e->tag) {
   6249             case Iex_Get:
   6250             case Iex_RdTmp:
   6251                return False;
   6252             case Iex_Const:
   6253                return isBogusAtom(e);
   6254             case Iex_Unop:
   6255                return isBogusAtom(e->Iex.Unop.arg)
   6256                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
   6257             case Iex_GetI:
   6258                return isBogusAtom(e->Iex.GetI.ix);
   6259             case Iex_Binop:
   6260                return isBogusAtom(e->Iex.Binop.arg1)
   6261                       || isBogusAtom(e->Iex.Binop.arg2);
   6262             case Iex_Triop:
   6263                return isBogusAtom(e->Iex.Triop.details->arg1)
   6264                       || isBogusAtom(e->Iex.Triop.details->arg2)
   6265                       || isBogusAtom(e->Iex.Triop.details->arg3);
   6266             case Iex_Qop:
   6267                return isBogusAtom(e->Iex.Qop.details->arg1)
   6268                       || isBogusAtom(e->Iex.Qop.details->arg2)
   6269                       || isBogusAtom(e->Iex.Qop.details->arg3)
   6270                       || isBogusAtom(e->Iex.Qop.details->arg4);
   6271             case Iex_ITE:
   6272                return isBogusAtom(e->Iex.ITE.cond)
   6273                       || isBogusAtom(e->Iex.ITE.iftrue)
   6274                       || isBogusAtom(e->Iex.ITE.iffalse);
   6275             case Iex_Load:
   6276                return isBogusAtom(e->Iex.Load.addr);
   6277             case Iex_CCall:
   6278                for (i = 0; e->Iex.CCall.args[i]; i++)
   6279                   if (isBogusAtom(e->Iex.CCall.args[i]))
   6280                      return True;
   6281                return False;
   6282             default:
   6283                goto unhandled;
   6284          }
   6285       case Ist_Dirty:
   6286          d = st->Ist.Dirty.details;
   6287          for (i = 0; d->args[i]; i++) {
   6288             IRAtom* atom = d->args[i];
   6289             if (LIKELY(!is_IRExpr_VECRET_or_GSPTR(atom))) {
   6290                if (isBogusAtom(atom))
   6291                   return True;
   6292             }
   6293          }
   6294          if (isBogusAtom(d->guard))
   6295             return True;
   6296          if (d->mAddr && isBogusAtom(d->mAddr))
   6297             return True;
   6298          return False;
   6299       case Ist_Put:
   6300          return isBogusAtom(st->Ist.Put.data);
   6301       case Ist_PutI:
   6302          return isBogusAtom(st->Ist.PutI.details->ix)
   6303                 || isBogusAtom(st->Ist.PutI.details->data);
   6304       case Ist_Store:
   6305          return isBogusAtom(st->Ist.Store.addr)
   6306                 || isBogusAtom(st->Ist.Store.data);
   6307       case Ist_StoreG: {
   6308          IRStoreG* sg = st->Ist.StoreG.details;
   6309          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
   6310                 || isBogusAtom(sg->guard);
   6311       }
   6312       case Ist_LoadG: {
   6313          IRLoadG* lg = st->Ist.LoadG.details;
   6314          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
   6315                 || isBogusAtom(lg->guard);
   6316       }
   6317       case Ist_Exit:
   6318          return isBogusAtom(st->Ist.Exit.guard);
   6319       case Ist_AbiHint:
   6320          return isBogusAtom(st->Ist.AbiHint.base)
   6321                 || isBogusAtom(st->Ist.AbiHint.nia);
   6322       case Ist_NoOp:
   6323       case Ist_IMark:
   6324       case Ist_MBE:
   6325          return False;
   6326       case Ist_CAS:
   6327          cas = st->Ist.CAS.details;
   6328          return isBogusAtom(cas->addr)
   6329                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
   6330                 || isBogusAtom(cas->expdLo)
   6331                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
   6332                 || isBogusAtom(cas->dataLo);
   6333       case Ist_LLSC:
   6334          return isBogusAtom(st->Ist.LLSC.addr)
   6335                 || (st->Ist.LLSC.storedata
   6336                        ? isBogusAtom(st->Ist.LLSC.storedata)
   6337                        : False);
   6338       default:
   6339       unhandled:
   6340          ppIRStmt(st);
   6341          VG_(tool_panic)("hasBogusLiterals");
   6342    }
   6343 }
   6344 
   6345 
   6346 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
   6347                         IRSB* sb_in,
   6348                         const VexGuestLayout* layout,
   6349                         const VexGuestExtents* vge,
   6350                         const VexArchInfo* archinfo_host,
   6351                         IRType gWordTy, IRType hWordTy )
   6352 {
   6353    Bool    verboze = 0||False;
   6354    Int     i, j, first_stmt;
   6355    IRStmt* st;
   6356    MCEnv   mce;
   6357    IRSB*   sb_out;
   6358 
   6359    if (gWordTy != hWordTy) {
   6360       /* We don't currently support this case. */
   6361       VG_(tool_panic)("host/guest word size mismatch");
   6362    }
   6363 
   6364    /* Check we're not completely nuts */
   6365    tl_assert(sizeof(UWord)  == sizeof(void*));
   6366    tl_assert(sizeof(Word)   == sizeof(void*));
   6367    tl_assert(sizeof(Addr)   == sizeof(void*));
   6368    tl_assert(sizeof(ULong)  == 8);
   6369    tl_assert(sizeof(Long)   == 8);
   6370    tl_assert(sizeof(UInt)   == 4);
   6371    tl_assert(sizeof(Int)    == 4);
   6372 
   6373    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
   6374 
   6375    /* Set up SB */
   6376    sb_out = deepCopyIRSBExceptStmts(sb_in);
   6377 
   6378    /* Set up the running environment.  Both .sb and .tmpMap are
   6379       modified as we go along.  Note that tmps are added to both
   6380       .sb->tyenv and .tmpMap together, so the valid index-set for
   6381       those two arrays should always be identical. */
   6382    VG_(memset)(&mce, 0, sizeof(mce));
   6383    mce.sb             = sb_out;
   6384    mce.trace          = verboze;
   6385    mce.layout         = layout;
   6386    mce.hWordTy        = hWordTy;
   6387    mce.bogusLiterals  = False;
   6388 
   6389    /* Do expensive interpretation for Iop_Add32 and Iop_Add64 on
   6390       Darwin.  10.7 is mostly built with LLVM, which uses these for
   6391       bitfield inserts, and we get a lot of false errors if the cheap
   6392       interpretation is used, alas.  Could solve this much better if
   6393       we knew which of such adds came from x86/amd64 LEA instructions,
   6394       since these are the only ones really needing the expensive
   6395       interpretation, but that would require some way to tag them in
   6396       the _toIR.c front ends, which is a lot of faffing around.  So
   6397       for now just use the slow and blunt-instrument solution. */
   6398    mce.useLLVMworkarounds = False;
   6399 #  if defined(VGO_darwin)
   6400    mce.useLLVMworkarounds = True;
   6401 #  endif
   6402 
   6403    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
   6404                             sizeof(TempMapEnt));
   6405    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
   6406    for (i = 0; i < sb_in->tyenv->types_used; i++) {
   6407       TempMapEnt ent;
   6408       ent.kind    = Orig;
   6409       ent.shadowV = IRTemp_INVALID;
   6410       ent.shadowB = IRTemp_INVALID;
   6411       VG_(addToXA)( mce.tmpMap, &ent );
   6412    }
   6413    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
   6414 
   6415    if (MC_(clo_expensive_definedness_checks)) {
   6416       /* For expensive definedness checking skip looking for bogus
   6417          literals. */
   6418       mce.bogusLiterals = True;
   6419    } else {
   6420       /* Make a preliminary inspection of the statements, to see if there
   6421          are any dodgy-looking literals.  If there are, we generate
   6422          extra-detailed (hence extra-expensive) instrumentation in
   6423          places.  Scan the whole bb even if dodgyness is found earlier,
   6424          so that the flatness assertion is applied to all stmts. */
   6425       Bool bogus = False;
   6426 
   6427       for (i = 0; i < sb_in->stmts_used; i++) {
   6428          st = sb_in->stmts[i];
   6429          tl_assert(st);
   6430          tl_assert(isFlatIRStmt(st));
   6431 
   6432          if (!bogus) {
   6433             bogus = checkForBogusLiterals(st);
   6434             if (0 && bogus) {
   6435                VG_(printf)("bogus: ");
   6436                ppIRStmt(st);
   6437                VG_(printf)("\n");
   6438             }
   6439             if (bogus) break;
   6440          }
   6441       }
   6442       mce.bogusLiterals = bogus;
   6443    }
   6444 
   6445    /* Copy verbatim any IR preamble preceding the first IMark */
   6446 
   6447    tl_assert(mce.sb == sb_out);
   6448    tl_assert(mce.sb != sb_in);
   6449 
   6450    i = 0;
   6451    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
   6452 
   6453       st = sb_in->stmts[i];
   6454       tl_assert(st);
   6455       tl_assert(isFlatIRStmt(st));
   6456 
   6457       stmt( 'C', &mce, sb_in->stmts[i] );
   6458       i++;
   6459    }
   6460 
   6461    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
   6462       cause the IR following the preamble to contain references to IR
   6463       temporaries defined in the preamble.  Because the preamble isn't
   6464       instrumented, these temporaries don't have any shadows.
   6465       Nevertheless uses of them following the preamble will cause
   6466       memcheck to generate references to their shadows.  End effect is
   6467       to cause IR sanity check failures, due to references to
   6468       non-existent shadows.  This is only evident for the complex
   6469       preambles used for function wrapping on TOC-afflicted platforms
   6470       (ppc64-linux).
   6471 
   6472       The following loop therefore scans the preamble looking for
   6473       assignments to temporaries.  For each one found it creates an
   6474       assignment to the corresponding (V) shadow temp, marking it as
   6475       'defined'.  This is the same resulting IR as if the main
   6476       instrumentation loop before had been applied to the statement
   6477       'tmp = CONSTANT'.
   6478 
   6479       Similarly, if origin tracking is enabled, we must generate an
   6480       assignment for the corresponding origin (B) shadow, claiming
   6481       no-origin, as appropriate for a defined value.
   6482    */
   6483    for (j = 0; j < i; j++) {
   6484       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
   6485          /* findShadowTmpV checks its arg is an original tmp;
   6486             no need to assert that here. */
   6487          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
   6488          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
   6489          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
   6490          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
   6491          if (MC_(clo_mc_level) == 3) {
   6492             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
   6493             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
   6494             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
   6495          }
   6496          if (0) {
   6497             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
   6498             ppIRType( ty_v );
   6499             VG_(printf)("\n");
   6500          }
   6501       }
   6502    }
   6503 
   6504    /* Iterate over the remaining stmts to generate instrumentation. */
   6505 
   6506    tl_assert(sb_in->stmts_used > 0);
   6507    tl_assert(i >= 0);
   6508    tl_assert(i < sb_in->stmts_used);
   6509    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
   6510 
   6511    for (/* use current i*/; i < sb_in->stmts_used; i++) {
   6512 
   6513       st = sb_in->stmts[i];
   6514       first_stmt = sb_out->stmts_used;
   6515 
   6516       if (verboze) {
   6517          VG_(printf)("\n");
   6518          ppIRStmt(st);
   6519          VG_(printf)("\n");
   6520       }
   6521 
   6522       if (MC_(clo_mc_level) == 3) {
   6523          /* See comments on case Ist_CAS below. */
   6524          if (st->tag != Ist_CAS)
   6525             schemeS( &mce, st );
   6526       }
   6527 
   6528       /* Generate instrumentation code for each stmt ... */
   6529 
   6530       switch (st->tag) {
   6531 
   6532          case Ist_WrTmp:
   6533             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
   6534                                expr2vbits( &mce, st->Ist.WrTmp.data) );
   6535             break;
   6536 
   6537          case Ist_Put:
   6538             do_shadow_PUT( &mce,
   6539                            st->Ist.Put.offset,
   6540                            st->Ist.Put.data,
   6541                            NULL /* shadow atom */, NULL /* guard */ );
   6542             break;
   6543 
   6544          case Ist_PutI:
   6545             do_shadow_PUTI( &mce, st->Ist.PutI.details);
   6546             break;
   6547 
   6548          case Ist_Store:
   6549             do_shadow_Store( &mce, st->Ist.Store.end,
   6550                                    st->Ist.Store.addr, 0/* addr bias */,
   6551                                    st->Ist.Store.data,
   6552                                    NULL /* shadow data */,
   6553                                    NULL/*guard*/ );
   6554             break;
   6555 
   6556          case Ist_StoreG:
   6557             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
   6558             break;
   6559 
   6560          case Ist_LoadG:
   6561             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
   6562             break;
   6563 
   6564          case Ist_Exit:
   6565             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
   6566             break;
   6567 
   6568          case Ist_IMark:
   6569             break;
   6570 
   6571          case Ist_NoOp:
   6572          case Ist_MBE:
   6573             break;
   6574 
   6575          case Ist_Dirty:
   6576             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
   6577             break;
   6578 
   6579          case Ist_AbiHint:
   6580             do_AbiHint( &mce, st->Ist.AbiHint.base,
   6581                               st->Ist.AbiHint.len,
   6582                               st->Ist.AbiHint.nia );
   6583             break;
   6584 
   6585          case Ist_CAS:
   6586             do_shadow_CAS( &mce, st->Ist.CAS.details );
   6587             /* Note, do_shadow_CAS copies the CAS itself to the output
   6588                block, because it needs to add instrumentation both
   6589                before and after it.  Hence skip the copy below.  Also
   6590                skip the origin-tracking stuff (call to schemeS) above,
   6591                since that's all tangled up with it too; do_shadow_CAS
   6592                does it all. */
   6593             break;
   6594 
   6595          case Ist_LLSC:
   6596             do_shadow_LLSC( &mce,
   6597                             st->Ist.LLSC.end,
   6598                             st->Ist.LLSC.result,
   6599                             st->Ist.LLSC.addr,
   6600                             st->Ist.LLSC.storedata );
   6601             break;
   6602 
   6603          default:
   6604             VG_(printf)("\n");
   6605             ppIRStmt(st);
   6606             VG_(printf)("\n");
   6607             VG_(tool_panic)("memcheck: unhandled IRStmt");
   6608 
   6609       } /* switch (st->tag) */
   6610 
   6611       if (0 && verboze) {
   6612          for (j = first_stmt; j < sb_out->stmts_used; j++) {
   6613             VG_(printf)("   ");
   6614             ppIRStmt(sb_out->stmts[j]);
   6615             VG_(printf)("\n");
   6616          }
   6617          VG_(printf)("\n");
   6618       }
   6619 
   6620       /* ... and finally copy the stmt itself to the output.  Except,
   6621          skip the copy of IRCASs; see comments on case Ist_CAS
   6622          above. */
   6623       if (st->tag != Ist_CAS)
   6624          stmt('C', &mce, st);
   6625    }
   6626 
   6627    /* Now we need to complain if the jump target is undefined. */
   6628    first_stmt = sb_out->stmts_used;
   6629 
   6630    if (verboze) {
   6631       VG_(printf)("sb_in->next = ");
   6632       ppIRExpr(sb_in->next);
   6633       VG_(printf)("\n\n");
   6634    }
   6635 
   6636    complainIfUndefined( &mce, sb_in->next, NULL );
   6637 
   6638    if (0 && verboze) {
   6639       for (j = first_stmt; j < sb_out->stmts_used; j++) {
   6640          VG_(printf)("   ");
   6641          ppIRStmt(sb_out->stmts[j]);
   6642          VG_(printf)("\n");
   6643       }
   6644       VG_(printf)("\n");
   6645    }
   6646 
   6647    /* If this fails, there's been some serious snafu with tmp management,
   6648       that should be investigated. */
   6649    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
   6650    VG_(deleteXA)( mce.tmpMap );
   6651 
   6652    tl_assert(mce.sb == sb_out);
   6653    return sb_out;
   6654 }
   6655 
   6656 
   6657 /*------------------------------------------------------------*/
   6658 /*--- Post-tree-build final tidying                        ---*/
   6659 /*------------------------------------------------------------*/
   6660 
   6661 /* This exploits the observation that Memcheck often produces
   6662    repeated conditional calls of the form
   6663 
   6664    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
   6665 
   6666    with the same guard expression G guarding the same helper call.
   6667    The second and subsequent calls are redundant.  This usually
   6668    results from instrumentation of guest code containing multiple
   6669    memory references at different constant offsets from the same base
   6670    register.  After optimisation of the instrumentation, you get a
   6671    test for the definedness of the base register for each memory
   6672    reference, which is kinda pointless.  MC_(final_tidy) therefore
   6673    looks for such repeated calls and removes all but the first. */
   6674 
   6675 
   6676 /* With some testing on perf/bz2.c, on amd64 and x86, compiled with
   6677    gcc-5.3.1 -O2, it appears that 16 entries in the array are enough to
   6678    get almost all the benefits of this transformation whilst causing
   6679    the slide-back case to just often enough to be verifiably
   6680    correct.  For posterity, the numbers are:
   6681 
   6682    bz2-32
   6683 
   6684    1   4,336 (112,212 -> 1,709,473; ratio 15.2)
   6685    2   4,336 (112,194 -> 1,669,895; ratio 14.9)
   6686    3   4,336 (112,194 -> 1,660,713; ratio 14.8)
   6687    4   4,336 (112,194 -> 1,658,555; ratio 14.8)
   6688    5   4,336 (112,194 -> 1,655,447; ratio 14.8)
   6689    6   4,336 (112,194 -> 1,655,101; ratio 14.8)
   6690    7   4,336 (112,194 -> 1,654,858; ratio 14.7)
   6691    8   4,336 (112,194 -> 1,654,810; ratio 14.7)
   6692    10  4,336 (112,194 -> 1,654,621; ratio 14.7)
   6693    12  4,336 (112,194 -> 1,654,678; ratio 14.7)
   6694    16  4,336 (112,194 -> 1,654,494; ratio 14.7)
   6695    32  4,336 (112,194 -> 1,654,602; ratio 14.7)
   6696    inf 4,336 (112,194 -> 1,654,602; ratio 14.7)
   6697 
   6698    bz2-64
   6699 
   6700    1   4,113 (107,329 -> 1,822,171; ratio 17.0)
   6701    2   4,113 (107,329 -> 1,806,443; ratio 16.8)
   6702    3   4,113 (107,329 -> 1,803,967; ratio 16.8)
   6703    4   4,113 (107,329 -> 1,802,785; ratio 16.8)
   6704    5   4,113 (107,329 -> 1,802,412; ratio 16.8)
   6705    6   4,113 (107,329 -> 1,802,062; ratio 16.8)
   6706    7   4,113 (107,329 -> 1,801,976; ratio 16.8)
   6707    8   4,113 (107,329 -> 1,801,886; ratio 16.8)
   6708    10  4,113 (107,329 -> 1,801,653; ratio 16.8)
   6709    12  4,113 (107,329 -> 1,801,526; ratio 16.8)
   6710    16  4,113 (107,329 -> 1,801,298; ratio 16.8)
   6711    32  4,113 (107,329 -> 1,800,827; ratio 16.8)
   6712    inf 4,113 (107,329 -> 1,800,827; ratio 16.8)
   6713 */
   6714 
   6715 /* Structs for recording which (helper, guard) pairs we have already
   6716    seen. */
   6717 
   6718 #define N_TIDYING_PAIRS 16
   6719 
   6720 typedef
   6721    struct { void* entry; IRExpr* guard; }
   6722    Pair;
   6723 
   6724 typedef
   6725    struct {
   6726       Pair pairs[N_TIDYING_PAIRS +1/*for bounds checking*/];
   6727       UInt pairsUsed;
   6728    }
   6729    Pairs;
   6730 
   6731 
   6732 /* Return True if e1 and e2 definitely denote the same value (used to
   6733    compare guards).  Return False if unknown; False is the safe
   6734    answer.  Since guest registers and guest memory do not have the
   6735    SSA property we must return False if any Gets or Loads appear in
   6736    the expression.  This implicitly assumes that e1 and e2 have the
   6737    same IR type, which is always true here -- the type is Ity_I1. */
   6738 
   6739 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
   6740 {
   6741    if (e1->tag != e2->tag)
   6742       return False;
   6743    switch (e1->tag) {
   6744       case Iex_Const:
   6745          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
   6746       case Iex_Binop:
   6747          return e1->Iex.Binop.op == e2->Iex.Binop.op
   6748                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
   6749                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
   6750       case Iex_Unop:
   6751          return e1->Iex.Unop.op == e2->Iex.Unop.op
   6752                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
   6753       case Iex_RdTmp:
   6754          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
   6755       case Iex_ITE:
   6756          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
   6757                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
   6758                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
   6759       case Iex_Qop:
   6760       case Iex_Triop:
   6761       case Iex_CCall:
   6762          /* be lazy.  Could define equality for these, but they never
   6763             appear to be used. */
   6764          return False;
   6765       case Iex_Get:
   6766       case Iex_GetI:
   6767       case Iex_Load:
   6768          /* be conservative - these may not give the same value each
   6769             time */
   6770          return False;
   6771       case Iex_Binder:
   6772          /* should never see this */
   6773          /* fallthrough */
   6774       default:
   6775          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
   6776          ppIRExpr(e1);
   6777          VG_(tool_panic)("memcheck:sameIRValue");
   6778          return False;
   6779    }
   6780 }
   6781 
   6782 /* See if 'pairs' already has an entry for (entry, guard).  Return
   6783    True if so.  If not, add an entry. */
   6784 
   6785 static
   6786 Bool check_or_add ( Pairs* tidyingEnv, IRExpr* guard, void* entry )
   6787 {
   6788    UInt i, n = tidyingEnv->pairsUsed;
   6789    tl_assert(n <= N_TIDYING_PAIRS);
   6790    for (i = 0; i < n; i++) {
   6791       if (tidyingEnv->pairs[i].entry == entry
   6792           && sameIRValue(tidyingEnv->pairs[i].guard, guard))
   6793          return True;
   6794    }
   6795    /* (guard, entry) wasn't found in the array.  Add it at the end.
   6796       If the array is already full, slide the entries one slot
   6797       backwards.  This means we will lose to ability to detect
   6798       duplicates from the pair in slot zero, but that happens so
   6799       rarely that it's unlikely to have much effect on overall code
   6800       quality.  Also, this strategy loses the check for the oldest
   6801       tracked exit (memory reference, basically) and so that is (I'd
   6802       guess) least likely to be re-used after this point. */
   6803    tl_assert(i == n);
   6804    if (n == N_TIDYING_PAIRS) {
   6805       for (i = 1; i < N_TIDYING_PAIRS; i++) {
   6806          tidyingEnv->pairs[i-1] = tidyingEnv->pairs[i];
   6807       }
   6808       tidyingEnv->pairs[N_TIDYING_PAIRS-1].entry = entry;
   6809       tidyingEnv->pairs[N_TIDYING_PAIRS-1].guard = guard;
   6810    } else {
   6811       tl_assert(n < N_TIDYING_PAIRS);
   6812       tidyingEnv->pairs[n].entry = entry;
   6813       tidyingEnv->pairs[n].guard = guard;
   6814       n++;
   6815       tidyingEnv->pairsUsed = n;
   6816    }
   6817    return False;
   6818 }
   6819 
   6820 static Bool is_helperc_value_checkN_fail ( const HChar* name )
   6821 {
   6822    /* This is expensive because it happens a lot.  We are checking to
   6823       see whether |name| is one of the following 8 strings:
   6824 
   6825          MC_(helperc_value_check8_fail_no_o)
   6826          MC_(helperc_value_check4_fail_no_o)
   6827          MC_(helperc_value_check0_fail_no_o)
   6828          MC_(helperc_value_check1_fail_no_o)
   6829          MC_(helperc_value_check8_fail_w_o)
   6830          MC_(helperc_value_check0_fail_w_o)
   6831          MC_(helperc_value_check1_fail_w_o)
   6832          MC_(helperc_value_check4_fail_w_o)
   6833 
   6834       To speed it up, check the common prefix just once, rather than
   6835       all 8 times.
   6836    */
   6837    const HChar* prefix = "MC_(helperc_value_check";
   6838 
   6839    HChar n, p;
   6840    while (True) {
   6841       n = *name;
   6842       p = *prefix;
   6843       if (p == 0) break; /* ran off the end of the prefix */
   6844       /* We still have some prefix to use */
   6845       if (n == 0) return False; /* have prefix, but name ran out */
   6846       if (n != p) return False; /* have both pfx and name, but no match */
   6847       name++;
   6848       prefix++;
   6849    }
   6850 
   6851    /* Check the part after the prefix. */
   6852    tl_assert(*prefix == 0 && *name != 0);
   6853    return    0==VG_(strcmp)(name, "8_fail_no_o)")
   6854           || 0==VG_(strcmp)(name, "4_fail_no_o)")
   6855           || 0==VG_(strcmp)(name, "0_fail_no_o)")
   6856           || 0==VG_(strcmp)(name, "1_fail_no_o)")
   6857           || 0==VG_(strcmp)(name, "8_fail_w_o)")
   6858           || 0==VG_(strcmp)(name, "4_fail_w_o)")
   6859           || 0==VG_(strcmp)(name, "0_fail_w_o)")
   6860           || 0==VG_(strcmp)(name, "1_fail_w_o)");
   6861 }
   6862 
   6863 IRSB* MC_(final_tidy) ( IRSB* sb_in )
   6864 {
   6865    Int       i;
   6866    IRStmt*   st;
   6867    IRDirty*  di;
   6868    IRExpr*   guard;
   6869    IRCallee* cee;
   6870    Bool      alreadyPresent;
   6871    Pairs     pairs;
   6872 
   6873    pairs.pairsUsed = 0;
   6874 
   6875    pairs.pairs[N_TIDYING_PAIRS].entry = (void*)0x123;
   6876    pairs.pairs[N_TIDYING_PAIRS].guard = (IRExpr*)0x456;
   6877 
   6878    /* Scan forwards through the statements.  Each time a call to one
   6879       of the relevant helpers is seen, check if we have made a
   6880       previous call to the same helper using the same guard
   6881       expression, and if so, delete the call. */
   6882    for (i = 0; i < sb_in->stmts_used; i++) {
   6883       st = sb_in->stmts[i];
   6884       tl_assert(st);
   6885       if (st->tag != Ist_Dirty)
   6886          continue;
   6887       di = st->Ist.Dirty.details;
   6888       guard = di->guard;
   6889       tl_assert(guard);
   6890       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
   6891       cee = di->cee;
   6892       if (!is_helperc_value_checkN_fail( cee->name ))
   6893          continue;
   6894        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
   6895           guard 'guard'.  Check if we have already seen a call to this
   6896           function with the same guard.  If so, delete it.  If not,
   6897           add it to the set of calls we do know about. */
   6898       alreadyPresent = check_or_add( &pairs, guard, cee->addr );
   6899       if (alreadyPresent) {
   6900          sb_in->stmts[i] = IRStmt_NoOp();
   6901          if (0) VG_(printf)("XX\n");
   6902       }
   6903    }
   6904 
   6905    tl_assert(pairs.pairs[N_TIDYING_PAIRS].entry == (void*)0x123);
   6906    tl_assert(pairs.pairs[N_TIDYING_PAIRS].guard == (IRExpr*)0x456);
   6907 
   6908    return sb_in;
   6909 }
   6910 
   6911 #undef N_TIDYING_PAIRS
   6912 
   6913 
   6914 /*------------------------------------------------------------*/
   6915 /*--- Origin tracking stuff                                ---*/
   6916 /*------------------------------------------------------------*/
   6917 
   6918 /* Almost identical to findShadowTmpV. */
   6919 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
   6920 {
   6921    TempMapEnt* ent;
   6922    /* VG_(indexXA) range-checks 'orig', hence no need to check
   6923       here. */
   6924    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
   6925    tl_assert(ent->kind == Orig);
   6926    if (ent->shadowB == IRTemp_INVALID) {
   6927       IRTemp tmpB
   6928         = newTemp( mce, Ity_I32, BSh );
   6929       /* newTemp may cause mce->tmpMap to resize, hence previous results
   6930          from VG_(indexXA) are invalid. */
   6931       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
   6932       tl_assert(ent->kind == Orig);
   6933       tl_assert(ent->shadowB == IRTemp_INVALID);
   6934       ent->shadowB = tmpB;
   6935    }
   6936    return ent->shadowB;
   6937 }
   6938 
   6939 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
   6940 {
   6941    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
   6942 }
   6943 
   6944 
   6945 /* Make a guarded origin load, with no special handling in the
   6946    didn't-happen case.  A GUARD of NULL is assumed to mean "always
   6947    True".
   6948 
   6949    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
   6950    return the otag.  The loaded size is SZB.  If GUARD evaluates to
   6951    False at run time then the returned otag is zero.
   6952 */
   6953 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
   6954                                     IRAtom* baseaddr,
   6955                                     Int offset, IRExpr* guard )
   6956 {
   6957    void*    hFun;
   6958    const HChar* hName;
   6959    IRTemp   bTmp;
   6960    IRDirty* di;
   6961    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
   6962    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
   6963    IRAtom*  ea    = baseaddr;
   6964    if (offset != 0) {
   6965       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
   6966                                    : mkU64( (Long)(Int)offset );
   6967       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
   6968    }
   6969    bTmp = newTemp(mce, mce->hWordTy, BSh);
   6970 
   6971    switch (szB) {
   6972       case 1: hFun  = (void*)&MC_(helperc_b_load1);
   6973               hName = "MC_(helperc_b_load1)";
   6974               break;
   6975       case 2: hFun  = (void*)&MC_(helperc_b_load2);
   6976               hName = "MC_(helperc_b_load2)";
   6977               break;
   6978       case 4: hFun  = (void*)&MC_(helperc_b_load4);
   6979               hName = "MC_(helperc_b_load4)";
   6980               break;
   6981       case 8: hFun  = (void*)&MC_(helperc_b_load8);
   6982               hName = "MC_(helperc_b_load8)";
   6983               break;
   6984       case 16: hFun  = (void*)&MC_(helperc_b_load16);
   6985                hName = "MC_(helperc_b_load16)";
   6986                break;
   6987       case 32: hFun  = (void*)&MC_(helperc_b_load32);
   6988                hName = "MC_(helperc_b_load32)";
   6989                break;
   6990       default:
   6991          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
   6992          tl_assert(0);
   6993    }
   6994    di = unsafeIRDirty_1_N(
   6995            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
   6996            mkIRExprVec_1( ea )
   6997         );
   6998    if (guard) {
   6999       di->guard = guard;
   7000       /* Ideally the didn't-happen return value here would be
   7001          all-zeroes (unknown-origin), so it'd be harmless if it got
   7002          used inadvertently.  We slum it out with the IR-mandated
   7003          default value (0b01 repeating, 0x55 etc) as that'll probably
   7004          trump all legitimate otags via Max32, and it's pretty
   7005          obviously bogus. */
   7006    }
   7007    /* no need to mess with any annotations.  This call accesses
   7008       neither guest state nor guest memory. */
   7009    stmt( 'B', mce, IRStmt_Dirty(di) );
   7010    if (mce->hWordTy == Ity_I64) {
   7011       /* 64-bit host */
   7012       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
   7013       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
   7014       return mkexpr(bTmp32);
   7015    } else {
   7016       /* 32-bit host */
   7017       return mkexpr(bTmp);
   7018    }
   7019 }
   7020 
   7021 
   7022 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
   7023    loaded size is SZB.  The load is regarded as unconditional (always
   7024    happens).
   7025 */
   7026 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
   7027                             Int offset )
   7028 {
   7029    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
   7030 }
   7031 
   7032 
   7033 /* The most general handler for guarded origin loads.  A GUARD of NULL
   7034    is assumed to mean "always True".
   7035 
   7036    Generate IR to do a shadow origin load from ADDR+BIAS and return
   7037    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
   7038    run time then the returned B bits are simply BALT instead.
   7039 */
   7040 static
   7041 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
   7042                                         IRType ty,
   7043                                         IRAtom* addr, UInt bias,
   7044                                         IRAtom* guard, IRAtom* balt )
   7045 {
   7046    /* If the guard evaluates to True, this will hold the loaded
   7047       origin.  If the guard evaluates to False, this will be zero,
   7048       meaning "unknown origin", in which case we will have to replace
   7049       it using an ITE below. */
   7050    IRAtom* iftrue
   7051       = assignNew('B', mce, Ity_I32,
   7052                   gen_guarded_load_b(mce, sizeofIRType(ty),
   7053                                      addr, bias, guard));
   7054    /* These are the bits we will return if the load doesn't take
   7055       place. */
   7056    IRAtom* iffalse
   7057       = balt;
   7058    /* Prepare the cond for the ITE.  Convert a NULL cond into
   7059       something that iropt knows how to fold out later. */
   7060    IRAtom* cond
   7061       = guard == NULL  ? mkU1(1)  : guard;
   7062    /* And assemble the final result. */
   7063    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
   7064 }
   7065 
   7066 
   7067 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
   7068    the store really happens; NULL means it unconditionally does. */
   7069 static void gen_store_b ( MCEnv* mce, Int szB,
   7070                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
   7071                           IRAtom* guard )
   7072 {
   7073    void*    hFun;
   7074    const HChar* hName;
   7075    IRDirty* di;
   7076    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
   7077    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
   7078    IRAtom*  ea    = baseaddr;
   7079    if (guard) {
   7080       tl_assert(isOriginalAtom(mce, guard));
   7081       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
   7082    }
   7083    if (offset != 0) {
   7084       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
   7085                                    : mkU64( (Long)(Int)offset );
   7086       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
   7087    }
   7088    if (mce->hWordTy == Ity_I64)
   7089       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
   7090 
   7091    switch (szB) {
   7092       case 1: hFun  = (void*)&MC_(helperc_b_store1);
   7093               hName = "MC_(helperc_b_store1)";
   7094               break;
   7095       case 2: hFun  = (void*)&MC_(helperc_b_store2);
   7096               hName = "MC_(helperc_b_store2)";
   7097               break;
   7098       case 4: hFun  = (void*)&MC_(helperc_b_store4);
   7099               hName = "MC_(helperc_b_store4)";
   7100               break;
   7101       case 8: hFun  = (void*)&MC_(helperc_b_store8);
   7102               hName = "MC_(helperc_b_store8)";
   7103               break;
   7104       case 16: hFun  = (void*)&MC_(helperc_b_store16);
   7105                hName = "MC_(helperc_b_store16)";
   7106                break;
   7107       case 32: hFun  = (void*)&MC_(helperc_b_store32);
   7108                hName = "MC_(helperc_b_store32)";
   7109                break;
   7110       default:
   7111          tl_assert(0);
   7112    }
   7113    di = unsafeIRDirty_0_N( 2/*regparms*/,
   7114            hName, VG_(fnptr_to_fnentry)( hFun ),
   7115            mkIRExprVec_2( ea, dataB )
   7116         );
   7117    /* no need to mess with any annotations.  This call accesses
   7118       neither guest state nor guest memory. */
   7119    if (guard) di->guard = guard;
   7120    stmt( 'B', mce, IRStmt_Dirty(di) );
   7121 }
   7122 
   7123 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
   7124    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
   7125    if (eTy == Ity_I64)
   7126       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
   7127    if (eTy == Ity_I32)
   7128       return e;
   7129    tl_assert(0);
   7130 }
   7131 
   7132 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
   7133    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
   7134    tl_assert(eTy == Ity_I32);
   7135    if (dstTy == Ity_I64)
   7136       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
   7137    tl_assert(0);
   7138 }
   7139 
   7140 
   7141 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
   7142 {
   7143    tl_assert(MC_(clo_mc_level) == 3);
   7144 
   7145    switch (e->tag) {
   7146 
   7147       case Iex_GetI: {
   7148          IRRegArray* descr_b;
   7149          IRAtom      *t1, *t2, *t3, *t4;
   7150          IRRegArray* descr      = e->Iex.GetI.descr;
   7151          IRType equivIntTy
   7152             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
   7153          /* If this array is unshadowable for whatever reason, use the
   7154             usual approximation. */
   7155          if (equivIntTy == Ity_INVALID)
   7156             return mkU32(0);
   7157          tl_assert(sizeofIRType(equivIntTy) >= 4);
   7158          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
   7159          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
   7160                                  equivIntTy, descr->nElems );
   7161          /* Do a shadow indexed get of the same size, giving t1.  Take
   7162             the bottom 32 bits of it, giving t2.  Compute into t3 the
   7163             origin for the index (almost certainly zero, but there's
   7164             no harm in being completely general here, since iropt will
   7165             remove any useless code), and fold it in, giving a final
   7166             value t4. */
   7167          t1 = assignNew( 'B', mce, equivIntTy,
   7168                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
   7169                                                 e->Iex.GetI.bias ));
   7170          t2 = narrowTo32( mce, t1 );
   7171          t3 = schemeE( mce, e->Iex.GetI.ix );
   7172          t4 = gen_maxU32( mce, t2, t3 );
   7173          return t4;
   7174       }
   7175       case Iex_CCall: {
   7176          Int i;
   7177          IRAtom*  here;
   7178          IRExpr** args = e->Iex.CCall.args;
   7179          IRAtom*  curr = mkU32(0);
   7180          for (i = 0; args[i]; i++) {
   7181             tl_assert(i < 32);
   7182             tl_assert(isOriginalAtom(mce, args[i]));
   7183             /* Only take notice of this arg if the callee's
   7184                mc-exclusion mask does not say it is to be excluded. */
   7185             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
   7186                /* the arg is to be excluded from definedness checking.
   7187                   Do nothing. */
   7188                if (0) VG_(printf)("excluding %s(%d)\n",
   7189                                   e->Iex.CCall.cee->name, i);
   7190             } else {
   7191                /* calculate the arg's definedness, and pessimistically
   7192                   merge it in. */
   7193                here = schemeE( mce, args[i] );
   7194                curr = gen_maxU32( mce, curr, here );
   7195             }
   7196          }
   7197          return curr;
   7198       }
   7199       case Iex_Load: {
   7200          Int dszB;
   7201          dszB = sizeofIRType(e->Iex.Load.ty);
   7202          /* assert that the B value for the address is already
   7203             available (somewhere) */
   7204          tl_assert(isIRAtom(e->Iex.Load.addr));
   7205          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
   7206          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
   7207       }
   7208       case Iex_ITE: {
   7209          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
   7210          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
   7211          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
   7212          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
   7213       }
   7214       case Iex_Qop: {
   7215          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
   7216          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
   7217          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
   7218          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
   7219          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
   7220                                  gen_maxU32( mce, b3, b4 ) );
   7221       }
   7222       case Iex_Triop: {
   7223          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
   7224          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
   7225          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
   7226          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
   7227       }
   7228       case Iex_Binop: {
   7229          switch (e->Iex.Binop.op) {
   7230             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
   7231             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
   7232             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
   7233             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
   7234                /* Just say these all produce a defined result,
   7235                   regardless of their arguments.  See
   7236                   COMMENT_ON_CasCmpEQ in this file. */
   7237                return mkU32(0);
   7238             default: {
   7239                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
   7240                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
   7241                return gen_maxU32( mce, b1, b2 );
   7242             }
   7243          }
   7244          tl_assert(0);
   7245          /*NOTREACHED*/
   7246       }
   7247       case Iex_Unop: {
   7248          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
   7249          return b1;
   7250       }
   7251       case Iex_Const:
   7252          return mkU32(0);
   7253       case Iex_RdTmp:
   7254          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
   7255       case Iex_Get: {
   7256          Int b_offset = MC_(get_otrack_shadow_offset)(
   7257                            e->Iex.Get.offset,
   7258                            sizeofIRType(e->Iex.Get.ty)
   7259                         );
   7260          tl_assert(b_offset >= -1
   7261                    && b_offset <= mce->layout->total_sizeB -4);
   7262          if (b_offset >= 0) {
   7263             /* FIXME: this isn't an atom! */
   7264             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
   7265                                Ity_I32 );
   7266          }
   7267          return mkU32(0);
   7268       }
   7269       default:
   7270          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
   7271          ppIRExpr(e);
   7272          VG_(tool_panic)("memcheck:schemeE");
   7273    }
   7274 }
   7275 
   7276 
   7277 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
   7278 {
   7279    // This is a hacked version of do_shadow_Dirty
   7280    Int       i, k, n, toDo, gSz, gOff;
   7281    IRAtom    *here, *curr;
   7282    IRTemp    dst;
   7283 
   7284    /* First check the guard. */
   7285    curr = schemeE( mce, d->guard );
   7286 
   7287    /* Now round up all inputs and maxU32 over them. */
   7288 
   7289    /* Inputs: unmasked args
   7290       Note: arguments are evaluated REGARDLESS of the guard expression */
   7291    for (i = 0; d->args[i]; i++) {
   7292       IRAtom* arg = d->args[i];
   7293       if ( (d->cee->mcx_mask & (1<<i))
   7294            || UNLIKELY(is_IRExpr_VECRET_or_GSPTR(arg)) ) {
   7295          /* ignore this arg */
   7296       } else {
   7297          here = schemeE( mce, arg );
   7298          curr = gen_maxU32( mce, curr, here );
   7299       }
   7300    }
   7301 
   7302    /* Inputs: guest state that we read. */
   7303    for (i = 0; i < d->nFxState; i++) {
   7304       tl_assert(d->fxState[i].fx != Ifx_None);
   7305       if (d->fxState[i].fx == Ifx_Write)
   7306          continue;
   7307 
   7308       /* Enumerate the described state segments */
   7309       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   7310          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   7311          gSz  = d->fxState[i].size;
   7312 
   7313          /* Ignore any sections marked as 'always defined'. */
   7314          if (isAlwaysDefd(mce, gOff, gSz)) {
   7315             if (0)
   7316             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
   7317                         gOff, gSz);
   7318             continue;
   7319          }
   7320 
   7321          /* This state element is read or modified.  So we need to
   7322             consider it.  If larger than 4 bytes, deal with it in
   7323             4-byte chunks. */
   7324          while (True) {
   7325             Int b_offset;
   7326             tl_assert(gSz >= 0);
   7327             if (gSz == 0) break;
   7328             n = gSz <= 4 ? gSz : 4;
   7329             /* update 'curr' with maxU32 of the state slice
   7330                gOff .. gOff+n-1 */
   7331             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
   7332             if (b_offset != -1) {
   7333                /* Observe the guard expression. If it is false use 0, i.e.
   7334                   nothing is known about the origin */
   7335                IRAtom *cond, *iffalse, *iftrue;
   7336 
   7337                cond = assignNew( 'B', mce, Ity_I1, d->guard);
   7338                iffalse = mkU32(0);
   7339                iftrue  = assignNew( 'B', mce, Ity_I32,
   7340                                     IRExpr_Get(b_offset
   7341                                                  + 2*mce->layout->total_sizeB,
   7342                                                Ity_I32));
   7343                here = assignNew( 'B', mce, Ity_I32,
   7344                                  IRExpr_ITE(cond, iftrue, iffalse));
   7345                curr = gen_maxU32( mce, curr, here );
   7346             }
   7347             gSz -= n;
   7348             gOff += n;
   7349          }
   7350       }
   7351    }
   7352 
   7353    /* Inputs: memory */
   7354 
   7355    if (d->mFx != Ifx_None) {
   7356       /* Because we may do multiple shadow loads/stores from the same
   7357          base address, it's best to do a single test of its
   7358          definedness right now.  Post-instrumentation optimisation
   7359          should remove all but this test. */
   7360       tl_assert(d->mAddr);
   7361       here = schemeE( mce, d->mAddr );
   7362       curr = gen_maxU32( mce, curr, here );
   7363    }
   7364 
   7365    /* Deal with memory inputs (reads or modifies) */
   7366    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
   7367       toDo   = d->mSize;
   7368       /* chew off 32-bit chunks.  We don't care about the endianness
   7369          since it's all going to be condensed down to a single bit,
   7370          but nevertheless choose an endianness which is hopefully
   7371          native to the platform. */
   7372       while (toDo >= 4) {
   7373          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
   7374                                     d->guard );
   7375          curr = gen_maxU32( mce, curr, here );
   7376          toDo -= 4;
   7377       }
   7378       /* handle possible 16-bit excess */
   7379       while (toDo >= 2) {
   7380          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
   7381                                     d->guard );
   7382          curr = gen_maxU32( mce, curr, here );
   7383          toDo -= 2;
   7384       }
   7385       /* chew off the remaining 8-bit chunk, if any */
   7386       if (toDo == 1) {
   7387          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
   7388                                     d->guard );
   7389          curr = gen_maxU32( mce, curr, here );
   7390          toDo -= 1;
   7391       }
   7392       tl_assert(toDo == 0);
   7393    }
   7394 
   7395    /* Whew!  So curr is a 32-bit B-value which should give an origin
   7396       of some use if any of the inputs to the helper are undefined.
   7397       Now we need to re-distribute the results to all destinations. */
   7398 
   7399    /* Outputs: the destination temporary, if there is one. */
   7400    if (d->tmp != IRTemp_INVALID) {
   7401       dst   = findShadowTmpB(mce, d->tmp);
   7402       assign( 'V', mce, dst, curr );
   7403    }
   7404 
   7405    /* Outputs: guest state that we write or modify. */
   7406    for (i = 0; i < d->nFxState; i++) {
   7407       tl_assert(d->fxState[i].fx != Ifx_None);
   7408       if (d->fxState[i].fx == Ifx_Read)
   7409          continue;
   7410 
   7411       /* Enumerate the described state segments */
   7412       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   7413          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   7414          gSz  = d->fxState[i].size;
   7415 
   7416          /* Ignore any sections marked as 'always defined'. */
   7417          if (isAlwaysDefd(mce, gOff, gSz))
   7418             continue;
   7419 
   7420          /* This state element is written or modified.  So we need to
   7421             consider it.  If larger than 4 bytes, deal with it in
   7422             4-byte chunks. */
   7423          while (True) {
   7424             Int b_offset;
   7425             tl_assert(gSz >= 0);
   7426             if (gSz == 0) break;
   7427             n = gSz <= 4 ? gSz : 4;
   7428             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
   7429             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
   7430             if (b_offset != -1) {
   7431 
   7432                /* If the guard expression evaluates to false we simply Put
   7433                   the value that is already stored in the guest state slot */
   7434                IRAtom *cond, *iffalse;
   7435 
   7436                cond    = assignNew('B', mce, Ity_I1,
   7437                                    d->guard);
   7438                iffalse = assignNew('B', mce, Ity_I32,
   7439                                    IRExpr_Get(b_offset +
   7440                                               2*mce->layout->total_sizeB,
   7441                                               Ity_I32));
   7442                curr = assignNew('V', mce, Ity_I32,
   7443                                 IRExpr_ITE(cond, curr, iffalse));
   7444 
   7445                stmt( 'B', mce, IRStmt_Put(b_offset
   7446                                           + 2*mce->layout->total_sizeB,
   7447                                           curr ));
   7448             }
   7449             gSz -= n;
   7450             gOff += n;
   7451          }
   7452       }
   7453    }
   7454 
   7455    /* Outputs: memory that we write or modify.  Same comments about
   7456       endianness as above apply. */
   7457    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
   7458       toDo   = d->mSize;
   7459       /* chew off 32-bit chunks */
   7460       while (toDo >= 4) {
   7461          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
   7462                       d->guard );
   7463          toDo -= 4;
   7464       }
   7465       /* handle possible 16-bit excess */
   7466       while (toDo >= 2) {
   7467          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
   7468                       d->guard );
   7469          toDo -= 2;
   7470       }
   7471       /* chew off the remaining 8-bit chunk, if any */
   7472       if (toDo == 1) {
   7473          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
   7474                       d->guard );
   7475          toDo -= 1;
   7476       }
   7477       tl_assert(toDo == 0);
   7478    }
   7479 }
   7480 
   7481 
   7482 /* Generate IR for origin shadowing for a general guarded store. */
   7483 static void do_origins_Store_guarded ( MCEnv* mce,
   7484                                        IREndness stEnd,
   7485                                        IRExpr* stAddr,
   7486                                        IRExpr* stData,
   7487                                        IRExpr* guard )
   7488 {
   7489    Int     dszB;
   7490    IRAtom* dataB;
   7491    /* assert that the B value for the address is already available
   7492       (somewhere), since the call to schemeE will want to see it.
   7493       XXXX how does this actually ensure that?? */
   7494    tl_assert(isIRAtom(stAddr));
   7495    tl_assert(isIRAtom(stData));
   7496    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
   7497    dataB = schemeE( mce, stData );
   7498    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
   7499 }
   7500 
   7501 
   7502 /* Generate IR for origin shadowing for a plain store. */
   7503 static void do_origins_Store_plain ( MCEnv* mce,
   7504                                      IREndness stEnd,
   7505                                      IRExpr* stAddr,
   7506                                      IRExpr* stData )
   7507 {
   7508    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
   7509                               NULL/*guard*/ );
   7510 }
   7511 
   7512 
   7513 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
   7514 
   7515 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
   7516 {
   7517    do_origins_Store_guarded( mce, sg->end, sg->addr,
   7518                              sg->data, sg->guard );
   7519 }
   7520 
   7521 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
   7522 {
   7523    IRType loadedTy = Ity_INVALID;
   7524    switch (lg->cvt) {
   7525       case ILGop_IdentV128: loadedTy = Ity_V128; break;
   7526       case ILGop_Ident64:   loadedTy = Ity_I64;  break;
   7527       case ILGop_Ident32:   loadedTy = Ity_I32;  break;
   7528       case ILGop_16Uto32:   loadedTy = Ity_I16;  break;
   7529       case ILGop_16Sto32:   loadedTy = Ity_I16;  break;
   7530       case ILGop_8Uto32:    loadedTy = Ity_I8;   break;
   7531       case ILGop_8Sto32:    loadedTy = Ity_I8;   break;
   7532       default: VG_(tool_panic)("schemeS.IRLoadG");
   7533    }
   7534    IRAtom* ori_alt
   7535       = schemeE( mce,lg->alt );
   7536    IRAtom* ori_final
   7537       = expr2ori_Load_guarded_General(mce, loadedTy,
   7538                                       lg->addr, 0/*addr bias*/,
   7539                                       lg->guard, ori_alt );
   7540    /* And finally, bind the origin to the destination temporary. */
   7541    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
   7542 }
   7543 
   7544 
   7545 static void schemeS ( MCEnv* mce, IRStmt* st )
   7546 {
   7547    tl_assert(MC_(clo_mc_level) == 3);
   7548 
   7549    switch (st->tag) {
   7550 
   7551       case Ist_AbiHint:
   7552          /* The value-check instrumenter handles this - by arranging
   7553             to pass the address of the next instruction to
   7554             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
   7555             happen for origin tracking w.r.t. AbiHints.  So there is
   7556             nothing to do here. */
   7557          break;
   7558 
   7559       case Ist_PutI: {
   7560          IRPutI *puti = st->Ist.PutI.details;
   7561          IRRegArray* descr_b;
   7562          IRAtom      *t1, *t2, *t3, *t4;
   7563          IRRegArray* descr = puti->descr;
   7564          IRType equivIntTy
   7565             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
   7566          /* If this array is unshadowable for whatever reason,
   7567             generate no code. */
   7568          if (equivIntTy == Ity_INVALID)
   7569             break;
   7570          tl_assert(sizeofIRType(equivIntTy) >= 4);
   7571          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
   7572          descr_b
   7573             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
   7574                             equivIntTy, descr->nElems );
   7575          /* Compute a value to Put - the conjoinment of the origin for
   7576             the data to be Put-ted (obviously) and of the index value
   7577             (not so obviously). */
   7578          t1 = schemeE( mce, puti->data );
   7579          t2 = schemeE( mce, puti->ix );
   7580          t3 = gen_maxU32( mce, t1, t2 );
   7581          t4 = zWidenFrom32( mce, equivIntTy, t3 );
   7582          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
   7583                                                puti->bias, t4) ));
   7584          break;
   7585       }
   7586 
   7587       case Ist_Dirty:
   7588          do_origins_Dirty( mce, st->Ist.Dirty.details );
   7589          break;
   7590 
   7591       case Ist_Store:
   7592          do_origins_Store_plain( mce, st->Ist.Store.end,
   7593                                       st->Ist.Store.addr,
   7594                                       st->Ist.Store.data );
   7595          break;
   7596 
   7597       case Ist_StoreG:
   7598          do_origins_StoreG( mce, st->Ist.StoreG.details );
   7599          break;
   7600 
   7601       case Ist_LoadG:
   7602          do_origins_LoadG( mce, st->Ist.LoadG.details );
   7603          break;
   7604 
   7605       case Ist_LLSC: {
   7606          /* In short: treat a load-linked like a normal load followed
   7607             by an assignment of the loaded (shadow) data the result
   7608             temporary.  Treat a store-conditional like a normal store,
   7609             and mark the result temporary as defined. */
   7610          if (st->Ist.LLSC.storedata == NULL) {
   7611             /* Load Linked */
   7612             IRType resTy
   7613                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
   7614             IRExpr* vanillaLoad
   7615                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
   7616             tl_assert(resTy == Ity_I64 || resTy == Ity_I32
   7617                       || resTy == Ity_I16 || resTy == Ity_I8);
   7618             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
   7619                               schemeE(mce, vanillaLoad));
   7620          } else {
   7621             /* Store conditional */
   7622             do_origins_Store_plain( mce, st->Ist.LLSC.end,
   7623                                     st->Ist.LLSC.addr,
   7624                                     st->Ist.LLSC.storedata );
   7625             /* For the rationale behind this, see comments at the
   7626                place where the V-shadow for .result is constructed, in
   7627                do_shadow_LLSC.  In short, we regard .result as
   7628                always-defined. */
   7629             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
   7630                               mkU32(0) );
   7631          }
   7632          break;
   7633       }
   7634 
   7635       case Ist_Put: {
   7636          Int b_offset
   7637             = MC_(get_otrack_shadow_offset)(
   7638                  st->Ist.Put.offset,
   7639                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
   7640               );
   7641          if (b_offset >= 0) {
   7642             /* FIXME: this isn't an atom! */
   7643             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
   7644                                        schemeE( mce, st->Ist.Put.data )) );
   7645          }
   7646          break;
   7647       }
   7648 
   7649       case Ist_WrTmp:
   7650          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
   7651                            schemeE(mce, st->Ist.WrTmp.data) );
   7652          break;
   7653 
   7654       case Ist_MBE:
   7655       case Ist_NoOp:
   7656       case Ist_Exit:
   7657       case Ist_IMark:
   7658          break;
   7659 
   7660       default:
   7661          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
   7662          ppIRStmt(st);
   7663          VG_(tool_panic)("memcheck:schemeS");
   7664    }
   7665 }
   7666 
   7667 
   7668 /*------------------------------------------------------------*/
   7669 /*--- Startup assertion checking                           ---*/
   7670 /*------------------------------------------------------------*/
   7671 
   7672 void MC_(do_instrumentation_startup_checks)( void )
   7673 {
   7674    /* Make a best-effort check to see that is_helperc_value_checkN_fail
   7675       is working as we expect. */
   7676 
   7677 #  define CHECK(_expected, _string) \
   7678       tl_assert((_expected) == is_helperc_value_checkN_fail(_string))
   7679 
   7680    /* It should identify these 8, and no others, as targets. */
   7681    CHECK(True, "MC_(helperc_value_check8_fail_no_o)");
   7682    CHECK(True, "MC_(helperc_value_check4_fail_no_o)");
   7683    CHECK(True, "MC_(helperc_value_check0_fail_no_o)");
   7684    CHECK(True, "MC_(helperc_value_check1_fail_no_o)");
   7685    CHECK(True, "MC_(helperc_value_check8_fail_w_o)");
   7686    CHECK(True, "MC_(helperc_value_check0_fail_w_o)");
   7687    CHECK(True, "MC_(helperc_value_check1_fail_w_o)");
   7688    CHECK(True, "MC_(helperc_value_check4_fail_w_o)");
   7689 
   7690    /* Ad-hoc selection of other strings gathered via a quick test. */
   7691    CHECK(False, "amd64g_dirtyhelper_CPUID_avx2");
   7692    CHECK(False, "amd64g_dirtyhelper_RDTSC");
   7693    CHECK(False, "MC_(helperc_b_load1)");
   7694    CHECK(False, "MC_(helperc_b_load2)");
   7695    CHECK(False, "MC_(helperc_b_load4)");
   7696    CHECK(False, "MC_(helperc_b_load8)");
   7697    CHECK(False, "MC_(helperc_b_load16)");
   7698    CHECK(False, "MC_(helperc_b_load32)");
   7699    CHECK(False, "MC_(helperc_b_store1)");
   7700    CHECK(False, "MC_(helperc_b_store2)");
   7701    CHECK(False, "MC_(helperc_b_store4)");
   7702    CHECK(False, "MC_(helperc_b_store8)");
   7703    CHECK(False, "MC_(helperc_b_store16)");
   7704    CHECK(False, "MC_(helperc_b_store32)");
   7705    CHECK(False, "MC_(helperc_LOADV8)");
   7706    CHECK(False, "MC_(helperc_LOADV16le)");
   7707    CHECK(False, "MC_(helperc_LOADV32le)");
   7708    CHECK(False, "MC_(helperc_LOADV64le)");
   7709    CHECK(False, "MC_(helperc_LOADV128le)");
   7710    CHECK(False, "MC_(helperc_LOADV256le)");
   7711    CHECK(False, "MC_(helperc_STOREV16le)");
   7712    CHECK(False, "MC_(helperc_STOREV32le)");
   7713    CHECK(False, "MC_(helperc_STOREV64le)");
   7714    CHECK(False, "MC_(helperc_STOREV8)");
   7715    CHECK(False, "track_die_mem_stack_8");
   7716    CHECK(False, "track_new_mem_stack_8_w_ECU");
   7717    CHECK(False, "MC_(helperc_MAKE_STACK_UNINIT_w_o)");
   7718    CHECK(False, "VG_(unknown_SP_update_w_ECU)");
   7719 
   7720 #  undef CHECK
   7721 }
   7722 
   7723 
   7724 /*--------------------------------------------------------------------*/
   7725 /*--- end                                           mc_translate.c ---*/
   7726 /*--------------------------------------------------------------------*/
   7727