Home | History | Annotate | Download | only in memcheck
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Instrument IR to perform memory checking operations.         ---*/
      4 /*---                                               mc_translate.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of MemCheck, a heavyweight Valgrind tool for
      9    detecting memory errors.
     10 
     11    Copyright (C) 2000-2013 Julian Seward
     12       jseward (at) acm.org
     13 
     14    This program is free software; you can redistribute it and/or
     15    modify it under the terms of the GNU General Public License as
     16    published by the Free Software Foundation; either version 2 of the
     17    License, or (at your option) any later version.
     18 
     19    This program is distributed in the hope that it will be useful, but
     20    WITHOUT ANY WARRANTY; without even the implied warranty of
     21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     22    General Public License for more details.
     23 
     24    You should have received a copy of the GNU General Public License
     25    along with this program; if not, write to the Free Software
     26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     27    02111-1307, USA.
     28 
     29    The GNU General Public License is contained in the file COPYING.
     30 */
     31 
     32 #include "pub_tool_basics.h"
     33 #include "pub_tool_poolalloc.h"     // For mc_include.h
     34 #include "pub_tool_hashtable.h"     // For mc_include.h
     35 #include "pub_tool_libcassert.h"
     36 #include "pub_tool_libcprint.h"
     37 #include "pub_tool_tooliface.h"
     38 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
     39 #include "pub_tool_xarray.h"
     40 #include "pub_tool_mallocfree.h"
     41 #include "pub_tool_libcbase.h"
     42 
     43 #include "mc_include.h"
     44 
     45 
     46 /* FIXMEs JRS 2011-June-16.
     47 
     48    Check the interpretation for vector narrowing and widening ops,
     49    particularly the saturating ones.  I suspect they are either overly
     50    pessimistic and/or wrong.
     51 */
     52 
     53 /* This file implements the Memcheck instrumentation, and in
     54    particular contains the core of its undefined value detection
     55    machinery.  For a comprehensive background of the terminology,
     56    algorithms and rationale used herein, read:
     57 
     58      Using Valgrind to detect undefined value errors with
     59      bit-precision
     60 
     61      Julian Seward and Nicholas Nethercote
     62 
     63      2005 USENIX Annual Technical Conference (General Track),
     64      Anaheim, CA, USA, April 10-15, 2005.
     65 
     66    ----
     67 
     68    Here is as good a place as any to record exactly when V bits are and
     69    should be checked, why, and what function is responsible.
     70 
     71 
     72    Memcheck complains when an undefined value is used:
     73 
     74    1. In the condition of a conditional branch.  Because it could cause
     75       incorrect control flow, and thus cause incorrect externally-visible
     76       behaviour.  [mc_translate.c:complainIfUndefined]
     77 
     78    2. As an argument to a system call, or as the value that specifies
     79       the system call number.  Because it could cause an incorrect
     80       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
     81 
     82    3. As the address in a load or store.  Because it could cause an
     83       incorrect value to be used later, which could cause externally-visible
     84       behaviour (eg. via incorrect control flow or an incorrect system call
     85       argument)  [complainIfUndefined]
     86 
     87    4. As the target address of a branch.  Because it could cause incorrect
     88       control flow.  [complainIfUndefined]
     89 
     90    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
     91       an incorrect value into the external environment.
     92       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
     93 
     94    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
     95       [complainIfUndefined]
     96 
     97    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
     98       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
     99       requested it.  [in memcheck.h]
    100 
    101 
    102    Memcheck also complains, but should not, when an undefined value is used:
    103 
    104    8. As the shift value in certain SIMD shift operations (but not in the
    105       standard integer shift operations).  This inconsistency is due to
    106       historical reasons.)  [complainIfUndefined]
    107 
    108 
    109    Memcheck does not complain, but should, when an undefined value is used:
    110 
    111    9. As an input to a client request.  Because the client request may
    112       affect the visible behaviour -- see bug #144362 for an example
    113       involving the malloc replacements in vg_replace_malloc.c and
    114       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
    115       isn't identified.  That bug report also has some info on how to solve
    116       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
    117 
    118 
    119    In practice, 1 and 2 account for the vast majority of cases.
    120 */
    121 
    122 /* Generation of addr-definedness, addr-validity and
    123    guard-definedness checks pertaining to loads and stores (Iex_Load,
    124    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
    125    loads/stores) was re-checked 11 May 2013. */
    126 
    127 /*------------------------------------------------------------*/
    128 /*--- Forward decls                                        ---*/
    129 /*------------------------------------------------------------*/
    130 
    131 struct _MCEnv;
    132 
    133 static IRType  shadowTypeV ( IRType ty );
    134 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e );
    135 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
    136 
    137 static IRExpr *i128_const_zero(void);
    138 
    139 /*------------------------------------------------------------*/
    140 /*--- Memcheck running state, and tmp management.          ---*/
    141 /*------------------------------------------------------------*/
    142 
    143 /* Carries info about a particular tmp.  The tmp's number is not
    144    recorded, as this is implied by (equal to) its index in the tmpMap
    145    in MCEnv.  The tmp's type is also not recorded, as this is present
    146    in MCEnv.sb->tyenv.
    147 
    148    When .kind is Orig, .shadowV and .shadowB may give the identities
    149    of the temps currently holding the associated definedness (shadowV)
    150    and origin (shadowB) values, or these may be IRTemp_INVALID if code
    151    to compute such values has not yet been emitted.
    152 
    153    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
    154    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
    155    illogical for a shadow tmp itself to be shadowed.
    156 */
    157 typedef
    158    enum { Orig=1, VSh=2, BSh=3 }
    159    TempKind;
    160 
    161 typedef
    162    struct {
    163       TempKind kind;
    164       IRTemp   shadowV;
    165       IRTemp   shadowB;
    166    }
    167    TempMapEnt;
    168 
    169 
    170 /* Carries around state during memcheck instrumentation. */
    171 typedef
    172    struct _MCEnv {
    173       /* MODIFIED: the superblock being constructed.  IRStmts are
    174          added. */
    175       IRSB* sb;
    176       Bool  trace;
    177 
    178       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
    179          current kind and possibly shadow temps for each temp in the
    180          IRSB being constructed.  Note that it does not contain the
    181          type of each tmp.  If you want to know the type, look at the
    182          relevant entry in sb->tyenv.  It follows that at all times
    183          during the instrumentation process, the valid indices for
    184          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
    185          total number of Orig, V- and B- temps allocated so far.
    186 
    187          The reason for this strange split (types in one place, all
    188          other info in another) is that we need the types to be
    189          attached to sb so as to make it possible to do
    190          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
    191          instrumentation process. */
    192       XArray* /* of TempMapEnt */ tmpMap;
    193 
    194       /* MODIFIED: indicates whether "bogus" literals have so far been
    195          found.  Starts off False, and may change to True. */
    196       Bool bogusLiterals;
    197 
    198       /* READONLY: indicates whether we should use expensive
    199          interpretations of integer adds, since unfortunately LLVM
    200          uses them to do ORs in some circumstances.  Defaulted to True
    201          on MacOS and False everywhere else. */
    202       Bool useLLVMworkarounds;
    203 
    204       /* READONLY: the guest layout.  This indicates which parts of
    205          the guest state should be regarded as 'always defined'. */
    206       VexGuestLayout* layout;
    207 
    208       /* READONLY: the host word type.  Needed for constructing
    209          arguments of type 'HWord' to be passed to helper functions.
    210          Ity_I32 or Ity_I64 only. */
    211       IRType hWordTy;
    212    }
    213    MCEnv;
    214 
    215 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
    216    demand), as they are encountered.  This is for two reasons.
    217 
    218    (1) (less important reason): Many original tmps are unused due to
    219    initial IR optimisation, and we do not want to spaces in tables
    220    tracking them.
    221 
    222    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
    223    table indexed [0 .. n_types-1], which gives the current shadow for
    224    each original tmp, or INVALID_IRTEMP if none is so far assigned.
    225    It is necessary to support making multiple assignments to a shadow
    226    -- specifically, after testing a shadow for definedness, it needs
    227    to be made defined.  But IR's SSA property disallows this.
    228 
    229    (2) (more important reason): Therefore, when a shadow needs to get
    230    a new value, a new temporary is created, the value is assigned to
    231    that, and the tmpMap is updated to reflect the new binding.
    232 
    233    A corollary is that if the tmpMap maps a given tmp to
    234    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
    235    there's a read-before-write error in the original tmps.  The IR
    236    sanity checker should catch all such anomalies, however.
    237 */
    238 
    239 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
    240    both the table in mce->sb and to our auxiliary mapping.  Note that
    241    newTemp may cause mce->tmpMap to resize, hence previous results
    242    from VG_(indexXA)(mce->tmpMap) are invalidated. */
    243 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
    244 {
    245    Word       newIx;
    246    TempMapEnt ent;
    247    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
    248    ent.kind    = kind;
    249    ent.shadowV = IRTemp_INVALID;
    250    ent.shadowB = IRTemp_INVALID;
    251    newIx = VG_(addToXA)( mce->tmpMap, &ent );
    252    tl_assert(newIx == (Word)tmp);
    253    return tmp;
    254 }
    255 
    256 
    257 /* Find the tmp currently shadowing the given original tmp.  If none
    258    so far exists, allocate one.  */
    259 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
    260 {
    261    TempMapEnt* ent;
    262    /* VG_(indexXA) range-checks 'orig', hence no need to check
    263       here. */
    264    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    265    tl_assert(ent->kind == Orig);
    266    if (ent->shadowV == IRTemp_INVALID) {
    267       IRTemp tmpV
    268         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
    269       /* newTemp may cause mce->tmpMap to resize, hence previous results
    270          from VG_(indexXA) are invalid. */
    271       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    272       tl_assert(ent->kind == Orig);
    273       tl_assert(ent->shadowV == IRTemp_INVALID);
    274       ent->shadowV = tmpV;
    275    }
    276    return ent->shadowV;
    277 }
    278 
    279 /* Allocate a new shadow for the given original tmp.  This means any
    280    previous shadow is abandoned.  This is needed because it is
    281    necessary to give a new value to a shadow once it has been tested
    282    for undefinedness, but unfortunately IR's SSA property disallows
    283    this.  Instead we must abandon the old shadow, allocate a new one
    284    and use that instead.
    285 
    286    This is the same as findShadowTmpV, except we don't bother to see
    287    if a shadow temp already existed -- we simply allocate a new one
    288    regardless. */
    289 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
    290 {
    291    TempMapEnt* ent;
    292    /* VG_(indexXA) range-checks 'orig', hence no need to check
    293       here. */
    294    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    295    tl_assert(ent->kind == Orig);
    296    if (1) {
    297       IRTemp tmpV
    298         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
    299       /* newTemp may cause mce->tmpMap to resize, hence previous results
    300          from VG_(indexXA) are invalid. */
    301       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    302       tl_assert(ent->kind == Orig);
    303       ent->shadowV = tmpV;
    304    }
    305 }
    306 
    307 
    308 /*------------------------------------------------------------*/
    309 /*--- IRAtoms -- a subset of IRExprs                       ---*/
    310 /*------------------------------------------------------------*/
    311 
    312 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
    313    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
    314    input, most of this code deals in atoms.  Usefully, a value atom
    315    always has a V-value which is also an atom: constants are shadowed
    316    by constants, and temps are shadowed by the corresponding shadow
    317    temporary. */
    318 
    319 typedef  IRExpr  IRAtom;
    320 
    321 /* (used for sanity checks only): is this an atom which looks
    322    like it's from original code? */
    323 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
    324 {
    325    if (a1->tag == Iex_Const)
    326       return True;
    327    if (a1->tag == Iex_RdTmp) {
    328       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
    329       return ent->kind == Orig;
    330    }
    331    return False;
    332 }
    333 
    334 /* (used for sanity checks only): is this an atom which looks
    335    like it's from shadow code? */
    336 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
    337 {
    338    if (a1->tag == Iex_Const)
    339       return True;
    340    if (a1->tag == Iex_RdTmp) {
    341       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
    342       return ent->kind == VSh || ent->kind == BSh;
    343    }
    344    return False;
    345 }
    346 
    347 /* (used for sanity checks only): check that both args are atoms and
    348    are identically-kinded. */
    349 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
    350 {
    351    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
    352       return True;
    353    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
    354       return True;
    355    return False;
    356 }
    357 
    358 
    359 /*------------------------------------------------------------*/
    360 /*--- Type management                                      ---*/
    361 /*------------------------------------------------------------*/
    362 
    363 /* Shadow state is always accessed using integer types.  This returns
    364    an integer type with the same size (as per sizeofIRType) as the
    365    given type.  The only valid shadow types are Bit, I8, I16, I32,
    366    I64, I128, V128, V256. */
    367 
    368 static IRType shadowTypeV ( IRType ty )
    369 {
    370    switch (ty) {
    371       case Ity_I1:
    372       case Ity_I8:
    373       case Ity_I16:
    374       case Ity_I32:
    375       case Ity_I64:
    376       case Ity_I128: return ty;
    377       case Ity_F32:  return Ity_I32;
    378       case Ity_D32:  return Ity_I32;
    379       case Ity_F64:  return Ity_I64;
    380       case Ity_D64:  return Ity_I64;
    381       case Ity_F128: return Ity_I128;
    382       case Ity_D128: return Ity_I128;
    383       case Ity_V128: return Ity_V128;
    384       case Ity_V256: return Ity_V256;
    385       default: ppIRType(ty);
    386                VG_(tool_panic)("memcheck:shadowTypeV");
    387    }
    388 }
    389 
    390 /* Produce a 'defined' value of the given shadow type.  Should only be
    391    supplied shadow types (Bit/I8/I16/I32/UI64). */
    392 static IRExpr* definedOfType ( IRType ty ) {
    393    switch (ty) {
    394       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
    395       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
    396       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
    397       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
    398       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
    399       case Ity_I128: return i128_const_zero();
    400       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
    401       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
    402       default:       VG_(tool_panic)("memcheck:definedOfType");
    403    }
    404 }
    405 
    406 
    407 /*------------------------------------------------------------*/
    408 /*--- Constructing IR fragments                            ---*/
    409 /*------------------------------------------------------------*/
    410 
    411 /* add stmt to a bb */
    412 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
    413    if (mce->trace) {
    414       VG_(printf)("  %c: ", cat);
    415       ppIRStmt(st);
    416       VG_(printf)("\n");
    417    }
    418    addStmtToIRSB(mce->sb, st);
    419 }
    420 
    421 /* assign value to tmp */
    422 static inline
    423 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
    424    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
    425 }
    426 
    427 /* build various kinds of expressions */
    428 #define triop(_op, _arg1, _arg2, _arg3) \
    429                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
    430 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
    431 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
    432 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
    433 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
    434 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
    435 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
    436 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
    437 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
    438 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
    439 
    440 /* Bind the given expression to a new temporary, and return the
    441    temporary.  This effectively converts an arbitrary expression into
    442    an atom.
    443 
    444    'ty' is the type of 'e' and hence the type that the new temporary
    445    needs to be.  But passing it in is redundant, since we can deduce
    446    the type merely by inspecting 'e'.  So at least use that fact to
    447    assert that the two types agree. */
    448 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
    449 {
    450    TempKind k;
    451    IRTemp   t;
    452    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
    453 
    454    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
    455    switch (cat) {
    456       case 'V': k = VSh;  break;
    457       case 'B': k = BSh;  break;
    458       case 'C': k = Orig; break;
    459                 /* happens when we are making up new "orig"
    460                    expressions, for IRCAS handling */
    461       default: tl_assert(0);
    462    }
    463    t = newTemp(mce, ty, k);
    464    assign(cat, mce, t, e);
    465    return mkexpr(t);
    466 }
    467 
    468 
    469 /*------------------------------------------------------------*/
    470 /*--- Helper functions for 128-bit ops                     ---*/
    471 /*------------------------------------------------------------*/
    472 
    473 static IRExpr *i128_const_zero(void)
    474 {
    475    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
    476    return binop(Iop_64HLto128, z64, z64);
    477 }
    478 
    479 /* There are no I128-bit loads and/or stores [as generated by any
    480    current front ends].  So we do not need to worry about that in
    481    expr2vbits_Load */
    482 
    483 
    484 /*------------------------------------------------------------*/
    485 /*--- Constructing definedness primitive ops               ---*/
    486 /*------------------------------------------------------------*/
    487 
    488 /* --------- Defined-if-either-defined --------- */
    489 
    490 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    491    tl_assert(isShadowAtom(mce,a1));
    492    tl_assert(isShadowAtom(mce,a2));
    493    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
    494 }
    495 
    496 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    497    tl_assert(isShadowAtom(mce,a1));
    498    tl_assert(isShadowAtom(mce,a2));
    499    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
    500 }
    501 
    502 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    503    tl_assert(isShadowAtom(mce,a1));
    504    tl_assert(isShadowAtom(mce,a2));
    505    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
    506 }
    507 
    508 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    509    tl_assert(isShadowAtom(mce,a1));
    510    tl_assert(isShadowAtom(mce,a2));
    511    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
    512 }
    513 
    514 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    515    tl_assert(isShadowAtom(mce,a1));
    516    tl_assert(isShadowAtom(mce,a2));
    517    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
    518 }
    519 
    520 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    521    tl_assert(isShadowAtom(mce,a1));
    522    tl_assert(isShadowAtom(mce,a2));
    523    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
    524 }
    525 
    526 /* --------- Undefined-if-either-undefined --------- */
    527 
    528 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    529    tl_assert(isShadowAtom(mce,a1));
    530    tl_assert(isShadowAtom(mce,a2));
    531    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
    532 }
    533 
    534 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    535    tl_assert(isShadowAtom(mce,a1));
    536    tl_assert(isShadowAtom(mce,a2));
    537    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
    538 }
    539 
    540 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    541    tl_assert(isShadowAtom(mce,a1));
    542    tl_assert(isShadowAtom(mce,a2));
    543    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
    544 }
    545 
    546 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    547    tl_assert(isShadowAtom(mce,a1));
    548    tl_assert(isShadowAtom(mce,a2));
    549    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
    550 }
    551 
    552 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    553    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
    554    tl_assert(isShadowAtom(mce,a1));
    555    tl_assert(isShadowAtom(mce,a2));
    556    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
    557    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
    558    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
    559    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
    560    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
    561    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
    562 
    563    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
    564 }
    565 
    566 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    567    tl_assert(isShadowAtom(mce,a1));
    568    tl_assert(isShadowAtom(mce,a2));
    569    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
    570 }
    571 
    572 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    573    tl_assert(isShadowAtom(mce,a1));
    574    tl_assert(isShadowAtom(mce,a2));
    575    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
    576 }
    577 
    578 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
    579    switch (vty) {
    580       case Ity_I8:   return mkUifU8(mce, a1, a2);
    581       case Ity_I16:  return mkUifU16(mce, a1, a2);
    582       case Ity_I32:  return mkUifU32(mce, a1, a2);
    583       case Ity_I64:  return mkUifU64(mce, a1, a2);
    584       case Ity_I128: return mkUifU128(mce, a1, a2);
    585       case Ity_V128: return mkUifUV128(mce, a1, a2);
    586       case Ity_V256: return mkUifUV256(mce, a1, a2);
    587       default:
    588          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
    589          VG_(tool_panic)("memcheck:mkUifU");
    590    }
    591 }
    592 
    593 /* --------- The Left-family of operations. --------- */
    594 
    595 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
    596    tl_assert(isShadowAtom(mce,a1));
    597    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
    598 }
    599 
    600 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
    601    tl_assert(isShadowAtom(mce,a1));
    602    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
    603 }
    604 
    605 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
    606    tl_assert(isShadowAtom(mce,a1));
    607    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
    608 }
    609 
    610 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
    611    tl_assert(isShadowAtom(mce,a1));
    612    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
    613 }
    614 
    615 /* --------- 'Improvement' functions for AND/OR. --------- */
    616 
    617 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
    618    defined (0); all other -> undefined (1).
    619 */
    620 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    621 {
    622    tl_assert(isOriginalAtom(mce, data));
    623    tl_assert(isShadowAtom(mce, vbits));
    624    tl_assert(sameKindedAtoms(data, vbits));
    625    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
    626 }
    627 
    628 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    629 {
    630    tl_assert(isOriginalAtom(mce, data));
    631    tl_assert(isShadowAtom(mce, vbits));
    632    tl_assert(sameKindedAtoms(data, vbits));
    633    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
    634 }
    635 
    636 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    637 {
    638    tl_assert(isOriginalAtom(mce, data));
    639    tl_assert(isShadowAtom(mce, vbits));
    640    tl_assert(sameKindedAtoms(data, vbits));
    641    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
    642 }
    643 
    644 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    645 {
    646    tl_assert(isOriginalAtom(mce, data));
    647    tl_assert(isShadowAtom(mce, vbits));
    648    tl_assert(sameKindedAtoms(data, vbits));
    649    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
    650 }
    651 
    652 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    653 {
    654    tl_assert(isOriginalAtom(mce, data));
    655    tl_assert(isShadowAtom(mce, vbits));
    656    tl_assert(sameKindedAtoms(data, vbits));
    657    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
    658 }
    659 
    660 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    661 {
    662    tl_assert(isOriginalAtom(mce, data));
    663    tl_assert(isShadowAtom(mce, vbits));
    664    tl_assert(sameKindedAtoms(data, vbits));
    665    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
    666 }
    667 
    668 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
    669    defined (0); all other -> undefined (1).
    670 */
    671 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    672 {
    673    tl_assert(isOriginalAtom(mce, data));
    674    tl_assert(isShadowAtom(mce, vbits));
    675    tl_assert(sameKindedAtoms(data, vbits));
    676    return assignNew(
    677              'V', mce, Ity_I8,
    678              binop(Iop_Or8,
    679                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
    680                    vbits) );
    681 }
    682 
    683 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    684 {
    685    tl_assert(isOriginalAtom(mce, data));
    686    tl_assert(isShadowAtom(mce, vbits));
    687    tl_assert(sameKindedAtoms(data, vbits));
    688    return assignNew(
    689              'V', mce, Ity_I16,
    690              binop(Iop_Or16,
    691                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
    692                    vbits) );
    693 }
    694 
    695 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    696 {
    697    tl_assert(isOriginalAtom(mce, data));
    698    tl_assert(isShadowAtom(mce, vbits));
    699    tl_assert(sameKindedAtoms(data, vbits));
    700    return assignNew(
    701              'V', mce, Ity_I32,
    702              binop(Iop_Or32,
    703                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
    704                    vbits) );
    705 }
    706 
    707 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    708 {
    709    tl_assert(isOriginalAtom(mce, data));
    710    tl_assert(isShadowAtom(mce, vbits));
    711    tl_assert(sameKindedAtoms(data, vbits));
    712    return assignNew(
    713              'V', mce, Ity_I64,
    714              binop(Iop_Or64,
    715                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
    716                    vbits) );
    717 }
    718 
    719 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    720 {
    721    tl_assert(isOriginalAtom(mce, data));
    722    tl_assert(isShadowAtom(mce, vbits));
    723    tl_assert(sameKindedAtoms(data, vbits));
    724    return assignNew(
    725              'V', mce, Ity_V128,
    726              binop(Iop_OrV128,
    727                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
    728                    vbits) );
    729 }
    730 
    731 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    732 {
    733    tl_assert(isOriginalAtom(mce, data));
    734    tl_assert(isShadowAtom(mce, vbits));
    735    tl_assert(sameKindedAtoms(data, vbits));
    736    return assignNew(
    737              'V', mce, Ity_V256,
    738              binop(Iop_OrV256,
    739                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
    740                    vbits) );
    741 }
    742 
    743 /* --------- Pessimising casts. --------- */
    744 
    745 /* The function returns an expression of type DST_TY. If any of the VBITS
    746    is undefined (value == 1) the resulting expression has all bits set to
    747    1. Otherwise, all bits are 0. */
    748 
    749 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
    750 {
    751    IRType  src_ty;
    752    IRAtom* tmp1;
    753 
    754    /* Note, dst_ty is a shadow type, not an original type. */
    755    tl_assert(isShadowAtom(mce,vbits));
    756    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
    757 
    758    /* Fast-track some common cases */
    759    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
    760       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    761 
    762    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
    763       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
    764 
    765    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
    766       /* PCast the arg, then clone it. */
    767       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    768       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    769    }
    770 
    771    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
    772       /* PCast the arg, then clone it 4 times. */
    773       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    774       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    775       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
    776    }
    777 
    778    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
    779       /* PCast the arg, then clone it 8 times. */
    780       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    781       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    782       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
    783       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
    784    }
    785 
    786    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
    787       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
    788          the top half. */
    789       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
    790       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
    791    }
    792 
    793    /* Else do it the slow way .. */
    794    /* First of all, collapse vbits down to a single bit. */
    795    tmp1   = NULL;
    796    switch (src_ty) {
    797       case Ity_I1:
    798          tmp1 = vbits;
    799          break;
    800       case Ity_I8:
    801          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
    802          break;
    803       case Ity_I16:
    804          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
    805          break;
    806       case Ity_I32:
    807          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
    808          break;
    809       case Ity_I64:
    810          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
    811          break;
    812       case Ity_I128: {
    813          /* Gah.  Chop it in half, OR the halves together, and compare
    814             that with zero. */
    815          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
    816          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
    817          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
    818          tmp1         = assignNew('V', mce, Ity_I1,
    819                                        unop(Iop_CmpNEZ64, tmp4));
    820          break;
    821       }
    822       default:
    823          ppIRType(src_ty);
    824          VG_(tool_panic)("mkPCastTo(1)");
    825    }
    826    tl_assert(tmp1);
    827    /* Now widen up to the dst type. */
    828    switch (dst_ty) {
    829       case Ity_I1:
    830          return tmp1;
    831       case Ity_I8:
    832          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
    833       case Ity_I16:
    834          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
    835       case Ity_I32:
    836          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
    837       case Ity_I64:
    838          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
    839       case Ity_V128:
    840          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    841          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
    842          return tmp1;
    843       case Ity_I128:
    844          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    845          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
    846          return tmp1;
    847       case Ity_V256:
    848          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    849          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
    850                                                     tmp1, tmp1));
    851          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
    852                                                     tmp1, tmp1));
    853          return tmp1;
    854       default:
    855          ppIRType(dst_ty);
    856          VG_(tool_panic)("mkPCastTo(2)");
    857    }
    858 }
    859 
    860 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
    861 /*
    862    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
    863    PCasting to Ity_U1.  However, sometimes it is necessary to be more
    864    accurate.  The insight is that the result is defined if two
    865    corresponding bits can be found, one from each argument, so that
    866    both bits are defined but are different -- that makes EQ say "No"
    867    and NE say "Yes".  Hence, we compute an improvement term and DifD
    868    it onto the "normal" (UifU) result.
    869 
    870    The result is:
    871 
    872    PCastTo<1> (
    873       -- naive version
    874       PCastTo<sz>( UifU<sz>(vxx, vyy) )
    875 
    876       `DifD<sz>`
    877 
    878       -- improvement term
    879       PCastTo<sz>( PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) ) )
    880    )
    881 
    882    where
    883      vec contains 0 (defined) bits where the corresponding arg bits
    884      are defined but different, and 1 bits otherwise.
    885 
    886      vec = Or<sz>( vxx,   // 0 iff bit defined
    887                    vyy,   // 0 iff bit defined
    888                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
    889                  )
    890 
    891      If any bit of vec is 0, the result is defined and so the
    892      improvement term should produce 0...0, else it should produce
    893      1...1.
    894 
    895      Hence require for the improvement term:
    896 
    897         if vec == 1...1 then 1...1 else 0...0
    898      ->
    899         PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) )
    900 
    901    This was extensively re-analysed and checked on 6 July 05.
    902 */
    903 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
    904                                     IRType  ty,
    905                                     IRAtom* vxx, IRAtom* vyy,
    906                                     IRAtom* xx,  IRAtom* yy )
    907 {
    908    IRAtom *naive, *vec, *improvement_term;
    909    IRAtom *improved, *final_cast, *top;
    910    IROp   opDIFD, opUIFU, opXOR, opNOT, opCMP, opOR;
    911 
    912    tl_assert(isShadowAtom(mce,vxx));
    913    tl_assert(isShadowAtom(mce,vyy));
    914    tl_assert(isOriginalAtom(mce,xx));
    915    tl_assert(isOriginalAtom(mce,yy));
    916    tl_assert(sameKindedAtoms(vxx,xx));
    917    tl_assert(sameKindedAtoms(vyy,yy));
    918 
    919    switch (ty) {
    920       case Ity_I16:
    921          opOR   = Iop_Or16;
    922          opDIFD = Iop_And16;
    923          opUIFU = Iop_Or16;
    924          opNOT  = Iop_Not16;
    925          opXOR  = Iop_Xor16;
    926          opCMP  = Iop_CmpEQ16;
    927          top    = mkU16(0xFFFF);
    928          break;
    929       case Ity_I32:
    930          opOR   = Iop_Or32;
    931          opDIFD = Iop_And32;
    932          opUIFU = Iop_Or32;
    933          opNOT  = Iop_Not32;
    934          opXOR  = Iop_Xor32;
    935          opCMP  = Iop_CmpEQ32;
    936          top    = mkU32(0xFFFFFFFF);
    937          break;
    938       case Ity_I64:
    939          opOR   = Iop_Or64;
    940          opDIFD = Iop_And64;
    941          opUIFU = Iop_Or64;
    942          opNOT  = Iop_Not64;
    943          opXOR  = Iop_Xor64;
    944          opCMP  = Iop_CmpEQ64;
    945          top    = mkU64(0xFFFFFFFFFFFFFFFFULL);
    946          break;
    947       default:
    948          VG_(tool_panic)("expensiveCmpEQorNE");
    949    }
    950 
    951    naive
    952       = mkPCastTo(mce,ty,
    953                   assignNew('V', mce, ty, binop(opUIFU, vxx, vyy)));
    954 
    955    vec
    956       = assignNew(
    957            'V', mce,ty,
    958            binop( opOR,
    959                   assignNew('V', mce,ty, binop(opOR, vxx, vyy)),
    960                   assignNew(
    961                      'V', mce,ty,
    962                      unop( opNOT,
    963                            assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
    964 
    965    improvement_term
    966       = mkPCastTo( mce,ty,
    967                    assignNew('V', mce,Ity_I1, binop(opCMP, vec, top)));
    968 
    969    improved
    970       = assignNew( 'V', mce,ty, binop(opDIFD, naive, improvement_term) );
    971 
    972    final_cast
    973       = mkPCastTo( mce, Ity_I1, improved );
    974 
    975    return final_cast;
    976 }
    977 
    978 
    979 /* --------- Semi-accurate interpretation of CmpORD. --------- */
    980 
    981 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
    982 
    983       CmpORD32S(x,y) = 1<<3   if  x <s y
    984                      = 1<<2   if  x >s y
    985                      = 1<<1   if  x == y
    986 
    987    and similarly the unsigned variant.  The default interpretation is:
    988 
    989       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
    990                                   & (7<<1)
    991 
    992    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
    993    are zero and therefore defined (viz, zero).
    994 
    995    Also deal with a special case better:
    996 
    997       CmpORD32S(x,0)
    998 
    999    Here, bit 3 (LT) of the result is a copy of the top bit of x and
   1000    will be defined even if the rest of x isn't.  In which case we do:
   1001 
   1002       CmpORD32S#(x,x#,0,{impliedly 0}#)
   1003          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
   1004            | (x# >>u 31) << 3      -- LT# = x#[31]
   1005 
   1006    Analogous handling for CmpORD64{S,U}.
   1007 */
   1008 static Bool isZeroU32 ( IRAtom* e )
   1009 {
   1010    return
   1011       toBool( e->tag == Iex_Const
   1012               && e->Iex.Const.con->tag == Ico_U32
   1013               && e->Iex.Const.con->Ico.U32 == 0 );
   1014 }
   1015 
   1016 static Bool isZeroU64 ( IRAtom* e )
   1017 {
   1018    return
   1019       toBool( e->tag == Iex_Const
   1020               && e->Iex.Const.con->tag == Ico_U64
   1021               && e->Iex.Const.con->Ico.U64 == 0 );
   1022 }
   1023 
   1024 static IRAtom* doCmpORD ( MCEnv*  mce,
   1025                           IROp    cmp_op,
   1026                           IRAtom* xxhash, IRAtom* yyhash,
   1027                           IRAtom* xx,     IRAtom* yy )
   1028 {
   1029    Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
   1030    Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
   1031    IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
   1032    IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
   1033    IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
   1034    IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
   1035    IRType ty     = m64 ? Ity_I64   : Ity_I32;
   1036    Int    width  = m64 ? 64        : 32;
   1037 
   1038    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
   1039 
   1040    IRAtom* threeLeft1 = NULL;
   1041    IRAtom* sevenLeft1 = NULL;
   1042 
   1043    tl_assert(isShadowAtom(mce,xxhash));
   1044    tl_assert(isShadowAtom(mce,yyhash));
   1045    tl_assert(isOriginalAtom(mce,xx));
   1046    tl_assert(isOriginalAtom(mce,yy));
   1047    tl_assert(sameKindedAtoms(xxhash,xx));
   1048    tl_assert(sameKindedAtoms(yyhash,yy));
   1049    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
   1050              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
   1051 
   1052    if (0) {
   1053       ppIROp(cmp_op); VG_(printf)(" ");
   1054       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
   1055    }
   1056 
   1057    if (syned && isZero(yy)) {
   1058       /* fancy interpretation */
   1059       /* if yy is zero, then it must be fully defined (zero#). */
   1060       tl_assert(isZero(yyhash));
   1061       threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
   1062       return
   1063          binop(
   1064             opOR,
   1065             assignNew(
   1066                'V', mce,ty,
   1067                binop(
   1068                   opAND,
   1069                   mkPCastTo(mce,ty, xxhash),
   1070                   threeLeft1
   1071                )),
   1072             assignNew(
   1073                'V', mce,ty,
   1074                binop(
   1075                   opSHL,
   1076                   assignNew(
   1077                      'V', mce,ty,
   1078                      binop(opSHR, xxhash, mkU8(width-1))),
   1079                   mkU8(3)
   1080                ))
   1081 	 );
   1082    } else {
   1083       /* standard interpretation */
   1084       sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
   1085       return
   1086          binop(
   1087             opAND,
   1088             mkPCastTo( mce,ty,
   1089                        mkUifU(mce,ty, xxhash,yyhash)),
   1090             sevenLeft1
   1091          );
   1092    }
   1093 }
   1094 
   1095 
   1096 /*------------------------------------------------------------*/
   1097 /*--- Emit a test and complaint if something is undefined. ---*/
   1098 /*------------------------------------------------------------*/
   1099 
   1100 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
   1101 
   1102 
   1103 /* Set the annotations on a dirty helper to indicate that the stack
   1104    pointer and instruction pointers might be read.  This is the
   1105    behaviour of all 'emit-a-complaint' style functions we might
   1106    call. */
   1107 
   1108 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
   1109    di->nFxState = 2;
   1110    di->fxState[0].fx        = Ifx_Read;
   1111    di->fxState[0].offset    = mce->layout->offset_SP;
   1112    di->fxState[0].size      = mce->layout->sizeof_SP;
   1113    di->fxState[0].nRepeats  = 0;
   1114    di->fxState[0].repeatLen = 0;
   1115    di->fxState[1].fx        = Ifx_Read;
   1116    di->fxState[1].offset    = mce->layout->offset_IP;
   1117    di->fxState[1].size      = mce->layout->sizeof_IP;
   1118    di->fxState[1].nRepeats  = 0;
   1119    di->fxState[1].repeatLen = 0;
   1120 }
   1121 
   1122 
   1123 /* Check the supplied *original* |atom| for undefinedness, and emit a
   1124    complaint if so.  Once that happens, mark it as defined.  This is
   1125    possible because the atom is either a tmp or literal.  If it's a
   1126    tmp, it will be shadowed by a tmp, and so we can set the shadow to
   1127    be defined.  In fact as mentioned above, we will have to allocate a
   1128    new tmp to carry the new 'defined' shadow value, and update the
   1129    original->tmp mapping accordingly; we cannot simply assign a new
   1130    value to an existing shadow tmp as this breaks SSAness.
   1131 
   1132    The checks are performed, any resulting complaint emitted, and
   1133    |atom|'s shadow temp set to 'defined', ONLY in the case that
   1134    |guard| evaluates to True at run-time.  If it evaluates to False
   1135    then no action is performed.  If |guard| is NULL (the usual case)
   1136    then it is assumed to be always-true, and hence these actions are
   1137    performed unconditionally.
   1138 
   1139    This routine does not generate code to check the definedness of
   1140    |guard|.  The caller is assumed to have taken care of that already.
   1141 */
   1142 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
   1143 {
   1144    IRAtom*  vatom;
   1145    IRType   ty;
   1146    Int      sz;
   1147    IRDirty* di;
   1148    IRAtom*  cond;
   1149    IRAtom*  origin;
   1150    void*    fn;
   1151    const HChar* nm;
   1152    IRExpr** args;
   1153    Int      nargs;
   1154 
   1155    // Don't do V bit tests if we're not reporting undefined value errors.
   1156    if (MC_(clo_mc_level) == 1)
   1157       return;
   1158 
   1159    if (guard)
   1160       tl_assert(isOriginalAtom(mce, guard));
   1161 
   1162    /* Since the original expression is atomic, there's no duplicated
   1163       work generated by making multiple V-expressions for it.  So we
   1164       don't really care about the possibility that someone else may
   1165       also create a V-interpretion for it. */
   1166    tl_assert(isOriginalAtom(mce, atom));
   1167    vatom = expr2vbits( mce, atom );
   1168    tl_assert(isShadowAtom(mce, vatom));
   1169    tl_assert(sameKindedAtoms(atom, vatom));
   1170 
   1171    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
   1172 
   1173    /* sz is only used for constructing the error message */
   1174    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
   1175 
   1176    cond = mkPCastTo( mce, Ity_I1, vatom );
   1177    /* cond will be 0 if all defined, and 1 if any not defined. */
   1178 
   1179    /* Get the origin info for the value we are about to check.  At
   1180       least, if we are doing origin tracking.  If not, use a dummy
   1181       zero origin. */
   1182    if (MC_(clo_mc_level) == 3) {
   1183       origin = schemeE( mce, atom );
   1184       if (mce->hWordTy == Ity_I64) {
   1185          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
   1186       }
   1187    } else {
   1188       origin = NULL;
   1189    }
   1190 
   1191    fn    = NULL;
   1192    nm    = NULL;
   1193    args  = NULL;
   1194    nargs = -1;
   1195 
   1196    switch (sz) {
   1197       case 0:
   1198          if (origin) {
   1199             fn    = &MC_(helperc_value_check0_fail_w_o);
   1200             nm    = "MC_(helperc_value_check0_fail_w_o)";
   1201             args  = mkIRExprVec_1(origin);
   1202             nargs = 1;
   1203          } else {
   1204             fn    = &MC_(helperc_value_check0_fail_no_o);
   1205             nm    = "MC_(helperc_value_check0_fail_no_o)";
   1206             args  = mkIRExprVec_0();
   1207             nargs = 0;
   1208          }
   1209          break;
   1210       case 1:
   1211          if (origin) {
   1212             fn    = &MC_(helperc_value_check1_fail_w_o);
   1213             nm    = "MC_(helperc_value_check1_fail_w_o)";
   1214             args  = mkIRExprVec_1(origin);
   1215             nargs = 1;
   1216          } else {
   1217             fn    = &MC_(helperc_value_check1_fail_no_o);
   1218             nm    = "MC_(helperc_value_check1_fail_no_o)";
   1219             args  = mkIRExprVec_0();
   1220             nargs = 0;
   1221          }
   1222          break;
   1223       case 4:
   1224          if (origin) {
   1225             fn    = &MC_(helperc_value_check4_fail_w_o);
   1226             nm    = "MC_(helperc_value_check4_fail_w_o)";
   1227             args  = mkIRExprVec_1(origin);
   1228             nargs = 1;
   1229          } else {
   1230             fn    = &MC_(helperc_value_check4_fail_no_o);
   1231             nm    = "MC_(helperc_value_check4_fail_no_o)";
   1232             args  = mkIRExprVec_0();
   1233             nargs = 0;
   1234          }
   1235          break;
   1236       case 8:
   1237          if (origin) {
   1238             fn    = &MC_(helperc_value_check8_fail_w_o);
   1239             nm    = "MC_(helperc_value_check8_fail_w_o)";
   1240             args  = mkIRExprVec_1(origin);
   1241             nargs = 1;
   1242          } else {
   1243             fn    = &MC_(helperc_value_check8_fail_no_o);
   1244             nm    = "MC_(helperc_value_check8_fail_no_o)";
   1245             args  = mkIRExprVec_0();
   1246             nargs = 0;
   1247          }
   1248          break;
   1249       case 2:
   1250       case 16:
   1251          if (origin) {
   1252             fn    = &MC_(helperc_value_checkN_fail_w_o);
   1253             nm    = "MC_(helperc_value_checkN_fail_w_o)";
   1254             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
   1255             nargs = 2;
   1256          } else {
   1257             fn    = &MC_(helperc_value_checkN_fail_no_o);
   1258             nm    = "MC_(helperc_value_checkN_fail_no_o)";
   1259             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
   1260             nargs = 1;
   1261          }
   1262          break;
   1263       default:
   1264          VG_(tool_panic)("unexpected szB");
   1265    }
   1266 
   1267    tl_assert(fn);
   1268    tl_assert(nm);
   1269    tl_assert(args);
   1270    tl_assert(nargs >= 0 && nargs <= 2);
   1271    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
   1272               || (MC_(clo_mc_level) == 2 && origin == NULL) );
   1273 
   1274    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
   1275                            VG_(fnptr_to_fnentry)( fn ), args );
   1276    di->guard = cond; // and cond is PCast-to-1(atom#)
   1277 
   1278    /* If the complaint is to be issued under a guard condition, AND
   1279       that into the guard condition for the helper call. */
   1280    if (guard) {
   1281       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
   1282       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
   1283       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
   1284       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
   1285    }
   1286 
   1287    setHelperAnns( mce, di );
   1288    stmt( 'V', mce, IRStmt_Dirty(di));
   1289 
   1290    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
   1291       defined -- but only in the case where the guard evaluates to
   1292       True at run-time.  Do the update by setting the orig->shadow
   1293       mapping for tmp to reflect the fact that this shadow is getting
   1294       a new value. */
   1295    tl_assert(isIRAtom(vatom));
   1296    /* sameKindedAtoms ... */
   1297    if (vatom->tag == Iex_RdTmp) {
   1298       tl_assert(atom->tag == Iex_RdTmp);
   1299       if (guard == NULL) {
   1300          // guard is 'always True', hence update unconditionally
   1301          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1302          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
   1303                           definedOfType(ty));
   1304       } else {
   1305          // update the temp only conditionally.  Do this by copying
   1306          // its old value when the guard is False.
   1307          // The old value ..
   1308          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1309          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1310          IRAtom* new_tmpV
   1311             = assignNew('V', mce, shadowTypeV(ty),
   1312                         IRExpr_ITE(guard, definedOfType(ty),
   1313                                           mkexpr(old_tmpV)));
   1314          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
   1315       }
   1316    }
   1317 }
   1318 
   1319 
   1320 /*------------------------------------------------------------*/
   1321 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
   1322 /*------------------------------------------------------------*/
   1323 
   1324 /* Examine the always-defined sections declared in layout to see if
   1325    the (offset,size) section is within one.  Note, is is an error to
   1326    partially fall into such a region: (offset,size) should either be
   1327    completely in such a region or completely not-in such a region.
   1328 */
   1329 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
   1330 {
   1331    Int minoffD, maxoffD, i;
   1332    Int minoff = offset;
   1333    Int maxoff = minoff + size - 1;
   1334    tl_assert((minoff & ~0xFFFF) == 0);
   1335    tl_assert((maxoff & ~0xFFFF) == 0);
   1336 
   1337    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
   1338       minoffD = mce->layout->alwaysDefd[i].offset;
   1339       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
   1340       tl_assert((minoffD & ~0xFFFF) == 0);
   1341       tl_assert((maxoffD & ~0xFFFF) == 0);
   1342 
   1343       if (maxoff < minoffD || maxoffD < minoff)
   1344          continue; /* no overlap */
   1345       if (minoff >= minoffD && maxoff <= maxoffD)
   1346          return True; /* completely contained in an always-defd section */
   1347 
   1348       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
   1349    }
   1350    return False; /* could not find any containing section */
   1351 }
   1352 
   1353 
   1354 /* Generate into bb suitable actions to shadow this Put.  If the state
   1355    slice is marked 'always defined', do nothing.  Otherwise, write the
   1356    supplied V bits to the shadow state.  We can pass in either an
   1357    original atom or a V-atom, but not both.  In the former case the
   1358    relevant V-bits are then generated from the original.
   1359    We assume here, that the definedness of GUARD has already been checked.
   1360 */
   1361 static
   1362 void do_shadow_PUT ( MCEnv* mce,  Int offset,
   1363                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
   1364 {
   1365    IRType ty;
   1366 
   1367    // Don't do shadow PUTs if we're not doing undefined value checking.
   1368    // Their absence lets Vex's optimiser remove all the shadow computation
   1369    // that they depend on, which includes GETs of the shadow registers.
   1370    if (MC_(clo_mc_level) == 1)
   1371       return;
   1372 
   1373    if (atom) {
   1374       tl_assert(!vatom);
   1375       tl_assert(isOriginalAtom(mce, atom));
   1376       vatom = expr2vbits( mce, atom );
   1377    } else {
   1378       tl_assert(vatom);
   1379       tl_assert(isShadowAtom(mce, vatom));
   1380    }
   1381 
   1382    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
   1383    tl_assert(ty != Ity_I1);
   1384    tl_assert(ty != Ity_I128);
   1385    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
   1386       /* later: no ... */
   1387       /* emit code to emit a complaint if any of the vbits are 1. */
   1388       /* complainIfUndefined(mce, atom); */
   1389    } else {
   1390       /* Do a plain shadow Put. */
   1391       if (guard) {
   1392          /* If the guard expression evaluates to false we simply Put the value
   1393             that is already stored in the guest state slot */
   1394          IRAtom *cond, *iffalse;
   1395 
   1396          cond    = assignNew('V', mce, Ity_I1, guard);
   1397          iffalse = assignNew('V', mce, ty,
   1398                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
   1399          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
   1400       }
   1401       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
   1402    }
   1403 }
   1404 
   1405 
   1406 /* Return an expression which contains the V bits corresponding to the
   1407    given GETI (passed in in pieces).
   1408 */
   1409 static
   1410 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
   1411 {
   1412    IRAtom* vatom;
   1413    IRType  ty, tyS;
   1414    Int     arrSize;;
   1415    IRRegArray* descr = puti->descr;
   1416    IRAtom*     ix    = puti->ix;
   1417    Int         bias  = puti->bias;
   1418    IRAtom*     atom  = puti->data;
   1419 
   1420    // Don't do shadow PUTIs if we're not doing undefined value checking.
   1421    // Their absence lets Vex's optimiser remove all the shadow computation
   1422    // that they depend on, which includes GETIs of the shadow registers.
   1423    if (MC_(clo_mc_level) == 1)
   1424       return;
   1425 
   1426    tl_assert(isOriginalAtom(mce,atom));
   1427    vatom = expr2vbits( mce, atom );
   1428    tl_assert(sameKindedAtoms(atom, vatom));
   1429    ty   = descr->elemTy;
   1430    tyS  = shadowTypeV(ty);
   1431    arrSize = descr->nElems * sizeofIRType(ty);
   1432    tl_assert(ty != Ity_I1);
   1433    tl_assert(isOriginalAtom(mce,ix));
   1434    complainIfUndefined(mce, ix, NULL);
   1435    if (isAlwaysDefd(mce, descr->base, arrSize)) {
   1436       /* later: no ... */
   1437       /* emit code to emit a complaint if any of the vbits are 1. */
   1438       /* complainIfUndefined(mce, atom); */
   1439    } else {
   1440       /* Do a cloned version of the Put that refers to the shadow
   1441          area. */
   1442       IRRegArray* new_descr
   1443          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
   1444                          tyS, descr->nElems);
   1445       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
   1446    }
   1447 }
   1448 
   1449 
   1450 /* Return an expression which contains the V bits corresponding to the
   1451    given GET (passed in in pieces).
   1452 */
   1453 static
   1454 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
   1455 {
   1456    IRType tyS = shadowTypeV(ty);
   1457    tl_assert(ty != Ity_I1);
   1458    tl_assert(ty != Ity_I128);
   1459    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
   1460       /* Always defined, return all zeroes of the relevant type */
   1461       return definedOfType(tyS);
   1462    } else {
   1463       /* return a cloned version of the Get that refers to the shadow
   1464          area. */
   1465       /* FIXME: this isn't an atom! */
   1466       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
   1467    }
   1468 }
   1469 
   1470 
   1471 /* Return an expression which contains the V bits corresponding to the
   1472    given GETI (passed in in pieces).
   1473 */
   1474 static
   1475 IRExpr* shadow_GETI ( MCEnv* mce,
   1476                       IRRegArray* descr, IRAtom* ix, Int bias )
   1477 {
   1478    IRType ty   = descr->elemTy;
   1479    IRType tyS  = shadowTypeV(ty);
   1480    Int arrSize = descr->nElems * sizeofIRType(ty);
   1481    tl_assert(ty != Ity_I1);
   1482    tl_assert(isOriginalAtom(mce,ix));
   1483    complainIfUndefined(mce, ix, NULL);
   1484    if (isAlwaysDefd(mce, descr->base, arrSize)) {
   1485       /* Always defined, return all zeroes of the relevant type */
   1486       return definedOfType(tyS);
   1487    } else {
   1488       /* return a cloned version of the Get that refers to the shadow
   1489          area. */
   1490       IRRegArray* new_descr
   1491          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
   1492                          tyS, descr->nElems);
   1493       return IRExpr_GetI( new_descr, ix, bias );
   1494    }
   1495 }
   1496 
   1497 
   1498 /*------------------------------------------------------------*/
   1499 /*--- Generating approximations for unknown operations,    ---*/
   1500 /*--- using lazy-propagate semantics                       ---*/
   1501 /*------------------------------------------------------------*/
   1502 
   1503 /* Lazy propagation of undefinedness from two values, resulting in the
   1504    specified shadow type.
   1505 */
   1506 static
   1507 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
   1508 {
   1509    IRAtom* at;
   1510    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1511    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1512    tl_assert(isShadowAtom(mce,va1));
   1513    tl_assert(isShadowAtom(mce,va2));
   1514 
   1515    /* The general case is inefficient because PCast is an expensive
   1516       operation.  Here are some special cases which use PCast only
   1517       once rather than twice. */
   1518 
   1519    /* I64 x I64 -> I64 */
   1520    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
   1521       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
   1522       at = mkUifU(mce, Ity_I64, va1, va2);
   1523       at = mkPCastTo(mce, Ity_I64, at);
   1524       return at;
   1525    }
   1526 
   1527    /* I64 x I64 -> I32 */
   1528    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
   1529       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
   1530       at = mkUifU(mce, Ity_I64, va1, va2);
   1531       at = mkPCastTo(mce, Ity_I32, at);
   1532       return at;
   1533    }
   1534 
   1535    if (0) {
   1536       VG_(printf)("mkLazy2 ");
   1537       ppIRType(t1);
   1538       VG_(printf)("_");
   1539       ppIRType(t2);
   1540       VG_(printf)("_");
   1541       ppIRType(finalVty);
   1542       VG_(printf)("\n");
   1543    }
   1544 
   1545    /* General case: force everything via 32-bit intermediaries. */
   1546    at = mkPCastTo(mce, Ity_I32, va1);
   1547    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
   1548    at = mkPCastTo(mce, finalVty, at);
   1549    return at;
   1550 }
   1551 
   1552 
   1553 /* 3-arg version of the above. */
   1554 static
   1555 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
   1556                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
   1557 {
   1558    IRAtom* at;
   1559    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1560    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1561    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
   1562    tl_assert(isShadowAtom(mce,va1));
   1563    tl_assert(isShadowAtom(mce,va2));
   1564    tl_assert(isShadowAtom(mce,va3));
   1565 
   1566    /* The general case is inefficient because PCast is an expensive
   1567       operation.  Here are some special cases which use PCast only
   1568       twice rather than three times. */
   1569 
   1570    /* I32 x I64 x I64 -> I64 */
   1571    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1572    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
   1573        && finalVty == Ity_I64) {
   1574       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
   1575       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
   1576          mode indication which is fully defined, this should get
   1577          folded out later. */
   1578       at = mkPCastTo(mce, Ity_I64, va1);
   1579       /* Now fold in 2nd and 3rd args. */
   1580       at = mkUifU(mce, Ity_I64, at, va2);
   1581       at = mkUifU(mce, Ity_I64, at, va3);
   1582       /* and PCast once again. */
   1583       at = mkPCastTo(mce, Ity_I64, at);
   1584       return at;
   1585    }
   1586 
   1587    /* I32 x I8 x I64 -> I64 */
   1588    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
   1589        && finalVty == Ity_I64) {
   1590       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
   1591       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
   1592        * rounding mode indication which is fully defined, this should
   1593        * get folded out later.
   1594       */
   1595       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
   1596       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
   1597       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
   1598       at = mkUifU(mce, Ity_I64, at, va3);
   1599       /* and PCast once again. */
   1600       at = mkPCastTo(mce, Ity_I64, at);
   1601       return at;
   1602    }
   1603 
   1604    /* I32 x I64 x I64 -> I32 */
   1605    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
   1606        && finalVty == Ity_I32) {
   1607       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
   1608       at = mkPCastTo(mce, Ity_I64, va1);
   1609       at = mkUifU(mce, Ity_I64, at, va2);
   1610       at = mkUifU(mce, Ity_I64, at, va3);
   1611       at = mkPCastTo(mce, Ity_I32, at);
   1612       return at;
   1613    }
   1614 
   1615    /* I32 x I32 x I32 -> I32 */
   1616    /* 32-bit FP idiom, as (eg) happens on ARM */
   1617    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
   1618        && finalVty == Ity_I32) {
   1619       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
   1620       at = va1;
   1621       at = mkUifU(mce, Ity_I32, at, va2);
   1622       at = mkUifU(mce, Ity_I32, at, va3);
   1623       at = mkPCastTo(mce, Ity_I32, at);
   1624       return at;
   1625    }
   1626 
   1627    /* I32 x I128 x I128 -> I128 */
   1628    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1629    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
   1630        && finalVty == Ity_I128) {
   1631       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
   1632       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
   1633          mode indication which is fully defined, this should get
   1634          folded out later. */
   1635       at = mkPCastTo(mce, Ity_I128, va1);
   1636       /* Now fold in 2nd and 3rd args. */
   1637       at = mkUifU(mce, Ity_I128, at, va2);
   1638       at = mkUifU(mce, Ity_I128, at, va3);
   1639       /* and PCast once again. */
   1640       at = mkPCastTo(mce, Ity_I128, at);
   1641       return at;
   1642    }
   1643 
   1644    /* I32 x I8 x I128 -> I128 */
   1645    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1646    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
   1647        && finalVty == Ity_I128) {
   1648       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
   1649       /* Use I64 as an intermediate type, which means PCasting all 3
   1650          args to I64 to start with. 1st arg is typically a rounding
   1651          mode indication which is fully defined, so we hope that it
   1652          will get folded out later. */
   1653       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
   1654       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
   1655       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
   1656       /* Now UifU all three together. */
   1657       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
   1658       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
   1659       /* and PCast once again. */
   1660       at = mkPCastTo(mce, Ity_I128, at);
   1661       return at;
   1662    }
   1663    if (1) {
   1664       VG_(printf)("mkLazy3: ");
   1665       ppIRType(t1);
   1666       VG_(printf)(" x ");
   1667       ppIRType(t2);
   1668       VG_(printf)(" x ");
   1669       ppIRType(t3);
   1670       VG_(printf)(" -> ");
   1671       ppIRType(finalVty);
   1672       VG_(printf)("\n");
   1673    }
   1674 
   1675    tl_assert(0);
   1676    /* General case: force everything via 32-bit intermediaries. */
   1677    /*
   1678    at = mkPCastTo(mce, Ity_I32, va1);
   1679    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
   1680    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
   1681    at = mkPCastTo(mce, finalVty, at);
   1682    return at;
   1683    */
   1684 }
   1685 
   1686 
   1687 /* 4-arg version of the above. */
   1688 static
   1689 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
   1690                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
   1691 {
   1692    IRAtom* at;
   1693    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1694    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1695    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
   1696    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
   1697    tl_assert(isShadowAtom(mce,va1));
   1698    tl_assert(isShadowAtom(mce,va2));
   1699    tl_assert(isShadowAtom(mce,va3));
   1700    tl_assert(isShadowAtom(mce,va4));
   1701 
   1702    /* The general case is inefficient because PCast is an expensive
   1703       operation.  Here are some special cases which use PCast only
   1704       twice rather than three times. */
   1705 
   1706    /* I32 x I64 x I64 x I64 -> I64 */
   1707    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
   1708    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
   1709        && finalVty == Ity_I64) {
   1710       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
   1711       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
   1712          mode indication which is fully defined, this should get
   1713          folded out later. */
   1714       at = mkPCastTo(mce, Ity_I64, va1);
   1715       /* Now fold in 2nd, 3rd, 4th args. */
   1716       at = mkUifU(mce, Ity_I64, at, va2);
   1717       at = mkUifU(mce, Ity_I64, at, va3);
   1718       at = mkUifU(mce, Ity_I64, at, va4);
   1719       /* and PCast once again. */
   1720       at = mkPCastTo(mce, Ity_I64, at);
   1721       return at;
   1722    }
   1723    /* I32 x I32 x I32 x I32 -> I32 */
   1724    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
   1725    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
   1726        && finalVty == Ity_I32) {
   1727       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
   1728       at = va1;
   1729       /* Now fold in 2nd, 3rd, 4th args. */
   1730       at = mkUifU(mce, Ity_I32, at, va2);
   1731       at = mkUifU(mce, Ity_I32, at, va3);
   1732       at = mkUifU(mce, Ity_I32, at, va4);
   1733       at = mkPCastTo(mce, Ity_I32, at);
   1734       return at;
   1735    }
   1736 
   1737    if (1) {
   1738       VG_(printf)("mkLazy4: ");
   1739       ppIRType(t1);
   1740       VG_(printf)(" x ");
   1741       ppIRType(t2);
   1742       VG_(printf)(" x ");
   1743       ppIRType(t3);
   1744       VG_(printf)(" x ");
   1745       ppIRType(t4);
   1746       VG_(printf)(" -> ");
   1747       ppIRType(finalVty);
   1748       VG_(printf)("\n");
   1749    }
   1750 
   1751    tl_assert(0);
   1752 }
   1753 
   1754 
   1755 /* Do the lazy propagation game from a null-terminated vector of
   1756    atoms.  This is presumably the arguments to a helper call, so the
   1757    IRCallee info is also supplied in order that we can know which
   1758    arguments should be ignored (via the .mcx_mask field).
   1759 */
   1760 static
   1761 IRAtom* mkLazyN ( MCEnv* mce,
   1762                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
   1763 {
   1764    Int     i;
   1765    IRAtom* here;
   1766    IRAtom* curr;
   1767    IRType  mergeTy;
   1768    Bool    mergeTy64 = True;
   1769 
   1770    /* Decide on the type of the merge intermediary.  If all relevant
   1771       args are I64, then it's I64.  In all other circumstances, use
   1772       I32. */
   1773    for (i = 0; exprvec[i]; i++) {
   1774       tl_assert(i < 32);
   1775       tl_assert(isOriginalAtom(mce, exprvec[i]));
   1776       if (cee->mcx_mask & (1<<i))
   1777          continue;
   1778       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
   1779          mergeTy64 = False;
   1780    }
   1781 
   1782    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
   1783    curr    = definedOfType(mergeTy);
   1784 
   1785    for (i = 0; exprvec[i]; i++) {
   1786       tl_assert(i < 32);
   1787       tl_assert(isOriginalAtom(mce, exprvec[i]));
   1788       /* Only take notice of this arg if the callee's mc-exclusion
   1789          mask does not say it is to be excluded. */
   1790       if (cee->mcx_mask & (1<<i)) {
   1791          /* the arg is to be excluded from definedness checking.  Do
   1792             nothing. */
   1793          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
   1794       } else {
   1795          /* calculate the arg's definedness, and pessimistically merge
   1796             it in. */
   1797          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i]) );
   1798          curr = mergeTy64
   1799                    ? mkUifU64(mce, here, curr)
   1800                    : mkUifU32(mce, here, curr);
   1801       }
   1802    }
   1803    return mkPCastTo(mce, finalVtype, curr );
   1804 }
   1805 
   1806 
   1807 /*------------------------------------------------------------*/
   1808 /*--- Generating expensive sequences for exact carry-chain ---*/
   1809 /*--- propagation in add/sub and related operations.       ---*/
   1810 /*------------------------------------------------------------*/
   1811 
   1812 static
   1813 IRAtom* expensiveAddSub ( MCEnv*  mce,
   1814                           Bool    add,
   1815                           IRType  ty,
   1816                           IRAtom* qaa, IRAtom* qbb,
   1817                           IRAtom* aa,  IRAtom* bb )
   1818 {
   1819    IRAtom *a_min, *b_min, *a_max, *b_max;
   1820    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
   1821 
   1822    tl_assert(isShadowAtom(mce,qaa));
   1823    tl_assert(isShadowAtom(mce,qbb));
   1824    tl_assert(isOriginalAtom(mce,aa));
   1825    tl_assert(isOriginalAtom(mce,bb));
   1826    tl_assert(sameKindedAtoms(qaa,aa));
   1827    tl_assert(sameKindedAtoms(qbb,bb));
   1828 
   1829    switch (ty) {
   1830       case Ity_I32:
   1831          opAND = Iop_And32;
   1832          opOR  = Iop_Or32;
   1833          opXOR = Iop_Xor32;
   1834          opNOT = Iop_Not32;
   1835          opADD = Iop_Add32;
   1836          opSUB = Iop_Sub32;
   1837          break;
   1838       case Ity_I64:
   1839          opAND = Iop_And64;
   1840          opOR  = Iop_Or64;
   1841          opXOR = Iop_Xor64;
   1842          opNOT = Iop_Not64;
   1843          opADD = Iop_Add64;
   1844          opSUB = Iop_Sub64;
   1845          break;
   1846       default:
   1847          VG_(tool_panic)("expensiveAddSub");
   1848    }
   1849 
   1850    // a_min = aa & ~qaa
   1851    a_min = assignNew('V', mce,ty,
   1852                      binop(opAND, aa,
   1853                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
   1854 
   1855    // b_min = bb & ~qbb
   1856    b_min = assignNew('V', mce,ty,
   1857                      binop(opAND, bb,
   1858                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
   1859 
   1860    // a_max = aa | qaa
   1861    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
   1862 
   1863    // b_max = bb | qbb
   1864    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
   1865 
   1866    if (add) {
   1867       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
   1868       return
   1869       assignNew('V', mce,ty,
   1870          binop( opOR,
   1871                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
   1872                 assignNew('V', mce,ty,
   1873                    binop( opXOR,
   1874                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
   1875                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
   1876                    )
   1877                 )
   1878          )
   1879       );
   1880    } else {
   1881       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max + b_min))
   1882       return
   1883       assignNew('V', mce,ty,
   1884          binop( opOR,
   1885                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
   1886                 assignNew('V', mce,ty,
   1887                    binop( opXOR,
   1888                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
   1889                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
   1890                    )
   1891                 )
   1892          )
   1893       );
   1894    }
   1895 
   1896 }
   1897 
   1898 
   1899 static
   1900 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
   1901                                        IRAtom* atom, IRAtom* vatom )
   1902 {
   1903    IRType ty;
   1904    IROp xorOp, subOp, andOp;
   1905    IRExpr *one;
   1906    IRAtom *improver, *improved;
   1907    tl_assert(isShadowAtom(mce,vatom));
   1908    tl_assert(isOriginalAtom(mce,atom));
   1909    tl_assert(sameKindedAtoms(atom,vatom));
   1910 
   1911    switch (czop) {
   1912       case Iop_Ctz32:
   1913          ty = Ity_I32;
   1914          xorOp = Iop_Xor32;
   1915          subOp = Iop_Sub32;
   1916          andOp = Iop_And32;
   1917          one = mkU32(1);
   1918          break;
   1919       case Iop_Ctz64:
   1920          ty = Ity_I64;
   1921          xorOp = Iop_Xor64;
   1922          subOp = Iop_Sub64;
   1923          andOp = Iop_And64;
   1924          one = mkU64(1);
   1925          break;
   1926       default:
   1927          ppIROp(czop);
   1928          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
   1929    }
   1930 
   1931    // improver = atom ^ (atom - 1)
   1932    //
   1933    // That is, improver has its low ctz(atom) bits equal to one;
   1934    // higher bits (if any) equal to zero.
   1935    improver = assignNew('V', mce,ty,
   1936                         binop(xorOp,
   1937                               atom,
   1938                               assignNew('V', mce, ty,
   1939                                         binop(subOp, atom, one))));
   1940 
   1941    // improved = vatom & improver
   1942    //
   1943    // That is, treat any V bits above the first ctz(atom) bits as
   1944    // "defined".
   1945    improved = assignNew('V', mce, ty,
   1946                         binop(andOp, vatom, improver));
   1947 
   1948    // Return pessimizing cast of improved.
   1949    return mkPCastTo(mce, ty, improved);
   1950 }
   1951 
   1952 
   1953 /*------------------------------------------------------------*/
   1954 /*--- Scalar shifts.                                       ---*/
   1955 /*------------------------------------------------------------*/
   1956 
   1957 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
   1958    idea is to shift the definedness bits by the original shift amount.
   1959    This introduces 0s ("defined") in new positions for left shifts and
   1960    unsigned right shifts, and copies the top definedness bit for
   1961    signed right shifts.  So, conveniently, applying the original shift
   1962    operator to the definedness bits for the left arg is exactly the
   1963    right thing to do:
   1964 
   1965       (qaa << bb)
   1966 
   1967    However if the shift amount is undefined then the whole result
   1968    is undefined.  Hence need:
   1969 
   1970       (qaa << bb) `UifU` PCast(qbb)
   1971 
   1972    If the shift amount bb is a literal than qbb will say 'all defined'
   1973    and the UifU and PCast will get folded out by post-instrumentation
   1974    optimisation.
   1975 */
   1976 static IRAtom* scalarShift ( MCEnv*  mce,
   1977                              IRType  ty,
   1978                              IROp    original_op,
   1979                              IRAtom* qaa, IRAtom* qbb,
   1980                              IRAtom* aa,  IRAtom* bb )
   1981 {
   1982    tl_assert(isShadowAtom(mce,qaa));
   1983    tl_assert(isShadowAtom(mce,qbb));
   1984    tl_assert(isOriginalAtom(mce,aa));
   1985    tl_assert(isOriginalAtom(mce,bb));
   1986    tl_assert(sameKindedAtoms(qaa,aa));
   1987    tl_assert(sameKindedAtoms(qbb,bb));
   1988    return
   1989       assignNew(
   1990          'V', mce, ty,
   1991          mkUifU( mce, ty,
   1992                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
   1993                  mkPCastTo(mce, ty, qbb)
   1994          )
   1995    );
   1996 }
   1997 
   1998 
   1999 /*------------------------------------------------------------*/
   2000 /*--- Helpers for dealing with vector primops.             ---*/
   2001 /*------------------------------------------------------------*/
   2002 
   2003 /* Vector pessimisation -- pessimise within each lane individually. */
   2004 
   2005 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
   2006 {
   2007    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
   2008 }
   2009 
   2010 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
   2011 {
   2012    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
   2013 }
   2014 
   2015 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
   2016 {
   2017    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
   2018 }
   2019 
   2020 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
   2021 {
   2022    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
   2023 }
   2024 
   2025 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
   2026 {
   2027    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
   2028 }
   2029 
   2030 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
   2031 {
   2032    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
   2033 }
   2034 
   2035 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
   2036 {
   2037    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
   2038 }
   2039 
   2040 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
   2041 {
   2042    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
   2043 }
   2044 
   2045 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
   2046 {
   2047    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
   2048 }
   2049 
   2050 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
   2051 {
   2052    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
   2053 }
   2054 
   2055 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
   2056 {
   2057    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
   2058 }
   2059 
   2060 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
   2061 {
   2062    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
   2063 }
   2064 
   2065 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
   2066 {
   2067    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
   2068 }
   2069 
   2070 
   2071 /* Here's a simple scheme capable of handling ops derived from SSE1
   2072    code and while only generating ops that can be efficiently
   2073    implemented in SSE1. */
   2074 
   2075 /* All-lanes versions are straightforward:
   2076 
   2077    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
   2078 
   2079    unary32Fx4(x,y)    ==> PCast32x4(x#)
   2080 
   2081    Lowest-lane-only versions are more complex:
   2082 
   2083    binary32F0x4(x,y)  ==> SetV128lo32(
   2084                              x#,
   2085                              PCast32(V128to32(UifUV128(x#,y#)))
   2086                           )
   2087 
   2088    This is perhaps not so obvious.  In particular, it's faster to
   2089    do a V128-bit UifU and then take the bottom 32 bits than the more
   2090    obvious scheme of taking the bottom 32 bits of each operand
   2091    and doing a 32-bit UifU.  Basically since UifU is fast and
   2092    chopping lanes off vector values is slow.
   2093 
   2094    Finally:
   2095 
   2096    unary32F0x4(x)     ==> SetV128lo32(
   2097                              x#,
   2098                              PCast32(V128to32(x#))
   2099                           )
   2100 
   2101    Where:
   2102 
   2103    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
   2104    PCast32x4(v#) = CmpNEZ32x4(v#)
   2105 */
   2106 
   2107 static
   2108 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2109 {
   2110    IRAtom* at;
   2111    tl_assert(isShadowAtom(mce, vatomX));
   2112    tl_assert(isShadowAtom(mce, vatomY));
   2113    at = mkUifUV128(mce, vatomX, vatomY);
   2114    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
   2115    return at;
   2116 }
   2117 
   2118 static
   2119 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
   2120 {
   2121    IRAtom* at;
   2122    tl_assert(isShadowAtom(mce, vatomX));
   2123    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
   2124    return at;
   2125 }
   2126 
   2127 static
   2128 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2129 {
   2130    IRAtom* at;
   2131    tl_assert(isShadowAtom(mce, vatomX));
   2132    tl_assert(isShadowAtom(mce, vatomY));
   2133    at = mkUifUV128(mce, vatomX, vatomY);
   2134    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
   2135    at = mkPCastTo(mce, Ity_I32, at);
   2136    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
   2137    return at;
   2138 }
   2139 
   2140 static
   2141 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
   2142 {
   2143    IRAtom* at;
   2144    tl_assert(isShadowAtom(mce, vatomX));
   2145    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
   2146    at = mkPCastTo(mce, Ity_I32, at);
   2147    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
   2148    return at;
   2149 }
   2150 
   2151 /* --- ... and ... 64Fx2 versions of the same ... --- */
   2152 
   2153 static
   2154 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2155 {
   2156    IRAtom* at;
   2157    tl_assert(isShadowAtom(mce, vatomX));
   2158    tl_assert(isShadowAtom(mce, vatomY));
   2159    at = mkUifUV128(mce, vatomX, vatomY);
   2160    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
   2161    return at;
   2162 }
   2163 
   2164 static
   2165 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
   2166 {
   2167    IRAtom* at;
   2168    tl_assert(isShadowAtom(mce, vatomX));
   2169    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
   2170    return at;
   2171 }
   2172 
   2173 static
   2174 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2175 {
   2176    IRAtom* at;
   2177    tl_assert(isShadowAtom(mce, vatomX));
   2178    tl_assert(isShadowAtom(mce, vatomY));
   2179    at = mkUifUV128(mce, vatomX, vatomY);
   2180    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
   2181    at = mkPCastTo(mce, Ity_I64, at);
   2182    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
   2183    return at;
   2184 }
   2185 
   2186 static
   2187 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
   2188 {
   2189    IRAtom* at;
   2190    tl_assert(isShadowAtom(mce, vatomX));
   2191    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
   2192    at = mkPCastTo(mce, Ity_I64, at);
   2193    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
   2194    return at;
   2195 }
   2196 
   2197 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
   2198 
   2199 static
   2200 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2201 {
   2202    IRAtom* at;
   2203    tl_assert(isShadowAtom(mce, vatomX));
   2204    tl_assert(isShadowAtom(mce, vatomY));
   2205    at = mkUifU64(mce, vatomX, vatomY);
   2206    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
   2207    return at;
   2208 }
   2209 
   2210 static
   2211 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
   2212 {
   2213    IRAtom* at;
   2214    tl_assert(isShadowAtom(mce, vatomX));
   2215    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
   2216    return at;
   2217 }
   2218 
   2219 /* --- ... and ... 64Fx4 versions of the same ... --- */
   2220 
   2221 static
   2222 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2223 {
   2224    IRAtom* at;
   2225    tl_assert(isShadowAtom(mce, vatomX));
   2226    tl_assert(isShadowAtom(mce, vatomY));
   2227    at = mkUifUV256(mce, vatomX, vatomY);
   2228    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
   2229    return at;
   2230 }
   2231 
   2232 static
   2233 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
   2234 {
   2235    IRAtom* at;
   2236    tl_assert(isShadowAtom(mce, vatomX));
   2237    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
   2238    return at;
   2239 }
   2240 
   2241 /* --- ... and ... 32Fx8 versions of the same ... --- */
   2242 
   2243 static
   2244 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2245 {
   2246    IRAtom* at;
   2247    tl_assert(isShadowAtom(mce, vatomX));
   2248    tl_assert(isShadowAtom(mce, vatomY));
   2249    at = mkUifUV256(mce, vatomX, vatomY);
   2250    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
   2251    return at;
   2252 }
   2253 
   2254 static
   2255 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
   2256 {
   2257    IRAtom* at;
   2258    tl_assert(isShadowAtom(mce, vatomX));
   2259    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
   2260    return at;
   2261 }
   2262 
   2263 /* --- 64Fx2 binary FP ops, with rounding mode --- */
   2264 
   2265 static
   2266 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
   2267                                        IRAtom* vatomX, IRAtom* vatomY )
   2268 {
   2269    /* This is the same as binary64Fx2, except that we subsequently
   2270       pessimise vRM (definedness of the rounding mode), widen to 128
   2271       bits and UifU it into the result.  As with the scalar cases, if
   2272       the RM is a constant then it is defined and so this extra bit
   2273       will get constant-folded out later. */
   2274    // "do" the vector args
   2275    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
   2276    // PCast the RM, and widen it to 128 bits
   2277    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2278    // Roll it into the result
   2279    t1 = mkUifUV128(mce, t1, t2);
   2280    return t1;
   2281 }
   2282 
   2283 /* --- ... and ... 32Fx4 versions of the same --- */
   2284 
   2285 static
   2286 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
   2287                                        IRAtom* vatomX, IRAtom* vatomY )
   2288 {
   2289    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
   2290    // PCast the RM, and widen it to 128 bits
   2291    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2292    // Roll it into the result
   2293    t1 = mkUifUV128(mce, t1, t2);
   2294    return t1;
   2295 }
   2296 
   2297 /* --- ... and ... 64Fx4 versions of the same --- */
   2298 
   2299 static
   2300 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
   2301                                        IRAtom* vatomX, IRAtom* vatomY )
   2302 {
   2303    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
   2304    // PCast the RM, and widen it to 256 bits
   2305    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
   2306    // Roll it into the result
   2307    t1 = mkUifUV256(mce, t1, t2);
   2308    return t1;
   2309 }
   2310 
   2311 /* --- ... and ... 32Fx8 versions of the same --- */
   2312 
   2313 static
   2314 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
   2315                                        IRAtom* vatomX, IRAtom* vatomY )
   2316 {
   2317    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
   2318    // PCast the RM, and widen it to 256 bits
   2319    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
   2320    // Roll it into the result
   2321    t1 = mkUifUV256(mce, t1, t2);
   2322    return t1;
   2323 }
   2324 
   2325 
   2326 /* --- --- Vector saturated narrowing --- --- */
   2327 
   2328 /* We used to do something very clever here, but on closer inspection
   2329    (2011-Jun-15), and in particular bug #279698, it turns out to be
   2330    wrong.  Part of the problem came from the fact that for a long
   2331    time, the IR primops to do with saturated narrowing were
   2332    underspecified and managed to confuse multiple cases which needed
   2333    to be separate: the op names had a signedness qualifier, but in
   2334    fact the source and destination signednesses needed to be specified
   2335    independently, so the op names really need two independent
   2336    signedness specifiers.
   2337 
   2338    As of 2011-Jun-15 (ish) the underspecification was sorted out
   2339    properly.  The incorrect instrumentation remained, though.  That
   2340    has now (2011-Oct-22) been fixed.
   2341 
   2342    What we now do is simple:
   2343 
   2344    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
   2345    number of lanes, X is the source lane width and signedness, and Y
   2346    is the destination lane width and signedness.  In all cases the
   2347    destination lane width is half the source lane width, so the names
   2348    have a bit of redundancy, but are at least easy to read.
   2349 
   2350    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
   2351    to unsigned 16s.
   2352 
   2353    Let Vanilla(OP) be a function that takes OP, one of these
   2354    saturating narrowing ops, and produces the same "shaped" narrowing
   2355    op which is not saturating, but merely dumps the most significant
   2356    bits.  "same shape" means that the lane numbers and widths are the
   2357    same as with OP.
   2358 
   2359    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
   2360                   = Iop_NarrowBin32to16x8,
   2361    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
   2362    dumping the top half of each lane.
   2363 
   2364    So, with that in place, the scheme is simple, and it is simple to
   2365    pessimise each lane individually and then apply Vanilla(OP) so as
   2366    to get the result in the right "shape".  If the original OP is
   2367    QNarrowBinXtoYxZ then we produce
   2368 
   2369    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
   2370 
   2371    or for the case when OP is unary (Iop_QNarrowUn*)
   2372 
   2373    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
   2374 */
   2375 static
   2376 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
   2377 {
   2378    switch (qnarrowOp) {
   2379       /* Binary: (128, 128) -> 128 */
   2380       case Iop_QNarrowBin16Sto8Ux16:
   2381       case Iop_QNarrowBin16Sto8Sx16:
   2382       case Iop_QNarrowBin16Uto8Ux16:
   2383       case Iop_QNarrowBin64Sto32Sx4:
   2384       case Iop_QNarrowBin64Uto32Ux4:
   2385          return Iop_NarrowBin16to8x16;
   2386       case Iop_QNarrowBin32Sto16Ux8:
   2387       case Iop_QNarrowBin32Sto16Sx8:
   2388       case Iop_QNarrowBin32Uto16Ux8:
   2389          return Iop_NarrowBin32to16x8;
   2390       /* Binary: (64, 64) -> 64 */
   2391       case Iop_QNarrowBin32Sto16Sx4:
   2392          return Iop_NarrowBin32to16x4;
   2393       case Iop_QNarrowBin16Sto8Ux8:
   2394       case Iop_QNarrowBin16Sto8Sx8:
   2395          return Iop_NarrowBin16to8x8;
   2396       /* Unary: 128 -> 64 */
   2397       case Iop_QNarrowUn64Uto32Ux2:
   2398       case Iop_QNarrowUn64Sto32Sx2:
   2399       case Iop_QNarrowUn64Sto32Ux2:
   2400          return Iop_NarrowUn64to32x2;
   2401       case Iop_QNarrowUn32Uto16Ux4:
   2402       case Iop_QNarrowUn32Sto16Sx4:
   2403       case Iop_QNarrowUn32Sto16Ux4:
   2404          return Iop_NarrowUn32to16x4;
   2405       case Iop_QNarrowUn16Uto8Ux8:
   2406       case Iop_QNarrowUn16Sto8Sx8:
   2407       case Iop_QNarrowUn16Sto8Ux8:
   2408          return Iop_NarrowUn16to8x8;
   2409       default:
   2410          ppIROp(qnarrowOp);
   2411          VG_(tool_panic)("vanillaNarrowOpOfShape");
   2412    }
   2413 }
   2414 
   2415 static
   2416 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
   2417                               IRAtom* vatom1, IRAtom* vatom2)
   2418 {
   2419    IRAtom *at1, *at2, *at3;
   2420    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2421    switch (narrow_op) {
   2422       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
   2423       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
   2424       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
   2425       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
   2426       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
   2427       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
   2428       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
   2429       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
   2430       default: VG_(tool_panic)("vectorNarrowBinV128");
   2431    }
   2432    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2433    tl_assert(isShadowAtom(mce,vatom1));
   2434    tl_assert(isShadowAtom(mce,vatom2));
   2435    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
   2436    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
   2437    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
   2438    return at3;
   2439 }
   2440 
   2441 static
   2442 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
   2443                             IRAtom* vatom1, IRAtom* vatom2)
   2444 {
   2445    IRAtom *at1, *at2, *at3;
   2446    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2447    switch (narrow_op) {
   2448       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
   2449       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
   2450       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
   2451       default: VG_(tool_panic)("vectorNarrowBin64");
   2452    }
   2453    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2454    tl_assert(isShadowAtom(mce,vatom1));
   2455    tl_assert(isShadowAtom(mce,vatom2));
   2456    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
   2457    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
   2458    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
   2459    return at3;
   2460 }
   2461 
   2462 static
   2463 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
   2464                              IRAtom* vatom1)
   2465 {
   2466    IRAtom *at1, *at2;
   2467    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2468    tl_assert(isShadowAtom(mce,vatom1));
   2469    /* For vanilla narrowing (non-saturating), we can just apply
   2470       the op directly to the V bits. */
   2471    switch (narrow_op) {
   2472       case Iop_NarrowUn16to8x8:
   2473       case Iop_NarrowUn32to16x4:
   2474       case Iop_NarrowUn64to32x2:
   2475          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
   2476          return at1;
   2477       default:
   2478          break; /* Do Plan B */
   2479    }
   2480    /* Plan B: for ops that involve a saturation operation on the args,
   2481       we must PCast before the vanilla narrow. */
   2482    switch (narrow_op) {
   2483       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
   2484       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
   2485       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
   2486       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
   2487       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
   2488       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
   2489       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
   2490       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
   2491       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
   2492       default: VG_(tool_panic)("vectorNarrowUnV128");
   2493    }
   2494    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2495    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
   2496    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
   2497    return at2;
   2498 }
   2499 
   2500 static
   2501 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
   2502                          IRAtom* vatom1)
   2503 {
   2504    IRAtom *at1, *at2;
   2505    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2506    switch (longen_op) {
   2507       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
   2508       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
   2509       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
   2510       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
   2511       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
   2512       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
   2513       default: VG_(tool_panic)("vectorWidenI64");
   2514    }
   2515    tl_assert(isShadowAtom(mce,vatom1));
   2516    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
   2517    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
   2518    return at2;
   2519 }
   2520 
   2521 
   2522 /* --- --- Vector integer arithmetic --- --- */
   2523 
   2524 /* Simple ... UifU the args and per-lane pessimise the results. */
   2525 
   2526 /* --- V256-bit versions --- */
   2527 
   2528 static
   2529 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2530 {
   2531    IRAtom* at;
   2532    at = mkUifUV256(mce, vatom1, vatom2);
   2533    at = mkPCast8x32(mce, at);
   2534    return at;
   2535 }
   2536 
   2537 static
   2538 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2539 {
   2540    IRAtom* at;
   2541    at = mkUifUV256(mce, vatom1, vatom2);
   2542    at = mkPCast16x16(mce, at);
   2543    return at;
   2544 }
   2545 
   2546 static
   2547 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2548 {
   2549    IRAtom* at;
   2550    at = mkUifUV256(mce, vatom1, vatom2);
   2551    at = mkPCast32x8(mce, at);
   2552    return at;
   2553 }
   2554 
   2555 static
   2556 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2557 {
   2558    IRAtom* at;
   2559    at = mkUifUV256(mce, vatom1, vatom2);
   2560    at = mkPCast64x4(mce, at);
   2561    return at;
   2562 }
   2563 
   2564 /* --- V128-bit versions --- */
   2565 
   2566 static
   2567 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2568 {
   2569    IRAtom* at;
   2570    at = mkUifUV128(mce, vatom1, vatom2);
   2571    at = mkPCast8x16(mce, at);
   2572    return at;
   2573 }
   2574 
   2575 static
   2576 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2577 {
   2578    IRAtom* at;
   2579    at = mkUifUV128(mce, vatom1, vatom2);
   2580    at = mkPCast16x8(mce, at);
   2581    return at;
   2582 }
   2583 
   2584 static
   2585 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2586 {
   2587    IRAtom* at;
   2588    at = mkUifUV128(mce, vatom1, vatom2);
   2589    at = mkPCast32x4(mce, at);
   2590    return at;
   2591 }
   2592 
   2593 static
   2594 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2595 {
   2596    IRAtom* at;
   2597    at = mkUifUV128(mce, vatom1, vatom2);
   2598    at = mkPCast64x2(mce, at);
   2599    return at;
   2600 }
   2601 
   2602 /* --- 64-bit versions --- */
   2603 
   2604 static
   2605 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2606 {
   2607    IRAtom* at;
   2608    at = mkUifU64(mce, vatom1, vatom2);
   2609    at = mkPCast8x8(mce, at);
   2610    return at;
   2611 }
   2612 
   2613 static
   2614 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2615 {
   2616    IRAtom* at;
   2617    at = mkUifU64(mce, vatom1, vatom2);
   2618    at = mkPCast16x4(mce, at);
   2619    return at;
   2620 }
   2621 
   2622 static
   2623 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2624 {
   2625    IRAtom* at;
   2626    at = mkUifU64(mce, vatom1, vatom2);
   2627    at = mkPCast32x2(mce, at);
   2628    return at;
   2629 }
   2630 
   2631 static
   2632 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2633 {
   2634    IRAtom* at;
   2635    at = mkUifU64(mce, vatom1, vatom2);
   2636    at = mkPCastTo(mce, Ity_I64, at);
   2637    return at;
   2638 }
   2639 
   2640 /* --- 32-bit versions --- */
   2641 
   2642 static
   2643 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2644 {
   2645    IRAtom* at;
   2646    at = mkUifU32(mce, vatom1, vatom2);
   2647    at = mkPCast8x4(mce, at);
   2648    return at;
   2649 }
   2650 
   2651 static
   2652 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2653 {
   2654    IRAtom* at;
   2655    at = mkUifU32(mce, vatom1, vatom2);
   2656    at = mkPCast16x2(mce, at);
   2657    return at;
   2658 }
   2659 
   2660 
   2661 /*------------------------------------------------------------*/
   2662 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
   2663 /*------------------------------------------------------------*/
   2664 
   2665 static
   2666 IRAtom* expr2vbits_Qop ( MCEnv* mce,
   2667                          IROp op,
   2668                          IRAtom* atom1, IRAtom* atom2,
   2669                          IRAtom* atom3, IRAtom* atom4 )
   2670 {
   2671    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2672    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2673    IRAtom* vatom3 = expr2vbits( mce, atom3 );
   2674    IRAtom* vatom4 = expr2vbits( mce, atom4 );
   2675 
   2676    tl_assert(isOriginalAtom(mce,atom1));
   2677    tl_assert(isOriginalAtom(mce,atom2));
   2678    tl_assert(isOriginalAtom(mce,atom3));
   2679    tl_assert(isOriginalAtom(mce,atom4));
   2680    tl_assert(isShadowAtom(mce,vatom1));
   2681    tl_assert(isShadowAtom(mce,vatom2));
   2682    tl_assert(isShadowAtom(mce,vatom3));
   2683    tl_assert(isShadowAtom(mce,vatom4));
   2684    tl_assert(sameKindedAtoms(atom1,vatom1));
   2685    tl_assert(sameKindedAtoms(atom2,vatom2));
   2686    tl_assert(sameKindedAtoms(atom3,vatom3));
   2687    tl_assert(sameKindedAtoms(atom4,vatom4));
   2688    switch (op) {
   2689       case Iop_MAddF64:
   2690       case Iop_MAddF64r32:
   2691       case Iop_MSubF64:
   2692       case Iop_MSubF64r32:
   2693          /* I32(rm) x F64 x F64 x F64 -> F64 */
   2694          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
   2695 
   2696       case Iop_MAddF32:
   2697       case Iop_MSubF32:
   2698          /* I32(rm) x F32 x F32 x F32 -> F32 */
   2699          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
   2700 
   2701       /* V256-bit data-steering */
   2702       case Iop_64x4toV256:
   2703          return assignNew('V', mce, Ity_V256,
   2704                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
   2705 
   2706       default:
   2707          ppIROp(op);
   2708          VG_(tool_panic)("memcheck:expr2vbits_Qop");
   2709    }
   2710 }
   2711 
   2712 
   2713 static
   2714 IRAtom* expr2vbits_Triop ( MCEnv* mce,
   2715                            IROp op,
   2716                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
   2717 {
   2718    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2719    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2720    IRAtom* vatom3 = expr2vbits( mce, atom3 );
   2721 
   2722    tl_assert(isOriginalAtom(mce,atom1));
   2723    tl_assert(isOriginalAtom(mce,atom2));
   2724    tl_assert(isOriginalAtom(mce,atom3));
   2725    tl_assert(isShadowAtom(mce,vatom1));
   2726    tl_assert(isShadowAtom(mce,vatom2));
   2727    tl_assert(isShadowAtom(mce,vatom3));
   2728    tl_assert(sameKindedAtoms(atom1,vatom1));
   2729    tl_assert(sameKindedAtoms(atom2,vatom2));
   2730    tl_assert(sameKindedAtoms(atom3,vatom3));
   2731    switch (op) {
   2732       case Iop_AddF128:
   2733       case Iop_AddD128:
   2734       case Iop_SubF128:
   2735       case Iop_SubD128:
   2736       case Iop_MulF128:
   2737       case Iop_MulD128:
   2738       case Iop_DivF128:
   2739       case Iop_DivD128:
   2740       case Iop_QuantizeD128:
   2741          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
   2742          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
   2743       case Iop_AddF64:
   2744       case Iop_AddD64:
   2745       case Iop_AddF64r32:
   2746       case Iop_SubF64:
   2747       case Iop_SubD64:
   2748       case Iop_SubF64r32:
   2749       case Iop_MulF64:
   2750       case Iop_MulD64:
   2751       case Iop_MulF64r32:
   2752       case Iop_DivF64:
   2753       case Iop_DivD64:
   2754       case Iop_DivF64r32:
   2755       case Iop_ScaleF64:
   2756       case Iop_Yl2xF64:
   2757       case Iop_Yl2xp1F64:
   2758       case Iop_AtanF64:
   2759       case Iop_PRemF64:
   2760       case Iop_PRem1F64:
   2761       case Iop_QuantizeD64:
   2762          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
   2763          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
   2764       case Iop_PRemC3210F64:
   2765       case Iop_PRem1C3210F64:
   2766          /* I32(rm) x F64 x F64 -> I32 */
   2767          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
   2768       case Iop_AddF32:
   2769       case Iop_SubF32:
   2770       case Iop_MulF32:
   2771       case Iop_DivF32:
   2772          /* I32(rm) x F32 x F32 -> I32 */
   2773          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
   2774       case Iop_SignificanceRoundD64:
   2775          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
   2776          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
   2777       case Iop_SignificanceRoundD128:
   2778          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
   2779          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
   2780       case Iop_ExtractV128:
   2781          complainIfUndefined(mce, atom3, NULL);
   2782          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
   2783       case Iop_Extract64:
   2784          complainIfUndefined(mce, atom3, NULL);
   2785          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
   2786       case Iop_SetElem8x8:
   2787       case Iop_SetElem16x4:
   2788       case Iop_SetElem32x2:
   2789          complainIfUndefined(mce, atom2, NULL);
   2790          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
   2791       /* BCDIops */
   2792       case Iop_BCDAdd:
   2793       case Iop_BCDSub:
   2794          complainIfUndefined(mce, atom3, NULL);
   2795          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
   2796 
   2797       /* Vector FP with rounding mode as the first arg */
   2798       case Iop_Add64Fx2:
   2799       case Iop_Sub64Fx2:
   2800       case Iop_Mul64Fx2:
   2801       case Iop_Div64Fx2:
   2802          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
   2803 
   2804       case Iop_Add32Fx4:
   2805       case Iop_Sub32Fx4:
   2806       case Iop_Mul32Fx4:
   2807       case Iop_Div32Fx4:
   2808         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
   2809 
   2810       case Iop_Add64Fx4:
   2811       case Iop_Sub64Fx4:
   2812       case Iop_Mul64Fx4:
   2813       case Iop_Div64Fx4:
   2814          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
   2815 
   2816       case Iop_Add32Fx8:
   2817       case Iop_Sub32Fx8:
   2818       case Iop_Mul32Fx8:
   2819       case Iop_Div32Fx8:
   2820          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
   2821 
   2822       default:
   2823          ppIROp(op);
   2824          VG_(tool_panic)("memcheck:expr2vbits_Triop");
   2825    }
   2826 }
   2827 
   2828 
   2829 static
   2830 IRAtom* expr2vbits_Binop ( MCEnv* mce,
   2831                            IROp op,
   2832                            IRAtom* atom1, IRAtom* atom2 )
   2833 {
   2834    IRType  and_or_ty;
   2835    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*);
   2836    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*);
   2837    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*);
   2838 
   2839    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2840    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2841 
   2842    tl_assert(isOriginalAtom(mce,atom1));
   2843    tl_assert(isOriginalAtom(mce,atom2));
   2844    tl_assert(isShadowAtom(mce,vatom1));
   2845    tl_assert(isShadowAtom(mce,vatom2));
   2846    tl_assert(sameKindedAtoms(atom1,vatom1));
   2847    tl_assert(sameKindedAtoms(atom2,vatom2));
   2848    switch (op) {
   2849 
   2850       /* 32-bit SIMD */
   2851 
   2852       case Iop_Add16x2:
   2853       case Iop_HAdd16Ux2:
   2854       case Iop_HAdd16Sx2:
   2855       case Iop_Sub16x2:
   2856       case Iop_HSub16Ux2:
   2857       case Iop_HSub16Sx2:
   2858       case Iop_QAdd16Sx2:
   2859       case Iop_QSub16Sx2:
   2860       case Iop_QSub16Ux2:
   2861       case Iop_QAdd16Ux2:
   2862          return binary16Ix2(mce, vatom1, vatom2);
   2863 
   2864       case Iop_Add8x4:
   2865       case Iop_HAdd8Ux4:
   2866       case Iop_HAdd8Sx4:
   2867       case Iop_Sub8x4:
   2868       case Iop_HSub8Ux4:
   2869       case Iop_HSub8Sx4:
   2870       case Iop_QSub8Ux4:
   2871       case Iop_QAdd8Ux4:
   2872       case Iop_QSub8Sx4:
   2873       case Iop_QAdd8Sx4:
   2874          return binary8Ix4(mce, vatom1, vatom2);
   2875 
   2876       /* 64-bit SIMD */
   2877 
   2878       case Iop_ShrN8x8:
   2879       case Iop_ShrN16x4:
   2880       case Iop_ShrN32x2:
   2881       case Iop_SarN8x8:
   2882       case Iop_SarN16x4:
   2883       case Iop_SarN32x2:
   2884       case Iop_ShlN16x4:
   2885       case Iop_ShlN32x2:
   2886       case Iop_ShlN8x8:
   2887          /* Same scheme as with all other shifts. */
   2888          complainIfUndefined(mce, atom2, NULL);
   2889          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
   2890 
   2891       case Iop_QNarrowBin32Sto16Sx4:
   2892       case Iop_QNarrowBin16Sto8Sx8:
   2893       case Iop_QNarrowBin16Sto8Ux8:
   2894          return vectorNarrowBin64(mce, op, vatom1, vatom2);
   2895 
   2896       case Iop_Min8Ux8:
   2897       case Iop_Min8Sx8:
   2898       case Iop_Max8Ux8:
   2899       case Iop_Max8Sx8:
   2900       case Iop_Avg8Ux8:
   2901       case Iop_QSub8Sx8:
   2902       case Iop_QSub8Ux8:
   2903       case Iop_Sub8x8:
   2904       case Iop_CmpGT8Sx8:
   2905       case Iop_CmpGT8Ux8:
   2906       case Iop_CmpEQ8x8:
   2907       case Iop_QAdd8Sx8:
   2908       case Iop_QAdd8Ux8:
   2909       case Iop_QSal8x8:
   2910       case Iop_QShl8x8:
   2911       case Iop_Add8x8:
   2912       case Iop_Mul8x8:
   2913       case Iop_PolynomialMul8x8:
   2914          return binary8Ix8(mce, vatom1, vatom2);
   2915 
   2916       case Iop_Min16Sx4:
   2917       case Iop_Min16Ux4:
   2918       case Iop_Max16Sx4:
   2919       case Iop_Max16Ux4:
   2920       case Iop_Avg16Ux4:
   2921       case Iop_QSub16Ux4:
   2922       case Iop_QSub16Sx4:
   2923       case Iop_Sub16x4:
   2924       case Iop_Mul16x4:
   2925       case Iop_MulHi16Sx4:
   2926       case Iop_MulHi16Ux4:
   2927       case Iop_CmpGT16Sx4:
   2928       case Iop_CmpGT16Ux4:
   2929       case Iop_CmpEQ16x4:
   2930       case Iop_QAdd16Sx4:
   2931       case Iop_QAdd16Ux4:
   2932       case Iop_QSal16x4:
   2933       case Iop_QShl16x4:
   2934       case Iop_Add16x4:
   2935       case Iop_QDMulHi16Sx4:
   2936       case Iop_QRDMulHi16Sx4:
   2937          return binary16Ix4(mce, vatom1, vatom2);
   2938 
   2939       case Iop_Sub32x2:
   2940       case Iop_Mul32x2:
   2941       case Iop_Max32Sx2:
   2942       case Iop_Max32Ux2:
   2943       case Iop_Min32Sx2:
   2944       case Iop_Min32Ux2:
   2945       case Iop_CmpGT32Sx2:
   2946       case Iop_CmpGT32Ux2:
   2947       case Iop_CmpEQ32x2:
   2948       case Iop_Add32x2:
   2949       case Iop_QAdd32Ux2:
   2950       case Iop_QAdd32Sx2:
   2951       case Iop_QSub32Ux2:
   2952       case Iop_QSub32Sx2:
   2953       case Iop_QSal32x2:
   2954       case Iop_QShl32x2:
   2955       case Iop_QDMulHi32Sx2:
   2956       case Iop_QRDMulHi32Sx2:
   2957          return binary32Ix2(mce, vatom1, vatom2);
   2958 
   2959       case Iop_QSub64Ux1:
   2960       case Iop_QSub64Sx1:
   2961       case Iop_QAdd64Ux1:
   2962       case Iop_QAdd64Sx1:
   2963       case Iop_QSal64x1:
   2964       case Iop_QShl64x1:
   2965       case Iop_Sal64x1:
   2966          return binary64Ix1(mce, vatom1, vatom2);
   2967 
   2968       case Iop_QShlN8Sx8:
   2969       case Iop_QShlN8x8:
   2970       case Iop_QSalN8x8:
   2971          complainIfUndefined(mce, atom2, NULL);
   2972          return mkPCast8x8(mce, vatom1);
   2973 
   2974       case Iop_QShlN16Sx4:
   2975       case Iop_QShlN16x4:
   2976       case Iop_QSalN16x4:
   2977          complainIfUndefined(mce, atom2, NULL);
   2978          return mkPCast16x4(mce, vatom1);
   2979 
   2980       case Iop_QShlN32Sx2:
   2981       case Iop_QShlN32x2:
   2982       case Iop_QSalN32x2:
   2983          complainIfUndefined(mce, atom2, NULL);
   2984          return mkPCast32x2(mce, vatom1);
   2985 
   2986       case Iop_QShlN64Sx1:
   2987       case Iop_QShlN64x1:
   2988       case Iop_QSalN64x1:
   2989          complainIfUndefined(mce, atom2, NULL);
   2990          return mkPCast32x2(mce, vatom1);
   2991 
   2992       case Iop_PwMax32Sx2:
   2993       case Iop_PwMax32Ux2:
   2994       case Iop_PwMin32Sx2:
   2995       case Iop_PwMin32Ux2:
   2996       case Iop_PwMax32Fx2:
   2997       case Iop_PwMin32Fx2:
   2998          return assignNew('V', mce, Ity_I64,
   2999                           binop(Iop_PwMax32Ux2,
   3000                                 mkPCast32x2(mce, vatom1),
   3001                                 mkPCast32x2(mce, vatom2)));
   3002 
   3003       case Iop_PwMax16Sx4:
   3004       case Iop_PwMax16Ux4:
   3005       case Iop_PwMin16Sx4:
   3006       case Iop_PwMin16Ux4:
   3007          return assignNew('V', mce, Ity_I64,
   3008                           binop(Iop_PwMax16Ux4,
   3009                                 mkPCast16x4(mce, vatom1),
   3010                                 mkPCast16x4(mce, vatom2)));
   3011 
   3012       case Iop_PwMax8Sx8:
   3013       case Iop_PwMax8Ux8:
   3014       case Iop_PwMin8Sx8:
   3015       case Iop_PwMin8Ux8:
   3016          return assignNew('V', mce, Ity_I64,
   3017                           binop(Iop_PwMax8Ux8,
   3018                                 mkPCast8x8(mce, vatom1),
   3019                                 mkPCast8x8(mce, vatom2)));
   3020 
   3021       case Iop_PwAdd32x2:
   3022       case Iop_PwAdd32Fx2:
   3023          return mkPCast32x2(mce,
   3024                assignNew('V', mce, Ity_I64,
   3025                          binop(Iop_PwAdd32x2,
   3026                                mkPCast32x2(mce, vatom1),
   3027                                mkPCast32x2(mce, vatom2))));
   3028 
   3029       case Iop_PwAdd16x4:
   3030          return mkPCast16x4(mce,
   3031                assignNew('V', mce, Ity_I64,
   3032                          binop(op, mkPCast16x4(mce, vatom1),
   3033                                    mkPCast16x4(mce, vatom2))));
   3034 
   3035       case Iop_PwAdd8x8:
   3036          return mkPCast8x8(mce,
   3037                assignNew('V', mce, Ity_I64,
   3038                          binop(op, mkPCast8x8(mce, vatom1),
   3039                                    mkPCast8x8(mce, vatom2))));
   3040 
   3041       case Iop_Shl8x8:
   3042       case Iop_Shr8x8:
   3043       case Iop_Sar8x8:
   3044       case Iop_Sal8x8:
   3045          return mkUifU64(mce,
   3046                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3047                    mkPCast8x8(mce,vatom2)
   3048                 );
   3049 
   3050       case Iop_Shl16x4:
   3051       case Iop_Shr16x4:
   3052       case Iop_Sar16x4:
   3053       case Iop_Sal16x4:
   3054          return mkUifU64(mce,
   3055                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3056                    mkPCast16x4(mce,vatom2)
   3057                 );
   3058 
   3059       case Iop_Shl32x2:
   3060       case Iop_Shr32x2:
   3061       case Iop_Sar32x2:
   3062       case Iop_Sal32x2:
   3063          return mkUifU64(mce,
   3064                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3065                    mkPCast32x2(mce,vatom2)
   3066                 );
   3067 
   3068       /* 64-bit data-steering */
   3069       case Iop_InterleaveLO32x2:
   3070       case Iop_InterleaveLO16x4:
   3071       case Iop_InterleaveLO8x8:
   3072       case Iop_InterleaveHI32x2:
   3073       case Iop_InterleaveHI16x4:
   3074       case Iop_InterleaveHI8x8:
   3075       case Iop_CatOddLanes8x8:
   3076       case Iop_CatEvenLanes8x8:
   3077       case Iop_CatOddLanes16x4:
   3078       case Iop_CatEvenLanes16x4:
   3079       case Iop_InterleaveOddLanes8x8:
   3080       case Iop_InterleaveEvenLanes8x8:
   3081       case Iop_InterleaveOddLanes16x4:
   3082       case Iop_InterleaveEvenLanes16x4:
   3083          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
   3084 
   3085       case Iop_GetElem8x8:
   3086          complainIfUndefined(mce, atom2, NULL);
   3087          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
   3088       case Iop_GetElem16x4:
   3089          complainIfUndefined(mce, atom2, NULL);
   3090          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
   3091       case Iop_GetElem32x2:
   3092          complainIfUndefined(mce, atom2, NULL);
   3093          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
   3094 
   3095       /* Perm8x8: rearrange values in left arg using steering values
   3096         from right arg.  So rearrange the vbits in the same way but
   3097         pessimise wrt steering values. */
   3098       case Iop_Perm8x8:
   3099          return mkUifU64(
   3100                    mce,
   3101                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3102                    mkPCast8x8(mce, vatom2)
   3103                 );
   3104 
   3105       /* V128-bit SIMD */
   3106 
   3107       case Iop_ShrN8x16:
   3108       case Iop_ShrN16x8:
   3109       case Iop_ShrN32x4:
   3110       case Iop_ShrN64x2:
   3111       case Iop_SarN8x16:
   3112       case Iop_SarN16x8:
   3113       case Iop_SarN32x4:
   3114       case Iop_SarN64x2:
   3115       case Iop_ShlN8x16:
   3116       case Iop_ShlN16x8:
   3117       case Iop_ShlN32x4:
   3118       case Iop_ShlN64x2:
   3119          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
   3120             this is wrong now, scalar shifts are done properly lazily.
   3121             Vector shifts should be fixed too. */
   3122          complainIfUndefined(mce, atom2, NULL);
   3123          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3124 
   3125       /* V x V shifts/rotates are done using the standard lazy scheme. */
   3126       case Iop_Shl8x16:
   3127       case Iop_Shr8x16:
   3128       case Iop_Sar8x16:
   3129       case Iop_Sal8x16:
   3130       case Iop_Rol8x16:
   3131          return mkUifUV128(mce,
   3132                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3133                    mkPCast8x16(mce,vatom2)
   3134                 );
   3135 
   3136       case Iop_Shl16x8:
   3137       case Iop_Shr16x8:
   3138       case Iop_Sar16x8:
   3139       case Iop_Sal16x8:
   3140       case Iop_Rol16x8:
   3141          return mkUifUV128(mce,
   3142                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3143                    mkPCast16x8(mce,vatom2)
   3144                 );
   3145 
   3146       case Iop_Shl32x4:
   3147       case Iop_Shr32x4:
   3148       case Iop_Sar32x4:
   3149       case Iop_Sal32x4:
   3150       case Iop_Rol32x4:
   3151       case Iop_Rol64x2:
   3152          return mkUifUV128(mce,
   3153                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3154                    mkPCast32x4(mce,vatom2)
   3155                 );
   3156 
   3157       case Iop_Shl64x2:
   3158       case Iop_Shr64x2:
   3159       case Iop_Sar64x2:
   3160       case Iop_Sal64x2:
   3161          return mkUifUV128(mce,
   3162                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3163                    mkPCast64x2(mce,vatom2)
   3164                 );
   3165 
   3166       case Iop_F32ToFixed32Ux4_RZ:
   3167       case Iop_F32ToFixed32Sx4_RZ:
   3168       case Iop_Fixed32UToF32x4_RN:
   3169       case Iop_Fixed32SToF32x4_RN:
   3170          complainIfUndefined(mce, atom2, NULL);
   3171          return mkPCast32x4(mce, vatom1);
   3172 
   3173       case Iop_F32ToFixed32Ux2_RZ:
   3174       case Iop_F32ToFixed32Sx2_RZ:
   3175       case Iop_Fixed32UToF32x2_RN:
   3176       case Iop_Fixed32SToF32x2_RN:
   3177          complainIfUndefined(mce, atom2, NULL);
   3178          return mkPCast32x2(mce, vatom1);
   3179 
   3180       case Iop_QSub8Ux16:
   3181       case Iop_QSub8Sx16:
   3182       case Iop_Sub8x16:
   3183       case Iop_Min8Ux16:
   3184       case Iop_Min8Sx16:
   3185       case Iop_Max8Ux16:
   3186       case Iop_Max8Sx16:
   3187       case Iop_CmpGT8Sx16:
   3188       case Iop_CmpGT8Ux16:
   3189       case Iop_CmpEQ8x16:
   3190       case Iop_Avg8Ux16:
   3191       case Iop_Avg8Sx16:
   3192       case Iop_QAdd8Ux16:
   3193       case Iop_QAdd8Sx16:
   3194       case Iop_QSal8x16:
   3195       case Iop_QShl8x16:
   3196       case Iop_Add8x16:
   3197       case Iop_Mul8x16:
   3198       case Iop_PolynomialMul8x16:
   3199       case Iop_PolynomialMulAdd8x16:
   3200          return binary8Ix16(mce, vatom1, vatom2);
   3201 
   3202       case Iop_QSub16Ux8:
   3203       case Iop_QSub16Sx8:
   3204       case Iop_Sub16x8:
   3205       case Iop_Mul16x8:
   3206       case Iop_MulHi16Sx8:
   3207       case Iop_MulHi16Ux8:
   3208       case Iop_Min16Sx8:
   3209       case Iop_Min16Ux8:
   3210       case Iop_Max16Sx8:
   3211       case Iop_Max16Ux8:
   3212       case Iop_CmpGT16Sx8:
   3213       case Iop_CmpGT16Ux8:
   3214       case Iop_CmpEQ16x8:
   3215       case Iop_Avg16Ux8:
   3216       case Iop_Avg16Sx8:
   3217       case Iop_QAdd16Ux8:
   3218       case Iop_QAdd16Sx8:
   3219       case Iop_QSal16x8:
   3220       case Iop_QShl16x8:
   3221       case Iop_Add16x8:
   3222       case Iop_QDMulHi16Sx8:
   3223       case Iop_QRDMulHi16Sx8:
   3224       case Iop_PolynomialMulAdd16x8:
   3225          return binary16Ix8(mce, vatom1, vatom2);
   3226 
   3227       case Iop_Sub32x4:
   3228       case Iop_CmpGT32Sx4:
   3229       case Iop_CmpGT32Ux4:
   3230       case Iop_CmpEQ32x4:
   3231       case Iop_QAdd32Sx4:
   3232       case Iop_QAdd32Ux4:
   3233       case Iop_QSub32Sx4:
   3234       case Iop_QSub32Ux4:
   3235       case Iop_QSal32x4:
   3236       case Iop_QShl32x4:
   3237       case Iop_Avg32Ux4:
   3238       case Iop_Avg32Sx4:
   3239       case Iop_Add32x4:
   3240       case Iop_Max32Ux4:
   3241       case Iop_Max32Sx4:
   3242       case Iop_Min32Ux4:
   3243       case Iop_Min32Sx4:
   3244       case Iop_Mul32x4:
   3245       case Iop_QDMulHi32Sx4:
   3246       case Iop_QRDMulHi32Sx4:
   3247       case Iop_PolynomialMulAdd32x4:
   3248          return binary32Ix4(mce, vatom1, vatom2);
   3249 
   3250       case Iop_Sub64x2:
   3251       case Iop_Add64x2:
   3252       case Iop_Max64Sx2:
   3253       case Iop_Max64Ux2:
   3254       case Iop_Min64Sx2:
   3255       case Iop_Min64Ux2:
   3256       case Iop_CmpEQ64x2:
   3257       case Iop_CmpGT64Sx2:
   3258       case Iop_CmpGT64Ux2:
   3259       case Iop_QSal64x2:
   3260       case Iop_QShl64x2:
   3261       case Iop_QAdd64Ux2:
   3262       case Iop_QAdd64Sx2:
   3263       case Iop_QSub64Ux2:
   3264       case Iop_QSub64Sx2:
   3265       case Iop_PolynomialMulAdd64x2:
   3266       case Iop_CipherV128:
   3267       case Iop_CipherLV128:
   3268       case Iop_NCipherV128:
   3269       case Iop_NCipherLV128:
   3270         return binary64Ix2(mce, vatom1, vatom2);
   3271 
   3272       case Iop_QNarrowBin64Sto32Sx4:
   3273       case Iop_QNarrowBin64Uto32Ux4:
   3274       case Iop_QNarrowBin32Sto16Sx8:
   3275       case Iop_QNarrowBin32Uto16Ux8:
   3276       case Iop_QNarrowBin32Sto16Ux8:
   3277       case Iop_QNarrowBin16Sto8Sx16:
   3278       case Iop_QNarrowBin16Uto8Ux16:
   3279       case Iop_QNarrowBin16Sto8Ux16:
   3280          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
   3281 
   3282       case Iop_Min64Fx2:
   3283       case Iop_Max64Fx2:
   3284       case Iop_CmpLT64Fx2:
   3285       case Iop_CmpLE64Fx2:
   3286       case Iop_CmpEQ64Fx2:
   3287       case Iop_CmpUN64Fx2:
   3288          return binary64Fx2(mce, vatom1, vatom2);
   3289 
   3290       case Iop_Sub64F0x2:
   3291       case Iop_Mul64F0x2:
   3292       case Iop_Min64F0x2:
   3293       case Iop_Max64F0x2:
   3294       case Iop_Div64F0x2:
   3295       case Iop_CmpLT64F0x2:
   3296       case Iop_CmpLE64F0x2:
   3297       case Iop_CmpEQ64F0x2:
   3298       case Iop_CmpUN64F0x2:
   3299       case Iop_Add64F0x2:
   3300          return binary64F0x2(mce, vatom1, vatom2);
   3301 
   3302       case Iop_Min32Fx4:
   3303       case Iop_Max32Fx4:
   3304       case Iop_CmpLT32Fx4:
   3305       case Iop_CmpLE32Fx4:
   3306       case Iop_CmpEQ32Fx4:
   3307       case Iop_CmpUN32Fx4:
   3308       case Iop_CmpGT32Fx4:
   3309       case Iop_CmpGE32Fx4:
   3310       case Iop_Recps32Fx4:
   3311       case Iop_Rsqrts32Fx4:
   3312          return binary32Fx4(mce, vatom1, vatom2);
   3313 
   3314       case Iop_Sub32Fx2:
   3315       case Iop_Mul32Fx2:
   3316       case Iop_Min32Fx2:
   3317       case Iop_Max32Fx2:
   3318       case Iop_CmpEQ32Fx2:
   3319       case Iop_CmpGT32Fx2:
   3320       case Iop_CmpGE32Fx2:
   3321       case Iop_Add32Fx2:
   3322       case Iop_Recps32Fx2:
   3323       case Iop_Rsqrts32Fx2:
   3324          return binary32Fx2(mce, vatom1, vatom2);
   3325 
   3326       case Iop_Sub32F0x4:
   3327       case Iop_Mul32F0x4:
   3328       case Iop_Min32F0x4:
   3329       case Iop_Max32F0x4:
   3330       case Iop_Div32F0x4:
   3331       case Iop_CmpLT32F0x4:
   3332       case Iop_CmpLE32F0x4:
   3333       case Iop_CmpEQ32F0x4:
   3334       case Iop_CmpUN32F0x4:
   3335       case Iop_Add32F0x4:
   3336          return binary32F0x4(mce, vatom1, vatom2);
   3337 
   3338       case Iop_QShlN8Sx16:
   3339       case Iop_QShlN8x16:
   3340       case Iop_QSalN8x16:
   3341          complainIfUndefined(mce, atom2, NULL);
   3342          return mkPCast8x16(mce, vatom1);
   3343 
   3344       case Iop_QShlN16Sx8:
   3345       case Iop_QShlN16x8:
   3346       case Iop_QSalN16x8:
   3347          complainIfUndefined(mce, atom2, NULL);
   3348          return mkPCast16x8(mce, vatom1);
   3349 
   3350       case Iop_QShlN32Sx4:
   3351       case Iop_QShlN32x4:
   3352       case Iop_QSalN32x4:
   3353          complainIfUndefined(mce, atom2, NULL);
   3354          return mkPCast32x4(mce, vatom1);
   3355 
   3356       case Iop_QShlN64Sx2:
   3357       case Iop_QShlN64x2:
   3358       case Iop_QSalN64x2:
   3359          complainIfUndefined(mce, atom2, NULL);
   3360          return mkPCast32x4(mce, vatom1);
   3361 
   3362       case Iop_Mull32Sx2:
   3363       case Iop_Mull32Ux2:
   3364       case Iop_QDMulLong32Sx2:
   3365          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
   3366                                     mkUifU64(mce, vatom1, vatom2));
   3367 
   3368       case Iop_Mull16Sx4:
   3369       case Iop_Mull16Ux4:
   3370       case Iop_QDMulLong16Sx4:
   3371          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
   3372                                     mkUifU64(mce, vatom1, vatom2));
   3373 
   3374       case Iop_Mull8Sx8:
   3375       case Iop_Mull8Ux8:
   3376       case Iop_PolynomialMull8x8:
   3377          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
   3378                                     mkUifU64(mce, vatom1, vatom2));
   3379 
   3380       case Iop_PwAdd32x4:
   3381          return mkPCast32x4(mce,
   3382                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
   3383                      mkPCast32x4(mce, vatom2))));
   3384 
   3385       case Iop_PwAdd16x8:
   3386          return mkPCast16x8(mce,
   3387                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
   3388                      mkPCast16x8(mce, vatom2))));
   3389 
   3390       case Iop_PwAdd8x16:
   3391          return mkPCast8x16(mce,
   3392                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
   3393                      mkPCast8x16(mce, vatom2))));
   3394 
   3395       /* V128-bit data-steering */
   3396       case Iop_SetV128lo32:
   3397       case Iop_SetV128lo64:
   3398       case Iop_64HLtoV128:
   3399       case Iop_InterleaveLO64x2:
   3400       case Iop_InterleaveLO32x4:
   3401       case Iop_InterleaveLO16x8:
   3402       case Iop_InterleaveLO8x16:
   3403       case Iop_InterleaveHI64x2:
   3404       case Iop_InterleaveHI32x4:
   3405       case Iop_InterleaveHI16x8:
   3406       case Iop_InterleaveHI8x16:
   3407       case Iop_CatOddLanes8x16:
   3408       case Iop_CatOddLanes16x8:
   3409       case Iop_CatOddLanes32x4:
   3410       case Iop_CatEvenLanes8x16:
   3411       case Iop_CatEvenLanes16x8:
   3412       case Iop_CatEvenLanes32x4:
   3413       case Iop_InterleaveOddLanes8x16:
   3414       case Iop_InterleaveOddLanes16x8:
   3415       case Iop_InterleaveOddLanes32x4:
   3416       case Iop_InterleaveEvenLanes8x16:
   3417       case Iop_InterleaveEvenLanes16x8:
   3418       case Iop_InterleaveEvenLanes32x4:
   3419          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
   3420 
   3421       case Iop_GetElem8x16:
   3422          complainIfUndefined(mce, atom2, NULL);
   3423          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
   3424       case Iop_GetElem16x8:
   3425          complainIfUndefined(mce, atom2, NULL);
   3426          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
   3427       case Iop_GetElem32x4:
   3428          complainIfUndefined(mce, atom2, NULL);
   3429          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
   3430       case Iop_GetElem64x2:
   3431          complainIfUndefined(mce, atom2, NULL);
   3432          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
   3433 
   3434      /* Perm8x16: rearrange values in left arg using steering values
   3435         from right arg.  So rearrange the vbits in the same way but
   3436         pessimise wrt steering values.  Perm32x4 ditto. */
   3437       case Iop_Perm8x16:
   3438          return mkUifUV128(
   3439                    mce,
   3440                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3441                    mkPCast8x16(mce, vatom2)
   3442                 );
   3443       case Iop_Perm32x4:
   3444          return mkUifUV128(
   3445                    mce,
   3446                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3447                    mkPCast32x4(mce, vatom2)
   3448                 );
   3449 
   3450      /* These two take the lower half of each 16-bit lane, sign/zero
   3451         extend it to 32, and multiply together, producing a 32x4
   3452         result (and implicitly ignoring half the operand bits).  So
   3453         treat it as a bunch of independent 16x8 operations, but then
   3454         do 32-bit shifts left-right to copy the lower half results
   3455         (which are all 0s or all 1s due to PCasting in binary16Ix8)
   3456         into the upper half of each result lane. */
   3457       case Iop_MullEven16Ux8:
   3458       case Iop_MullEven16Sx8: {
   3459          IRAtom* at;
   3460          at = binary16Ix8(mce,vatom1,vatom2);
   3461          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
   3462          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
   3463 	 return at;
   3464       }
   3465 
   3466       /* Same deal as Iop_MullEven16{S,U}x8 */
   3467       case Iop_MullEven8Ux16:
   3468       case Iop_MullEven8Sx16: {
   3469          IRAtom* at;
   3470          at = binary8Ix16(mce,vatom1,vatom2);
   3471          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
   3472          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
   3473 	 return at;
   3474       }
   3475 
   3476       /* Same deal as Iop_MullEven16{S,U}x8 */
   3477       case Iop_MullEven32Ux4:
   3478       case Iop_MullEven32Sx4: {
   3479          IRAtom* at;
   3480          at = binary32Ix4(mce,vatom1,vatom2);
   3481          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
   3482          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
   3483          return at;
   3484       }
   3485 
   3486       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
   3487          32x4 -> 16x8 laneage, discarding the upper half of each lane.
   3488          Simply apply same op to the V bits, since this really no more
   3489          than a data steering operation. */
   3490       case Iop_NarrowBin32to16x8:
   3491       case Iop_NarrowBin16to8x16:
   3492       case Iop_NarrowBin64to32x4:
   3493          return assignNew('V', mce, Ity_V128,
   3494                                     binop(op, vatom1, vatom2));
   3495 
   3496       case Iop_ShrV128:
   3497       case Iop_ShlV128:
   3498          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
   3499             this is wrong now, scalar shifts are done properly lazily.
   3500             Vector shifts should be fixed too. */
   3501          complainIfUndefined(mce, atom2, NULL);
   3502          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3503 
   3504       /* SHA Iops */
   3505       case Iop_SHA256:
   3506       case Iop_SHA512:
   3507          complainIfUndefined(mce, atom2, NULL);
   3508          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3509 
   3510       /* I128-bit data-steering */
   3511       case Iop_64HLto128:
   3512          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
   3513 
   3514       /* V256-bit SIMD */
   3515 
   3516       case Iop_Max64Fx4:
   3517       case Iop_Min64Fx4:
   3518          return binary64Fx4(mce, vatom1, vatom2);
   3519 
   3520       case Iop_Max32Fx8:
   3521       case Iop_Min32Fx8:
   3522          return binary32Fx8(mce, vatom1, vatom2);
   3523 
   3524       /* V256-bit data-steering */
   3525       case Iop_V128HLtoV256:
   3526          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
   3527 
   3528       /* Scalar floating point */
   3529 
   3530       case Iop_F32toI64S:
   3531       case Iop_F32toI64U:
   3532          /* I32(rm) x F32 -> I64 */
   3533          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3534 
   3535       case Iop_I64StoF32:
   3536          /* I32(rm) x I64 -> F32 */
   3537          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3538 
   3539       case Iop_RoundF64toInt:
   3540       case Iop_RoundF64toF32:
   3541       case Iop_F64toI64S:
   3542       case Iop_F64toI64U:
   3543       case Iop_I64StoF64:
   3544       case Iop_I64UtoF64:
   3545       case Iop_SinF64:
   3546       case Iop_CosF64:
   3547       case Iop_TanF64:
   3548       case Iop_2xm1F64:
   3549       case Iop_SqrtF64:
   3550          /* I32(rm) x I64/F64 -> I64/F64 */
   3551          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3552 
   3553       case Iop_ShlD64:
   3554       case Iop_ShrD64:
   3555       case Iop_RoundD64toInt:
   3556          /* I32(rm) x D64 -> D64 */
   3557          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3558 
   3559       case Iop_ShlD128:
   3560       case Iop_ShrD128:
   3561       case Iop_RoundD128toInt:
   3562          /* I32(rm) x D128 -> D128 */
   3563          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3564 
   3565       case Iop_D64toI64S:
   3566       case Iop_D64toI64U:
   3567       case Iop_I64StoD64:
   3568       case Iop_I64UtoD64:
   3569          /* I32(rm) x I64/D64 -> D64/I64 */
   3570          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3571 
   3572       case Iop_F32toD32:
   3573       case Iop_F64toD32:
   3574       case Iop_F128toD32:
   3575       case Iop_D32toF32:
   3576       case Iop_D64toF32:
   3577       case Iop_D128toF32:
   3578          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
   3579          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3580 
   3581       case Iop_F32toD64:
   3582       case Iop_F64toD64:
   3583       case Iop_F128toD64:
   3584       case Iop_D32toF64:
   3585       case Iop_D64toF64:
   3586       case Iop_D128toF64:
   3587          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
   3588          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3589 
   3590       case Iop_F32toD128:
   3591       case Iop_F64toD128:
   3592       case Iop_F128toD128:
   3593       case Iop_D32toF128:
   3594       case Iop_D64toF128:
   3595       case Iop_D128toF128:
   3596          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
   3597          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3598 
   3599       case Iop_RoundF32toInt:
   3600       case Iop_SqrtF32:
   3601          /* I32(rm) x I32/F32 -> I32/F32 */
   3602          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3603 
   3604       case Iop_SqrtF128:
   3605          /* I32(rm) x F128 -> F128 */
   3606          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3607 
   3608       case Iop_I32StoF32:
   3609       case Iop_I32UtoF32:
   3610       case Iop_F32toI32S:
   3611       case Iop_F32toI32U:
   3612          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
   3613          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3614 
   3615       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
   3616       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
   3617       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
   3618       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
   3619       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
   3620          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3621 
   3622       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
   3623       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
   3624       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
   3625       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
   3626       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
   3627       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
   3628          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3629 
   3630       case Iop_F64HLtoF128:
   3631       case Iop_D64HLtoD128:
   3632          return assignNew('V', mce, Ity_I128,
   3633                           binop(Iop_64HLto128, vatom1, vatom2));
   3634 
   3635       case Iop_F64toI32U:
   3636       case Iop_F64toI32S:
   3637       case Iop_F64toF32:
   3638       case Iop_I64UtoF32:
   3639       case Iop_D64toI32U:
   3640       case Iop_D64toI32S:
   3641          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
   3642          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3643 
   3644       case Iop_D64toD32:
   3645          /* First arg is I32 (rounding mode), second is D64 (data). */
   3646          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3647 
   3648       case Iop_F64toI16S:
   3649          /* First arg is I32 (rounding mode), second is F64 (data). */
   3650          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
   3651 
   3652       case Iop_InsertExpD64:
   3653          /*  I64 x I64 -> D64 */
   3654          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3655 
   3656       case Iop_InsertExpD128:
   3657          /*  I64 x I128 -> D128 */
   3658          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3659 
   3660       case Iop_CmpF32:
   3661       case Iop_CmpF64:
   3662       case Iop_CmpF128:
   3663       case Iop_CmpD64:
   3664       case Iop_CmpD128:
   3665       case Iop_CmpExpD64:
   3666       case Iop_CmpExpD128:
   3667          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3668 
   3669       /* non-FP after here */
   3670 
   3671       case Iop_DivModU64to32:
   3672       case Iop_DivModS64to32:
   3673          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3674 
   3675       case Iop_DivModU128to64:
   3676       case Iop_DivModS128to64:
   3677          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3678 
   3679       case Iop_8HLto16:
   3680          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
   3681       case Iop_16HLto32:
   3682          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
   3683       case Iop_32HLto64:
   3684          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
   3685 
   3686       case Iop_DivModS64to64:
   3687       case Iop_MullS64:
   3688       case Iop_MullU64: {
   3689          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
   3690          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
   3691          return assignNew('V', mce, Ity_I128,
   3692                           binop(Iop_64HLto128, vHi64, vLo64));
   3693       }
   3694 
   3695       case Iop_MullS32:
   3696       case Iop_MullU32: {
   3697          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
   3698          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
   3699          return assignNew('V', mce, Ity_I64,
   3700                           binop(Iop_32HLto64, vHi32, vLo32));
   3701       }
   3702 
   3703       case Iop_MullS16:
   3704       case Iop_MullU16: {
   3705          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
   3706          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
   3707          return assignNew('V', mce, Ity_I32,
   3708                           binop(Iop_16HLto32, vHi16, vLo16));
   3709       }
   3710 
   3711       case Iop_MullS8:
   3712       case Iop_MullU8: {
   3713          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
   3714          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
   3715          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
   3716       }
   3717 
   3718       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
   3719       case Iop_DivS32:
   3720       case Iop_DivU32:
   3721       case Iop_DivU32E:
   3722       case Iop_DivS32E:
   3723       case Iop_QAdd32S: /* could probably do better */
   3724       case Iop_QSub32S: /* could probably do better */
   3725          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3726 
   3727       case Iop_DivS64:
   3728       case Iop_DivU64:
   3729       case Iop_DivS64E:
   3730       case Iop_DivU64E:
   3731          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3732 
   3733       case Iop_Add32:
   3734          if (mce->bogusLiterals || mce->useLLVMworkarounds)
   3735             return expensiveAddSub(mce,True,Ity_I32,
   3736                                    vatom1,vatom2, atom1,atom2);
   3737          else
   3738             goto cheap_AddSub32;
   3739       case Iop_Sub32:
   3740          if (mce->bogusLiterals)
   3741             return expensiveAddSub(mce,False,Ity_I32,
   3742                                    vatom1,vatom2, atom1,atom2);
   3743          else
   3744             goto cheap_AddSub32;
   3745 
   3746       cheap_AddSub32:
   3747       case Iop_Mul32:
   3748          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
   3749 
   3750       case Iop_CmpORD32S:
   3751       case Iop_CmpORD32U:
   3752       case Iop_CmpORD64S:
   3753       case Iop_CmpORD64U:
   3754          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
   3755 
   3756       case Iop_Add64:
   3757          if (mce->bogusLiterals || mce->useLLVMworkarounds)
   3758             return expensiveAddSub(mce,True,Ity_I64,
   3759                                    vatom1,vatom2, atom1,atom2);
   3760          else
   3761             goto cheap_AddSub64;
   3762       case Iop_Sub64:
   3763          if (mce->bogusLiterals)
   3764             return expensiveAddSub(mce,False,Ity_I64,
   3765                                    vatom1,vatom2, atom1,atom2);
   3766          else
   3767             goto cheap_AddSub64;
   3768 
   3769       cheap_AddSub64:
   3770       case Iop_Mul64:
   3771          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
   3772 
   3773       case Iop_Mul16:
   3774       case Iop_Add16:
   3775       case Iop_Sub16:
   3776          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
   3777 
   3778       case Iop_Mul8:
   3779       case Iop_Sub8:
   3780       case Iop_Add8:
   3781          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
   3782 
   3783       case Iop_CmpEQ64:
   3784       case Iop_CmpNE64:
   3785          if (mce->bogusLiterals)
   3786             goto expensive_cmp64;
   3787          else
   3788             goto cheap_cmp64;
   3789 
   3790       expensive_cmp64:
   3791       case Iop_ExpCmpNE64:
   3792          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
   3793 
   3794       cheap_cmp64:
   3795       case Iop_CmpLE64S: case Iop_CmpLE64U:
   3796       case Iop_CmpLT64U: case Iop_CmpLT64S:
   3797          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
   3798 
   3799       case Iop_CmpEQ32:
   3800       case Iop_CmpNE32:
   3801          if (mce->bogusLiterals)
   3802             goto expensive_cmp32;
   3803          else
   3804             goto cheap_cmp32;
   3805 
   3806       expensive_cmp32:
   3807       case Iop_ExpCmpNE32:
   3808          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
   3809 
   3810       cheap_cmp32:
   3811       case Iop_CmpLE32S: case Iop_CmpLE32U:
   3812       case Iop_CmpLT32U: case Iop_CmpLT32S:
   3813          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
   3814 
   3815       case Iop_CmpEQ16: case Iop_CmpNE16:
   3816          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
   3817 
   3818       case Iop_ExpCmpNE16:
   3819          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
   3820 
   3821       case Iop_CmpEQ8: case Iop_CmpNE8:
   3822          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
   3823 
   3824       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
   3825       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
   3826       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
   3827       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
   3828          /* Just say these all produce a defined result, regardless
   3829             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
   3830          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
   3831 
   3832       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
   3833          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
   3834 
   3835       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
   3836          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
   3837 
   3838       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
   3839          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
   3840 
   3841       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
   3842          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
   3843 
   3844       case Iop_AndV256:
   3845          uifu = mkUifUV256; difd = mkDifDV256;
   3846          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
   3847       case Iop_AndV128:
   3848          uifu = mkUifUV128; difd = mkDifDV128;
   3849          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
   3850       case Iop_And64:
   3851          uifu = mkUifU64; difd = mkDifD64;
   3852          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
   3853       case Iop_And32:
   3854          uifu = mkUifU32; difd = mkDifD32;
   3855          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
   3856       case Iop_And16:
   3857          uifu = mkUifU16; difd = mkDifD16;
   3858          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
   3859       case Iop_And8:
   3860          uifu = mkUifU8; difd = mkDifD8;
   3861          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
   3862 
   3863       case Iop_OrV256:
   3864          uifu = mkUifUV256; difd = mkDifDV256;
   3865          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
   3866       case Iop_OrV128:
   3867          uifu = mkUifUV128; difd = mkDifDV128;
   3868          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
   3869       case Iop_Or64:
   3870          uifu = mkUifU64; difd = mkDifD64;
   3871          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
   3872       case Iop_Or32:
   3873          uifu = mkUifU32; difd = mkDifD32;
   3874          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
   3875       case Iop_Or16:
   3876          uifu = mkUifU16; difd = mkDifD16;
   3877          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
   3878       case Iop_Or8:
   3879          uifu = mkUifU8; difd = mkDifD8;
   3880          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
   3881 
   3882       do_And_Or:
   3883          return
   3884          assignNew(
   3885             'V', mce,
   3886             and_or_ty,
   3887             difd(mce, uifu(mce, vatom1, vatom2),
   3888                       difd(mce, improve(mce, atom1, vatom1),
   3889                                 improve(mce, atom2, vatom2) ) ) );
   3890 
   3891       case Iop_Xor8:
   3892          return mkUifU8(mce, vatom1, vatom2);
   3893       case Iop_Xor16:
   3894          return mkUifU16(mce, vatom1, vatom2);
   3895       case Iop_Xor32:
   3896          return mkUifU32(mce, vatom1, vatom2);
   3897       case Iop_Xor64:
   3898          return mkUifU64(mce, vatom1, vatom2);
   3899       case Iop_XorV128:
   3900          return mkUifUV128(mce, vatom1, vatom2);
   3901       case Iop_XorV256:
   3902          return mkUifUV256(mce, vatom1, vatom2);
   3903 
   3904       /* V256-bit SIMD */
   3905 
   3906       case Iop_ShrN16x16:
   3907       case Iop_ShrN32x8:
   3908       case Iop_ShrN64x4:
   3909       case Iop_SarN16x16:
   3910       case Iop_SarN32x8:
   3911       case Iop_ShlN16x16:
   3912       case Iop_ShlN32x8:
   3913       case Iop_ShlN64x4:
   3914          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
   3915             this is wrong now, scalar shifts are done properly lazily.
   3916             Vector shifts should be fixed too. */
   3917          complainIfUndefined(mce, atom2, NULL);
   3918          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
   3919 
   3920       case Iop_QSub8Ux32:
   3921       case Iop_QSub8Sx32:
   3922       case Iop_Sub8x32:
   3923       case Iop_Min8Ux32:
   3924       case Iop_Min8Sx32:
   3925       case Iop_Max8Ux32:
   3926       case Iop_Max8Sx32:
   3927       case Iop_CmpGT8Sx32:
   3928       case Iop_CmpEQ8x32:
   3929       case Iop_Avg8Ux32:
   3930       case Iop_QAdd8Ux32:
   3931       case Iop_QAdd8Sx32:
   3932       case Iop_Add8x32:
   3933          return binary8Ix32(mce, vatom1, vatom2);
   3934 
   3935       case Iop_QSub16Ux16:
   3936       case Iop_QSub16Sx16:
   3937       case Iop_Sub16x16:
   3938       case Iop_Mul16x16:
   3939       case Iop_MulHi16Sx16:
   3940       case Iop_MulHi16Ux16:
   3941       case Iop_Min16Sx16:
   3942       case Iop_Min16Ux16:
   3943       case Iop_Max16Sx16:
   3944       case Iop_Max16Ux16:
   3945       case Iop_CmpGT16Sx16:
   3946       case Iop_CmpEQ16x16:
   3947       case Iop_Avg16Ux16:
   3948       case Iop_QAdd16Ux16:
   3949       case Iop_QAdd16Sx16:
   3950       case Iop_Add16x16:
   3951          return binary16Ix16(mce, vatom1, vatom2);
   3952 
   3953       case Iop_Sub32x8:
   3954       case Iop_CmpGT32Sx8:
   3955       case Iop_CmpEQ32x8:
   3956       case Iop_Add32x8:
   3957       case Iop_Max32Ux8:
   3958       case Iop_Max32Sx8:
   3959       case Iop_Min32Ux8:
   3960       case Iop_Min32Sx8:
   3961       case Iop_Mul32x8:
   3962          return binary32Ix8(mce, vatom1, vatom2);
   3963 
   3964       case Iop_Sub64x4:
   3965       case Iop_Add64x4:
   3966       case Iop_CmpEQ64x4:
   3967       case Iop_CmpGT64Sx4:
   3968          return binary64Ix4(mce, vatom1, vatom2);
   3969 
   3970      /* Perm32x8: rearrange values in left arg using steering values
   3971         from right arg.  So rearrange the vbits in the same way but
   3972         pessimise wrt steering values. */
   3973       case Iop_Perm32x8:
   3974          return mkUifUV256(
   3975                    mce,
   3976                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
   3977                    mkPCast32x8(mce, vatom2)
   3978                 );
   3979 
   3980       default:
   3981          ppIROp(op);
   3982          VG_(tool_panic)("memcheck:expr2vbits_Binop");
   3983    }
   3984 }
   3985 
   3986 
   3987 static
   3988 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
   3989 {
   3990    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
   3991       selection of shadow operation implicitly duplicates the logic in
   3992       do_shadow_LoadG and should be kept in sync (in the very unlikely
   3993       event that the interpretation of such widening ops changes in
   3994       future).  See comment in do_shadow_LoadG. */
   3995    IRAtom* vatom = expr2vbits( mce, atom );
   3996    tl_assert(isOriginalAtom(mce,atom));
   3997    switch (op) {
   3998 
   3999       case Iop_Sqrt64Fx2:
   4000       case Iop_Abs64Fx2:
   4001       case Iop_Neg64Fx2:
   4002          return unary64Fx2(mce, vatom);
   4003 
   4004       case Iop_Sqrt64F0x2:
   4005          return unary64F0x2(mce, vatom);
   4006 
   4007       case Iop_Sqrt32Fx8:
   4008       case Iop_RSqrt32Fx8:
   4009       case Iop_Recip32Fx8:
   4010          return unary32Fx8(mce, vatom);
   4011 
   4012       case Iop_Sqrt64Fx4:
   4013          return unary64Fx4(mce, vatom);
   4014 
   4015       case Iop_Sqrt32Fx4:
   4016       case Iop_RSqrt32Fx4:
   4017       case Iop_Recip32Fx4:
   4018       case Iop_I32UtoFx4:
   4019       case Iop_I32StoFx4:
   4020       case Iop_QFtoI32Ux4_RZ:
   4021       case Iop_QFtoI32Sx4_RZ:
   4022       case Iop_RoundF32x4_RM:
   4023       case Iop_RoundF32x4_RP:
   4024       case Iop_RoundF32x4_RN:
   4025       case Iop_RoundF32x4_RZ:
   4026       case Iop_Recip32x4:
   4027       case Iop_Abs32Fx4:
   4028       case Iop_Neg32Fx4:
   4029       case Iop_Rsqrte32Fx4:
   4030          return unary32Fx4(mce, vatom);
   4031 
   4032       case Iop_I32UtoFx2:
   4033       case Iop_I32StoFx2:
   4034       case Iop_Recip32Fx2:
   4035       case Iop_Recip32x2:
   4036       case Iop_Abs32Fx2:
   4037       case Iop_Neg32Fx2:
   4038       case Iop_Rsqrte32Fx2:
   4039          return unary32Fx2(mce, vatom);
   4040 
   4041       case Iop_Sqrt32F0x4:
   4042       case Iop_RSqrt32F0x4:
   4043       case Iop_Recip32F0x4:
   4044          return unary32F0x4(mce, vatom);
   4045 
   4046       case Iop_32UtoV128:
   4047       case Iop_64UtoV128:
   4048       case Iop_Dup8x16:
   4049       case Iop_Dup16x8:
   4050       case Iop_Dup32x4:
   4051       case Iop_Reverse16_8x16:
   4052       case Iop_Reverse32_8x16:
   4053       case Iop_Reverse32_16x8:
   4054       case Iop_Reverse64_8x16:
   4055       case Iop_Reverse64_16x8:
   4056       case Iop_Reverse64_32x4:
   4057       case Iop_V256toV128_1: case Iop_V256toV128_0:
   4058       case Iop_ZeroHI64ofV128:
   4059       case Iop_ZeroHI96ofV128:
   4060       case Iop_ZeroHI112ofV128:
   4061       case Iop_ZeroHI120ofV128:
   4062          return assignNew('V', mce, Ity_V128, unop(op, vatom));
   4063 
   4064       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
   4065       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
   4066          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
   4067       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
   4068       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
   4069          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
   4070 
   4071       case Iop_NegF128:
   4072       case Iop_AbsF128:
   4073          return mkPCastTo(mce, Ity_I128, vatom);
   4074 
   4075       case Iop_I32StoF128: /* signed I32 -> F128 */
   4076       case Iop_I64StoF128: /* signed I64 -> F128 */
   4077       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
   4078       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
   4079       case Iop_F32toF128:  /* F32 -> F128 */
   4080       case Iop_F64toF128:  /* F64 -> F128 */
   4081       case Iop_I32StoD128: /* signed I64 -> D128 */
   4082       case Iop_I64StoD128: /* signed I64 -> D128 */
   4083       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
   4084       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
   4085          return mkPCastTo(mce, Ity_I128, vatom);
   4086 
   4087       case Iop_F32toF64:
   4088       case Iop_I32StoF64:
   4089       case Iop_I32UtoF64:
   4090       case Iop_NegF64:
   4091       case Iop_AbsF64:
   4092       case Iop_Est5FRSqrt:
   4093       case Iop_RoundF64toF64_NEAREST:
   4094       case Iop_RoundF64toF64_NegINF:
   4095       case Iop_RoundF64toF64_PosINF:
   4096       case Iop_RoundF64toF64_ZERO:
   4097       case Iop_Clz64:
   4098       case Iop_D32toD64:
   4099       case Iop_I32StoD64:
   4100       case Iop_I32UtoD64:
   4101       case Iop_ExtractExpD64:    /* D64  -> I64 */
   4102       case Iop_ExtractExpD128:   /* D128 -> I64 */
   4103       case Iop_ExtractSigD64:    /* D64  -> I64 */
   4104       case Iop_ExtractSigD128:   /* D128 -> I64 */
   4105       case Iop_DPBtoBCD:
   4106       case Iop_BCDtoDPB:
   4107          return mkPCastTo(mce, Ity_I64, vatom);
   4108 
   4109       case Iop_D64toD128:
   4110          return mkPCastTo(mce, Ity_I128, vatom);
   4111 
   4112       case Iop_Clz32:
   4113       case Iop_TruncF64asF32:
   4114       case Iop_NegF32:
   4115       case Iop_AbsF32:
   4116          return mkPCastTo(mce, Ity_I32, vatom);
   4117 
   4118       case Iop_Ctz32:
   4119       case Iop_Ctz64:
   4120          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
   4121 
   4122       case Iop_1Uto64:
   4123       case Iop_1Sto64:
   4124       case Iop_8Uto64:
   4125       case Iop_8Sto64:
   4126       case Iop_16Uto64:
   4127       case Iop_16Sto64:
   4128       case Iop_32Sto64:
   4129       case Iop_32Uto64:
   4130       case Iop_V128to64:
   4131       case Iop_V128HIto64:
   4132       case Iop_128HIto64:
   4133       case Iop_128to64:
   4134       case Iop_Dup8x8:
   4135       case Iop_Dup16x4:
   4136       case Iop_Dup32x2:
   4137       case Iop_Reverse16_8x8:
   4138       case Iop_Reverse32_8x8:
   4139       case Iop_Reverse32_16x4:
   4140       case Iop_Reverse64_8x8:
   4141       case Iop_Reverse64_16x4:
   4142       case Iop_Reverse64_32x2:
   4143       case Iop_V256to64_0: case Iop_V256to64_1:
   4144       case Iop_V256to64_2: case Iop_V256to64_3:
   4145          return assignNew('V', mce, Ity_I64, unop(op, vatom));
   4146 
   4147       case Iop_64to32:
   4148       case Iop_64HIto32:
   4149       case Iop_1Uto32:
   4150       case Iop_1Sto32:
   4151       case Iop_8Uto32:
   4152       case Iop_16Uto32:
   4153       case Iop_16Sto32:
   4154       case Iop_8Sto32:
   4155       case Iop_V128to32:
   4156          return assignNew('V', mce, Ity_I32, unop(op, vatom));
   4157 
   4158       case Iop_8Sto16:
   4159       case Iop_8Uto16:
   4160       case Iop_32to16:
   4161       case Iop_32HIto16:
   4162       case Iop_64to16:
   4163       case Iop_GetMSBs8x16:
   4164          return assignNew('V', mce, Ity_I16, unop(op, vatom));
   4165 
   4166       case Iop_1Uto8:
   4167       case Iop_1Sto8:
   4168       case Iop_16to8:
   4169       case Iop_16HIto8:
   4170       case Iop_32to8:
   4171       case Iop_64to8:
   4172       case Iop_GetMSBs8x8:
   4173          return assignNew('V', mce, Ity_I8, unop(op, vatom));
   4174 
   4175       case Iop_32to1:
   4176          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
   4177 
   4178       case Iop_64to1:
   4179          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
   4180 
   4181       case Iop_ReinterpF64asI64:
   4182       case Iop_ReinterpI64asF64:
   4183       case Iop_ReinterpI32asF32:
   4184       case Iop_ReinterpF32asI32:
   4185       case Iop_ReinterpI64asD64:
   4186       case Iop_ReinterpD64asI64:
   4187       case Iop_NotV256:
   4188       case Iop_NotV128:
   4189       case Iop_Not64:
   4190       case Iop_Not32:
   4191       case Iop_Not16:
   4192       case Iop_Not8:
   4193       case Iop_Not1:
   4194          return vatom;
   4195 
   4196       case Iop_CmpNEZ8x8:
   4197       case Iop_Cnt8x8:
   4198       case Iop_Clz8Sx8:
   4199       case Iop_Cls8Sx8:
   4200       case Iop_Abs8x8:
   4201          return mkPCast8x8(mce, vatom);
   4202 
   4203       case Iop_CmpNEZ8x16:
   4204       case Iop_Cnt8x16:
   4205       case Iop_Clz8Sx16:
   4206       case Iop_Cls8Sx16:
   4207       case Iop_Abs8x16:
   4208          return mkPCast8x16(mce, vatom);
   4209 
   4210       case Iop_CmpNEZ16x4:
   4211       case Iop_Clz16Sx4:
   4212       case Iop_Cls16Sx4:
   4213       case Iop_Abs16x4:
   4214          return mkPCast16x4(mce, vatom);
   4215 
   4216       case Iop_CmpNEZ16x8:
   4217       case Iop_Clz16Sx8:
   4218       case Iop_Cls16Sx8:
   4219       case Iop_Abs16x8:
   4220          return mkPCast16x8(mce, vatom);
   4221 
   4222       case Iop_CmpNEZ32x2:
   4223       case Iop_Clz32Sx2:
   4224       case Iop_Cls32Sx2:
   4225       case Iop_FtoI32Ux2_RZ:
   4226       case Iop_FtoI32Sx2_RZ:
   4227       case Iop_Abs32x2:
   4228          return mkPCast32x2(mce, vatom);
   4229 
   4230       case Iop_CmpNEZ32x4:
   4231       case Iop_Clz32Sx4:
   4232       case Iop_Cls32Sx4:
   4233       case Iop_FtoI32Ux4_RZ:
   4234       case Iop_FtoI32Sx4_RZ:
   4235       case Iop_Abs32x4:
   4236          return mkPCast32x4(mce, vatom);
   4237 
   4238       case Iop_CmpwNEZ32:
   4239          return mkPCastTo(mce, Ity_I32, vatom);
   4240 
   4241       case Iop_CmpwNEZ64:
   4242          return mkPCastTo(mce, Ity_I64, vatom);
   4243 
   4244       case Iop_CmpNEZ64x2:
   4245       case Iop_CipherSV128:
   4246       case Iop_Clz64x2:
   4247          return mkPCast64x2(mce, vatom);
   4248 
   4249       case Iop_PwBitMtxXpose64x2:
   4250          return assignNew('V', mce, Ity_V128, unop(op, vatom));
   4251 
   4252       case Iop_NarrowUn16to8x8:
   4253       case Iop_NarrowUn32to16x4:
   4254       case Iop_NarrowUn64to32x2:
   4255       case Iop_QNarrowUn16Sto8Sx8:
   4256       case Iop_QNarrowUn16Sto8Ux8:
   4257       case Iop_QNarrowUn16Uto8Ux8:
   4258       case Iop_QNarrowUn32Sto16Sx4:
   4259       case Iop_QNarrowUn32Sto16Ux4:
   4260       case Iop_QNarrowUn32Uto16Ux4:
   4261       case Iop_QNarrowUn64Sto32Sx2:
   4262       case Iop_QNarrowUn64Sto32Ux2:
   4263       case Iop_QNarrowUn64Uto32Ux2:
   4264          return vectorNarrowUnV128(mce, op, vatom);
   4265 
   4266       case Iop_Widen8Sto16x8:
   4267       case Iop_Widen8Uto16x8:
   4268       case Iop_Widen16Sto32x4:
   4269       case Iop_Widen16Uto32x4:
   4270       case Iop_Widen32Sto64x2:
   4271       case Iop_Widen32Uto64x2:
   4272          return vectorWidenI64(mce, op, vatom);
   4273 
   4274       case Iop_PwAddL32Ux2:
   4275       case Iop_PwAddL32Sx2:
   4276          return mkPCastTo(mce, Ity_I64,
   4277                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
   4278 
   4279       case Iop_PwAddL16Ux4:
   4280       case Iop_PwAddL16Sx4:
   4281          return mkPCast32x2(mce,
   4282                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
   4283 
   4284       case Iop_PwAddL8Ux8:
   4285       case Iop_PwAddL8Sx8:
   4286          return mkPCast16x4(mce,
   4287                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
   4288 
   4289       case Iop_PwAddL32Ux4:
   4290       case Iop_PwAddL32Sx4:
   4291          return mkPCast64x2(mce,
   4292                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
   4293 
   4294       case Iop_PwAddL16Ux8:
   4295       case Iop_PwAddL16Sx8:
   4296          return mkPCast32x4(mce,
   4297                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
   4298 
   4299       case Iop_PwAddL8Ux16:
   4300       case Iop_PwAddL8Sx16:
   4301          return mkPCast16x8(mce,
   4302                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
   4303 
   4304       // TODO: is this correct?
   4305       case Iop_AddLV8Ux16:
   4306       case Iop_AddLV8Sx16:
   4307          return assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom)));
   4308 
   4309       case Iop_AddLV16Ux8:
   4310       case Iop_AddLV16Sx8:
   4311          return assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom)));
   4312 
   4313       case Iop_AddLV32Ux4:
   4314       case Iop_AddLV32Sx4:
   4315          return assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom)));
   4316 
   4317       case Iop_I64UtoF32:
   4318       default:
   4319          ppIROp(op);
   4320          VG_(tool_panic)("memcheck:expr2vbits_Unop");
   4321    }
   4322 }
   4323 
   4324 
   4325 /* Worker function -- do not call directly.  See comments on
   4326    expr2vbits_Load for the meaning of |guard|.
   4327 
   4328    Generates IR to (1) perform a definedness test of |addr|, (2)
   4329    perform a validity test of |addr|, and (3) return the Vbits for the
   4330    location indicated by |addr|.  All of this only happens when
   4331    |guard| is NULL or |guard| evaluates to True at run time.
   4332 
   4333    If |guard| evaluates to False at run time, the returned value is
   4334    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
   4335    performed.
   4336 
   4337    The definedness of |guard| itself is not checked.  That is assumed
   4338    to have been done before this point, by the caller. */
   4339 static
   4340 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
   4341                               IREndness end, IRType ty,
   4342                               IRAtom* addr, UInt bias, IRAtom* guard )
   4343 {
   4344    tl_assert(isOriginalAtom(mce,addr));
   4345    tl_assert(end == Iend_LE || end == Iend_BE);
   4346 
   4347    /* First, emit a definedness test for the address.  This also sets
   4348       the address (shadow) to 'defined' following the test. */
   4349    complainIfUndefined( mce, addr, guard );
   4350 
   4351    /* Now cook up a call to the relevant helper function, to read the
   4352       data V bits from shadow memory. */
   4353    ty = shadowTypeV(ty);
   4354 
   4355    void*        helper           = NULL;
   4356    const HChar* hname            = NULL;
   4357    Bool         ret_via_outparam = False;
   4358 
   4359    if (end == Iend_LE) {
   4360       switch (ty) {
   4361          case Ity_V256: helper = &MC_(helperc_LOADV256le);
   4362                         hname = "MC_(helperc_LOADV256le)";
   4363                         ret_via_outparam = True;
   4364                         break;
   4365          case Ity_V128: helper = &MC_(helperc_LOADV128le);
   4366                         hname = "MC_(helperc_LOADV128le)";
   4367                         ret_via_outparam = True;
   4368                         break;
   4369          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
   4370                         hname = "MC_(helperc_LOADV64le)";
   4371                         break;
   4372          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
   4373                         hname = "MC_(helperc_LOADV32le)";
   4374                         break;
   4375          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
   4376                         hname = "MC_(helperc_LOADV16le)";
   4377                         break;
   4378          case Ity_I8:   helper = &MC_(helperc_LOADV8);
   4379                         hname = "MC_(helperc_LOADV8)";
   4380                         break;
   4381          default:       ppIRType(ty);
   4382                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
   4383       }
   4384    } else {
   4385       switch (ty) {
   4386          case Ity_V256: helper = &MC_(helperc_LOADV256be);
   4387                         hname = "MC_(helperc_LOADV256be)";
   4388                         ret_via_outparam = True;
   4389                         break;
   4390          case Ity_V128: helper = &MC_(helperc_LOADV128be);
   4391                         hname = "MC_(helperc_LOADV128be)";
   4392                         ret_via_outparam = True;
   4393                         break;
   4394          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
   4395                         hname = "MC_(helperc_LOADV64be)";
   4396                         break;
   4397          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
   4398                         hname = "MC_(helperc_LOADV32be)";
   4399                         break;
   4400          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
   4401                         hname = "MC_(helperc_LOADV16be)";
   4402                         break;
   4403          case Ity_I8:   helper = &MC_(helperc_LOADV8);
   4404                         hname = "MC_(helperc_LOADV8)";
   4405                         break;
   4406          default:       ppIRType(ty);
   4407                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
   4408       }
   4409    }
   4410 
   4411    tl_assert(helper);
   4412    tl_assert(hname);
   4413 
   4414    /* Generate the actual address into addrAct. */
   4415    IRAtom* addrAct;
   4416    if (bias == 0) {
   4417       addrAct = addr;
   4418    } else {
   4419       IROp    mkAdd;
   4420       IRAtom* eBias;
   4421       IRType  tyAddr  = mce->hWordTy;
   4422       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
   4423       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
   4424       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
   4425       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
   4426    }
   4427 
   4428    /* We need to have a place to park the V bits we're just about to
   4429       read. */
   4430    IRTemp datavbits = newTemp(mce, ty, VSh);
   4431 
   4432    /* Here's the call. */
   4433    IRDirty* di;
   4434    if (ret_via_outparam) {
   4435       di = unsafeIRDirty_1_N( datavbits,
   4436                               2/*regparms*/,
   4437                               hname, VG_(fnptr_to_fnentry)( helper ),
   4438                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
   4439    } else {
   4440       di = unsafeIRDirty_1_N( datavbits,
   4441                               1/*regparms*/,
   4442                               hname, VG_(fnptr_to_fnentry)( helper ),
   4443                               mkIRExprVec_1( addrAct ) );
   4444    }
   4445 
   4446    setHelperAnns( mce, di );
   4447    if (guard) {
   4448       di->guard = guard;
   4449       /* Ideally the didn't-happen return value here would be all-ones
   4450          (all-undefined), so it'd be obvious if it got used
   4451          inadvertantly.  We can get by with the IR-mandated default
   4452          value (0b01 repeating, 0x55 etc) as that'll still look pretty
   4453          undefined if it ever leaks out. */
   4454    }
   4455    stmt( 'V', mce, IRStmt_Dirty(di) );
   4456 
   4457    return mkexpr(datavbits);
   4458 }
   4459 
   4460 
   4461 /* Generate IR to do a shadow load.  The helper is expected to check
   4462    the validity of the address and return the V bits for that address.
   4463    This can optionally be controlled by a guard, which is assumed to
   4464    be True if NULL.  In the case where the guard is False at runtime,
   4465    the helper will return the didn't-do-the-call value of 0x55..55.
   4466    Since that means "completely undefined result", the caller of
   4467    this function will need to fix up the result somehow in that
   4468    case.
   4469 
   4470    Caller of this function is also expected to have checked the
   4471    definedness of |guard| before this point.
   4472 */
   4473 static
   4474 IRAtom* expr2vbits_Load ( MCEnv* mce,
   4475                           IREndness end, IRType ty,
   4476                           IRAtom* addr, UInt bias,
   4477                           IRAtom* guard )
   4478 {
   4479    tl_assert(end == Iend_LE || end == Iend_BE);
   4480    switch (shadowTypeV(ty)) {
   4481       case Ity_I8:
   4482       case Ity_I16:
   4483       case Ity_I32:
   4484       case Ity_I64:
   4485       case Ity_V128:
   4486       case Ity_V256:
   4487          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
   4488       default:
   4489          VG_(tool_panic)("expr2vbits_Load");
   4490    }
   4491 }
   4492 
   4493 
   4494 /* The most general handler for guarded loads.  Assumes the
   4495    definedness of GUARD has already been checked by the caller.  A
   4496    GUARD of NULL is assumed to mean "always True".  Generates code to
   4497    check the definedness and validity of ADDR.
   4498 
   4499    Generate IR to do a shadow load from ADDR and return the V bits.
   4500    The loaded type is TY.  The loaded data is then (shadow) widened by
   4501    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
   4502    evaluates to False at run time then the returned Vbits are simply
   4503    VALT instead.  Note therefore that the argument type of VWIDEN must
   4504    be TY and the result type of VWIDEN must equal the type of VALT.
   4505 */
   4506 static
   4507 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
   4508                                           IREndness end, IRType ty,
   4509                                           IRAtom* addr, UInt bias,
   4510                                           IRAtom* guard,
   4511                                           IROp vwiden, IRAtom* valt )
   4512 {
   4513    /* Sanity check the conversion operation, and also set TYWIDE. */
   4514    IRType tyWide = Ity_INVALID;
   4515    switch (vwiden) {
   4516       case Iop_INVALID:
   4517          tyWide = ty;
   4518          break;
   4519       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
   4520          tyWide = Ity_I32;
   4521          break;
   4522       default:
   4523          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
   4524    }
   4525 
   4526    /* If the guard evaluates to True, this will hold the loaded V bits
   4527       at TY.  If the guard evaluates to False, this will be all
   4528       ones, meaning "all undefined", in which case we will have to
   4529       replace it using an ITE below. */
   4530    IRAtom* iftrue1
   4531       = assignNew('V', mce, ty,
   4532                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
   4533    /* Now (shadow-) widen the loaded V bits to the desired width.  In
   4534       the guard-is-False case, the allowable widening operators will
   4535       in the worst case (unsigned widening) at least leave the
   4536       pre-widened part as being marked all-undefined, and in the best
   4537       case (signed widening) mark the whole widened result as
   4538       undefined.  Anyway, it doesn't matter really, since in this case
   4539       we will replace said value with the default value |valt| using an
   4540       ITE. */
   4541    IRAtom* iftrue2
   4542       = vwiden == Iop_INVALID
   4543            ? iftrue1
   4544            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
   4545    /* These are the V bits we will return if the load doesn't take
   4546       place. */
   4547    IRAtom* iffalse
   4548       = valt;
   4549    /* Prepare the cond for the ITE.  Convert a NULL cond into
   4550       something that iropt knows how to fold out later. */
   4551    IRAtom* cond
   4552       = guard == NULL  ? mkU1(1)  : guard;
   4553    /* And assemble the final result. */
   4554    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
   4555 }
   4556 
   4557 
   4558 /* A simpler handler for guarded loads, in which there is no
   4559    conversion operation, and the default V bit return (when the guard
   4560    evaluates to False at runtime) is "all defined".  If there is no
   4561    guard expression or the guard is always TRUE this function behaves
   4562    like expr2vbits_Load.  It is assumed that definedness of GUARD has
   4563    already been checked at the call site. */
   4564 static
   4565 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
   4566                                          IREndness end, IRType ty,
   4567                                          IRAtom* addr, UInt bias,
   4568                                          IRAtom *guard )
   4569 {
   4570    return expr2vbits_Load_guarded_General(
   4571              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
   4572           );
   4573 }
   4574 
   4575 
   4576 static
   4577 IRAtom* expr2vbits_ITE ( MCEnv* mce,
   4578                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
   4579 {
   4580    IRAtom *vbitsC, *vbits0, *vbits1;
   4581    IRType ty;
   4582    /* Given ITE(cond, iftrue,  iffalse),  generate
   4583             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
   4584       That is, steer the V bits like the originals, but trash the
   4585       result if the steering value is undefined.  This gives
   4586       lazy propagation. */
   4587    tl_assert(isOriginalAtom(mce, cond));
   4588    tl_assert(isOriginalAtom(mce, iftrue));
   4589    tl_assert(isOriginalAtom(mce, iffalse));
   4590 
   4591    vbitsC = expr2vbits(mce, cond);
   4592    vbits1 = expr2vbits(mce, iftrue);
   4593    vbits0 = expr2vbits(mce, iffalse);
   4594    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
   4595 
   4596    return
   4597       mkUifU(mce, ty, assignNew('V', mce, ty,
   4598                                      IRExpr_ITE(cond, vbits1, vbits0)),
   4599                       mkPCastTo(mce, ty, vbitsC) );
   4600 }
   4601 
   4602 /* --------- This is the main expression-handling function. --------- */
   4603 
   4604 static
   4605 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
   4606 {
   4607    switch (e->tag) {
   4608 
   4609       case Iex_Get:
   4610          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
   4611 
   4612       case Iex_GetI:
   4613          return shadow_GETI( mce, e->Iex.GetI.descr,
   4614                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
   4615 
   4616       case Iex_RdTmp:
   4617          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
   4618 
   4619       case Iex_Const:
   4620          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
   4621 
   4622       case Iex_Qop:
   4623          return expr2vbits_Qop(
   4624                    mce,
   4625                    e->Iex.Qop.details->op,
   4626                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
   4627                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
   4628                 );
   4629 
   4630       case Iex_Triop:
   4631          return expr2vbits_Triop(
   4632                    mce,
   4633                    e->Iex.Triop.details->op,
   4634                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
   4635                    e->Iex.Triop.details->arg3
   4636                 );
   4637 
   4638       case Iex_Binop:
   4639          return expr2vbits_Binop(
   4640                    mce,
   4641                    e->Iex.Binop.op,
   4642                    e->Iex.Binop.arg1, e->Iex.Binop.arg2
   4643                 );
   4644 
   4645       case Iex_Unop:
   4646          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
   4647 
   4648       case Iex_Load:
   4649          return expr2vbits_Load( mce, e->Iex.Load.end,
   4650                                       e->Iex.Load.ty,
   4651                                       e->Iex.Load.addr, 0/*addr bias*/,
   4652                                       NULL/* guard == "always True"*/ );
   4653 
   4654       case Iex_CCall:
   4655          return mkLazyN( mce, e->Iex.CCall.args,
   4656                               e->Iex.CCall.retty,
   4657                               e->Iex.CCall.cee );
   4658 
   4659       case Iex_ITE:
   4660          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
   4661                                      e->Iex.ITE.iffalse);
   4662 
   4663       default:
   4664          VG_(printf)("\n");
   4665          ppIRExpr(e);
   4666          VG_(printf)("\n");
   4667          VG_(tool_panic)("memcheck: expr2vbits");
   4668    }
   4669 }
   4670 
   4671 /*------------------------------------------------------------*/
   4672 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
   4673 /*------------------------------------------------------------*/
   4674 
   4675 /* Widen a value to the host word size. */
   4676 
   4677 static
   4678 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
   4679 {
   4680    IRType ty, tyH;
   4681 
   4682    /* vatom is vbits-value and as such can only have a shadow type. */
   4683    tl_assert(isShadowAtom(mce,vatom));
   4684 
   4685    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
   4686    tyH = mce->hWordTy;
   4687 
   4688    if (tyH == Ity_I32) {
   4689       switch (ty) {
   4690          case Ity_I32:
   4691             return vatom;
   4692          case Ity_I16:
   4693             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
   4694          case Ity_I8:
   4695             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
   4696          default:
   4697             goto unhandled;
   4698       }
   4699    } else
   4700    if (tyH == Ity_I64) {
   4701       switch (ty) {
   4702          case Ity_I32:
   4703             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
   4704          case Ity_I16:
   4705             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
   4706                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
   4707          case Ity_I8:
   4708             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
   4709                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
   4710          default:
   4711             goto unhandled;
   4712       }
   4713    } else {
   4714       goto unhandled;
   4715    }
   4716   unhandled:
   4717    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
   4718    VG_(tool_panic)("zwidenToHostWord");
   4719 }
   4720 
   4721 
   4722 /* Generate a shadow store.  |addr| is always the original address
   4723    atom.  You can pass in either originals or V-bits for the data
   4724    atom, but obviously not both.  This function generates a check for
   4725    the definedness and (indirectly) the validity of |addr|, but only
   4726    when |guard| evaluates to True at run time (or is NULL).
   4727 
   4728    |guard| :: Ity_I1 controls whether the store really happens; NULL
   4729    means it unconditionally does.  Note that |guard| itself is not
   4730    checked for definedness; the caller of this function must do that
   4731    if necessary.
   4732 */
   4733 static
   4734 void do_shadow_Store ( MCEnv* mce,
   4735                        IREndness end,
   4736                        IRAtom* addr, UInt bias,
   4737                        IRAtom* data, IRAtom* vdata,
   4738                        IRAtom* guard )
   4739 {
   4740    IROp     mkAdd;
   4741    IRType   ty, tyAddr;
   4742    void*    helper = NULL;
   4743    const HChar* hname = NULL;
   4744    IRConst* c;
   4745 
   4746    tyAddr = mce->hWordTy;
   4747    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
   4748    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
   4749    tl_assert( end == Iend_LE || end == Iend_BE );
   4750 
   4751    if (data) {
   4752       tl_assert(!vdata);
   4753       tl_assert(isOriginalAtom(mce, data));
   4754       tl_assert(bias == 0);
   4755       vdata = expr2vbits( mce, data );
   4756    } else {
   4757       tl_assert(vdata);
   4758    }
   4759 
   4760    tl_assert(isOriginalAtom(mce,addr));
   4761    tl_assert(isShadowAtom(mce,vdata));
   4762 
   4763    if (guard) {
   4764       tl_assert(isOriginalAtom(mce, guard));
   4765       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
   4766    }
   4767 
   4768    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
   4769 
   4770    // If we're not doing undefined value checking, pretend that this value
   4771    // is "all valid".  That lets Vex's optimiser remove some of the V bit
   4772    // shadow computation ops that precede it.
   4773    if (MC_(clo_mc_level) == 1) {
   4774       switch (ty) {
   4775          case Ity_V256: // V256 weirdness -- used four times
   4776                         c = IRConst_V256(V_BITS32_DEFINED); break;
   4777          case Ity_V128: // V128 weirdness -- used twice
   4778                         c = IRConst_V128(V_BITS16_DEFINED); break;
   4779          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
   4780          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
   4781          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
   4782          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
   4783          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
   4784       }
   4785       vdata = IRExpr_Const( c );
   4786    }
   4787 
   4788    /* First, emit a definedness test for the address.  This also sets
   4789       the address (shadow) to 'defined' following the test.  Both of
   4790       those actions are gated on |guard|. */
   4791    complainIfUndefined( mce, addr, guard );
   4792 
   4793    /* Now decide which helper function to call to write the data V
   4794       bits into shadow memory. */
   4795    if (end == Iend_LE) {
   4796       switch (ty) {
   4797          case Ity_V256: /* we'll use the helper four times */
   4798          case Ity_V128: /* we'll use the helper twice */
   4799          case Ity_I64: helper = &MC_(helperc_STOREV64le);
   4800                        hname = "MC_(helperc_STOREV64le)";
   4801                        break;
   4802          case Ity_I32: helper = &MC_(helperc_STOREV32le);
   4803                        hname = "MC_(helperc_STOREV32le)";
   4804                        break;
   4805          case Ity_I16: helper = &MC_(helperc_STOREV16le);
   4806                        hname = "MC_(helperc_STOREV16le)";
   4807                        break;
   4808          case Ity_I8:  helper = &MC_(helperc_STOREV8);
   4809                        hname = "MC_(helperc_STOREV8)";
   4810                        break;
   4811          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
   4812       }
   4813    } else {
   4814       switch (ty) {
   4815          case Ity_V128: /* we'll use the helper twice */
   4816          case Ity_I64: helper = &MC_(helperc_STOREV64be);
   4817                        hname = "MC_(helperc_STOREV64be)";
   4818                        break;
   4819          case Ity_I32: helper = &MC_(helperc_STOREV32be);
   4820                        hname = "MC_(helperc_STOREV32be)";
   4821                        break;
   4822          case Ity_I16: helper = &MC_(helperc_STOREV16be);
   4823                        hname = "MC_(helperc_STOREV16be)";
   4824                        break;
   4825          case Ity_I8:  helper = &MC_(helperc_STOREV8);
   4826                        hname = "MC_(helperc_STOREV8)";
   4827                        break;
   4828          /* Note, no V256 case here, because no big-endian target that
   4829             we support, has 256 vectors. */
   4830          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
   4831       }
   4832    }
   4833 
   4834    if (UNLIKELY(ty == Ity_V256)) {
   4835 
   4836       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
   4837          Q3 being the most significant lane. */
   4838       /* These are the offsets of the Qs in memory. */
   4839       Int     offQ0, offQ1, offQ2, offQ3;
   4840 
   4841       /* Various bits for constructing the 4 lane helper calls */
   4842       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
   4843       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
   4844       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
   4845       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
   4846 
   4847       if (end == Iend_LE) {
   4848          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
   4849       } else {
   4850          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
   4851       }
   4852 
   4853       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
   4854       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
   4855       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
   4856       diQ0    = unsafeIRDirty_0_N(
   4857                    1/*regparms*/,
   4858                    hname, VG_(fnptr_to_fnentry)( helper ),
   4859                    mkIRExprVec_2( addrQ0, vdataQ0 )
   4860                 );
   4861 
   4862       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
   4863       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
   4864       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
   4865       diQ1    = unsafeIRDirty_0_N(
   4866                    1/*regparms*/,
   4867                    hname, VG_(fnptr_to_fnentry)( helper ),
   4868                    mkIRExprVec_2( addrQ1, vdataQ1 )
   4869                 );
   4870 
   4871       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
   4872       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
   4873       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
   4874       diQ2    = unsafeIRDirty_0_N(
   4875                    1/*regparms*/,
   4876                    hname, VG_(fnptr_to_fnentry)( helper ),
   4877                    mkIRExprVec_2( addrQ2, vdataQ2 )
   4878                 );
   4879 
   4880       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
   4881       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
   4882       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
   4883       diQ3    = unsafeIRDirty_0_N(
   4884                    1/*regparms*/,
   4885                    hname, VG_(fnptr_to_fnentry)( helper ),
   4886                    mkIRExprVec_2( addrQ3, vdataQ3 )
   4887                 );
   4888 
   4889       if (guard)
   4890          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
   4891 
   4892       setHelperAnns( mce, diQ0 );
   4893       setHelperAnns( mce, diQ1 );
   4894       setHelperAnns( mce, diQ2 );
   4895       setHelperAnns( mce, diQ3 );
   4896       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
   4897       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
   4898       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
   4899       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
   4900 
   4901    }
   4902    else if (UNLIKELY(ty == Ity_V128)) {
   4903 
   4904       /* V128-bit case */
   4905       /* See comment in next clause re 64-bit regparms */
   4906       /* also, need to be careful about endianness */
   4907 
   4908       Int     offLo64, offHi64;
   4909       IRDirty *diLo64, *diHi64;
   4910       IRAtom  *addrLo64, *addrHi64;
   4911       IRAtom  *vdataLo64, *vdataHi64;
   4912       IRAtom  *eBiasLo64, *eBiasHi64;
   4913 
   4914       if (end == Iend_LE) {
   4915          offLo64 = 0;
   4916          offHi64 = 8;
   4917       } else {
   4918          offLo64 = 8;
   4919          offHi64 = 0;
   4920       }
   4921 
   4922       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
   4923       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
   4924       vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
   4925       diLo64    = unsafeIRDirty_0_N(
   4926                      1/*regparms*/,
   4927                      hname, VG_(fnptr_to_fnentry)( helper ),
   4928                      mkIRExprVec_2( addrLo64, vdataLo64 )
   4929                   );
   4930       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
   4931       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
   4932       vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
   4933       diHi64    = unsafeIRDirty_0_N(
   4934                      1/*regparms*/,
   4935                      hname, VG_(fnptr_to_fnentry)( helper ),
   4936                      mkIRExprVec_2( addrHi64, vdataHi64 )
   4937                   );
   4938       if (guard) diLo64->guard = guard;
   4939       if (guard) diHi64->guard = guard;
   4940       setHelperAnns( mce, diLo64 );
   4941       setHelperAnns( mce, diHi64 );
   4942       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
   4943       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
   4944 
   4945    } else {
   4946 
   4947       IRDirty *di;
   4948       IRAtom  *addrAct;
   4949 
   4950       /* 8/16/32/64-bit cases */
   4951       /* Generate the actual address into addrAct. */
   4952       if (bias == 0) {
   4953          addrAct = addr;
   4954       } else {
   4955          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
   4956          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
   4957       }
   4958 
   4959       if (ty == Ity_I64) {
   4960          /* We can't do this with regparm 2 on 32-bit platforms, since
   4961             the back ends aren't clever enough to handle 64-bit
   4962             regparm args.  Therefore be different. */
   4963          di = unsafeIRDirty_0_N(
   4964                  1/*regparms*/,
   4965                  hname, VG_(fnptr_to_fnentry)( helper ),
   4966                  mkIRExprVec_2( addrAct, vdata )
   4967               );
   4968       } else {
   4969          di = unsafeIRDirty_0_N(
   4970                  2/*regparms*/,
   4971                  hname, VG_(fnptr_to_fnentry)( helper ),
   4972                  mkIRExprVec_2( addrAct,
   4973                                 zwidenToHostWord( mce, vdata ))
   4974               );
   4975       }
   4976       if (guard) di->guard = guard;
   4977       setHelperAnns( mce, di );
   4978       stmt( 'V', mce, IRStmt_Dirty(di) );
   4979    }
   4980 
   4981 }
   4982 
   4983 
   4984 /* Do lazy pessimistic propagation through a dirty helper call, by
   4985    looking at the annotations on it.  This is the most complex part of
   4986    Memcheck. */
   4987 
   4988 static IRType szToITy ( Int n )
   4989 {
   4990    switch (n) {
   4991       case 1: return Ity_I8;
   4992       case 2: return Ity_I16;
   4993       case 4: return Ity_I32;
   4994       case 8: return Ity_I64;
   4995       default: VG_(tool_panic)("szToITy(memcheck)");
   4996    }
   4997 }
   4998 
   4999 static
   5000 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
   5001 {
   5002    Int       i, k, n, toDo, gSz, gOff;
   5003    IRAtom    *src, *here, *curr;
   5004    IRType    tySrc, tyDst;
   5005    IRTemp    dst;
   5006    IREndness end;
   5007 
   5008    /* What's the native endianness?  We need to know this. */
   5009 #  if defined(VG_BIGENDIAN)
   5010    end = Iend_BE;
   5011 #  elif defined(VG_LITTLEENDIAN)
   5012    end = Iend_LE;
   5013 #  else
   5014 #    error "Unknown endianness"
   5015 #  endif
   5016 
   5017    /* First check the guard. */
   5018    complainIfUndefined(mce, d->guard, NULL);
   5019 
   5020    /* Now round up all inputs and PCast over them. */
   5021    curr = definedOfType(Ity_I32);
   5022 
   5023    /* Inputs: unmasked args
   5024       Note: arguments are evaluated REGARDLESS of the guard expression */
   5025    for (i = 0; d->args[i]; i++) {
   5026       IRAtom* arg = d->args[i];
   5027       if ( (d->cee->mcx_mask & (1<<i))
   5028            || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
   5029          /* ignore this arg */
   5030       } else {
   5031          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg) );
   5032          curr = mkUifU32(mce, here, curr);
   5033       }
   5034    }
   5035 
   5036    /* Inputs: guest state that we read. */
   5037    for (i = 0; i < d->nFxState; i++) {
   5038       tl_assert(d->fxState[i].fx != Ifx_None);
   5039       if (d->fxState[i].fx == Ifx_Write)
   5040          continue;
   5041 
   5042       /* Enumerate the described state segments */
   5043       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   5044          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   5045          gSz  = d->fxState[i].size;
   5046 
   5047          /* Ignore any sections marked as 'always defined'. */
   5048          if (isAlwaysDefd(mce, gOff, gSz)) {
   5049             if (0)
   5050             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
   5051                         gOff, gSz);
   5052             continue;
   5053          }
   5054 
   5055          /* This state element is read or modified.  So we need to
   5056             consider it.  If larger than 8 bytes, deal with it in
   5057             8-byte chunks. */
   5058          while (True) {
   5059             tl_assert(gSz >= 0);
   5060             if (gSz == 0) break;
   5061             n = gSz <= 8 ? gSz : 8;
   5062             /* update 'curr' with UifU of the state slice
   5063                gOff .. gOff+n-1 */
   5064             tySrc = szToITy( n );
   5065 
   5066             /* Observe the guard expression. If it is false use an
   5067                all-bits-defined bit pattern */
   5068             IRAtom *cond, *iffalse, *iftrue;
   5069 
   5070             cond    = assignNew('V', mce, Ity_I1, d->guard);
   5071             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
   5072             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
   5073             src     = assignNew('V', mce, tySrc,
   5074                                 IRExpr_ITE(cond, iftrue, iffalse));
   5075 
   5076             here = mkPCastTo( mce, Ity_I32, src );
   5077             curr = mkUifU32(mce, here, curr);
   5078             gSz -= n;
   5079             gOff += n;
   5080          }
   5081       }
   5082    }
   5083 
   5084    /* Inputs: memory.  First set up some info needed regardless of
   5085       whether we're doing reads or writes. */
   5086 
   5087    if (d->mFx != Ifx_None) {
   5088       /* Because we may do multiple shadow loads/stores from the same
   5089          base address, it's best to do a single test of its
   5090          definedness right now.  Post-instrumentation optimisation
   5091          should remove all but this test. */
   5092       IRType tyAddr;
   5093       tl_assert(d->mAddr);
   5094       complainIfUndefined(mce, d->mAddr, d->guard);
   5095 
   5096       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
   5097       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
   5098       tl_assert(tyAddr == mce->hWordTy); /* not really right */
   5099    }
   5100 
   5101    /* Deal with memory inputs (reads or modifies) */
   5102    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
   5103       toDo   = d->mSize;
   5104       /* chew off 32-bit chunks.  We don't care about the endianness
   5105          since it's all going to be condensed down to a single bit,
   5106          but nevertheless choose an endianness which is hopefully
   5107          native to the platform. */
   5108       while (toDo >= 4) {
   5109          here = mkPCastTo(
   5110                    mce, Ity_I32,
   5111                    expr2vbits_Load_guarded_Simple(
   5112                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
   5113                 );
   5114          curr = mkUifU32(mce, here, curr);
   5115          toDo -= 4;
   5116       }
   5117       /* chew off 16-bit chunks */
   5118       while (toDo >= 2) {
   5119          here = mkPCastTo(
   5120                    mce, Ity_I32,
   5121                    expr2vbits_Load_guarded_Simple(
   5122                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
   5123                 );
   5124          curr = mkUifU32(mce, here, curr);
   5125          toDo -= 2;
   5126       }
   5127       /* chew off the remaining 8-bit chunk, if any */
   5128       if (toDo == 1) {
   5129          here = mkPCastTo(
   5130                    mce, Ity_I32,
   5131                    expr2vbits_Load_guarded_Simple(
   5132                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
   5133                 );
   5134          curr = mkUifU32(mce, here, curr);
   5135          toDo -= 1;
   5136       }
   5137       tl_assert(toDo == 0);
   5138    }
   5139 
   5140    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
   5141       all the inputs to the helper.  Now we need to re-distribute the
   5142       results to all destinations. */
   5143 
   5144    /* Outputs: the destination temporary, if there is one. */
   5145    if (d->tmp != IRTemp_INVALID) {
   5146       dst   = findShadowTmpV(mce, d->tmp);
   5147       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
   5148       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
   5149    }
   5150 
   5151    /* Outputs: guest state that we write or modify. */
   5152    for (i = 0; i < d->nFxState; i++) {
   5153       tl_assert(d->fxState[i].fx != Ifx_None);
   5154       if (d->fxState[i].fx == Ifx_Read)
   5155          continue;
   5156 
   5157       /* Enumerate the described state segments */
   5158       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   5159          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   5160          gSz  = d->fxState[i].size;
   5161 
   5162          /* Ignore any sections marked as 'always defined'. */
   5163          if (isAlwaysDefd(mce, gOff, gSz))
   5164             continue;
   5165 
   5166          /* This state element is written or modified.  So we need to
   5167             consider it.  If larger than 8 bytes, deal with it in
   5168             8-byte chunks. */
   5169          while (True) {
   5170             tl_assert(gSz >= 0);
   5171             if (gSz == 0) break;
   5172             n = gSz <= 8 ? gSz : 8;
   5173             /* Write suitably-casted 'curr' to the state slice
   5174                gOff .. gOff+n-1 */
   5175             tyDst = szToITy( n );
   5176             do_shadow_PUT( mce, gOff,
   5177                                 NULL, /* original atom */
   5178                                 mkPCastTo( mce, tyDst, curr ), d->guard );
   5179             gSz -= n;
   5180             gOff += n;
   5181          }
   5182       }
   5183    }
   5184 
   5185    /* Outputs: memory that we write or modify.  Same comments about
   5186       endianness as above apply. */
   5187    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
   5188       toDo   = d->mSize;
   5189       /* chew off 32-bit chunks */
   5190       while (toDo >= 4) {
   5191          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5192                           NULL, /* original data */
   5193                           mkPCastTo( mce, Ity_I32, curr ),
   5194                           d->guard );
   5195          toDo -= 4;
   5196       }
   5197       /* chew off 16-bit chunks */
   5198       while (toDo >= 2) {
   5199          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5200                           NULL, /* original data */
   5201                           mkPCastTo( mce, Ity_I16, curr ),
   5202                           d->guard );
   5203          toDo -= 2;
   5204       }
   5205       /* chew off the remaining 8-bit chunk, if any */
   5206       if (toDo == 1) {
   5207          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5208                           NULL, /* original data */
   5209                           mkPCastTo( mce, Ity_I8, curr ),
   5210                           d->guard );
   5211          toDo -= 1;
   5212       }
   5213       tl_assert(toDo == 0);
   5214    }
   5215 
   5216 }
   5217 
   5218 
   5219 /* We have an ABI hint telling us that [base .. base+len-1] is to
   5220    become undefined ("writable").  Generate code to call a helper to
   5221    notify the A/V bit machinery of this fact.
   5222 
   5223    We call
   5224    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
   5225                                                     Addr nia );
   5226 */
   5227 static
   5228 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
   5229 {
   5230    IRDirty* di;
   5231    /* Minor optimisation: if not doing origin tracking, ignore the
   5232       supplied nia and pass zero instead.  This is on the basis that
   5233       MC_(helperc_MAKE_STACK_UNINIT) will ignore it anyway, and we can
   5234       almost always generate a shorter instruction to put zero into a
   5235       register than any other value. */
   5236    if (MC_(clo_mc_level) < 3)
   5237       nia = mkIRExpr_HWord(0);
   5238 
   5239    di = unsafeIRDirty_0_N(
   5240            0/*regparms*/,
   5241            "MC_(helperc_MAKE_STACK_UNINIT)",
   5242            VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT) ),
   5243            mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
   5244         );
   5245    stmt( 'V', mce, IRStmt_Dirty(di) );
   5246 }
   5247 
   5248 
   5249 /* ------ Dealing with IRCAS (big and complex) ------ */
   5250 
   5251 /* FWDS */
   5252 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
   5253                              IRAtom* baseaddr, Int offset );
   5254 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
   5255 static void    gen_store_b ( MCEnv* mce, Int szB,
   5256                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
   5257                              IRAtom* guard );
   5258 
   5259 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
   5260 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
   5261 
   5262 
   5263 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
   5264    IRExpr.Consts, else this asserts.  If they are both Consts, it
   5265    doesn't do anything.  So that just leaves the RdTmp case.
   5266 
   5267    In which case: this assigns the shadow value SHADOW to the IR
   5268    shadow temporary associated with ORIG.  That is, ORIG, being an
   5269    original temporary, will have a shadow temporary associated with
   5270    it.  However, in the case envisaged here, there will so far have
   5271    been no IR emitted to actually write a shadow value into that
   5272    temporary.  What this routine does is to (emit IR to) copy the
   5273    value in SHADOW into said temporary, so that after this call,
   5274    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
   5275    value in SHADOW.
   5276 
   5277    Point is to allow callers to compute "by hand" a shadow value for
   5278    ORIG, and force it to be associated with ORIG.
   5279 
   5280    How do we know that that shadow associated with ORIG has not so far
   5281    been assigned to?  Well, we don't per se know that, but supposing
   5282    it had.  Then this routine would create a second assignment to it,
   5283    and later the IR sanity checker would barf.  But that never
   5284    happens.  QED.
   5285 */
   5286 static void bind_shadow_tmp_to_orig ( UChar how,
   5287                                       MCEnv* mce,
   5288                                       IRAtom* orig, IRAtom* shadow )
   5289 {
   5290    tl_assert(isOriginalAtom(mce, orig));
   5291    tl_assert(isShadowAtom(mce, shadow));
   5292    switch (orig->tag) {
   5293       case Iex_Const:
   5294          tl_assert(shadow->tag == Iex_Const);
   5295          break;
   5296       case Iex_RdTmp:
   5297          tl_assert(shadow->tag == Iex_RdTmp);
   5298          if (how == 'V') {
   5299             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
   5300                    shadow);
   5301          } else {
   5302             tl_assert(how == 'B');
   5303             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
   5304                    shadow);
   5305          }
   5306          break;
   5307       default:
   5308          tl_assert(0);
   5309    }
   5310 }
   5311 
   5312 
   5313 static
   5314 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
   5315 {
   5316    /* Scheme is (both single- and double- cases):
   5317 
   5318       1. fetch data#,dataB (the proposed new value)
   5319 
   5320       2. fetch expd#,expdB (what we expect to see at the address)
   5321 
   5322       3. check definedness of address
   5323 
   5324       4. load old#,oldB from shadow memory; this also checks
   5325          addressibility of the address
   5326 
   5327       5. the CAS itself
   5328 
   5329       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
   5330 
   5331       7. if "expected == old" (as computed by (6))
   5332             store data#,dataB to shadow memory
   5333 
   5334       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
   5335       'data' but 7 stores 'data#'.  Hence it is possible for the
   5336       shadow data to be incorrectly checked and/or updated:
   5337 
   5338       * 7 is at least gated correctly, since the 'expected == old'
   5339         condition is derived from outputs of 5.  However, the shadow
   5340         write could happen too late: imagine after 5 we are
   5341         descheduled, a different thread runs, writes a different
   5342         (shadow) value at the address, and then we resume, hence
   5343         overwriting the shadow value written by the other thread.
   5344 
   5345       Because the original memory access is atomic, there's no way to
   5346       make both the original and shadow accesses into a single atomic
   5347       thing, hence this is unavoidable.
   5348 
   5349       At least as Valgrind stands, I don't think it's a problem, since
   5350       we're single threaded *and* we guarantee that there are no
   5351       context switches during the execution of any specific superblock
   5352       -- context switches can only happen at superblock boundaries.
   5353 
   5354       If Valgrind ever becomes MT in the future, then it might be more
   5355       of a problem.  A possible kludge would be to artificially
   5356       associate with the location, a lock, which we must acquire and
   5357       release around the transaction as a whole.  Hmm, that probably
   5358       would't work properly since it only guards us against other
   5359       threads doing CASs on the same location, not against other
   5360       threads doing normal reads and writes.
   5361 
   5362       ------------------------------------------------------------
   5363 
   5364       COMMENT_ON_CasCmpEQ:
   5365 
   5366       Note two things.  Firstly, in the sequence above, we compute
   5367       "expected == old", but we don't check definedness of it.  Why
   5368       not?  Also, the x86 and amd64 front ends use
   5369       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
   5370       determination (expected == old ?) for themselves, and we also
   5371       don't check definedness for those primops; we just say that the
   5372       result is defined.  Why?  Details follow.
   5373 
   5374       x86/amd64 contains various forms of locked insns:
   5375       * lock prefix before all basic arithmetic insn;
   5376         eg lock xorl %reg1,(%reg2)
   5377       * atomic exchange reg-mem
   5378       * compare-and-swaps
   5379 
   5380       Rather than attempt to represent them all, which would be a
   5381       royal PITA, I used a result from Maurice Herlihy
   5382       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
   5383       demonstrates that compare-and-swap is a primitive more general
   5384       than the other two, and so can be used to represent all of them.
   5385       So the translation scheme for (eg) lock incl (%reg) is as
   5386       follows:
   5387 
   5388         again:
   5389          old = * %reg
   5390          new = old + 1
   5391          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
   5392 
   5393       The "atomically" is the CAS bit.  The scheme is always the same:
   5394       get old value from memory, compute new value, atomically stuff
   5395       new value back in memory iff the old value has not changed (iow,
   5396       no other thread modified it in the meantime).  If it has changed
   5397       then we've been out-raced and we have to start over.
   5398 
   5399       Now that's all very neat, but it has the bad side effect of
   5400       introducing an explicit equality test into the translation.
   5401       Consider the behaviour of said code on a memory location which
   5402       is uninitialised.  We will wind up doing a comparison on
   5403       uninitialised data, and mc duly complains.
   5404 
   5405       What's difficult about this is, the common case is that the
   5406       location is uncontended, and so we're usually comparing the same
   5407       value (* %reg) with itself.  So we shouldn't complain even if it
   5408       is undefined.  But mc doesn't know that.
   5409 
   5410       My solution is to mark the == in the IR specially, so as to tell
   5411       mc that it almost certainly compares a value with itself, and we
   5412       should just regard the result as always defined.  Rather than
   5413       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
   5414       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
   5415 
   5416       So there's always the question of, can this give a false
   5417       negative?  eg, imagine that initially, * %reg is defined; and we
   5418       read that; but then in the gap between the read and the CAS, a
   5419       different thread writes an undefined (and different) value at
   5420       the location.  Then the CAS in this thread will fail and we will
   5421       go back to "again:", but without knowing that the trip back
   5422       there was based on an undefined comparison.  No matter; at least
   5423       the other thread won the race and the location is correctly
   5424       marked as undefined.  What if it wrote an uninitialised version
   5425       of the same value that was there originally, though?
   5426 
   5427       etc etc.  Seems like there's a small corner case in which we
   5428       might lose the fact that something's defined -- we're out-raced
   5429       in between the "old = * reg" and the "atomically {", _and_ the
   5430       other thread is writing in an undefined version of what's
   5431       already there.  Well, that seems pretty unlikely.
   5432 
   5433       ---
   5434 
   5435       If we ever need to reinstate it .. code which generates a
   5436       definedness test for "expected == old" was removed at r10432 of
   5437       this file.
   5438    */
   5439    if (cas->oldHi == IRTemp_INVALID) {
   5440       do_shadow_CAS_single( mce, cas );
   5441    } else {
   5442       do_shadow_CAS_double( mce, cas );
   5443    }
   5444 }
   5445 
   5446 
   5447 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
   5448 {
   5449    IRAtom *vdataLo = NULL, *bdataLo = NULL;
   5450    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
   5451    IRAtom *voldLo  = NULL, *boldLo  = NULL;
   5452    IRAtom *expd_eq_old = NULL;
   5453    IROp   opCasCmpEQ;
   5454    Int    elemSzB;
   5455    IRType elemTy;
   5456    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
   5457 
   5458    /* single CAS */
   5459    tl_assert(cas->oldHi == IRTemp_INVALID);
   5460    tl_assert(cas->expdHi == NULL);
   5461    tl_assert(cas->dataHi == NULL);
   5462 
   5463    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
   5464    switch (elemTy) {
   5465       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
   5466       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
   5467       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
   5468       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
   5469       default: tl_assert(0); /* IR defn disallows any other types */
   5470    }
   5471 
   5472    /* 1. fetch data# (the proposed new value) */
   5473    tl_assert(isOriginalAtom(mce, cas->dataLo));
   5474    vdataLo
   5475       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
   5476    tl_assert(isShadowAtom(mce, vdataLo));
   5477    if (otrak) {
   5478       bdataLo
   5479          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
   5480       tl_assert(isShadowAtom(mce, bdataLo));
   5481    }
   5482 
   5483    /* 2. fetch expected# (what we expect to see at the address) */
   5484    tl_assert(isOriginalAtom(mce, cas->expdLo));
   5485    vexpdLo
   5486       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
   5487    tl_assert(isShadowAtom(mce, vexpdLo));
   5488    if (otrak) {
   5489       bexpdLo
   5490          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
   5491       tl_assert(isShadowAtom(mce, bexpdLo));
   5492    }
   5493 
   5494    /* 3. check definedness of address */
   5495    /* 4. fetch old# from shadow memory; this also checks
   5496          addressibility of the address */
   5497    voldLo
   5498       = assignNew(
   5499            'V', mce, elemTy,
   5500            expr2vbits_Load(
   5501               mce,
   5502               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
   5503               NULL/*always happens*/
   5504         ));
   5505    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
   5506    if (otrak) {
   5507       boldLo
   5508          = assignNew('B', mce, Ity_I32,
   5509                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
   5510       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
   5511    }
   5512 
   5513    /* 5. the CAS itself */
   5514    stmt( 'C', mce, IRStmt_CAS(cas) );
   5515 
   5516    /* 6. compute "expected == old" */
   5517    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
   5518    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
   5519       tree, but it's not copied from the input block. */
   5520    expd_eq_old
   5521       = assignNew('C', mce, Ity_I1,
   5522                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
   5523 
   5524    /* 7. if "expected == old"
   5525             store data# to shadow memory */
   5526    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
   5527                     NULL/*data*/, vdataLo/*vdata*/,
   5528                     expd_eq_old/*guard for store*/ );
   5529    if (otrak) {
   5530       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
   5531                    bdataLo/*bdata*/,
   5532                    expd_eq_old/*guard for store*/ );
   5533    }
   5534 }
   5535 
   5536 
   5537 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
   5538 {
   5539    IRAtom *vdataHi = NULL, *bdataHi = NULL;
   5540    IRAtom *vdataLo = NULL, *bdataLo = NULL;
   5541    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
   5542    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
   5543    IRAtom *voldHi  = NULL, *boldHi  = NULL;
   5544    IRAtom *voldLo  = NULL, *boldLo  = NULL;
   5545    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
   5546    IRAtom *expd_eq_old = NULL, *zero = NULL;
   5547    IROp   opCasCmpEQ, opOr, opXor;
   5548    Int    elemSzB, memOffsLo, memOffsHi;
   5549    IRType elemTy;
   5550    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
   5551 
   5552    /* double CAS */
   5553    tl_assert(cas->oldHi != IRTemp_INVALID);
   5554    tl_assert(cas->expdHi != NULL);
   5555    tl_assert(cas->dataHi != NULL);
   5556 
   5557    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
   5558    switch (elemTy) {
   5559       case Ity_I8:
   5560          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
   5561          elemSzB = 1; zero = mkU8(0);
   5562          break;
   5563       case Ity_I16:
   5564          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
   5565          elemSzB = 2; zero = mkU16(0);
   5566          break;
   5567       case Ity_I32:
   5568          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
   5569          elemSzB = 4; zero = mkU32(0);
   5570          break;
   5571       case Ity_I64:
   5572          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
   5573          elemSzB = 8; zero = mkU64(0);
   5574          break;
   5575       default:
   5576          tl_assert(0); /* IR defn disallows any other types */
   5577    }
   5578 
   5579    /* 1. fetch data# (the proposed new value) */
   5580    tl_assert(isOriginalAtom(mce, cas->dataHi));
   5581    tl_assert(isOriginalAtom(mce, cas->dataLo));
   5582    vdataHi
   5583       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi));
   5584    vdataLo
   5585       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
   5586    tl_assert(isShadowAtom(mce, vdataHi));
   5587    tl_assert(isShadowAtom(mce, vdataLo));
   5588    if (otrak) {
   5589       bdataHi
   5590          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
   5591       bdataLo
   5592          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
   5593       tl_assert(isShadowAtom(mce, bdataHi));
   5594       tl_assert(isShadowAtom(mce, bdataLo));
   5595    }
   5596 
   5597    /* 2. fetch expected# (what we expect to see at the address) */
   5598    tl_assert(isOriginalAtom(mce, cas->expdHi));
   5599    tl_assert(isOriginalAtom(mce, cas->expdLo));
   5600    vexpdHi
   5601       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi));
   5602    vexpdLo
   5603       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
   5604    tl_assert(isShadowAtom(mce, vexpdHi));
   5605    tl_assert(isShadowAtom(mce, vexpdLo));
   5606    if (otrak) {
   5607       bexpdHi
   5608          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
   5609       bexpdLo
   5610          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
   5611       tl_assert(isShadowAtom(mce, bexpdHi));
   5612       tl_assert(isShadowAtom(mce, bexpdLo));
   5613    }
   5614 
   5615    /* 3. check definedness of address */
   5616    /* 4. fetch old# from shadow memory; this also checks
   5617          addressibility of the address */
   5618    if (cas->end == Iend_LE) {
   5619       memOffsLo = 0;
   5620       memOffsHi = elemSzB;
   5621    } else {
   5622       tl_assert(cas->end == Iend_BE);
   5623       memOffsLo = elemSzB;
   5624       memOffsHi = 0;
   5625    }
   5626    voldHi
   5627       = assignNew(
   5628            'V', mce, elemTy,
   5629            expr2vbits_Load(
   5630               mce,
   5631               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
   5632               NULL/*always happens*/
   5633         ));
   5634    voldLo
   5635       = assignNew(
   5636            'V', mce, elemTy,
   5637            expr2vbits_Load(
   5638               mce,
   5639               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
   5640               NULL/*always happens*/
   5641         ));
   5642    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
   5643    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
   5644    if (otrak) {
   5645       boldHi
   5646          = assignNew('B', mce, Ity_I32,
   5647                      gen_load_b(mce, elemSzB, cas->addr,
   5648                                 memOffsHi/*addr bias*/));
   5649       boldLo
   5650          = assignNew('B', mce, Ity_I32,
   5651                      gen_load_b(mce, elemSzB, cas->addr,
   5652                                 memOffsLo/*addr bias*/));
   5653       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
   5654       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
   5655    }
   5656 
   5657    /* 5. the CAS itself */
   5658    stmt( 'C', mce, IRStmt_CAS(cas) );
   5659 
   5660    /* 6. compute "expected == old" */
   5661    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
   5662    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
   5663       tree, but it's not copied from the input block. */
   5664    /*
   5665       xHi = oldHi ^ expdHi;
   5666       xLo = oldLo ^ expdLo;
   5667       xHL = xHi | xLo;
   5668       expd_eq_old = xHL == 0;
   5669    */
   5670    xHi = assignNew('C', mce, elemTy,
   5671                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
   5672    xLo = assignNew('C', mce, elemTy,
   5673                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
   5674    xHL = assignNew('C', mce, elemTy,
   5675                    binop(opOr, xHi, xLo));
   5676    expd_eq_old
   5677       = assignNew('C', mce, Ity_I1,
   5678                   binop(opCasCmpEQ, xHL, zero));
   5679 
   5680    /* 7. if "expected == old"
   5681             store data# to shadow memory */
   5682    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
   5683                     NULL/*data*/, vdataHi/*vdata*/,
   5684                     expd_eq_old/*guard for store*/ );
   5685    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
   5686                     NULL/*data*/, vdataLo/*vdata*/,
   5687                     expd_eq_old/*guard for store*/ );
   5688    if (otrak) {
   5689       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
   5690                    bdataHi/*bdata*/,
   5691                    expd_eq_old/*guard for store*/ );
   5692       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
   5693                    bdataLo/*bdata*/,
   5694                    expd_eq_old/*guard for store*/ );
   5695    }
   5696 }
   5697 
   5698 
   5699 /* ------ Dealing with LL/SC (not difficult) ------ */
   5700 
   5701 static void do_shadow_LLSC ( MCEnv*    mce,
   5702                              IREndness stEnd,
   5703                              IRTemp    stResult,
   5704                              IRExpr*   stAddr,
   5705                              IRExpr*   stStoredata )
   5706 {
   5707    /* In short: treat a load-linked like a normal load followed by an
   5708       assignment of the loaded (shadow) data to the result temporary.
   5709       Treat a store-conditional like a normal store, and mark the
   5710       result temporary as defined. */
   5711    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
   5712    IRTemp resTmp = findShadowTmpV(mce, stResult);
   5713 
   5714    tl_assert(isIRAtom(stAddr));
   5715    if (stStoredata)
   5716       tl_assert(isIRAtom(stStoredata));
   5717 
   5718    if (stStoredata == NULL) {
   5719       /* Load Linked */
   5720       /* Just treat this as a normal load, followed by an assignment of
   5721          the value to .result. */
   5722       /* Stay sane */
   5723       tl_assert(resTy == Ity_I64 || resTy == Ity_I32
   5724                 || resTy == Ity_I16 || resTy == Ity_I8);
   5725       assign( 'V', mce, resTmp,
   5726                    expr2vbits_Load(
   5727                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
   5728                       NULL/*always happens*/) );
   5729    } else {
   5730       /* Store Conditional */
   5731       /* Stay sane */
   5732       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
   5733                                    stStoredata);
   5734       tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
   5735                 || dataTy == Ity_I16 || dataTy == Ity_I8);
   5736       do_shadow_Store( mce, stEnd,
   5737                             stAddr, 0/* addr bias */,
   5738                             stStoredata,
   5739                             NULL /* shadow data */,
   5740                             NULL/*guard*/ );
   5741       /* This is a store conditional, so it writes to .result a value
   5742          indicating whether or not the store succeeded.  Just claim
   5743          this value is always defined.  In the PowerPC interpretation
   5744          of store-conditional, definedness of the success indication
   5745          depends on whether the address of the store matches the
   5746          reservation address.  But we can't tell that here (and
   5747          anyway, we're not being PowerPC-specific).  At least we are
   5748          guaranteed that the definedness of the store address, and its
   5749          addressibility, will be checked as per normal.  So it seems
   5750          pretty safe to just say that the success indication is always
   5751          defined.
   5752 
   5753          In schemeS, for origin tracking, we must correspondingly set
   5754          a no-origin value for the origin shadow of .result.
   5755       */
   5756       tl_assert(resTy == Ity_I1);
   5757       assign( 'V', mce, resTmp, definedOfType(resTy) );
   5758    }
   5759 }
   5760 
   5761 
   5762 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
   5763 
   5764 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
   5765 {
   5766    complainIfUndefined(mce, sg->guard, NULL);
   5767    /* do_shadow_Store will generate code to check the definedness and
   5768       validity of sg->addr, in the case where sg->guard evaluates to
   5769       True at run-time. */
   5770    do_shadow_Store( mce, sg->end,
   5771                     sg->addr, 0/* addr bias */,
   5772                     sg->data,
   5773                     NULL /* shadow data */,
   5774                     sg->guard );
   5775 }
   5776 
   5777 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
   5778 {
   5779    complainIfUndefined(mce, lg->guard, NULL);
   5780    /* expr2vbits_Load_guarded_General will generate code to check the
   5781       definedness and validity of lg->addr, in the case where
   5782       lg->guard evaluates to True at run-time. */
   5783 
   5784    /* Look at the LoadG's built-in conversion operation, to determine
   5785       the source (actual loaded data) type, and the equivalent IROp.
   5786       NOTE that implicitly we are taking a widening operation to be
   5787       applied to original atoms and producing one that applies to V
   5788       bits.  Since signed and unsigned widening are self-shadowing,
   5789       this is a straight copy of the op (modulo swapping from the
   5790       IRLoadGOp form to the IROp form).  Note also therefore that this
   5791       implicitly duplicates the logic to do with said widening ops in
   5792       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
   5793    IROp   vwiden   = Iop_INVALID;
   5794    IRType loadedTy = Ity_INVALID;
   5795    switch (lg->cvt) {
   5796       case ILGop_Ident32: loadedTy = Ity_I32; vwiden = Iop_INVALID; break;
   5797       case ILGop_16Uto32: loadedTy = Ity_I16; vwiden = Iop_16Uto32; break;
   5798       case ILGop_16Sto32: loadedTy = Ity_I16; vwiden = Iop_16Sto32; break;
   5799       case ILGop_8Uto32:  loadedTy = Ity_I8;  vwiden = Iop_8Uto32;  break;
   5800       case ILGop_8Sto32:  loadedTy = Ity_I8;  vwiden = Iop_8Sto32;  break;
   5801       default: VG_(tool_panic)("do_shadow_LoadG");
   5802    }
   5803 
   5804    IRAtom* vbits_alt
   5805       = expr2vbits( mce, lg->alt );
   5806    IRAtom* vbits_final
   5807       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
   5808                                         lg->addr, 0/*addr bias*/,
   5809                                         lg->guard, vwiden, vbits_alt );
   5810    /* And finally, bind the V bits to the destination temporary. */
   5811    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
   5812 }
   5813 
   5814 
   5815 /*------------------------------------------------------------*/
   5816 /*--- Memcheck main                                        ---*/
   5817 /*------------------------------------------------------------*/
   5818 
   5819 static void schemeS ( MCEnv* mce, IRStmt* st );
   5820 
   5821 static Bool isBogusAtom ( IRAtom* at )
   5822 {
   5823    ULong n = 0;
   5824    IRConst* con;
   5825    tl_assert(isIRAtom(at));
   5826    if (at->tag == Iex_RdTmp)
   5827       return False;
   5828    tl_assert(at->tag == Iex_Const);
   5829    con = at->Iex.Const.con;
   5830    switch (con->tag) {
   5831       case Ico_U1:   return False;
   5832       case Ico_U8:   n = (ULong)con->Ico.U8; break;
   5833       case Ico_U16:  n = (ULong)con->Ico.U16; break;
   5834       case Ico_U32:  n = (ULong)con->Ico.U32; break;
   5835       case Ico_U64:  n = (ULong)con->Ico.U64; break;
   5836       case Ico_F64:  return False;
   5837       case Ico_F32i: return False;
   5838       case Ico_F64i: return False;
   5839       case Ico_V128: return False;
   5840       case Ico_V256: return False;
   5841       default: ppIRExpr(at); tl_assert(0);
   5842    }
   5843    /* VG_(printf)("%llx\n", n); */
   5844    return (/*32*/    n == 0xFEFEFEFFULL
   5845            /*32*/ || n == 0x80808080ULL
   5846            /*32*/ || n == 0x7F7F7F7FULL
   5847            /*32*/ || n == 0x7EFEFEFFULL
   5848            /*32*/ || n == 0x81010100ULL
   5849            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
   5850            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
   5851            /*64*/ || n == 0x0000000000008080ULL
   5852            /*64*/ || n == 0x8080808080808080ULL
   5853            /*64*/ || n == 0x0101010101010101ULL
   5854           );
   5855 }
   5856 
   5857 static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
   5858 {
   5859    Int      i;
   5860    IRExpr*  e;
   5861    IRDirty* d;
   5862    IRCAS*   cas;
   5863    switch (st->tag) {
   5864       case Ist_WrTmp:
   5865          e = st->Ist.WrTmp.data;
   5866          switch (e->tag) {
   5867             case Iex_Get:
   5868             case Iex_RdTmp:
   5869                return False;
   5870             case Iex_Const:
   5871                return isBogusAtom(e);
   5872             case Iex_Unop:
   5873                return isBogusAtom(e->Iex.Unop.arg)
   5874                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
   5875             case Iex_GetI:
   5876                return isBogusAtom(e->Iex.GetI.ix);
   5877             case Iex_Binop:
   5878                return isBogusAtom(e->Iex.Binop.arg1)
   5879                       || isBogusAtom(e->Iex.Binop.arg2);
   5880             case Iex_Triop:
   5881                return isBogusAtom(e->Iex.Triop.details->arg1)
   5882                       || isBogusAtom(e->Iex.Triop.details->arg2)
   5883                       || isBogusAtom(e->Iex.Triop.details->arg3);
   5884             case Iex_Qop:
   5885                return isBogusAtom(e->Iex.Qop.details->arg1)
   5886                       || isBogusAtom(e->Iex.Qop.details->arg2)
   5887                       || isBogusAtom(e->Iex.Qop.details->arg3)
   5888                       || isBogusAtom(e->Iex.Qop.details->arg4);
   5889             case Iex_ITE:
   5890                return isBogusAtom(e->Iex.ITE.cond)
   5891                       || isBogusAtom(e->Iex.ITE.iftrue)
   5892                       || isBogusAtom(e->Iex.ITE.iffalse);
   5893             case Iex_Load:
   5894                return isBogusAtom(e->Iex.Load.addr);
   5895             case Iex_CCall:
   5896                for (i = 0; e->Iex.CCall.args[i]; i++)
   5897                   if (isBogusAtom(e->Iex.CCall.args[i]))
   5898                      return True;
   5899                return False;
   5900             default:
   5901                goto unhandled;
   5902          }
   5903       case Ist_Dirty:
   5904          d = st->Ist.Dirty.details;
   5905          for (i = 0; d->args[i]; i++) {
   5906             IRAtom* atom = d->args[i];
   5907             if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(atom))) {
   5908                if (isBogusAtom(atom))
   5909                   return True;
   5910             }
   5911          }
   5912          if (isBogusAtom(d->guard))
   5913             return True;
   5914          if (d->mAddr && isBogusAtom(d->mAddr))
   5915             return True;
   5916          return False;
   5917       case Ist_Put:
   5918          return isBogusAtom(st->Ist.Put.data);
   5919       case Ist_PutI:
   5920          return isBogusAtom(st->Ist.PutI.details->ix)
   5921                 || isBogusAtom(st->Ist.PutI.details->data);
   5922       case Ist_Store:
   5923          return isBogusAtom(st->Ist.Store.addr)
   5924                 || isBogusAtom(st->Ist.Store.data);
   5925       case Ist_StoreG: {
   5926          IRStoreG* sg = st->Ist.StoreG.details;
   5927          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
   5928                 || isBogusAtom(sg->guard);
   5929       }
   5930       case Ist_LoadG: {
   5931          IRLoadG* lg = st->Ist.LoadG.details;
   5932          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
   5933                 || isBogusAtom(lg->guard);
   5934       }
   5935       case Ist_Exit:
   5936          return isBogusAtom(st->Ist.Exit.guard);
   5937       case Ist_AbiHint:
   5938          return isBogusAtom(st->Ist.AbiHint.base)
   5939                 || isBogusAtom(st->Ist.AbiHint.nia);
   5940       case Ist_NoOp:
   5941       case Ist_IMark:
   5942       case Ist_MBE:
   5943          return False;
   5944       case Ist_CAS:
   5945          cas = st->Ist.CAS.details;
   5946          return isBogusAtom(cas->addr)
   5947                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
   5948                 || isBogusAtom(cas->expdLo)
   5949                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
   5950                 || isBogusAtom(cas->dataLo);
   5951       case Ist_LLSC:
   5952          return isBogusAtom(st->Ist.LLSC.addr)
   5953                 || (st->Ist.LLSC.storedata
   5954                        ? isBogusAtom(st->Ist.LLSC.storedata)
   5955                        : False);
   5956       default:
   5957       unhandled:
   5958          ppIRStmt(st);
   5959          VG_(tool_panic)("hasBogusLiterals");
   5960    }
   5961 }
   5962 
   5963 
   5964 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
   5965                         IRSB* sb_in,
   5966                         VexGuestLayout* layout,
   5967                         VexGuestExtents* vge,
   5968                         VexArchInfo* archinfo_host,
   5969                         IRType gWordTy, IRType hWordTy )
   5970 {
   5971    Bool    verboze = 0||False;
   5972    Bool    bogus;
   5973    Int     i, j, first_stmt;
   5974    IRStmt* st;
   5975    MCEnv   mce;
   5976    IRSB*   sb_out;
   5977 
   5978    if (gWordTy != hWordTy) {
   5979       /* We don't currently support this case. */
   5980       VG_(tool_panic)("host/guest word size mismatch");
   5981    }
   5982 
   5983    /* Check we're not completely nuts */
   5984    tl_assert(sizeof(UWord)  == sizeof(void*));
   5985    tl_assert(sizeof(Word)   == sizeof(void*));
   5986    tl_assert(sizeof(Addr)   == sizeof(void*));
   5987    tl_assert(sizeof(ULong)  == 8);
   5988    tl_assert(sizeof(Long)   == 8);
   5989    tl_assert(sizeof(Addr64) == 8);
   5990    tl_assert(sizeof(UInt)   == 4);
   5991    tl_assert(sizeof(Int)    == 4);
   5992 
   5993    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
   5994 
   5995    /* Set up SB */
   5996    sb_out = deepCopyIRSBExceptStmts(sb_in);
   5997 
   5998    /* Set up the running environment.  Both .sb and .tmpMap are
   5999       modified as we go along.  Note that tmps are added to both
   6000       .sb->tyenv and .tmpMap together, so the valid index-set for
   6001       those two arrays should always be identical. */
   6002    VG_(memset)(&mce, 0, sizeof(mce));
   6003    mce.sb             = sb_out;
   6004    mce.trace          = verboze;
   6005    mce.layout         = layout;
   6006    mce.hWordTy        = hWordTy;
   6007    mce.bogusLiterals  = False;
   6008 
   6009    /* Do expensive interpretation for Iop_Add32 and Iop_Add64 on
   6010       Darwin.  10.7 is mostly built with LLVM, which uses these for
   6011       bitfield inserts, and we get a lot of false errors if the cheap
   6012       interpretation is used, alas.  Could solve this much better if
   6013       we knew which of such adds came from x86/amd64 LEA instructions,
   6014       since these are the only ones really needing the expensive
   6015       interpretation, but that would require some way to tag them in
   6016       the _toIR.c front ends, which is a lot of faffing around.  So
   6017       for now just use the slow and blunt-instrument solution. */
   6018    mce.useLLVMworkarounds = False;
   6019 #  if defined(VGO_darwin)
   6020    mce.useLLVMworkarounds = True;
   6021 #  endif
   6022 
   6023    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
   6024                             sizeof(TempMapEnt));
   6025    for (i = 0; i < sb_in->tyenv->types_used; i++) {
   6026       TempMapEnt ent;
   6027       ent.kind    = Orig;
   6028       ent.shadowV = IRTemp_INVALID;
   6029       ent.shadowB = IRTemp_INVALID;
   6030       VG_(addToXA)( mce.tmpMap, &ent );
   6031    }
   6032    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
   6033 
   6034    /* Make a preliminary inspection of the statements, to see if there
   6035       are any dodgy-looking literals.  If there are, we generate
   6036       extra-detailed (hence extra-expensive) instrumentation in
   6037       places.  Scan the whole bb even if dodgyness is found earlier,
   6038       so that the flatness assertion is applied to all stmts. */
   6039 
   6040    bogus = False;
   6041 
   6042    for (i = 0; i < sb_in->stmts_used; i++) {
   6043 
   6044       st = sb_in->stmts[i];
   6045       tl_assert(st);
   6046       tl_assert(isFlatIRStmt(st));
   6047 
   6048       if (!bogus) {
   6049          bogus = checkForBogusLiterals(st);
   6050          if (0 && bogus) {
   6051             VG_(printf)("bogus: ");
   6052             ppIRStmt(st);
   6053             VG_(printf)("\n");
   6054          }
   6055       }
   6056 
   6057    }
   6058 
   6059    mce.bogusLiterals = bogus;
   6060 
   6061    /* Copy verbatim any IR preamble preceding the first IMark */
   6062 
   6063    tl_assert(mce.sb == sb_out);
   6064    tl_assert(mce.sb != sb_in);
   6065 
   6066    i = 0;
   6067    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
   6068 
   6069       st = sb_in->stmts[i];
   6070       tl_assert(st);
   6071       tl_assert(isFlatIRStmt(st));
   6072 
   6073       stmt( 'C', &mce, sb_in->stmts[i] );
   6074       i++;
   6075    }
   6076 
   6077    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
   6078       cause the IR following the preamble to contain references to IR
   6079       temporaries defined in the preamble.  Because the preamble isn't
   6080       instrumented, these temporaries don't have any shadows.
   6081       Nevertheless uses of them following the preamble will cause
   6082       memcheck to generate references to their shadows.  End effect is
   6083       to cause IR sanity check failures, due to references to
   6084       non-existent shadows.  This is only evident for the complex
   6085       preambles used for function wrapping on TOC-afflicted platforms
   6086       (ppc64-linux).
   6087 
   6088       The following loop therefore scans the preamble looking for
   6089       assignments to temporaries.  For each one found it creates an
   6090       assignment to the corresponding (V) shadow temp, marking it as
   6091       'defined'.  This is the same resulting IR as if the main
   6092       instrumentation loop before had been applied to the statement
   6093       'tmp = CONSTANT'.
   6094 
   6095       Similarly, if origin tracking is enabled, we must generate an
   6096       assignment for the corresponding origin (B) shadow, claiming
   6097       no-origin, as appropriate for a defined value.
   6098    */
   6099    for (j = 0; j < i; j++) {
   6100       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
   6101          /* findShadowTmpV checks its arg is an original tmp;
   6102             no need to assert that here. */
   6103          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
   6104          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
   6105          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
   6106          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
   6107          if (MC_(clo_mc_level) == 3) {
   6108             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
   6109             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
   6110             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
   6111          }
   6112          if (0) {
   6113             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
   6114             ppIRType( ty_v );
   6115             VG_(printf)("\n");
   6116          }
   6117       }
   6118    }
   6119 
   6120    /* Iterate over the remaining stmts to generate instrumentation. */
   6121 
   6122    tl_assert(sb_in->stmts_used > 0);
   6123    tl_assert(i >= 0);
   6124    tl_assert(i < sb_in->stmts_used);
   6125    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
   6126 
   6127    for (/* use current i*/; i < sb_in->stmts_used; i++) {
   6128 
   6129       st = sb_in->stmts[i];
   6130       first_stmt = sb_out->stmts_used;
   6131 
   6132       if (verboze) {
   6133          VG_(printf)("\n");
   6134          ppIRStmt(st);
   6135          VG_(printf)("\n");
   6136       }
   6137 
   6138       if (MC_(clo_mc_level) == 3) {
   6139          /* See comments on case Ist_CAS below. */
   6140          if (st->tag != Ist_CAS)
   6141             schemeS( &mce, st );
   6142       }
   6143 
   6144       /* Generate instrumentation code for each stmt ... */
   6145 
   6146       switch (st->tag) {
   6147 
   6148          case Ist_WrTmp:
   6149             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
   6150                                expr2vbits( &mce, st->Ist.WrTmp.data) );
   6151             break;
   6152 
   6153          case Ist_Put:
   6154             do_shadow_PUT( &mce,
   6155                            st->Ist.Put.offset,
   6156                            st->Ist.Put.data,
   6157                            NULL /* shadow atom */, NULL /* guard */ );
   6158             break;
   6159 
   6160          case Ist_PutI:
   6161             do_shadow_PUTI( &mce, st->Ist.PutI.details);
   6162             break;
   6163 
   6164          case Ist_Store:
   6165             do_shadow_Store( &mce, st->Ist.Store.end,
   6166                                    st->Ist.Store.addr, 0/* addr bias */,
   6167                                    st->Ist.Store.data,
   6168                                    NULL /* shadow data */,
   6169                                    NULL/*guard*/ );
   6170             break;
   6171 
   6172          case Ist_StoreG:
   6173             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
   6174             break;
   6175 
   6176          case Ist_LoadG:
   6177             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
   6178             break;
   6179 
   6180          case Ist_Exit:
   6181             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
   6182             break;
   6183 
   6184          case Ist_IMark:
   6185             break;
   6186 
   6187          case Ist_NoOp:
   6188          case Ist_MBE:
   6189             break;
   6190 
   6191          case Ist_Dirty:
   6192             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
   6193             break;
   6194 
   6195          case Ist_AbiHint:
   6196             do_AbiHint( &mce, st->Ist.AbiHint.base,
   6197                               st->Ist.AbiHint.len,
   6198                               st->Ist.AbiHint.nia );
   6199             break;
   6200 
   6201          case Ist_CAS:
   6202             do_shadow_CAS( &mce, st->Ist.CAS.details );
   6203             /* Note, do_shadow_CAS copies the CAS itself to the output
   6204                block, because it needs to add instrumentation both
   6205                before and after it.  Hence skip the copy below.  Also
   6206                skip the origin-tracking stuff (call to schemeS) above,
   6207                since that's all tangled up with it too; do_shadow_CAS
   6208                does it all. */
   6209             break;
   6210 
   6211          case Ist_LLSC:
   6212             do_shadow_LLSC( &mce,
   6213                             st->Ist.LLSC.end,
   6214                             st->Ist.LLSC.result,
   6215                             st->Ist.LLSC.addr,
   6216                             st->Ist.LLSC.storedata );
   6217             break;
   6218 
   6219          default:
   6220             VG_(printf)("\n");
   6221             ppIRStmt(st);
   6222             VG_(printf)("\n");
   6223             VG_(tool_panic)("memcheck: unhandled IRStmt");
   6224 
   6225       } /* switch (st->tag) */
   6226 
   6227       if (0 && verboze) {
   6228          for (j = first_stmt; j < sb_out->stmts_used; j++) {
   6229             VG_(printf)("   ");
   6230             ppIRStmt(sb_out->stmts[j]);
   6231             VG_(printf)("\n");
   6232          }
   6233          VG_(printf)("\n");
   6234       }
   6235 
   6236       /* ... and finally copy the stmt itself to the output.  Except,
   6237          skip the copy of IRCASs; see comments on case Ist_CAS
   6238          above. */
   6239       if (st->tag != Ist_CAS)
   6240          stmt('C', &mce, st);
   6241    }
   6242 
   6243    /* Now we need to complain if the jump target is undefined. */
   6244    first_stmt = sb_out->stmts_used;
   6245 
   6246    if (verboze) {
   6247       VG_(printf)("sb_in->next = ");
   6248       ppIRExpr(sb_in->next);
   6249       VG_(printf)("\n\n");
   6250    }
   6251 
   6252    complainIfUndefined( &mce, sb_in->next, NULL );
   6253 
   6254    if (0 && verboze) {
   6255       for (j = first_stmt; j < sb_out->stmts_used; j++) {
   6256          VG_(printf)("   ");
   6257          ppIRStmt(sb_out->stmts[j]);
   6258          VG_(printf)("\n");
   6259       }
   6260       VG_(printf)("\n");
   6261    }
   6262 
   6263    /* If this fails, there's been some serious snafu with tmp management,
   6264       that should be investigated. */
   6265    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
   6266    VG_(deleteXA)( mce.tmpMap );
   6267 
   6268    tl_assert(mce.sb == sb_out);
   6269    return sb_out;
   6270 }
   6271 
   6272 /*------------------------------------------------------------*/
   6273 /*--- Post-tree-build final tidying                        ---*/
   6274 /*------------------------------------------------------------*/
   6275 
   6276 /* This exploits the observation that Memcheck often produces
   6277    repeated conditional calls of the form
   6278 
   6279    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
   6280 
   6281    with the same guard expression G guarding the same helper call.
   6282    The second and subsequent calls are redundant.  This usually
   6283    results from instrumentation of guest code containing multiple
   6284    memory references at different constant offsets from the same base
   6285    register.  After optimisation of the instrumentation, you get a
   6286    test for the definedness of the base register for each memory
   6287    reference, which is kinda pointless.  MC_(final_tidy) therefore
   6288    looks for such repeated calls and removes all but the first. */
   6289 
   6290 /* A struct for recording which (helper, guard) pairs we have already
   6291    seen. */
   6292 typedef
   6293    struct { void* entry; IRExpr* guard; }
   6294    Pair;
   6295 
   6296 /* Return True if e1 and e2 definitely denote the same value (used to
   6297    compare guards).  Return False if unknown; False is the safe
   6298    answer.  Since guest registers and guest memory do not have the
   6299    SSA property we must return False if any Gets or Loads appear in
   6300    the expression. */
   6301 
   6302 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
   6303 {
   6304    if (e1->tag != e2->tag)
   6305       return False;
   6306    switch (e1->tag) {
   6307       case Iex_Const:
   6308          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
   6309       case Iex_Binop:
   6310          return e1->Iex.Binop.op == e2->Iex.Binop.op
   6311                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
   6312                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
   6313       case Iex_Unop:
   6314          return e1->Iex.Unop.op == e2->Iex.Unop.op
   6315                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
   6316       case Iex_RdTmp:
   6317          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
   6318       case Iex_ITE:
   6319          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
   6320                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
   6321                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
   6322       case Iex_Qop:
   6323       case Iex_Triop:
   6324       case Iex_CCall:
   6325          /* be lazy.  Could define equality for these, but they never
   6326             appear to be used. */
   6327          return False;
   6328       case Iex_Get:
   6329       case Iex_GetI:
   6330       case Iex_Load:
   6331          /* be conservative - these may not give the same value each
   6332             time */
   6333          return False;
   6334       case Iex_Binder:
   6335          /* should never see this */
   6336          /* fallthrough */
   6337       default:
   6338          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
   6339          ppIRExpr(e1);
   6340          VG_(tool_panic)("memcheck:sameIRValue");
   6341          return False;
   6342    }
   6343 }
   6344 
   6345 /* See if 'pairs' already has an entry for (entry, guard).  Return
   6346    True if so.  If not, add an entry. */
   6347 
   6348 static
   6349 Bool check_or_add ( XArray* /*of Pair*/ pairs, IRExpr* guard, void* entry )
   6350 {
   6351    Pair  p;
   6352    Pair* pp;
   6353    Int   i, n = VG_(sizeXA)( pairs );
   6354    for (i = 0; i < n; i++) {
   6355       pp = VG_(indexXA)( pairs, i );
   6356       if (pp->entry == entry && sameIRValue(pp->guard, guard))
   6357          return True;
   6358    }
   6359    p.guard = guard;
   6360    p.entry = entry;
   6361    VG_(addToXA)( pairs, &p );
   6362    return False;
   6363 }
   6364 
   6365 static Bool is_helperc_value_checkN_fail ( const HChar* name )
   6366 {
   6367    return
   6368       0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_no_o)")
   6369       || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_no_o)")
   6370       || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_no_o)")
   6371       || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_no_o)")
   6372       || 0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_w_o)")
   6373       || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_w_o)")
   6374       || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_w_o)")
   6375       || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_w_o)");
   6376 }
   6377 
   6378 IRSB* MC_(final_tidy) ( IRSB* sb_in )
   6379 {
   6380    Int i;
   6381    IRStmt*   st;
   6382    IRDirty*  di;
   6383    IRExpr*   guard;
   6384    IRCallee* cee;
   6385    Bool      alreadyPresent;
   6386    XArray*   pairs = VG_(newXA)( VG_(malloc), "mc.ft.1",
   6387                                  VG_(free), sizeof(Pair) );
   6388    /* Scan forwards through the statements.  Each time a call to one
   6389       of the relevant helpers is seen, check if we have made a
   6390       previous call to the same helper using the same guard
   6391       expression, and if so, delete the call. */
   6392    for (i = 0; i < sb_in->stmts_used; i++) {
   6393       st = sb_in->stmts[i];
   6394       tl_assert(st);
   6395       if (st->tag != Ist_Dirty)
   6396          continue;
   6397       di = st->Ist.Dirty.details;
   6398       guard = di->guard;
   6399       tl_assert(guard);
   6400       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
   6401       cee = di->cee;
   6402       if (!is_helperc_value_checkN_fail( cee->name ))
   6403          continue;
   6404        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
   6405           guard 'guard'.  Check if we have already seen a call to this
   6406           function with the same guard.  If so, delete it.  If not,
   6407           add it to the set of calls we do know about. */
   6408       alreadyPresent = check_or_add( pairs, guard, cee->addr );
   6409       if (alreadyPresent) {
   6410          sb_in->stmts[i] = IRStmt_NoOp();
   6411          if (0) VG_(printf)("XX\n");
   6412       }
   6413    }
   6414    VG_(deleteXA)( pairs );
   6415    return sb_in;
   6416 }
   6417 
   6418 
   6419 /*------------------------------------------------------------*/
   6420 /*--- Origin tracking stuff                                ---*/
   6421 /*------------------------------------------------------------*/
   6422 
   6423 /* Almost identical to findShadowTmpV. */
   6424 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
   6425 {
   6426    TempMapEnt* ent;
   6427    /* VG_(indexXA) range-checks 'orig', hence no need to check
   6428       here. */
   6429    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
   6430    tl_assert(ent->kind == Orig);
   6431    if (ent->shadowB == IRTemp_INVALID) {
   6432       IRTemp tmpB
   6433         = newTemp( mce, Ity_I32, BSh );
   6434       /* newTemp may cause mce->tmpMap to resize, hence previous results
   6435          from VG_(indexXA) are invalid. */
   6436       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
   6437       tl_assert(ent->kind == Orig);
   6438       tl_assert(ent->shadowB == IRTemp_INVALID);
   6439       ent->shadowB = tmpB;
   6440    }
   6441    return ent->shadowB;
   6442 }
   6443 
   6444 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
   6445 {
   6446    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
   6447 }
   6448 
   6449 
   6450 /* Make a guarded origin load, with no special handling in the
   6451    didn't-happen case.  A GUARD of NULL is assumed to mean "always
   6452    True".
   6453 
   6454    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
   6455    return the otag.  The loaded size is SZB.  If GUARD evaluates to
   6456    False at run time then the returned otag is zero.
   6457 */
   6458 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
   6459                                     IRAtom* baseaddr,
   6460                                     Int offset, IRExpr* guard )
   6461 {
   6462    void*    hFun;
   6463    const HChar* hName;
   6464    IRTemp   bTmp;
   6465    IRDirty* di;
   6466    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
   6467    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
   6468    IRAtom*  ea    = baseaddr;
   6469    if (offset != 0) {
   6470       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
   6471                                    : mkU64( (Long)(Int)offset );
   6472       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
   6473    }
   6474    bTmp = newTemp(mce, mce->hWordTy, BSh);
   6475 
   6476    switch (szB) {
   6477       case 1: hFun  = (void*)&MC_(helperc_b_load1);
   6478               hName = "MC_(helperc_b_load1)";
   6479               break;
   6480       case 2: hFun  = (void*)&MC_(helperc_b_load2);
   6481               hName = "MC_(helperc_b_load2)";
   6482               break;
   6483       case 4: hFun  = (void*)&MC_(helperc_b_load4);
   6484               hName = "MC_(helperc_b_load4)";
   6485               break;
   6486       case 8: hFun  = (void*)&MC_(helperc_b_load8);
   6487               hName = "MC_(helperc_b_load8)";
   6488               break;
   6489       case 16: hFun  = (void*)&MC_(helperc_b_load16);
   6490                hName = "MC_(helperc_b_load16)";
   6491                break;
   6492       case 32: hFun  = (void*)&MC_(helperc_b_load32);
   6493                hName = "MC_(helperc_b_load32)";
   6494                break;
   6495       default:
   6496          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
   6497          tl_assert(0);
   6498    }
   6499    di = unsafeIRDirty_1_N(
   6500            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
   6501            mkIRExprVec_1( ea )
   6502         );
   6503    if (guard) {
   6504       di->guard = guard;
   6505       /* Ideally the didn't-happen return value here would be
   6506          all-zeroes (unknown-origin), so it'd be harmless if it got
   6507          used inadvertantly.  We slum it out with the IR-mandated
   6508          default value (0b01 repeating, 0x55 etc) as that'll probably
   6509          trump all legitimate otags via Max32, and it's pretty
   6510          obviously bogus. */
   6511    }
   6512    /* no need to mess with any annotations.  This call accesses
   6513       neither guest state nor guest memory. */
   6514    stmt( 'B', mce, IRStmt_Dirty(di) );
   6515    if (mce->hWordTy == Ity_I64) {
   6516       /* 64-bit host */
   6517       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
   6518       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
   6519       return mkexpr(bTmp32);
   6520    } else {
   6521       /* 32-bit host */
   6522       return mkexpr(bTmp);
   6523    }
   6524 }
   6525 
   6526 
   6527 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
   6528    loaded size is SZB.  The load is regarded as unconditional (always
   6529    happens).
   6530 */
   6531 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
   6532                             Int offset )
   6533 {
   6534    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
   6535 }
   6536 
   6537 
   6538 /* The most general handler for guarded origin loads.  A GUARD of NULL
   6539    is assumed to mean "always True".
   6540 
   6541    Generate IR to do a shadow origin load from ADDR+BIAS and return
   6542    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
   6543    run time then the returned B bits are simply BALT instead.
   6544 */
   6545 static
   6546 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
   6547                                         IRType ty,
   6548                                         IRAtom* addr, UInt bias,
   6549                                         IRAtom* guard, IRAtom* balt )
   6550 {
   6551    /* If the guard evaluates to True, this will hold the loaded
   6552       origin.  If the guard evaluates to False, this will be zero,
   6553       meaning "unknown origin", in which case we will have to replace
   6554       it using an ITE below. */
   6555    IRAtom* iftrue
   6556       = assignNew('B', mce, Ity_I32,
   6557                   gen_guarded_load_b(mce, sizeofIRType(ty),
   6558                                      addr, bias, guard));
   6559    /* These are the bits we will return if the load doesn't take
   6560       place. */
   6561    IRAtom* iffalse
   6562       = balt;
   6563    /* Prepare the cond for the ITE.  Convert a NULL cond into
   6564       something that iropt knows how to fold out later. */
   6565    IRAtom* cond
   6566       = guard == NULL  ? mkU1(1)  : guard;
   6567    /* And assemble the final result. */
   6568    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
   6569 }
   6570 
   6571 
   6572 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
   6573    the store really happens; NULL means it unconditionally does. */
   6574 static void gen_store_b ( MCEnv* mce, Int szB,
   6575                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
   6576                           IRAtom* guard )
   6577 {
   6578    void*    hFun;
   6579    const HChar* hName;
   6580    IRDirty* di;
   6581    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
   6582    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
   6583    IRAtom*  ea    = baseaddr;
   6584    if (guard) {
   6585       tl_assert(isOriginalAtom(mce, guard));
   6586       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
   6587    }
   6588    if (offset != 0) {
   6589       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
   6590                                    : mkU64( (Long)(Int)offset );
   6591       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
   6592    }
   6593    if (mce->hWordTy == Ity_I64)
   6594       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
   6595 
   6596    switch (szB) {
   6597       case 1: hFun  = (void*)&MC_(helperc_b_store1);
   6598               hName = "MC_(helperc_b_store1)";
   6599               break;
   6600       case 2: hFun  = (void*)&MC_(helperc_b_store2);
   6601               hName = "MC_(helperc_b_store2)";
   6602               break;
   6603       case 4: hFun  = (void*)&MC_(helperc_b_store4);
   6604               hName = "MC_(helperc_b_store4)";
   6605               break;
   6606       case 8: hFun  = (void*)&MC_(helperc_b_store8);
   6607               hName = "MC_(helperc_b_store8)";
   6608               break;
   6609       case 16: hFun  = (void*)&MC_(helperc_b_store16);
   6610                hName = "MC_(helperc_b_store16)";
   6611                break;
   6612       case 32: hFun  = (void*)&MC_(helperc_b_store32);
   6613                hName = "MC_(helperc_b_store32)";
   6614                break;
   6615       default:
   6616          tl_assert(0);
   6617    }
   6618    di = unsafeIRDirty_0_N( 2/*regparms*/,
   6619            hName, VG_(fnptr_to_fnentry)( hFun ),
   6620            mkIRExprVec_2( ea, dataB )
   6621         );
   6622    /* no need to mess with any annotations.  This call accesses
   6623       neither guest state nor guest memory. */
   6624    if (guard) di->guard = guard;
   6625    stmt( 'B', mce, IRStmt_Dirty(di) );
   6626 }
   6627 
   6628 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
   6629    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
   6630    if (eTy == Ity_I64)
   6631       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
   6632    if (eTy == Ity_I32)
   6633       return e;
   6634    tl_assert(0);
   6635 }
   6636 
   6637 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
   6638    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
   6639    tl_assert(eTy == Ity_I32);
   6640    if (dstTy == Ity_I64)
   6641       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
   6642    tl_assert(0);
   6643 }
   6644 
   6645 
   6646 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
   6647 {
   6648    tl_assert(MC_(clo_mc_level) == 3);
   6649 
   6650    switch (e->tag) {
   6651 
   6652       case Iex_GetI: {
   6653          IRRegArray* descr_b;
   6654          IRAtom      *t1, *t2, *t3, *t4;
   6655          IRRegArray* descr      = e->Iex.GetI.descr;
   6656          IRType equivIntTy
   6657             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
   6658          /* If this array is unshadowable for whatever reason, use the
   6659             usual approximation. */
   6660          if (equivIntTy == Ity_INVALID)
   6661             return mkU32(0);
   6662          tl_assert(sizeofIRType(equivIntTy) >= 4);
   6663          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
   6664          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
   6665                                  equivIntTy, descr->nElems );
   6666          /* Do a shadow indexed get of the same size, giving t1.  Take
   6667             the bottom 32 bits of it, giving t2.  Compute into t3 the
   6668             origin for the index (almost certainly zero, but there's
   6669             no harm in being completely general here, since iropt will
   6670             remove any useless code), and fold it in, giving a final
   6671             value t4. */
   6672          t1 = assignNew( 'B', mce, equivIntTy,
   6673                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
   6674                                                 e->Iex.GetI.bias ));
   6675          t2 = narrowTo32( mce, t1 );
   6676          t3 = schemeE( mce, e->Iex.GetI.ix );
   6677          t4 = gen_maxU32( mce, t2, t3 );
   6678          return t4;
   6679       }
   6680       case Iex_CCall: {
   6681          Int i;
   6682          IRAtom*  here;
   6683          IRExpr** args = e->Iex.CCall.args;
   6684          IRAtom*  curr = mkU32(0);
   6685          for (i = 0; args[i]; i++) {
   6686             tl_assert(i < 32);
   6687             tl_assert(isOriginalAtom(mce, args[i]));
   6688             /* Only take notice of this arg if the callee's
   6689                mc-exclusion mask does not say it is to be excluded. */
   6690             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
   6691                /* the arg is to be excluded from definedness checking.
   6692                   Do nothing. */
   6693                if (0) VG_(printf)("excluding %s(%d)\n",
   6694                                   e->Iex.CCall.cee->name, i);
   6695             } else {
   6696                /* calculate the arg's definedness, and pessimistically
   6697                   merge it in. */
   6698                here = schemeE( mce, args[i] );
   6699                curr = gen_maxU32( mce, curr, here );
   6700             }
   6701          }
   6702          return curr;
   6703       }
   6704       case Iex_Load: {
   6705          Int dszB;
   6706          dszB = sizeofIRType(e->Iex.Load.ty);
   6707          /* assert that the B value for the address is already
   6708             available (somewhere) */
   6709          tl_assert(isIRAtom(e->Iex.Load.addr));
   6710          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
   6711          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
   6712       }
   6713       case Iex_ITE: {
   6714          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
   6715          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
   6716          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
   6717          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
   6718       }
   6719       case Iex_Qop: {
   6720          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
   6721          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
   6722          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
   6723          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
   6724          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
   6725                                  gen_maxU32( mce, b3, b4 ) );
   6726       }
   6727       case Iex_Triop: {
   6728          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
   6729          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
   6730          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
   6731          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
   6732       }
   6733       case Iex_Binop: {
   6734          switch (e->Iex.Binop.op) {
   6735             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
   6736             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
   6737             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
   6738             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
   6739                /* Just say these all produce a defined result,
   6740                   regardless of their arguments.  See
   6741                   COMMENT_ON_CasCmpEQ in this file. */
   6742                return mkU32(0);
   6743             default: {
   6744                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
   6745                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
   6746                return gen_maxU32( mce, b1, b2 );
   6747             }
   6748          }
   6749          tl_assert(0);
   6750          /*NOTREACHED*/
   6751       }
   6752       case Iex_Unop: {
   6753          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
   6754          return b1;
   6755       }
   6756       case Iex_Const:
   6757          return mkU32(0);
   6758       case Iex_RdTmp:
   6759          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
   6760       case Iex_Get: {
   6761          Int b_offset = MC_(get_otrack_shadow_offset)(
   6762                            e->Iex.Get.offset,
   6763                            sizeofIRType(e->Iex.Get.ty)
   6764                         );
   6765          tl_assert(b_offset >= -1
   6766                    && b_offset <= mce->layout->total_sizeB -4);
   6767          if (b_offset >= 0) {
   6768             /* FIXME: this isn't an atom! */
   6769             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
   6770                                Ity_I32 );
   6771          }
   6772          return mkU32(0);
   6773       }
   6774       default:
   6775          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
   6776          ppIRExpr(e);
   6777          VG_(tool_panic)("memcheck:schemeE");
   6778    }
   6779 }
   6780 
   6781 
   6782 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
   6783 {
   6784    // This is a hacked version of do_shadow_Dirty
   6785    Int       i, k, n, toDo, gSz, gOff;
   6786    IRAtom    *here, *curr;
   6787    IRTemp    dst;
   6788 
   6789    /* First check the guard. */
   6790    curr = schemeE( mce, d->guard );
   6791 
   6792    /* Now round up all inputs and maxU32 over them. */
   6793 
   6794    /* Inputs: unmasked args
   6795       Note: arguments are evaluated REGARDLESS of the guard expression */
   6796    for (i = 0; d->args[i]; i++) {
   6797       IRAtom* arg = d->args[i];
   6798       if ( (d->cee->mcx_mask & (1<<i))
   6799            || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
   6800          /* ignore this arg */
   6801       } else {
   6802          here = schemeE( mce, arg );
   6803          curr = gen_maxU32( mce, curr, here );
   6804       }
   6805    }
   6806 
   6807    /* Inputs: guest state that we read. */
   6808    for (i = 0; i < d->nFxState; i++) {
   6809       tl_assert(d->fxState[i].fx != Ifx_None);
   6810       if (d->fxState[i].fx == Ifx_Write)
   6811          continue;
   6812 
   6813       /* Enumerate the described state segments */
   6814       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   6815          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   6816          gSz  = d->fxState[i].size;
   6817 
   6818          /* Ignore any sections marked as 'always defined'. */
   6819          if (isAlwaysDefd(mce, gOff, gSz)) {
   6820             if (0)
   6821             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
   6822                         gOff, gSz);
   6823             continue;
   6824          }
   6825 
   6826          /* This state element is read or modified.  So we need to
   6827             consider it.  If larger than 4 bytes, deal with it in
   6828             4-byte chunks. */
   6829          while (True) {
   6830             Int b_offset;
   6831             tl_assert(gSz >= 0);
   6832             if (gSz == 0) break;
   6833             n = gSz <= 4 ? gSz : 4;
   6834             /* update 'curr' with maxU32 of the state slice
   6835                gOff .. gOff+n-1 */
   6836             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
   6837             if (b_offset != -1) {
   6838                /* Observe the guard expression. If it is false use 0, i.e.
   6839                   nothing is known about the origin */
   6840                IRAtom *cond, *iffalse, *iftrue;
   6841 
   6842                cond = assignNew( 'B', mce, Ity_I1, d->guard);
   6843                iffalse = mkU32(0);
   6844                iftrue  = assignNew( 'B', mce, Ity_I32,
   6845                                     IRExpr_Get(b_offset
   6846                                                  + 2*mce->layout->total_sizeB,
   6847                                                Ity_I32));
   6848                here = assignNew( 'B', mce, Ity_I32,
   6849                                  IRExpr_ITE(cond, iftrue, iffalse));
   6850                curr = gen_maxU32( mce, curr, here );
   6851             }
   6852             gSz -= n;
   6853             gOff += n;
   6854          }
   6855       }
   6856    }
   6857 
   6858    /* Inputs: memory */
   6859 
   6860    if (d->mFx != Ifx_None) {
   6861       /* Because we may do multiple shadow loads/stores from the same
   6862          base address, it's best to do a single test of its
   6863          definedness right now.  Post-instrumentation optimisation
   6864          should remove all but this test. */
   6865       tl_assert(d->mAddr);
   6866       here = schemeE( mce, d->mAddr );
   6867       curr = gen_maxU32( mce, curr, here );
   6868    }
   6869 
   6870    /* Deal with memory inputs (reads or modifies) */
   6871    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
   6872       toDo   = d->mSize;
   6873       /* chew off 32-bit chunks.  We don't care about the endianness
   6874          since it's all going to be condensed down to a single bit,
   6875          but nevertheless choose an endianness which is hopefully
   6876          native to the platform. */
   6877       while (toDo >= 4) {
   6878          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
   6879                                     d->guard );
   6880          curr = gen_maxU32( mce, curr, here );
   6881          toDo -= 4;
   6882       }
   6883       /* handle possible 16-bit excess */
   6884       while (toDo >= 2) {
   6885          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
   6886                                     d->guard );
   6887          curr = gen_maxU32( mce, curr, here );
   6888          toDo -= 2;
   6889       }
   6890       /* chew off the remaining 8-bit chunk, if any */
   6891       if (toDo == 1) {
   6892          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
   6893                                     d->guard );
   6894          curr = gen_maxU32( mce, curr, here );
   6895          toDo -= 1;
   6896       }
   6897       tl_assert(toDo == 0);
   6898    }
   6899 
   6900    /* Whew!  So curr is a 32-bit B-value which should give an origin
   6901       of some use if any of the inputs to the helper are undefined.
   6902       Now we need to re-distribute the results to all destinations. */
   6903 
   6904    /* Outputs: the destination temporary, if there is one. */
   6905    if (d->tmp != IRTemp_INVALID) {
   6906       dst   = findShadowTmpB(mce, d->tmp);
   6907       assign( 'V', mce, dst, curr );
   6908    }
   6909 
   6910    /* Outputs: guest state that we write or modify. */
   6911    for (i = 0; i < d->nFxState; i++) {
   6912       tl_assert(d->fxState[i].fx != Ifx_None);
   6913       if (d->fxState[i].fx == Ifx_Read)
   6914          continue;
   6915 
   6916       /* Enumerate the described state segments */
   6917       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   6918          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   6919          gSz  = d->fxState[i].size;
   6920 
   6921          /* Ignore any sections marked as 'always defined'. */
   6922          if (isAlwaysDefd(mce, gOff, gSz))
   6923             continue;
   6924 
   6925          /* This state element is written or modified.  So we need to
   6926             consider it.  If larger than 4 bytes, deal with it in
   6927             4-byte chunks. */
   6928          while (True) {
   6929             Int b_offset;
   6930             tl_assert(gSz >= 0);
   6931             if (gSz == 0) break;
   6932             n = gSz <= 4 ? gSz : 4;
   6933             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
   6934             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
   6935             if (b_offset != -1) {
   6936 
   6937                /* If the guard expression evaluates to false we simply Put
   6938                   the value that is already stored in the guest state slot */
   6939                IRAtom *cond, *iffalse;
   6940 
   6941                cond    = assignNew('B', mce, Ity_I1,
   6942                                    d->guard);
   6943                iffalse = assignNew('B', mce, Ity_I32,
   6944                                    IRExpr_Get(b_offset +
   6945                                               2*mce->layout->total_sizeB,
   6946                                               Ity_I32));
   6947                curr = assignNew('V', mce, Ity_I32,
   6948                                 IRExpr_ITE(cond, curr, iffalse));
   6949 
   6950                stmt( 'B', mce, IRStmt_Put(b_offset
   6951                                           + 2*mce->layout->total_sizeB,
   6952                                           curr ));
   6953             }
   6954             gSz -= n;
   6955             gOff += n;
   6956          }
   6957       }
   6958    }
   6959 
   6960    /* Outputs: memory that we write or modify.  Same comments about
   6961       endianness as above apply. */
   6962    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
   6963       toDo   = d->mSize;
   6964       /* chew off 32-bit chunks */
   6965       while (toDo >= 4) {
   6966          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
   6967                       d->guard );
   6968          toDo -= 4;
   6969       }
   6970       /* handle possible 16-bit excess */
   6971       while (toDo >= 2) {
   6972          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
   6973                       d->guard );
   6974          toDo -= 2;
   6975       }
   6976       /* chew off the remaining 8-bit chunk, if any */
   6977       if (toDo == 1) {
   6978          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
   6979                       d->guard );
   6980          toDo -= 1;
   6981       }
   6982       tl_assert(toDo == 0);
   6983    }
   6984 }
   6985 
   6986 
   6987 /* Generate IR for origin shadowing for a general guarded store. */
   6988 static void do_origins_Store_guarded ( MCEnv* mce,
   6989                                        IREndness stEnd,
   6990                                        IRExpr* stAddr,
   6991                                        IRExpr* stData,
   6992                                        IRExpr* guard )
   6993 {
   6994    Int     dszB;
   6995    IRAtom* dataB;
   6996    /* assert that the B value for the address is already available
   6997       (somewhere), since the call to schemeE will want to see it.
   6998       XXXX how does this actually ensure that?? */
   6999    tl_assert(isIRAtom(stAddr));
   7000    tl_assert(isIRAtom(stData));
   7001    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
   7002    dataB = schemeE( mce, stData );
   7003    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
   7004 }
   7005 
   7006 
   7007 /* Generate IR for origin shadowing for a plain store. */
   7008 static void do_origins_Store_plain ( MCEnv* mce,
   7009                                      IREndness stEnd,
   7010                                      IRExpr* stAddr,
   7011                                      IRExpr* stData )
   7012 {
   7013    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
   7014                               NULL/*guard*/ );
   7015 }
   7016 
   7017 
   7018 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
   7019 
   7020 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
   7021 {
   7022    do_origins_Store_guarded( mce, sg->end, sg->addr,
   7023                              sg->data, sg->guard );
   7024 }
   7025 
   7026 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
   7027 {
   7028    IRType loadedTy = Ity_INVALID;
   7029    switch (lg->cvt) {
   7030       case ILGop_Ident32: loadedTy = Ity_I32; break;
   7031       case ILGop_16Uto32: loadedTy = Ity_I16; break;
   7032       case ILGop_16Sto32: loadedTy = Ity_I16; break;
   7033       case ILGop_8Uto32:  loadedTy = Ity_I8;  break;
   7034       case ILGop_8Sto32:  loadedTy = Ity_I8;  break;
   7035       default: VG_(tool_panic)("schemeS.IRLoadG");
   7036    }
   7037    IRAtom* ori_alt
   7038       = schemeE( mce,lg->alt );
   7039    IRAtom* ori_final
   7040       = expr2ori_Load_guarded_General(mce, loadedTy,
   7041                                       lg->addr, 0/*addr bias*/,
   7042                                       lg->guard, ori_alt );
   7043    /* And finally, bind the origin to the destination temporary. */
   7044    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
   7045 }
   7046 
   7047 
   7048 static void schemeS ( MCEnv* mce, IRStmt* st )
   7049 {
   7050    tl_assert(MC_(clo_mc_level) == 3);
   7051 
   7052    switch (st->tag) {
   7053 
   7054       case Ist_AbiHint:
   7055          /* The value-check instrumenter handles this - by arranging
   7056             to pass the address of the next instruction to
   7057             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
   7058             happen for origin tracking w.r.t. AbiHints.  So there is
   7059             nothing to do here. */
   7060          break;
   7061 
   7062       case Ist_PutI: {
   7063          IRPutI *puti = st->Ist.PutI.details;
   7064          IRRegArray* descr_b;
   7065          IRAtom      *t1, *t2, *t3, *t4;
   7066          IRRegArray* descr = puti->descr;
   7067          IRType equivIntTy
   7068             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
   7069          /* If this array is unshadowable for whatever reason,
   7070             generate no code. */
   7071          if (equivIntTy == Ity_INVALID)
   7072             break;
   7073          tl_assert(sizeofIRType(equivIntTy) >= 4);
   7074          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
   7075          descr_b
   7076             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
   7077                             equivIntTy, descr->nElems );
   7078          /* Compute a value to Put - the conjoinment of the origin for
   7079             the data to be Put-ted (obviously) and of the index value
   7080             (not so obviously). */
   7081          t1 = schemeE( mce, puti->data );
   7082          t2 = schemeE( mce, puti->ix );
   7083          t3 = gen_maxU32( mce, t1, t2 );
   7084          t4 = zWidenFrom32( mce, equivIntTy, t3 );
   7085          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
   7086                                                puti->bias, t4) ));
   7087          break;
   7088       }
   7089 
   7090       case Ist_Dirty:
   7091          do_origins_Dirty( mce, st->Ist.Dirty.details );
   7092          break;
   7093 
   7094       case Ist_Store:
   7095          do_origins_Store_plain( mce, st->Ist.Store.end,
   7096                                       st->Ist.Store.addr,
   7097                                       st->Ist.Store.data );
   7098          break;
   7099 
   7100       case Ist_StoreG:
   7101          do_origins_StoreG( mce, st->Ist.StoreG.details );
   7102          break;
   7103 
   7104       case Ist_LoadG:
   7105          do_origins_LoadG( mce, st->Ist.LoadG.details );
   7106          break;
   7107 
   7108       case Ist_LLSC: {
   7109          /* In short: treat a load-linked like a normal load followed
   7110             by an assignment of the loaded (shadow) data the result
   7111             temporary.  Treat a store-conditional like a normal store,
   7112             and mark the result temporary as defined. */
   7113          if (st->Ist.LLSC.storedata == NULL) {
   7114             /* Load Linked */
   7115             IRType resTy
   7116                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
   7117             IRExpr* vanillaLoad
   7118                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
   7119             tl_assert(resTy == Ity_I64 || resTy == Ity_I32
   7120                       || resTy == Ity_I16 || resTy == Ity_I8);
   7121             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
   7122                               schemeE(mce, vanillaLoad));
   7123          } else {
   7124             /* Store conditional */
   7125             do_origins_Store_plain( mce, st->Ist.LLSC.end,
   7126                                     st->Ist.LLSC.addr,
   7127                                     st->Ist.LLSC.storedata );
   7128             /* For the rationale behind this, see comments at the
   7129                place where the V-shadow for .result is constructed, in
   7130                do_shadow_LLSC.  In short, we regard .result as
   7131                always-defined. */
   7132             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
   7133                               mkU32(0) );
   7134          }
   7135          break;
   7136       }
   7137 
   7138       case Ist_Put: {
   7139          Int b_offset
   7140             = MC_(get_otrack_shadow_offset)(
   7141                  st->Ist.Put.offset,
   7142                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
   7143               );
   7144          if (b_offset >= 0) {
   7145             /* FIXME: this isn't an atom! */
   7146             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
   7147                                        schemeE( mce, st->Ist.Put.data )) );
   7148          }
   7149          break;
   7150       }
   7151 
   7152       case Ist_WrTmp:
   7153          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
   7154                            schemeE(mce, st->Ist.WrTmp.data) );
   7155          break;
   7156 
   7157       case Ist_MBE:
   7158       case Ist_NoOp:
   7159       case Ist_Exit:
   7160       case Ist_IMark:
   7161          break;
   7162 
   7163       default:
   7164          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
   7165          ppIRStmt(st);
   7166          VG_(tool_panic)("memcheck:schemeS");
   7167    }
   7168 }
   7169 
   7170 
   7171 /*--------------------------------------------------------------------*/
   7172 /*--- end                                           mc_translate.c ---*/
   7173 /*--------------------------------------------------------------------*/
   7174