Home | History | Annotate | Download | only in memcheck
      1 
      2 /*--------------------------------------------------------------------*/
      3 /*--- Instrument IR to perform memory checking operations.         ---*/
      4 /*---                                               mc_translate.c ---*/
      5 /*--------------------------------------------------------------------*/
      6 
      7 /*
      8    This file is part of MemCheck, a heavyweight Valgrind tool for
      9    detecting memory errors.
     10 
     11    Copyright (C) 2000-2015 Julian Seward
     12       jseward (at) acm.org
     13 
     14    This program is free software; you can redistribute it and/or
     15    modify it under the terms of the GNU General Public License as
     16    published by the Free Software Foundation; either version 2 of the
     17    License, or (at your option) any later version.
     18 
     19    This program is distributed in the hope that it will be useful, but
     20    WITHOUT ANY WARRANTY; without even the implied warranty of
     21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     22    General Public License for more details.
     23 
     24    You should have received a copy of the GNU General Public License
     25    along with this program; if not, write to the Free Software
     26    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
     27    02111-1307, USA.
     28 
     29    The GNU General Public License is contained in the file COPYING.
     30 */
     31 
     32 #include "pub_tool_basics.h"
     33 #include "pub_tool_poolalloc.h"     // For mc_include.h
     34 #include "pub_tool_hashtable.h"     // For mc_include.h
     35 #include "pub_tool_libcassert.h"
     36 #include "pub_tool_libcprint.h"
     37 #include "pub_tool_tooliface.h"
     38 #include "pub_tool_machine.h"     // VG_(fnptr_to_fnentry)
     39 #include "pub_tool_xarray.h"
     40 #include "pub_tool_mallocfree.h"
     41 #include "pub_tool_libcbase.h"
     42 
     43 #include "mc_include.h"
     44 
     45 
     46 /* FIXMEs JRS 2011-June-16.
     47 
     48    Check the interpretation for vector narrowing and widening ops,
     49    particularly the saturating ones.  I suspect they are either overly
     50    pessimistic and/or wrong.
     51 
     52    Iop_QandSQsh64x2 and friends (vector-by-vector bidirectional
     53    saturating shifts): the interpretation is overly pessimistic.
     54    See comments on the relevant cases below for details.
     55 
     56    Iop_Sh64Sx2 and friends (vector-by-vector bidirectional shifts,
     57    both rounding and non-rounding variants): ditto
     58 */
     59 
     60 /* This file implements the Memcheck instrumentation, and in
     61    particular contains the core of its undefined value detection
     62    machinery.  For a comprehensive background of the terminology,
     63    algorithms and rationale used herein, read:
     64 
     65      Using Valgrind to detect undefined value errors with
     66      bit-precision
     67 
     68      Julian Seward and Nicholas Nethercote
     69 
     70      2005 USENIX Annual Technical Conference (General Track),
     71      Anaheim, CA, USA, April 10-15, 2005.
     72 
     73    ----
     74 
     75    Here is as good a place as any to record exactly when V bits are and
     76    should be checked, why, and what function is responsible.
     77 
     78 
     79    Memcheck complains when an undefined value is used:
     80 
     81    1. In the condition of a conditional branch.  Because it could cause
     82       incorrect control flow, and thus cause incorrect externally-visible
     83       behaviour.  [mc_translate.c:complainIfUndefined]
     84 
     85    2. As an argument to a system call, or as the value that specifies
     86       the system call number.  Because it could cause an incorrect
     87       externally-visible side effect.  [mc_translate.c:mc_pre_reg_read]
     88 
     89    3. As the address in a load or store.  Because it could cause an
     90       incorrect value to be used later, which could cause externally-visible
     91       behaviour (eg. via incorrect control flow or an incorrect system call
     92       argument)  [complainIfUndefined]
     93 
     94    4. As the target address of a branch.  Because it could cause incorrect
     95       control flow.  [complainIfUndefined]
     96 
     97    5. As an argument to setenv, unsetenv, or putenv.  Because it could put
     98       an incorrect value into the external environment.
     99       [mc_replace_strmem.c:VG_WRAP_FUNCTION_ZU(*, *env)]
    100 
    101    6. As the index in a GETI or PUTI operation.  I'm not sure why... (njn).
    102       [complainIfUndefined]
    103 
    104    7. As an argument to the VALGRIND_CHECK_MEM_IS_DEFINED and
    105       VALGRIND_CHECK_VALUE_IS_DEFINED client requests.  Because the user
    106       requested it.  [in memcheck.h]
    107 
    108 
    109    Memcheck also complains, but should not, when an undefined value is used:
    110 
    111    8. As the shift value in certain SIMD shift operations (but not in the
    112       standard integer shift operations).  This inconsistency is due to
    113       historical reasons.)  [complainIfUndefined]
    114 
    115 
    116    Memcheck does not complain, but should, when an undefined value is used:
    117 
    118    9. As an input to a client request.  Because the client request may
    119       affect the visible behaviour -- see bug #144362 for an example
    120       involving the malloc replacements in vg_replace_malloc.c and
    121       VALGRIND_NON_SIMD_CALL* requests, where an uninitialised argument
    122       isn't identified.  That bug report also has some info on how to solve
    123       the problem.  [valgrind.h:VALGRIND_DO_CLIENT_REQUEST]
    124 
    125 
    126    In practice, 1 and 2 account for the vast majority of cases.
    127 */
    128 
    129 /* Generation of addr-definedness, addr-validity and
    130    guard-definedness checks pertaining to loads and stores (Iex_Load,
    131    Ist_Store, IRLoadG, IRStoreG, LLSC, CAS and Dirty memory
    132    loads/stores) was re-checked 11 May 2013. */
    133 
    134 /*------------------------------------------------------------*/
    135 /*--- Forward decls                                        ---*/
    136 /*------------------------------------------------------------*/
    137 
    138 struct _MCEnv;
    139 
    140 static IRType  shadowTypeV ( IRType ty );
    141 static IRExpr* expr2vbits ( struct _MCEnv* mce, IRExpr* e );
    142 static IRTemp  findShadowTmpB ( struct _MCEnv* mce, IRTemp orig );
    143 
    144 static IRExpr *i128_const_zero(void);
    145 
    146 /*------------------------------------------------------------*/
    147 /*--- Memcheck running state, and tmp management.          ---*/
    148 /*------------------------------------------------------------*/
    149 
    150 /* Carries info about a particular tmp.  The tmp's number is not
    151    recorded, as this is implied by (equal to) its index in the tmpMap
    152    in MCEnv.  The tmp's type is also not recorded, as this is present
    153    in MCEnv.sb->tyenv.
    154 
    155    When .kind is Orig, .shadowV and .shadowB may give the identities
    156    of the temps currently holding the associated definedness (shadowV)
    157    and origin (shadowB) values, or these may be IRTemp_INVALID if code
    158    to compute such values has not yet been emitted.
    159 
    160    When .kind is VSh or BSh then the tmp is holds a V- or B- value,
    161    and so .shadowV and .shadowB must be IRTemp_INVALID, since it is
    162    illogical for a shadow tmp itself to be shadowed.
    163 */
    164 typedef
    165    enum { Orig=1, VSh=2, BSh=3 }
    166    TempKind;
    167 
    168 typedef
    169    struct {
    170       TempKind kind;
    171       IRTemp   shadowV;
    172       IRTemp   shadowB;
    173    }
    174    TempMapEnt;
    175 
    176 
    177 /* Carries around state during memcheck instrumentation. */
    178 typedef
    179    struct _MCEnv {
    180       /* MODIFIED: the superblock being constructed.  IRStmts are
    181          added. */
    182       IRSB* sb;
    183       Bool  trace;
    184 
    185       /* MODIFIED: a table [0 .. #temps_in_sb-1] which gives the
    186          current kind and possibly shadow temps for each temp in the
    187          IRSB being constructed.  Note that it does not contain the
    188          type of each tmp.  If you want to know the type, look at the
    189          relevant entry in sb->tyenv.  It follows that at all times
    190          during the instrumentation process, the valid indices for
    191          tmpMap and sb->tyenv are identical, being 0 .. N-1 where N is
    192          total number of Orig, V- and B- temps allocated so far.
    193 
    194          The reason for this strange split (types in one place, all
    195          other info in another) is that we need the types to be
    196          attached to sb so as to make it possible to do
    197          "typeOfIRExpr(mce->bb->tyenv, ...)" at various places in the
    198          instrumentation process. */
    199       XArray* /* of TempMapEnt */ tmpMap;
    200 
    201       /* MODIFIED: indicates whether "bogus" literals have so far been
    202          found.  Starts off False, and may change to True. */
    203       Bool bogusLiterals;
    204 
    205       /* READONLY: indicates whether we should use expensive
    206          interpretations of integer adds, since unfortunately LLVM
    207          uses them to do ORs in some circumstances.  Defaulted to True
    208          on MacOS and False everywhere else. */
    209       Bool useLLVMworkarounds;
    210 
    211       /* READONLY: the guest layout.  This indicates which parts of
    212          the guest state should be regarded as 'always defined'. */
    213       const VexGuestLayout* layout;
    214 
    215       /* READONLY: the host word type.  Needed for constructing
    216          arguments of type 'HWord' to be passed to helper functions.
    217          Ity_I32 or Ity_I64 only. */
    218       IRType hWordTy;
    219    }
    220    MCEnv;
    221 
    222 /* SHADOW TMP MANAGEMENT.  Shadow tmps are allocated lazily (on
    223    demand), as they are encountered.  This is for two reasons.
    224 
    225    (1) (less important reason): Many original tmps are unused due to
    226    initial IR optimisation, and we do not want to spaces in tables
    227    tracking them.
    228 
    229    Shadow IRTemps are therefore allocated on demand.  mce.tmpMap is a
    230    table indexed [0 .. n_types-1], which gives the current shadow for
    231    each original tmp, or INVALID_IRTEMP if none is so far assigned.
    232    It is necessary to support making multiple assignments to a shadow
    233    -- specifically, after testing a shadow for definedness, it needs
    234    to be made defined.  But IR's SSA property disallows this.
    235 
    236    (2) (more important reason): Therefore, when a shadow needs to get
    237    a new value, a new temporary is created, the value is assigned to
    238    that, and the tmpMap is updated to reflect the new binding.
    239 
    240    A corollary is that if the tmpMap maps a given tmp to
    241    IRTemp_INVALID and we are hoping to read that shadow tmp, it means
    242    there's a read-before-write error in the original tmps.  The IR
    243    sanity checker should catch all such anomalies, however.
    244 */
    245 
    246 /* Create a new IRTemp of type 'ty' and kind 'kind', and add it to
    247    both the table in mce->sb and to our auxiliary mapping.  Note that
    248    newTemp may cause mce->tmpMap to resize, hence previous results
    249    from VG_(indexXA)(mce->tmpMap) are invalidated. */
    250 static IRTemp newTemp ( MCEnv* mce, IRType ty, TempKind kind )
    251 {
    252    Word       newIx;
    253    TempMapEnt ent;
    254    IRTemp     tmp = newIRTemp(mce->sb->tyenv, ty);
    255    ent.kind    = kind;
    256    ent.shadowV = IRTemp_INVALID;
    257    ent.shadowB = IRTemp_INVALID;
    258    newIx = VG_(addToXA)( mce->tmpMap, &ent );
    259    tl_assert(newIx == (Word)tmp);
    260    return tmp;
    261 }
    262 
    263 
    264 /* Find the tmp currently shadowing the given original tmp.  If none
    265    so far exists, allocate one.  */
    266 static IRTemp findShadowTmpV ( MCEnv* mce, IRTemp orig )
    267 {
    268    TempMapEnt* ent;
    269    /* VG_(indexXA) range-checks 'orig', hence no need to check
    270       here. */
    271    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    272    tl_assert(ent->kind == Orig);
    273    if (ent->shadowV == IRTemp_INVALID) {
    274       IRTemp tmpV
    275         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
    276       /* newTemp may cause mce->tmpMap to resize, hence previous results
    277          from VG_(indexXA) are invalid. */
    278       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    279       tl_assert(ent->kind == Orig);
    280       tl_assert(ent->shadowV == IRTemp_INVALID);
    281       ent->shadowV = tmpV;
    282    }
    283    return ent->shadowV;
    284 }
    285 
    286 /* Allocate a new shadow for the given original tmp.  This means any
    287    previous shadow is abandoned.  This is needed because it is
    288    necessary to give a new value to a shadow once it has been tested
    289    for undefinedness, but unfortunately IR's SSA property disallows
    290    this.  Instead we must abandon the old shadow, allocate a new one
    291    and use that instead.
    292 
    293    This is the same as findShadowTmpV, except we don't bother to see
    294    if a shadow temp already existed -- we simply allocate a new one
    295    regardless. */
    296 static void newShadowTmpV ( MCEnv* mce, IRTemp orig )
    297 {
    298    TempMapEnt* ent;
    299    /* VG_(indexXA) range-checks 'orig', hence no need to check
    300       here. */
    301    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    302    tl_assert(ent->kind == Orig);
    303    if (1) {
    304       IRTemp tmpV
    305         = newTemp( mce, shadowTypeV(mce->sb->tyenv->types[orig]), VSh );
    306       /* newTemp may cause mce->tmpMap to resize, hence previous results
    307          from VG_(indexXA) are invalid. */
    308       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
    309       tl_assert(ent->kind == Orig);
    310       ent->shadowV = tmpV;
    311    }
    312 }
    313 
    314 
    315 /*------------------------------------------------------------*/
    316 /*--- IRAtoms -- a subset of IRExprs                       ---*/
    317 /*------------------------------------------------------------*/
    318 
    319 /* An atom is either an IRExpr_Const or an IRExpr_Tmp, as defined by
    320    isIRAtom() in libvex_ir.h.  Because this instrumenter expects flat
    321    input, most of this code deals in atoms.  Usefully, a value atom
    322    always has a V-value which is also an atom: constants are shadowed
    323    by constants, and temps are shadowed by the corresponding shadow
    324    temporary. */
    325 
    326 typedef  IRExpr  IRAtom;
    327 
    328 /* (used for sanity checks only): is this an atom which looks
    329    like it's from original code? */
    330 static Bool isOriginalAtom ( MCEnv* mce, IRAtom* a1 )
    331 {
    332    if (a1->tag == Iex_Const)
    333       return True;
    334    if (a1->tag == Iex_RdTmp) {
    335       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
    336       return ent->kind == Orig;
    337    }
    338    return False;
    339 }
    340 
    341 /* (used for sanity checks only): is this an atom which looks
    342    like it's from shadow code? */
    343 static Bool isShadowAtom ( MCEnv* mce, IRAtom* a1 )
    344 {
    345    if (a1->tag == Iex_Const)
    346       return True;
    347    if (a1->tag == Iex_RdTmp) {
    348       TempMapEnt* ent = VG_(indexXA)( mce->tmpMap, a1->Iex.RdTmp.tmp );
    349       return ent->kind == VSh || ent->kind == BSh;
    350    }
    351    return False;
    352 }
    353 
    354 /* (used for sanity checks only): check that both args are atoms and
    355    are identically-kinded. */
    356 static Bool sameKindedAtoms ( IRAtom* a1, IRAtom* a2 )
    357 {
    358    if (a1->tag == Iex_RdTmp && a2->tag == Iex_RdTmp)
    359       return True;
    360    if (a1->tag == Iex_Const && a2->tag == Iex_Const)
    361       return True;
    362    return False;
    363 }
    364 
    365 
    366 /*------------------------------------------------------------*/
    367 /*--- Type management                                      ---*/
    368 /*------------------------------------------------------------*/
    369 
    370 /* Shadow state is always accessed using integer types.  This returns
    371    an integer type with the same size (as per sizeofIRType) as the
    372    given type.  The only valid shadow types are Bit, I8, I16, I32,
    373    I64, I128, V128, V256. */
    374 
    375 static IRType shadowTypeV ( IRType ty )
    376 {
    377    switch (ty) {
    378       case Ity_I1:
    379       case Ity_I8:
    380       case Ity_I16:
    381       case Ity_I32:
    382       case Ity_I64:
    383       case Ity_I128: return ty;
    384       case Ity_F16:  return Ity_I16;
    385       case Ity_F32:  return Ity_I32;
    386       case Ity_D32:  return Ity_I32;
    387       case Ity_F64:  return Ity_I64;
    388       case Ity_D64:  return Ity_I64;
    389       case Ity_F128: return Ity_I128;
    390       case Ity_D128: return Ity_I128;
    391       case Ity_V128: return Ity_V128;
    392       case Ity_V256: return Ity_V256;
    393       default: ppIRType(ty);
    394                VG_(tool_panic)("memcheck:shadowTypeV");
    395    }
    396 }
    397 
    398 /* Produce a 'defined' value of the given shadow type.  Should only be
    399    supplied shadow types (Bit/I8/I16/I32/UI64). */
    400 static IRExpr* definedOfType ( IRType ty ) {
    401    switch (ty) {
    402       case Ity_I1:   return IRExpr_Const(IRConst_U1(False));
    403       case Ity_I8:   return IRExpr_Const(IRConst_U8(0));
    404       case Ity_I16:  return IRExpr_Const(IRConst_U16(0));
    405       case Ity_I32:  return IRExpr_Const(IRConst_U32(0));
    406       case Ity_I64:  return IRExpr_Const(IRConst_U64(0));
    407       case Ity_I128: return i128_const_zero();
    408       case Ity_V128: return IRExpr_Const(IRConst_V128(0x0000));
    409       case Ity_V256: return IRExpr_Const(IRConst_V256(0x00000000));
    410       default:       VG_(tool_panic)("memcheck:definedOfType");
    411    }
    412 }
    413 
    414 
    415 /*------------------------------------------------------------*/
    416 /*--- Constructing IR fragments                            ---*/
    417 /*------------------------------------------------------------*/
    418 
    419 /* add stmt to a bb */
    420 static inline void stmt ( HChar cat, MCEnv* mce, IRStmt* st ) {
    421    if (mce->trace) {
    422       VG_(printf)("  %c: ", cat);
    423       ppIRStmt(st);
    424       VG_(printf)("\n");
    425    }
    426    addStmtToIRSB(mce->sb, st);
    427 }
    428 
    429 /* assign value to tmp */
    430 static inline
    431 void assign ( HChar cat, MCEnv* mce, IRTemp tmp, IRExpr* expr ) {
    432    stmt(cat, mce, IRStmt_WrTmp(tmp,expr));
    433 }
    434 
    435 /* build various kinds of expressions */
    436 #define triop(_op, _arg1, _arg2, _arg3) \
    437                                  IRExpr_Triop((_op),(_arg1),(_arg2),(_arg3))
    438 #define binop(_op, _arg1, _arg2) IRExpr_Binop((_op),(_arg1),(_arg2))
    439 #define unop(_op, _arg)          IRExpr_Unop((_op),(_arg))
    440 #define mkU1(_n)                 IRExpr_Const(IRConst_U1(_n))
    441 #define mkU8(_n)                 IRExpr_Const(IRConst_U8(_n))
    442 #define mkU16(_n)                IRExpr_Const(IRConst_U16(_n))
    443 #define mkU32(_n)                IRExpr_Const(IRConst_U32(_n))
    444 #define mkU64(_n)                IRExpr_Const(IRConst_U64(_n))
    445 #define mkV128(_n)               IRExpr_Const(IRConst_V128(_n))
    446 #define mkexpr(_tmp)             IRExpr_RdTmp((_tmp))
    447 
    448 /* Bind the given expression to a new temporary, and return the
    449    temporary.  This effectively converts an arbitrary expression into
    450    an atom.
    451 
    452    'ty' is the type of 'e' and hence the type that the new temporary
    453    needs to be.  But passing it in is redundant, since we can deduce
    454    the type merely by inspecting 'e'.  So at least use that fact to
    455    assert that the two types agree. */
    456 static IRAtom* assignNew ( HChar cat, MCEnv* mce, IRType ty, IRExpr* e )
    457 {
    458    TempKind k;
    459    IRTemp   t;
    460    IRType   tyE = typeOfIRExpr(mce->sb->tyenv, e);
    461 
    462    tl_assert(tyE == ty); /* so 'ty' is redundant (!) */
    463    switch (cat) {
    464       case 'V': k = VSh;  break;
    465       case 'B': k = BSh;  break;
    466       case 'C': k = Orig; break;
    467                 /* happens when we are making up new "orig"
    468                    expressions, for IRCAS handling */
    469       default: tl_assert(0);
    470    }
    471    t = newTemp(mce, ty, k);
    472    assign(cat, mce, t, e);
    473    return mkexpr(t);
    474 }
    475 
    476 
    477 /*------------------------------------------------------------*/
    478 /*--- Helper functions for 128-bit ops                     ---*/
    479 /*------------------------------------------------------------*/
    480 
    481 static IRExpr *i128_const_zero(void)
    482 {
    483    IRAtom* z64 = IRExpr_Const(IRConst_U64(0));
    484    return binop(Iop_64HLto128, z64, z64);
    485 }
    486 
    487 /* There are no I128-bit loads and/or stores [as generated by any
    488    current front ends].  So we do not need to worry about that in
    489    expr2vbits_Load */
    490 
    491 
    492 /*------------------------------------------------------------*/
    493 /*--- Constructing definedness primitive ops               ---*/
    494 /*------------------------------------------------------------*/
    495 
    496 /* --------- Defined-if-either-defined --------- */
    497 
    498 static IRAtom* mkDifD8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    499    tl_assert(isShadowAtom(mce,a1));
    500    tl_assert(isShadowAtom(mce,a2));
    501    return assignNew('V', mce, Ity_I8, binop(Iop_And8, a1, a2));
    502 }
    503 
    504 static IRAtom* mkDifD16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    505    tl_assert(isShadowAtom(mce,a1));
    506    tl_assert(isShadowAtom(mce,a2));
    507    return assignNew('V', mce, Ity_I16, binop(Iop_And16, a1, a2));
    508 }
    509 
    510 static IRAtom* mkDifD32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    511    tl_assert(isShadowAtom(mce,a1));
    512    tl_assert(isShadowAtom(mce,a2));
    513    return assignNew('V', mce, Ity_I32, binop(Iop_And32, a1, a2));
    514 }
    515 
    516 static IRAtom* mkDifD64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    517    tl_assert(isShadowAtom(mce,a1));
    518    tl_assert(isShadowAtom(mce,a2));
    519    return assignNew('V', mce, Ity_I64, binop(Iop_And64, a1, a2));
    520 }
    521 
    522 static IRAtom* mkDifDV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    523    tl_assert(isShadowAtom(mce,a1));
    524    tl_assert(isShadowAtom(mce,a2));
    525    return assignNew('V', mce, Ity_V128, binop(Iop_AndV128, a1, a2));
    526 }
    527 
    528 static IRAtom* mkDifDV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    529    tl_assert(isShadowAtom(mce,a1));
    530    tl_assert(isShadowAtom(mce,a2));
    531    return assignNew('V', mce, Ity_V256, binop(Iop_AndV256, a1, a2));
    532 }
    533 
    534 /* --------- Undefined-if-either-undefined --------- */
    535 
    536 static IRAtom* mkUifU8 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    537    tl_assert(isShadowAtom(mce,a1));
    538    tl_assert(isShadowAtom(mce,a2));
    539    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, a1, a2));
    540 }
    541 
    542 static IRAtom* mkUifU16 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    543    tl_assert(isShadowAtom(mce,a1));
    544    tl_assert(isShadowAtom(mce,a2));
    545    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, a1, a2));
    546 }
    547 
    548 static IRAtom* mkUifU32 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    549    tl_assert(isShadowAtom(mce,a1));
    550    tl_assert(isShadowAtom(mce,a2));
    551    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, a1, a2));
    552 }
    553 
    554 static IRAtom* mkUifU64 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    555    tl_assert(isShadowAtom(mce,a1));
    556    tl_assert(isShadowAtom(mce,a2));
    557    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, a1, a2));
    558 }
    559 
    560 static IRAtom* mkUifU128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    561    IRAtom *tmp1, *tmp2, *tmp3, *tmp4, *tmp5, *tmp6;
    562    tl_assert(isShadowAtom(mce,a1));
    563    tl_assert(isShadowAtom(mce,a2));
    564    tmp1 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a1));
    565    tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a1));
    566    tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, a2));
    567    tmp4 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, a2));
    568    tmp5 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp1, tmp3));
    569    tmp6 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp4));
    570 
    571    return assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp6, tmp5));
    572 }
    573 
    574 static IRAtom* mkUifUV128 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    575    tl_assert(isShadowAtom(mce,a1));
    576    tl_assert(isShadowAtom(mce,a2));
    577    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, a1, a2));
    578 }
    579 
    580 static IRAtom* mkUifUV256 ( MCEnv* mce, IRAtom* a1, IRAtom* a2 ) {
    581    tl_assert(isShadowAtom(mce,a1));
    582    tl_assert(isShadowAtom(mce,a2));
    583    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, a1, a2));
    584 }
    585 
    586 static IRAtom* mkUifU ( MCEnv* mce, IRType vty, IRAtom* a1, IRAtom* a2 ) {
    587    switch (vty) {
    588       case Ity_I8:   return mkUifU8(mce, a1, a2);
    589       case Ity_I16:  return mkUifU16(mce, a1, a2);
    590       case Ity_I32:  return mkUifU32(mce, a1, a2);
    591       case Ity_I64:  return mkUifU64(mce, a1, a2);
    592       case Ity_I128: return mkUifU128(mce, a1, a2);
    593       case Ity_V128: return mkUifUV128(mce, a1, a2);
    594       case Ity_V256: return mkUifUV256(mce, a1, a2);
    595       default:
    596          VG_(printf)("\n"); ppIRType(vty); VG_(printf)("\n");
    597          VG_(tool_panic)("memcheck:mkUifU");
    598    }
    599 }
    600 
    601 /* --------- The Left-family of operations. --------- */
    602 
    603 static IRAtom* mkLeft8 ( MCEnv* mce, IRAtom* a1 ) {
    604    tl_assert(isShadowAtom(mce,a1));
    605    return assignNew('V', mce, Ity_I8, unop(Iop_Left8, a1));
    606 }
    607 
    608 static IRAtom* mkLeft16 ( MCEnv* mce, IRAtom* a1 ) {
    609    tl_assert(isShadowAtom(mce,a1));
    610    return assignNew('V', mce, Ity_I16, unop(Iop_Left16, a1));
    611 }
    612 
    613 static IRAtom* mkLeft32 ( MCEnv* mce, IRAtom* a1 ) {
    614    tl_assert(isShadowAtom(mce,a1));
    615    return assignNew('V', mce, Ity_I32, unop(Iop_Left32, a1));
    616 }
    617 
    618 static IRAtom* mkLeft64 ( MCEnv* mce, IRAtom* a1 ) {
    619    tl_assert(isShadowAtom(mce,a1));
    620    return assignNew('V', mce, Ity_I64, unop(Iop_Left64, a1));
    621 }
    622 
    623 /* --------- 'Improvement' functions for AND/OR. --------- */
    624 
    625 /* ImproveAND(data, vbits) = data OR vbits.  Defined (0) data 0s give
    626    defined (0); all other -> undefined (1).
    627 */
    628 static IRAtom* mkImproveAND8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    629 {
    630    tl_assert(isOriginalAtom(mce, data));
    631    tl_assert(isShadowAtom(mce, vbits));
    632    tl_assert(sameKindedAtoms(data, vbits));
    633    return assignNew('V', mce, Ity_I8, binop(Iop_Or8, data, vbits));
    634 }
    635 
    636 static IRAtom* mkImproveAND16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    637 {
    638    tl_assert(isOriginalAtom(mce, data));
    639    tl_assert(isShadowAtom(mce, vbits));
    640    tl_assert(sameKindedAtoms(data, vbits));
    641    return assignNew('V', mce, Ity_I16, binop(Iop_Or16, data, vbits));
    642 }
    643 
    644 static IRAtom* mkImproveAND32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    645 {
    646    tl_assert(isOriginalAtom(mce, data));
    647    tl_assert(isShadowAtom(mce, vbits));
    648    tl_assert(sameKindedAtoms(data, vbits));
    649    return assignNew('V', mce, Ity_I32, binop(Iop_Or32, data, vbits));
    650 }
    651 
    652 static IRAtom* mkImproveAND64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    653 {
    654    tl_assert(isOriginalAtom(mce, data));
    655    tl_assert(isShadowAtom(mce, vbits));
    656    tl_assert(sameKindedAtoms(data, vbits));
    657    return assignNew('V', mce, Ity_I64, binop(Iop_Or64, data, vbits));
    658 }
    659 
    660 static IRAtom* mkImproveANDV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    661 {
    662    tl_assert(isOriginalAtom(mce, data));
    663    tl_assert(isShadowAtom(mce, vbits));
    664    tl_assert(sameKindedAtoms(data, vbits));
    665    return assignNew('V', mce, Ity_V128, binop(Iop_OrV128, data, vbits));
    666 }
    667 
    668 static IRAtom* mkImproveANDV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    669 {
    670    tl_assert(isOriginalAtom(mce, data));
    671    tl_assert(isShadowAtom(mce, vbits));
    672    tl_assert(sameKindedAtoms(data, vbits));
    673    return assignNew('V', mce, Ity_V256, binop(Iop_OrV256, data, vbits));
    674 }
    675 
    676 /* ImproveOR(data, vbits) = ~data OR vbits.  Defined (0) data 1s give
    677    defined (0); all other -> undefined (1).
    678 */
    679 static IRAtom* mkImproveOR8 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    680 {
    681    tl_assert(isOriginalAtom(mce, data));
    682    tl_assert(isShadowAtom(mce, vbits));
    683    tl_assert(sameKindedAtoms(data, vbits));
    684    return assignNew(
    685              'V', mce, Ity_I8,
    686              binop(Iop_Or8,
    687                    assignNew('V', mce, Ity_I8, unop(Iop_Not8, data)),
    688                    vbits) );
    689 }
    690 
    691 static IRAtom* mkImproveOR16 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    692 {
    693    tl_assert(isOriginalAtom(mce, data));
    694    tl_assert(isShadowAtom(mce, vbits));
    695    tl_assert(sameKindedAtoms(data, vbits));
    696    return assignNew(
    697              'V', mce, Ity_I16,
    698              binop(Iop_Or16,
    699                    assignNew('V', mce, Ity_I16, unop(Iop_Not16, data)),
    700                    vbits) );
    701 }
    702 
    703 static IRAtom* mkImproveOR32 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    704 {
    705    tl_assert(isOriginalAtom(mce, data));
    706    tl_assert(isShadowAtom(mce, vbits));
    707    tl_assert(sameKindedAtoms(data, vbits));
    708    return assignNew(
    709              'V', mce, Ity_I32,
    710              binop(Iop_Or32,
    711                    assignNew('V', mce, Ity_I32, unop(Iop_Not32, data)),
    712                    vbits) );
    713 }
    714 
    715 static IRAtom* mkImproveOR64 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    716 {
    717    tl_assert(isOriginalAtom(mce, data));
    718    tl_assert(isShadowAtom(mce, vbits));
    719    tl_assert(sameKindedAtoms(data, vbits));
    720    return assignNew(
    721              'V', mce, Ity_I64,
    722              binop(Iop_Or64,
    723                    assignNew('V', mce, Ity_I64, unop(Iop_Not64, data)),
    724                    vbits) );
    725 }
    726 
    727 static IRAtom* mkImproveORV128 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    728 {
    729    tl_assert(isOriginalAtom(mce, data));
    730    tl_assert(isShadowAtom(mce, vbits));
    731    tl_assert(sameKindedAtoms(data, vbits));
    732    return assignNew(
    733              'V', mce, Ity_V128,
    734              binop(Iop_OrV128,
    735                    assignNew('V', mce, Ity_V128, unop(Iop_NotV128, data)),
    736                    vbits) );
    737 }
    738 
    739 static IRAtom* mkImproveORV256 ( MCEnv* mce, IRAtom* data, IRAtom* vbits )
    740 {
    741    tl_assert(isOriginalAtom(mce, data));
    742    tl_assert(isShadowAtom(mce, vbits));
    743    tl_assert(sameKindedAtoms(data, vbits));
    744    return assignNew(
    745              'V', mce, Ity_V256,
    746              binop(Iop_OrV256,
    747                    assignNew('V', mce, Ity_V256, unop(Iop_NotV256, data)),
    748                    vbits) );
    749 }
    750 
    751 /* --------- Pessimising casts. --------- */
    752 
    753 /* The function returns an expression of type DST_TY. If any of the VBITS
    754    is undefined (value == 1) the resulting expression has all bits set to
    755    1. Otherwise, all bits are 0. */
    756 
    757 static IRAtom* mkPCastTo( MCEnv* mce, IRType dst_ty, IRAtom* vbits )
    758 {
    759    IRType  src_ty;
    760    IRAtom* tmp1;
    761 
    762    /* Note, dst_ty is a shadow type, not an original type. */
    763    tl_assert(isShadowAtom(mce,vbits));
    764    src_ty = typeOfIRExpr(mce->sb->tyenv, vbits);
    765 
    766    /* Fast-track some common cases */
    767    if (src_ty == Ity_I32 && dst_ty == Ity_I32)
    768       return assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    769 
    770    if (src_ty == Ity_I64 && dst_ty == Ity_I64)
    771       return assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
    772 
    773    if (src_ty == Ity_I32 && dst_ty == Ity_I64) {
    774       /* PCast the arg, then clone it. */
    775       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    776       return assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    777    }
    778 
    779    if (src_ty == Ity_I32 && dst_ty == Ity_V128) {
    780       /* PCast the arg, then clone it 4 times. */
    781       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    782       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    783       return assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
    784    }
    785 
    786    if (src_ty == Ity_I32 && dst_ty == Ity_V256) {
    787       /* PCast the arg, then clone it 8 times. */
    788       IRAtom* tmp = assignNew('V', mce, Ity_I32, unop(Iop_CmpwNEZ32, vbits));
    789       tmp = assignNew('V', mce, Ity_I64, binop(Iop_32HLto64, tmp, tmp));
    790       tmp = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp, tmp));
    791       return assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256, tmp, tmp));
    792    }
    793 
    794    if (src_ty == Ity_I64 && dst_ty == Ity_I32) {
    795       /* PCast the arg.  This gives all 0s or all 1s.  Then throw away
    796          the top half. */
    797       IRAtom* tmp = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, vbits));
    798       return assignNew('V', mce, Ity_I32, unop(Iop_64to32, tmp));
    799    }
    800 
    801    if (src_ty == Ity_V128 && dst_ty == Ity_I64) {
    802       /* Use InterleaveHI64x2 to copy the top half of the vector into
    803          the bottom half.  Then we can UifU it with the original, throw
    804          away the upper half of the result, and PCast-I64-to-I64
    805          the lower half. */
    806       // Generates vbits[127:64] : vbits[127:64]
    807       IRAtom* hi64hi64
    808          = assignNew('V', mce, Ity_V128,
    809                      binop(Iop_InterleaveHI64x2, vbits, vbits));
    810       // Generates
    811       //   UifU(vbits[127:64],vbits[127:64]) : UifU(vbits[127:64],vbits[63:0])
    812       //   == vbits[127:64] : UifU(vbits[127:64],vbits[63:0])
    813       IRAtom* lohi64
    814          = mkUifUV128(mce, hi64hi64, vbits);
    815       // Generates UifU(vbits[127:64],vbits[63:0])
    816       IRAtom* lo64
    817          = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, lohi64));
    818       // Generates
    819       //   PCast-to-I64( UifU(vbits[127:64], vbits[63:0] )
    820       //   == PCast-to-I64( vbits[127:0] )
    821       IRAtom* res
    822          = assignNew('V', mce, Ity_I64, unop(Iop_CmpwNEZ64, lo64));
    823       return res;
    824    }
    825 
    826    /* Else do it the slow way .. */
    827    /* First of all, collapse vbits down to a single bit. */
    828    tmp1   = NULL;
    829    switch (src_ty) {
    830       case Ity_I1:
    831          tmp1 = vbits;
    832          break;
    833       case Ity_I8:
    834          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ8, vbits));
    835          break;
    836       case Ity_I16:
    837          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ16, vbits));
    838          break;
    839       case Ity_I32:
    840          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ32, vbits));
    841          break;
    842       case Ity_I64:
    843          tmp1 = assignNew('V', mce, Ity_I1, unop(Iop_CmpNEZ64, vbits));
    844          break;
    845       case Ity_I128: {
    846          /* Gah.  Chop it in half, OR the halves together, and compare
    847             that with zero. */
    848          IRAtom* tmp2 = assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vbits));
    849          IRAtom* tmp3 = assignNew('V', mce, Ity_I64, unop(Iop_128to64, vbits));
    850          IRAtom* tmp4 = assignNew('V', mce, Ity_I64, binop(Iop_Or64, tmp2, tmp3));
    851          tmp1         = assignNew('V', mce, Ity_I1,
    852                                        unop(Iop_CmpNEZ64, tmp4));
    853          break;
    854       }
    855       default:
    856          ppIRType(src_ty);
    857          VG_(tool_panic)("mkPCastTo(1)");
    858    }
    859    tl_assert(tmp1);
    860    /* Now widen up to the dst type. */
    861    switch (dst_ty) {
    862       case Ity_I1:
    863          return tmp1;
    864       case Ity_I8:
    865          return assignNew('V', mce, Ity_I8, unop(Iop_1Sto8, tmp1));
    866       case Ity_I16:
    867          return assignNew('V', mce, Ity_I16, unop(Iop_1Sto16, tmp1));
    868       case Ity_I32:
    869          return assignNew('V', mce, Ity_I32, unop(Iop_1Sto32, tmp1));
    870       case Ity_I64:
    871          return assignNew('V', mce, Ity_I64, unop(Iop_1Sto64, tmp1));
    872       case Ity_V128:
    873          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    874          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, tmp1, tmp1));
    875          return tmp1;
    876       case Ity_I128:
    877          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    878          tmp1 = assignNew('V', mce, Ity_I128, binop(Iop_64HLto128, tmp1, tmp1));
    879          return tmp1;
    880       case Ity_V256:
    881          tmp1 = assignNew('V', mce, Ity_I64,  unop(Iop_1Sto64, tmp1));
    882          tmp1 = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128,
    883                                                     tmp1, tmp1));
    884          tmp1 = assignNew('V', mce, Ity_V256, binop(Iop_V128HLtoV256,
    885                                                     tmp1, tmp1));
    886          return tmp1;
    887       default:
    888          ppIRType(dst_ty);
    889          VG_(tool_panic)("mkPCastTo(2)");
    890    }
    891 }
    892 
    893 /* This is a minor variant.  It takes an arg of some type and returns
    894    a value of the same type.  The result consists entirely of Defined
    895    (zero) bits except its least significant bit, which is a PCast of
    896    the entire argument down to a single bit. */
    897 static IRAtom* mkPCastXXtoXXlsb ( MCEnv* mce, IRAtom* varg, IRType ty )
    898 {
    899    if (ty == Ity_V128) {
    900       /* --- Case for V128 --- */
    901       IRAtom* varg128 = varg;
    902       // generates: PCast-to-I64(varg128)
    903       IRAtom* pcdTo64 = mkPCastTo(mce, Ity_I64, varg128);
    904       // Now introduce zeros (defined bits) in the top 63 places
    905       // generates: Def--(63)--Def PCast-to-I1(varg128)
    906       IRAtom* d63pc
    907          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcdTo64, mkU64(1)));
    908       // generates: Def--(64)--Def
    909       IRAtom* d64
    910          = definedOfType(Ity_I64);
    911       // generates: Def--(127)--Def PCast-to-I1(varg128)
    912       IRAtom* res
    913          = assignNew('V', mce, Ity_V128, binop(Iop_64HLtoV128, d64, d63pc));
    914       return res;
    915    }
    916    if (ty == Ity_I64) {
    917       /* --- Case for I64 --- */
    918       // PCast to 64
    919       IRAtom* pcd = mkPCastTo(mce, Ity_I64, varg);
    920       // Zero (Def) out the top 63 bits
    921       IRAtom* res
    922          = assignNew('V', mce, Ity_I64, binop(Iop_And64, pcd, mkU64(1)));
    923       return res;
    924    }
    925    /*NOTREACHED*/
    926    tl_assert(0);
    927 }
    928 
    929 /* --------- Accurate interpretation of CmpEQ/CmpNE. --------- */
    930 /*
    931    Normally, we can do CmpEQ/CmpNE by doing UifU on the arguments, and
    932    PCasting to Ity_U1.  However, sometimes it is necessary to be more
    933    accurate.  The insight is that the result is defined if two
    934    corresponding bits can be found, one from each argument, so that
    935    both bits are defined but are different -- that makes EQ say "No"
    936    and NE say "Yes".  Hence, we compute an improvement term and DifD
    937    it onto the "normal" (UifU) result.
    938 
    939    The result is:
    940 
    941    PCastTo<1> (
    942       -- naive version
    943       PCastTo<sz>( UifU<sz>(vxx, vyy) )
    944 
    945       `DifD<sz>`
    946 
    947       -- improvement term
    948       PCastTo<sz>( PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) ) )
    949    )
    950 
    951    where
    952      vec contains 0 (defined) bits where the corresponding arg bits
    953      are defined but different, and 1 bits otherwise.
    954 
    955      vec = Or<sz>( vxx,   // 0 iff bit defined
    956                    vyy,   // 0 iff bit defined
    957                    Not<sz>(Xor<sz>( xx, yy )) // 0 iff bits different
    958                  )
    959 
    960      If any bit of vec is 0, the result is defined and so the
    961      improvement term should produce 0...0, else it should produce
    962      1...1.
    963 
    964      Hence require for the improvement term:
    965 
    966         if vec == 1...1 then 1...1 else 0...0
    967      ->
    968         PCast<sz>( CmpEQ<sz> ( vec, 1...1 ) )
    969 
    970    This was extensively re-analysed and checked on 6 July 05.
    971 */
    972 static IRAtom* expensiveCmpEQorNE ( MCEnv*  mce,
    973                                     IRType  ty,
    974                                     IRAtom* vxx, IRAtom* vyy,
    975                                     IRAtom* xx,  IRAtom* yy )
    976 {
    977    IRAtom *naive, *vec, *improvement_term;
    978    IRAtom *improved, *final_cast, *top;
    979    IROp   opDIFD, opUIFU, opXOR, opNOT, opCMP, opOR;
    980 
    981    tl_assert(isShadowAtom(mce,vxx));
    982    tl_assert(isShadowAtom(mce,vyy));
    983    tl_assert(isOriginalAtom(mce,xx));
    984    tl_assert(isOriginalAtom(mce,yy));
    985    tl_assert(sameKindedAtoms(vxx,xx));
    986    tl_assert(sameKindedAtoms(vyy,yy));
    987 
    988    switch (ty) {
    989       case Ity_I16:
    990          opOR   = Iop_Or16;
    991          opDIFD = Iop_And16;
    992          opUIFU = Iop_Or16;
    993          opNOT  = Iop_Not16;
    994          opXOR  = Iop_Xor16;
    995          opCMP  = Iop_CmpEQ16;
    996          top    = mkU16(0xFFFF);
    997          break;
    998       case Ity_I32:
    999          opOR   = Iop_Or32;
   1000          opDIFD = Iop_And32;
   1001          opUIFU = Iop_Or32;
   1002          opNOT  = Iop_Not32;
   1003          opXOR  = Iop_Xor32;
   1004          opCMP  = Iop_CmpEQ32;
   1005          top    = mkU32(0xFFFFFFFF);
   1006          break;
   1007       case Ity_I64:
   1008          opOR   = Iop_Or64;
   1009          opDIFD = Iop_And64;
   1010          opUIFU = Iop_Or64;
   1011          opNOT  = Iop_Not64;
   1012          opXOR  = Iop_Xor64;
   1013          opCMP  = Iop_CmpEQ64;
   1014          top    = mkU64(0xFFFFFFFFFFFFFFFFULL);
   1015          break;
   1016       default:
   1017          VG_(tool_panic)("expensiveCmpEQorNE");
   1018    }
   1019 
   1020    naive
   1021       = mkPCastTo(mce,ty,
   1022                   assignNew('V', mce, ty, binop(opUIFU, vxx, vyy)));
   1023 
   1024    vec
   1025       = assignNew(
   1026            'V', mce,ty,
   1027            binop( opOR,
   1028                   assignNew('V', mce,ty, binop(opOR, vxx, vyy)),
   1029                   assignNew(
   1030                      'V', mce,ty,
   1031                      unop( opNOT,
   1032                            assignNew('V', mce,ty, binop(opXOR, xx, yy))))));
   1033 
   1034    improvement_term
   1035       = mkPCastTo( mce,ty,
   1036                    assignNew('V', mce,Ity_I1, binop(opCMP, vec, top)));
   1037 
   1038    improved
   1039       = assignNew( 'V', mce,ty, binop(opDIFD, naive, improvement_term) );
   1040 
   1041    final_cast
   1042       = mkPCastTo( mce, Ity_I1, improved );
   1043 
   1044    return final_cast;
   1045 }
   1046 
   1047 
   1048 /* --------- Semi-accurate interpretation of CmpORD. --------- */
   1049 
   1050 /* CmpORD32{S,U} does PowerPC-style 3-way comparisons:
   1051 
   1052       CmpORD32S(x,y) = 1<<3   if  x <s y
   1053                      = 1<<2   if  x >s y
   1054                      = 1<<1   if  x == y
   1055 
   1056    and similarly the unsigned variant.  The default interpretation is:
   1057 
   1058       CmpORD32{S,U}#(x,y,x#,y#) = PCast(x# `UifU` y#)
   1059                                   & (7<<1)
   1060 
   1061    The "& (7<<1)" reflects the fact that all result bits except 3,2,1
   1062    are zero and therefore defined (viz, zero).
   1063 
   1064    Also deal with a special case better:
   1065 
   1066       CmpORD32S(x,0)
   1067 
   1068    Here, bit 3 (LT) of the result is a copy of the top bit of x and
   1069    will be defined even if the rest of x isn't.  In which case we do:
   1070 
   1071       CmpORD32S#(x,x#,0,{impliedly 0}#)
   1072          = PCast(x#) & (3<<1)      -- standard interp for GT#,EQ#
   1073            | (x# >>u 31) << 3      -- LT# = x#[31]
   1074 
   1075    Analogous handling for CmpORD64{S,U}.
   1076 */
   1077 static Bool isZeroU32 ( IRAtom* e )
   1078 {
   1079    return
   1080       toBool( e->tag == Iex_Const
   1081               && e->Iex.Const.con->tag == Ico_U32
   1082               && e->Iex.Const.con->Ico.U32 == 0 );
   1083 }
   1084 
   1085 static Bool isZeroU64 ( IRAtom* e )
   1086 {
   1087    return
   1088       toBool( e->tag == Iex_Const
   1089               && e->Iex.Const.con->tag == Ico_U64
   1090               && e->Iex.Const.con->Ico.U64 == 0 );
   1091 }
   1092 
   1093 static IRAtom* doCmpORD ( MCEnv*  mce,
   1094                           IROp    cmp_op,
   1095                           IRAtom* xxhash, IRAtom* yyhash,
   1096                           IRAtom* xx,     IRAtom* yy )
   1097 {
   1098    Bool   m64    = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U;
   1099    Bool   syned  = cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD32S;
   1100    IROp   opOR   = m64 ? Iop_Or64  : Iop_Or32;
   1101    IROp   opAND  = m64 ? Iop_And64 : Iop_And32;
   1102    IROp   opSHL  = m64 ? Iop_Shl64 : Iop_Shl32;
   1103    IROp   opSHR  = m64 ? Iop_Shr64 : Iop_Shr32;
   1104    IRType ty     = m64 ? Ity_I64   : Ity_I32;
   1105    Int    width  = m64 ? 64        : 32;
   1106 
   1107    Bool (*isZero)(IRAtom*) = m64 ? isZeroU64 : isZeroU32;
   1108 
   1109    IRAtom* threeLeft1 = NULL;
   1110    IRAtom* sevenLeft1 = NULL;
   1111 
   1112    tl_assert(isShadowAtom(mce,xxhash));
   1113    tl_assert(isShadowAtom(mce,yyhash));
   1114    tl_assert(isOriginalAtom(mce,xx));
   1115    tl_assert(isOriginalAtom(mce,yy));
   1116    tl_assert(sameKindedAtoms(xxhash,xx));
   1117    tl_assert(sameKindedAtoms(yyhash,yy));
   1118    tl_assert(cmp_op == Iop_CmpORD32S || cmp_op == Iop_CmpORD32U
   1119              || cmp_op == Iop_CmpORD64S || cmp_op == Iop_CmpORD64U);
   1120 
   1121    if (0) {
   1122       ppIROp(cmp_op); VG_(printf)(" ");
   1123       ppIRExpr(xx); VG_(printf)(" "); ppIRExpr( yy ); VG_(printf)("\n");
   1124    }
   1125 
   1126    if (syned && isZero(yy)) {
   1127       /* fancy interpretation */
   1128       /* if yy is zero, then it must be fully defined (zero#). */
   1129       tl_assert(isZero(yyhash));
   1130       threeLeft1 = m64 ? mkU64(3<<1) : mkU32(3<<1);
   1131       return
   1132          binop(
   1133             opOR,
   1134             assignNew(
   1135                'V', mce,ty,
   1136                binop(
   1137                   opAND,
   1138                   mkPCastTo(mce,ty, xxhash),
   1139                   threeLeft1
   1140                )),
   1141             assignNew(
   1142                'V', mce,ty,
   1143                binop(
   1144                   opSHL,
   1145                   assignNew(
   1146                      'V', mce,ty,
   1147                      binop(opSHR, xxhash, mkU8(width-1))),
   1148                   mkU8(3)
   1149                ))
   1150 	 );
   1151    } else {
   1152       /* standard interpretation */
   1153       sevenLeft1 = m64 ? mkU64(7<<1) : mkU32(7<<1);
   1154       return
   1155          binop(
   1156             opAND,
   1157             mkPCastTo( mce,ty,
   1158                        mkUifU(mce,ty, xxhash,yyhash)),
   1159             sevenLeft1
   1160          );
   1161    }
   1162 }
   1163 
   1164 
   1165 /*------------------------------------------------------------*/
   1166 /*--- Emit a test and complaint if something is undefined. ---*/
   1167 /*------------------------------------------------------------*/
   1168 
   1169 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e ); /* fwds */
   1170 
   1171 
   1172 /* Set the annotations on a dirty helper to indicate that the stack
   1173    pointer and instruction pointers might be read.  This is the
   1174    behaviour of all 'emit-a-complaint' style functions we might
   1175    call. */
   1176 
   1177 static void setHelperAnns ( MCEnv* mce, IRDirty* di ) {
   1178    di->nFxState = 2;
   1179    di->fxState[0].fx        = Ifx_Read;
   1180    di->fxState[0].offset    = mce->layout->offset_SP;
   1181    di->fxState[0].size      = mce->layout->sizeof_SP;
   1182    di->fxState[0].nRepeats  = 0;
   1183    di->fxState[0].repeatLen = 0;
   1184    di->fxState[1].fx        = Ifx_Read;
   1185    di->fxState[1].offset    = mce->layout->offset_IP;
   1186    di->fxState[1].size      = mce->layout->sizeof_IP;
   1187    di->fxState[1].nRepeats  = 0;
   1188    di->fxState[1].repeatLen = 0;
   1189 }
   1190 
   1191 
   1192 /* Check the supplied *original* |atom| for undefinedness, and emit a
   1193    complaint if so.  Once that happens, mark it as defined.  This is
   1194    possible because the atom is either a tmp or literal.  If it's a
   1195    tmp, it will be shadowed by a tmp, and so we can set the shadow to
   1196    be defined.  In fact as mentioned above, we will have to allocate a
   1197    new tmp to carry the new 'defined' shadow value, and update the
   1198    original->tmp mapping accordingly; we cannot simply assign a new
   1199    value to an existing shadow tmp as this breaks SSAness.
   1200 
   1201    The checks are performed, any resulting complaint emitted, and
   1202    |atom|'s shadow temp set to 'defined', ONLY in the case that
   1203    |guard| evaluates to True at run-time.  If it evaluates to False
   1204    then no action is performed.  If |guard| is NULL (the usual case)
   1205    then it is assumed to be always-true, and hence these actions are
   1206    performed unconditionally.
   1207 
   1208    This routine does not generate code to check the definedness of
   1209    |guard|.  The caller is assumed to have taken care of that already.
   1210 */
   1211 static void complainIfUndefined ( MCEnv* mce, IRAtom* atom, IRExpr *guard )
   1212 {
   1213    IRAtom*  vatom;
   1214    IRType   ty;
   1215    Int      sz;
   1216    IRDirty* di;
   1217    IRAtom*  cond;
   1218    IRAtom*  origin;
   1219    void*    fn;
   1220    const HChar* nm;
   1221    IRExpr** args;
   1222    Int      nargs;
   1223 
   1224    // Don't do V bit tests if we're not reporting undefined value errors.
   1225    if (MC_(clo_mc_level) == 1)
   1226       return;
   1227 
   1228    if (guard)
   1229       tl_assert(isOriginalAtom(mce, guard));
   1230 
   1231    /* Since the original expression is atomic, there's no duplicated
   1232       work generated by making multiple V-expressions for it.  So we
   1233       don't really care about the possibility that someone else may
   1234       also create a V-interpretion for it. */
   1235    tl_assert(isOriginalAtom(mce, atom));
   1236    vatom = expr2vbits( mce, atom );
   1237    tl_assert(isShadowAtom(mce, vatom));
   1238    tl_assert(sameKindedAtoms(atom, vatom));
   1239 
   1240    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
   1241 
   1242    /* sz is only used for constructing the error message */
   1243    sz = ty==Ity_I1 ? 0 : sizeofIRType(ty);
   1244 
   1245    cond = mkPCastTo( mce, Ity_I1, vatom );
   1246    /* cond will be 0 if all defined, and 1 if any not defined. */
   1247 
   1248    /* Get the origin info for the value we are about to check.  At
   1249       least, if we are doing origin tracking.  If not, use a dummy
   1250       zero origin. */
   1251    if (MC_(clo_mc_level) == 3) {
   1252       origin = schemeE( mce, atom );
   1253       if (mce->hWordTy == Ity_I64) {
   1254          origin = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, origin) );
   1255       }
   1256    } else {
   1257       origin = NULL;
   1258    }
   1259 
   1260    fn    = NULL;
   1261    nm    = NULL;
   1262    args  = NULL;
   1263    nargs = -1;
   1264 
   1265    switch (sz) {
   1266       case 0:
   1267          if (origin) {
   1268             fn    = &MC_(helperc_value_check0_fail_w_o);
   1269             nm    = "MC_(helperc_value_check0_fail_w_o)";
   1270             args  = mkIRExprVec_1(origin);
   1271             nargs = 1;
   1272          } else {
   1273             fn    = &MC_(helperc_value_check0_fail_no_o);
   1274             nm    = "MC_(helperc_value_check0_fail_no_o)";
   1275             args  = mkIRExprVec_0();
   1276             nargs = 0;
   1277          }
   1278          break;
   1279       case 1:
   1280          if (origin) {
   1281             fn    = &MC_(helperc_value_check1_fail_w_o);
   1282             nm    = "MC_(helperc_value_check1_fail_w_o)";
   1283             args  = mkIRExprVec_1(origin);
   1284             nargs = 1;
   1285          } else {
   1286             fn    = &MC_(helperc_value_check1_fail_no_o);
   1287             nm    = "MC_(helperc_value_check1_fail_no_o)";
   1288             args  = mkIRExprVec_0();
   1289             nargs = 0;
   1290          }
   1291          break;
   1292       case 4:
   1293          if (origin) {
   1294             fn    = &MC_(helperc_value_check4_fail_w_o);
   1295             nm    = "MC_(helperc_value_check4_fail_w_o)";
   1296             args  = mkIRExprVec_1(origin);
   1297             nargs = 1;
   1298          } else {
   1299             fn    = &MC_(helperc_value_check4_fail_no_o);
   1300             nm    = "MC_(helperc_value_check4_fail_no_o)";
   1301             args  = mkIRExprVec_0();
   1302             nargs = 0;
   1303          }
   1304          break;
   1305       case 8:
   1306          if (origin) {
   1307             fn    = &MC_(helperc_value_check8_fail_w_o);
   1308             nm    = "MC_(helperc_value_check8_fail_w_o)";
   1309             args  = mkIRExprVec_1(origin);
   1310             nargs = 1;
   1311          } else {
   1312             fn    = &MC_(helperc_value_check8_fail_no_o);
   1313             nm    = "MC_(helperc_value_check8_fail_no_o)";
   1314             args  = mkIRExprVec_0();
   1315             nargs = 0;
   1316          }
   1317          break;
   1318       case 2:
   1319       case 16:
   1320          if (origin) {
   1321             fn    = &MC_(helperc_value_checkN_fail_w_o);
   1322             nm    = "MC_(helperc_value_checkN_fail_w_o)";
   1323             args  = mkIRExprVec_2( mkIRExpr_HWord( sz ), origin);
   1324             nargs = 2;
   1325          } else {
   1326             fn    = &MC_(helperc_value_checkN_fail_no_o);
   1327             nm    = "MC_(helperc_value_checkN_fail_no_o)";
   1328             args  = mkIRExprVec_1( mkIRExpr_HWord( sz ) );
   1329             nargs = 1;
   1330          }
   1331          break;
   1332       default:
   1333          VG_(tool_panic)("unexpected szB");
   1334    }
   1335 
   1336    tl_assert(fn);
   1337    tl_assert(nm);
   1338    tl_assert(args);
   1339    tl_assert(nargs >= 0 && nargs <= 2);
   1340    tl_assert( (MC_(clo_mc_level) == 3 && origin != NULL)
   1341               || (MC_(clo_mc_level) == 2 && origin == NULL) );
   1342 
   1343    di = unsafeIRDirty_0_N( nargs/*regparms*/, nm,
   1344                            VG_(fnptr_to_fnentry)( fn ), args );
   1345    di->guard = cond; // and cond is PCast-to-1(atom#)
   1346 
   1347    /* If the complaint is to be issued under a guard condition, AND
   1348       that into the guard condition for the helper call. */
   1349    if (guard) {
   1350       IRAtom *g1 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, di->guard));
   1351       IRAtom *g2 = assignNew('V', mce, Ity_I32, unop(Iop_1Uto32, guard));
   1352       IRAtom *e  = assignNew('V', mce, Ity_I32, binop(Iop_And32, g1, g2));
   1353       di->guard  = assignNew('V', mce, Ity_I1,  unop(Iop_32to1, e));
   1354    }
   1355 
   1356    setHelperAnns( mce, di );
   1357    stmt( 'V', mce, IRStmt_Dirty(di));
   1358 
   1359    /* If |atom| is shadowed by an IRTemp, set the shadow tmp to be
   1360       defined -- but only in the case where the guard evaluates to
   1361       True at run-time.  Do the update by setting the orig->shadow
   1362       mapping for tmp to reflect the fact that this shadow is getting
   1363       a new value. */
   1364    tl_assert(isIRAtom(vatom));
   1365    /* sameKindedAtoms ... */
   1366    if (vatom->tag == Iex_RdTmp) {
   1367       tl_assert(atom->tag == Iex_RdTmp);
   1368       if (guard == NULL) {
   1369          // guard is 'always True', hence update unconditionally
   1370          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1371          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp),
   1372                           definedOfType(ty));
   1373       } else {
   1374          // update the temp only conditionally.  Do this by copying
   1375          // its old value when the guard is False.
   1376          // The old value ..
   1377          IRTemp old_tmpV = findShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1378          newShadowTmpV(mce, atom->Iex.RdTmp.tmp);
   1379          IRAtom* new_tmpV
   1380             = assignNew('V', mce, shadowTypeV(ty),
   1381                         IRExpr_ITE(guard, definedOfType(ty),
   1382                                           mkexpr(old_tmpV)));
   1383          assign('V', mce, findShadowTmpV(mce, atom->Iex.RdTmp.tmp), new_tmpV);
   1384       }
   1385    }
   1386 }
   1387 
   1388 
   1389 /*------------------------------------------------------------*/
   1390 /*--- Shadowing PUTs/GETs, and indexed variants thereof    ---*/
   1391 /*------------------------------------------------------------*/
   1392 
   1393 /* Examine the always-defined sections declared in layout to see if
   1394    the (offset,size) section is within one.  Note, is is an error to
   1395    partially fall into such a region: (offset,size) should either be
   1396    completely in such a region or completely not-in such a region.
   1397 */
   1398 static Bool isAlwaysDefd ( MCEnv* mce, Int offset, Int size )
   1399 {
   1400    Int minoffD, maxoffD, i;
   1401    Int minoff = offset;
   1402    Int maxoff = minoff + size - 1;
   1403    tl_assert((minoff & ~0xFFFF) == 0);
   1404    tl_assert((maxoff & ~0xFFFF) == 0);
   1405 
   1406    for (i = 0; i < mce->layout->n_alwaysDefd; i++) {
   1407       minoffD = mce->layout->alwaysDefd[i].offset;
   1408       maxoffD = minoffD + mce->layout->alwaysDefd[i].size - 1;
   1409       tl_assert((minoffD & ~0xFFFF) == 0);
   1410       tl_assert((maxoffD & ~0xFFFF) == 0);
   1411 
   1412       if (maxoff < minoffD || maxoffD < minoff)
   1413          continue; /* no overlap */
   1414       if (minoff >= minoffD && maxoff <= maxoffD)
   1415          return True; /* completely contained in an always-defd section */
   1416 
   1417       VG_(tool_panic)("memcheck:isAlwaysDefd:partial overlap");
   1418    }
   1419    return False; /* could not find any containing section */
   1420 }
   1421 
   1422 
   1423 /* Generate into bb suitable actions to shadow this Put.  If the state
   1424    slice is marked 'always defined', do nothing.  Otherwise, write the
   1425    supplied V bits to the shadow state.  We can pass in either an
   1426    original atom or a V-atom, but not both.  In the former case the
   1427    relevant V-bits are then generated from the original.
   1428    We assume here, that the definedness of GUARD has already been checked.
   1429 */
   1430 static
   1431 void do_shadow_PUT ( MCEnv* mce,  Int offset,
   1432                      IRAtom* atom, IRAtom* vatom, IRExpr *guard )
   1433 {
   1434    IRType ty;
   1435 
   1436    // Don't do shadow PUTs if we're not doing undefined value checking.
   1437    // Their absence lets Vex's optimiser remove all the shadow computation
   1438    // that they depend on, which includes GETs of the shadow registers.
   1439    if (MC_(clo_mc_level) == 1)
   1440       return;
   1441 
   1442    if (atom) {
   1443       tl_assert(!vatom);
   1444       tl_assert(isOriginalAtom(mce, atom));
   1445       vatom = expr2vbits( mce, atom );
   1446    } else {
   1447       tl_assert(vatom);
   1448       tl_assert(isShadowAtom(mce, vatom));
   1449    }
   1450 
   1451    ty = typeOfIRExpr(mce->sb->tyenv, vatom);
   1452    tl_assert(ty != Ity_I1);
   1453    tl_assert(ty != Ity_I128);
   1454    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
   1455       /* later: no ... */
   1456       /* emit code to emit a complaint if any of the vbits are 1. */
   1457       /* complainIfUndefined(mce, atom); */
   1458    } else {
   1459       /* Do a plain shadow Put. */
   1460       if (guard) {
   1461          /* If the guard expression evaluates to false we simply Put the value
   1462             that is already stored in the guest state slot */
   1463          IRAtom *cond, *iffalse;
   1464 
   1465          cond    = assignNew('V', mce, Ity_I1, guard);
   1466          iffalse = assignNew('V', mce, ty,
   1467                              IRExpr_Get(offset + mce->layout->total_sizeB, ty));
   1468          vatom   = assignNew('V', mce, ty, IRExpr_ITE(cond, vatom, iffalse));
   1469       }
   1470       stmt( 'V', mce, IRStmt_Put( offset + mce->layout->total_sizeB, vatom ));
   1471    }
   1472 }
   1473 
   1474 
   1475 /* Return an expression which contains the V bits corresponding to the
   1476    given GETI (passed in in pieces).
   1477 */
   1478 static
   1479 void do_shadow_PUTI ( MCEnv* mce, IRPutI *puti)
   1480 {
   1481    IRAtom* vatom;
   1482    IRType  ty, tyS;
   1483    Int     arrSize;;
   1484    IRRegArray* descr = puti->descr;
   1485    IRAtom*     ix    = puti->ix;
   1486    Int         bias  = puti->bias;
   1487    IRAtom*     atom  = puti->data;
   1488 
   1489    // Don't do shadow PUTIs if we're not doing undefined value checking.
   1490    // Their absence lets Vex's optimiser remove all the shadow computation
   1491    // that they depend on, which includes GETIs of the shadow registers.
   1492    if (MC_(clo_mc_level) == 1)
   1493       return;
   1494 
   1495    tl_assert(isOriginalAtom(mce,atom));
   1496    vatom = expr2vbits( mce, atom );
   1497    tl_assert(sameKindedAtoms(atom, vatom));
   1498    ty   = descr->elemTy;
   1499    tyS  = shadowTypeV(ty);
   1500    arrSize = descr->nElems * sizeofIRType(ty);
   1501    tl_assert(ty != Ity_I1);
   1502    tl_assert(isOriginalAtom(mce,ix));
   1503    complainIfUndefined(mce, ix, NULL);
   1504    if (isAlwaysDefd(mce, descr->base, arrSize)) {
   1505       /* later: no ... */
   1506       /* emit code to emit a complaint if any of the vbits are 1. */
   1507       /* complainIfUndefined(mce, atom); */
   1508    } else {
   1509       /* Do a cloned version of the Put that refers to the shadow
   1510          area. */
   1511       IRRegArray* new_descr
   1512          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
   1513                          tyS, descr->nElems);
   1514       stmt( 'V', mce, IRStmt_PutI( mkIRPutI(new_descr, ix, bias, vatom) ));
   1515    }
   1516 }
   1517 
   1518 
   1519 /* Return an expression which contains the V bits corresponding to the
   1520    given GET (passed in in pieces).
   1521 */
   1522 static
   1523 IRExpr* shadow_GET ( MCEnv* mce, Int offset, IRType ty )
   1524 {
   1525    IRType tyS = shadowTypeV(ty);
   1526    tl_assert(ty != Ity_I1);
   1527    tl_assert(ty != Ity_I128);
   1528    if (isAlwaysDefd(mce, offset, sizeofIRType(ty))) {
   1529       /* Always defined, return all zeroes of the relevant type */
   1530       return definedOfType(tyS);
   1531    } else {
   1532       /* return a cloned version of the Get that refers to the shadow
   1533          area. */
   1534       /* FIXME: this isn't an atom! */
   1535       return IRExpr_Get( offset + mce->layout->total_sizeB, tyS );
   1536    }
   1537 }
   1538 
   1539 
   1540 /* Return an expression which contains the V bits corresponding to the
   1541    given GETI (passed in in pieces).
   1542 */
   1543 static
   1544 IRExpr* shadow_GETI ( MCEnv* mce,
   1545                       IRRegArray* descr, IRAtom* ix, Int bias )
   1546 {
   1547    IRType ty   = descr->elemTy;
   1548    IRType tyS  = shadowTypeV(ty);
   1549    Int arrSize = descr->nElems * sizeofIRType(ty);
   1550    tl_assert(ty != Ity_I1);
   1551    tl_assert(isOriginalAtom(mce,ix));
   1552    complainIfUndefined(mce, ix, NULL);
   1553    if (isAlwaysDefd(mce, descr->base, arrSize)) {
   1554       /* Always defined, return all zeroes of the relevant type */
   1555       return definedOfType(tyS);
   1556    } else {
   1557       /* return a cloned version of the Get that refers to the shadow
   1558          area. */
   1559       IRRegArray* new_descr
   1560          = mkIRRegArray( descr->base + mce->layout->total_sizeB,
   1561                          tyS, descr->nElems);
   1562       return IRExpr_GetI( new_descr, ix, bias );
   1563    }
   1564 }
   1565 
   1566 
   1567 /*------------------------------------------------------------*/
   1568 /*--- Generating approximations for unknown operations,    ---*/
   1569 /*--- using lazy-propagate semantics                       ---*/
   1570 /*------------------------------------------------------------*/
   1571 
   1572 /* Lazy propagation of undefinedness from two values, resulting in the
   1573    specified shadow type.
   1574 */
   1575 static
   1576 IRAtom* mkLazy2 ( MCEnv* mce, IRType finalVty, IRAtom* va1, IRAtom* va2 )
   1577 {
   1578    IRAtom* at;
   1579    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1580    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1581    tl_assert(isShadowAtom(mce,va1));
   1582    tl_assert(isShadowAtom(mce,va2));
   1583 
   1584    /* The general case is inefficient because PCast is an expensive
   1585       operation.  Here are some special cases which use PCast only
   1586       once rather than twice. */
   1587 
   1588    /* I64 x I64 -> I64 */
   1589    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I64) {
   1590       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I64\n");
   1591       at = mkUifU(mce, Ity_I64, va1, va2);
   1592       at = mkPCastTo(mce, Ity_I64, at);
   1593       return at;
   1594    }
   1595 
   1596    /* I64 x I64 -> I32 */
   1597    if (t1 == Ity_I64 && t2 == Ity_I64 && finalVty == Ity_I32) {
   1598       if (0) VG_(printf)("mkLazy2: I64 x I64 -> I32\n");
   1599       at = mkUifU(mce, Ity_I64, va1, va2);
   1600       at = mkPCastTo(mce, Ity_I32, at);
   1601       return at;
   1602    }
   1603 
   1604    if (0) {
   1605       VG_(printf)("mkLazy2 ");
   1606       ppIRType(t1);
   1607       VG_(printf)("_");
   1608       ppIRType(t2);
   1609       VG_(printf)("_");
   1610       ppIRType(finalVty);
   1611       VG_(printf)("\n");
   1612    }
   1613 
   1614    /* General case: force everything via 32-bit intermediaries. */
   1615    at = mkPCastTo(mce, Ity_I32, va1);
   1616    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
   1617    at = mkPCastTo(mce, finalVty, at);
   1618    return at;
   1619 }
   1620 
   1621 
   1622 /* 3-arg version of the above. */
   1623 static
   1624 IRAtom* mkLazy3 ( MCEnv* mce, IRType finalVty,
   1625                   IRAtom* va1, IRAtom* va2, IRAtom* va3 )
   1626 {
   1627    IRAtom* at;
   1628    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1629    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1630    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
   1631    tl_assert(isShadowAtom(mce,va1));
   1632    tl_assert(isShadowAtom(mce,va2));
   1633    tl_assert(isShadowAtom(mce,va3));
   1634 
   1635    /* The general case is inefficient because PCast is an expensive
   1636       operation.  Here are some special cases which use PCast only
   1637       twice rather than three times. */
   1638 
   1639    /* I32 x I64 x I64 -> I64 */
   1640    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1641    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
   1642        && finalVty == Ity_I64) {
   1643       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I64\n");
   1644       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
   1645          mode indication which is fully defined, this should get
   1646          folded out later. */
   1647       at = mkPCastTo(mce, Ity_I64, va1);
   1648       /* Now fold in 2nd and 3rd args. */
   1649       at = mkUifU(mce, Ity_I64, at, va2);
   1650       at = mkUifU(mce, Ity_I64, at, va3);
   1651       /* and PCast once again. */
   1652       at = mkPCastTo(mce, Ity_I64, at);
   1653       return at;
   1654    }
   1655 
   1656    /* I32 x I8 x I64 -> I64 */
   1657    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I64
   1658        && finalVty == Ity_I64) {
   1659       if (0) VG_(printf)("mkLazy3: I32 x I8 x I64 -> I64\n");
   1660       /* Widen 1st and 2nd args to I64.  Since 1st arg is typically a
   1661        * rounding mode indication which is fully defined, this should
   1662        * get folded out later.
   1663       */
   1664       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
   1665       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
   1666       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
   1667       at = mkUifU(mce, Ity_I64, at, va3);
   1668       /* and PCast once again. */
   1669       at = mkPCastTo(mce, Ity_I64, at);
   1670       return at;
   1671    }
   1672 
   1673    /* I32 x I64 x I64 -> I32 */
   1674    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64
   1675        && finalVty == Ity_I32) {
   1676       if (0) VG_(printf)("mkLazy3: I32 x I64 x I64 -> I32\n");
   1677       at = mkPCastTo(mce, Ity_I64, va1);
   1678       at = mkUifU(mce, Ity_I64, at, va2);
   1679       at = mkUifU(mce, Ity_I64, at, va3);
   1680       at = mkPCastTo(mce, Ity_I32, at);
   1681       return at;
   1682    }
   1683 
   1684    /* I32 x I32 x I32 -> I32 */
   1685    /* 32-bit FP idiom, as (eg) happens on ARM */
   1686    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32
   1687        && finalVty == Ity_I32) {
   1688       if (0) VG_(printf)("mkLazy3: I32 x I32 x I32 -> I32\n");
   1689       at = va1;
   1690       at = mkUifU(mce, Ity_I32, at, va2);
   1691       at = mkUifU(mce, Ity_I32, at, va3);
   1692       at = mkPCastTo(mce, Ity_I32, at);
   1693       return at;
   1694    }
   1695 
   1696    /* I32 x I128 x I128 -> I128 */
   1697    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1698    if (t1 == Ity_I32 && t2 == Ity_I128 && t3 == Ity_I128
   1699        && finalVty == Ity_I128) {
   1700       if (0) VG_(printf)("mkLazy3: I32 x I128 x I128 -> I128\n");
   1701       /* Widen 1st arg to I128.  Since 1st arg is typically a rounding
   1702          mode indication which is fully defined, this should get
   1703          folded out later. */
   1704       at = mkPCastTo(mce, Ity_I128, va1);
   1705       /* Now fold in 2nd and 3rd args. */
   1706       at = mkUifU(mce, Ity_I128, at, va2);
   1707       at = mkUifU(mce, Ity_I128, at, va3);
   1708       /* and PCast once again. */
   1709       at = mkPCastTo(mce, Ity_I128, at);
   1710       return at;
   1711    }
   1712 
   1713    /* I32 x I8 x I128 -> I128 */
   1714    /* Standard FP idiom: rm x FParg1 x FParg2 -> FPresult */
   1715    if (t1 == Ity_I32 && t2 == Ity_I8 && t3 == Ity_I128
   1716        && finalVty == Ity_I128) {
   1717       if (0) VG_(printf)("mkLazy3: I32 x I8 x I128 -> I128\n");
   1718       /* Use I64 as an intermediate type, which means PCasting all 3
   1719          args to I64 to start with. 1st arg is typically a rounding
   1720          mode indication which is fully defined, so we hope that it
   1721          will get folded out later. */
   1722       IRAtom* at1 = mkPCastTo(mce, Ity_I64, va1);
   1723       IRAtom* at2 = mkPCastTo(mce, Ity_I64, va2);
   1724       IRAtom* at3 = mkPCastTo(mce, Ity_I64, va3);
   1725       /* Now UifU all three together. */
   1726       at = mkUifU(mce, Ity_I64, at1, at2);  // UifU(PCast(va1), PCast(va2))
   1727       at = mkUifU(mce, Ity_I64, at, at3);   // ... `UifU` PCast(va3)
   1728       /* and PCast once again. */
   1729       at = mkPCastTo(mce, Ity_I128, at);
   1730       return at;
   1731    }
   1732    if (1) {
   1733       VG_(printf)("mkLazy3: ");
   1734       ppIRType(t1);
   1735       VG_(printf)(" x ");
   1736       ppIRType(t2);
   1737       VG_(printf)(" x ");
   1738       ppIRType(t3);
   1739       VG_(printf)(" -> ");
   1740       ppIRType(finalVty);
   1741       VG_(printf)("\n");
   1742    }
   1743 
   1744    tl_assert(0);
   1745    /* General case: force everything via 32-bit intermediaries. */
   1746    /*
   1747    at = mkPCastTo(mce, Ity_I32, va1);
   1748    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va2));
   1749    at = mkUifU(mce, Ity_I32, at, mkPCastTo(mce, Ity_I32, va3));
   1750    at = mkPCastTo(mce, finalVty, at);
   1751    return at;
   1752    */
   1753 }
   1754 
   1755 
   1756 /* 4-arg version of the above. */
   1757 static
   1758 IRAtom* mkLazy4 ( MCEnv* mce, IRType finalVty,
   1759                   IRAtom* va1, IRAtom* va2, IRAtom* va3, IRAtom* va4 )
   1760 {
   1761    IRAtom* at;
   1762    IRType t1 = typeOfIRExpr(mce->sb->tyenv, va1);
   1763    IRType t2 = typeOfIRExpr(mce->sb->tyenv, va2);
   1764    IRType t3 = typeOfIRExpr(mce->sb->tyenv, va3);
   1765    IRType t4 = typeOfIRExpr(mce->sb->tyenv, va4);
   1766    tl_assert(isShadowAtom(mce,va1));
   1767    tl_assert(isShadowAtom(mce,va2));
   1768    tl_assert(isShadowAtom(mce,va3));
   1769    tl_assert(isShadowAtom(mce,va4));
   1770 
   1771    /* The general case is inefficient because PCast is an expensive
   1772       operation.  Here are some special cases which use PCast only
   1773       twice rather than three times. */
   1774 
   1775    /* I32 x I64 x I64 x I64 -> I64 */
   1776    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
   1777    if (t1 == Ity_I32 && t2 == Ity_I64 && t3 == Ity_I64 && t4 == Ity_I64
   1778        && finalVty == Ity_I64) {
   1779       if (0) VG_(printf)("mkLazy4: I32 x I64 x I64 x I64 -> I64\n");
   1780       /* Widen 1st arg to I64.  Since 1st arg is typically a rounding
   1781          mode indication which is fully defined, this should get
   1782          folded out later. */
   1783       at = mkPCastTo(mce, Ity_I64, va1);
   1784       /* Now fold in 2nd, 3rd, 4th args. */
   1785       at = mkUifU(mce, Ity_I64, at, va2);
   1786       at = mkUifU(mce, Ity_I64, at, va3);
   1787       at = mkUifU(mce, Ity_I64, at, va4);
   1788       /* and PCast once again. */
   1789       at = mkPCastTo(mce, Ity_I64, at);
   1790       return at;
   1791    }
   1792    /* I32 x I32 x I32 x I32 -> I32 */
   1793    /* Standard FP idiom: rm x FParg1 x FParg2 x FParg3 -> FPresult */
   1794    if (t1 == Ity_I32 && t2 == Ity_I32 && t3 == Ity_I32 && t4 == Ity_I32
   1795        && finalVty == Ity_I32) {
   1796       if (0) VG_(printf)("mkLazy4: I32 x I32 x I32 x I32 -> I32\n");
   1797       at = va1;
   1798       /* Now fold in 2nd, 3rd, 4th args. */
   1799       at = mkUifU(mce, Ity_I32, at, va2);
   1800       at = mkUifU(mce, Ity_I32, at, va3);
   1801       at = mkUifU(mce, Ity_I32, at, va4);
   1802       at = mkPCastTo(mce, Ity_I32, at);
   1803       return at;
   1804    }
   1805 
   1806    if (1) {
   1807       VG_(printf)("mkLazy4: ");
   1808       ppIRType(t1);
   1809       VG_(printf)(" x ");
   1810       ppIRType(t2);
   1811       VG_(printf)(" x ");
   1812       ppIRType(t3);
   1813       VG_(printf)(" x ");
   1814       ppIRType(t4);
   1815       VG_(printf)(" -> ");
   1816       ppIRType(finalVty);
   1817       VG_(printf)("\n");
   1818    }
   1819 
   1820    tl_assert(0);
   1821 }
   1822 
   1823 
   1824 /* Do the lazy propagation game from a null-terminated vector of
   1825    atoms.  This is presumably the arguments to a helper call, so the
   1826    IRCallee info is also supplied in order that we can know which
   1827    arguments should be ignored (via the .mcx_mask field).
   1828 */
   1829 static
   1830 IRAtom* mkLazyN ( MCEnv* mce,
   1831                   IRAtom** exprvec, IRType finalVtype, IRCallee* cee )
   1832 {
   1833    Int     i;
   1834    IRAtom* here;
   1835    IRAtom* curr;
   1836    IRType  mergeTy;
   1837    Bool    mergeTy64 = True;
   1838 
   1839    /* Decide on the type of the merge intermediary.  If all relevant
   1840       args are I64, then it's I64.  In all other circumstances, use
   1841       I32. */
   1842    for (i = 0; exprvec[i]; i++) {
   1843       tl_assert(i < 32);
   1844       tl_assert(isOriginalAtom(mce, exprvec[i]));
   1845       if (cee->mcx_mask & (1<<i))
   1846          continue;
   1847       if (typeOfIRExpr(mce->sb->tyenv, exprvec[i]) != Ity_I64)
   1848          mergeTy64 = False;
   1849    }
   1850 
   1851    mergeTy = mergeTy64  ? Ity_I64  : Ity_I32;
   1852    curr    = definedOfType(mergeTy);
   1853 
   1854    for (i = 0; exprvec[i]; i++) {
   1855       tl_assert(i < 32);
   1856       tl_assert(isOriginalAtom(mce, exprvec[i]));
   1857       /* Only take notice of this arg if the callee's mc-exclusion
   1858          mask does not say it is to be excluded. */
   1859       if (cee->mcx_mask & (1<<i)) {
   1860          /* the arg is to be excluded from definedness checking.  Do
   1861             nothing. */
   1862          if (0) VG_(printf)("excluding %s(%d)\n", cee->name, i);
   1863       } else {
   1864          /* calculate the arg's definedness, and pessimistically merge
   1865             it in. */
   1866          here = mkPCastTo( mce, mergeTy, expr2vbits(mce, exprvec[i]) );
   1867          curr = mergeTy64
   1868                    ? mkUifU64(mce, here, curr)
   1869                    : mkUifU32(mce, here, curr);
   1870       }
   1871    }
   1872    return mkPCastTo(mce, finalVtype, curr );
   1873 }
   1874 
   1875 
   1876 /*------------------------------------------------------------*/
   1877 /*--- Generating expensive sequences for exact carry-chain ---*/
   1878 /*--- propagation in add/sub and related operations.       ---*/
   1879 /*------------------------------------------------------------*/
   1880 
   1881 static
   1882 IRAtom* expensiveAddSub ( MCEnv*  mce,
   1883                           Bool    add,
   1884                           IRType  ty,
   1885                           IRAtom* qaa, IRAtom* qbb,
   1886                           IRAtom* aa,  IRAtom* bb )
   1887 {
   1888    IRAtom *a_min, *b_min, *a_max, *b_max;
   1889    IROp   opAND, opOR, opXOR, opNOT, opADD, opSUB;
   1890 
   1891    tl_assert(isShadowAtom(mce,qaa));
   1892    tl_assert(isShadowAtom(mce,qbb));
   1893    tl_assert(isOriginalAtom(mce,aa));
   1894    tl_assert(isOriginalAtom(mce,bb));
   1895    tl_assert(sameKindedAtoms(qaa,aa));
   1896    tl_assert(sameKindedAtoms(qbb,bb));
   1897 
   1898    switch (ty) {
   1899       case Ity_I32:
   1900          opAND = Iop_And32;
   1901          opOR  = Iop_Or32;
   1902          opXOR = Iop_Xor32;
   1903          opNOT = Iop_Not32;
   1904          opADD = Iop_Add32;
   1905          opSUB = Iop_Sub32;
   1906          break;
   1907       case Ity_I64:
   1908          opAND = Iop_And64;
   1909          opOR  = Iop_Or64;
   1910          opXOR = Iop_Xor64;
   1911          opNOT = Iop_Not64;
   1912          opADD = Iop_Add64;
   1913          opSUB = Iop_Sub64;
   1914          break;
   1915       default:
   1916          VG_(tool_panic)("expensiveAddSub");
   1917    }
   1918 
   1919    // a_min = aa & ~qaa
   1920    a_min = assignNew('V', mce,ty,
   1921                      binop(opAND, aa,
   1922                                   assignNew('V', mce,ty, unop(opNOT, qaa))));
   1923 
   1924    // b_min = bb & ~qbb
   1925    b_min = assignNew('V', mce,ty,
   1926                      binop(opAND, bb,
   1927                                   assignNew('V', mce,ty, unop(opNOT, qbb))));
   1928 
   1929    // a_max = aa | qaa
   1930    a_max = assignNew('V', mce,ty, binop(opOR, aa, qaa));
   1931 
   1932    // b_max = bb | qbb
   1933    b_max = assignNew('V', mce,ty, binop(opOR, bb, qbb));
   1934 
   1935    if (add) {
   1936       // result = (qaa | qbb) | ((a_min + b_min) ^ (a_max + b_max))
   1937       return
   1938       assignNew('V', mce,ty,
   1939          binop( opOR,
   1940                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
   1941                 assignNew('V', mce,ty,
   1942                    binop( opXOR,
   1943                           assignNew('V', mce,ty, binop(opADD, a_min, b_min)),
   1944                           assignNew('V', mce,ty, binop(opADD, a_max, b_max))
   1945                    )
   1946                 )
   1947          )
   1948       );
   1949    } else {
   1950       // result = (qaa | qbb) | ((a_min - b_max) ^ (a_max + b_min))
   1951       return
   1952       assignNew('V', mce,ty,
   1953          binop( opOR,
   1954                 assignNew('V', mce,ty, binop(opOR, qaa, qbb)),
   1955                 assignNew('V', mce,ty,
   1956                    binop( opXOR,
   1957                           assignNew('V', mce,ty, binop(opSUB, a_min, b_max)),
   1958                           assignNew('V', mce,ty, binop(opSUB, a_max, b_min))
   1959                    )
   1960                 )
   1961          )
   1962       );
   1963    }
   1964 
   1965 }
   1966 
   1967 
   1968 static
   1969 IRAtom* expensiveCountTrailingZeroes ( MCEnv* mce, IROp czop,
   1970                                        IRAtom* atom, IRAtom* vatom )
   1971 {
   1972    IRType ty;
   1973    IROp xorOp, subOp, andOp;
   1974    IRExpr *one;
   1975    IRAtom *improver, *improved;
   1976    tl_assert(isShadowAtom(mce,vatom));
   1977    tl_assert(isOriginalAtom(mce,atom));
   1978    tl_assert(sameKindedAtoms(atom,vatom));
   1979 
   1980    switch (czop) {
   1981       case Iop_Ctz32:
   1982          ty = Ity_I32;
   1983          xorOp = Iop_Xor32;
   1984          subOp = Iop_Sub32;
   1985          andOp = Iop_And32;
   1986          one = mkU32(1);
   1987          break;
   1988       case Iop_Ctz64:
   1989          ty = Ity_I64;
   1990          xorOp = Iop_Xor64;
   1991          subOp = Iop_Sub64;
   1992          andOp = Iop_And64;
   1993          one = mkU64(1);
   1994          break;
   1995       default:
   1996          ppIROp(czop);
   1997          VG_(tool_panic)("memcheck:expensiveCountTrailingZeroes");
   1998    }
   1999 
   2000    // improver = atom ^ (atom - 1)
   2001    //
   2002    // That is, improver has its low ctz(atom) bits equal to one;
   2003    // higher bits (if any) equal to zero.
   2004    improver = assignNew('V', mce,ty,
   2005                         binop(xorOp,
   2006                               atom,
   2007                               assignNew('V', mce, ty,
   2008                                         binop(subOp, atom, one))));
   2009 
   2010    // improved = vatom & improver
   2011    //
   2012    // That is, treat any V bits above the first ctz(atom) bits as
   2013    // "defined".
   2014    improved = assignNew('V', mce, ty,
   2015                         binop(andOp, vatom, improver));
   2016 
   2017    // Return pessimizing cast of improved.
   2018    return mkPCastTo(mce, ty, improved);
   2019 }
   2020 
   2021 
   2022 /*------------------------------------------------------------*/
   2023 /*--- Scalar shifts.                                       ---*/
   2024 /*------------------------------------------------------------*/
   2025 
   2026 /* Produce an interpretation for (aa << bb) (or >>s, >>u).  The basic
   2027    idea is to shift the definedness bits by the original shift amount.
   2028    This introduces 0s ("defined") in new positions for left shifts and
   2029    unsigned right shifts, and copies the top definedness bit for
   2030    signed right shifts.  So, conveniently, applying the original shift
   2031    operator to the definedness bits for the left arg is exactly the
   2032    right thing to do:
   2033 
   2034       (qaa << bb)
   2035 
   2036    However if the shift amount is undefined then the whole result
   2037    is undefined.  Hence need:
   2038 
   2039       (qaa << bb) `UifU` PCast(qbb)
   2040 
   2041    If the shift amount bb is a literal than qbb will say 'all defined'
   2042    and the UifU and PCast will get folded out by post-instrumentation
   2043    optimisation.
   2044 */
   2045 static IRAtom* scalarShift ( MCEnv*  mce,
   2046                              IRType  ty,
   2047                              IROp    original_op,
   2048                              IRAtom* qaa, IRAtom* qbb,
   2049                              IRAtom* aa,  IRAtom* bb )
   2050 {
   2051    tl_assert(isShadowAtom(mce,qaa));
   2052    tl_assert(isShadowAtom(mce,qbb));
   2053    tl_assert(isOriginalAtom(mce,aa));
   2054    tl_assert(isOriginalAtom(mce,bb));
   2055    tl_assert(sameKindedAtoms(qaa,aa));
   2056    tl_assert(sameKindedAtoms(qbb,bb));
   2057    return
   2058       assignNew(
   2059          'V', mce, ty,
   2060          mkUifU( mce, ty,
   2061                  assignNew('V', mce, ty, binop(original_op, qaa, bb)),
   2062                  mkPCastTo(mce, ty, qbb)
   2063          )
   2064    );
   2065 }
   2066 
   2067 
   2068 /*------------------------------------------------------------*/
   2069 /*--- Helpers for dealing with vector primops.             ---*/
   2070 /*------------------------------------------------------------*/
   2071 
   2072 /* Vector pessimisation -- pessimise within each lane individually. */
   2073 
   2074 static IRAtom* mkPCast8x16 ( MCEnv* mce, IRAtom* at )
   2075 {
   2076    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ8x16, at));
   2077 }
   2078 
   2079 static IRAtom* mkPCast16x8 ( MCEnv* mce, IRAtom* at )
   2080 {
   2081    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ16x8, at));
   2082 }
   2083 
   2084 static IRAtom* mkPCast32x4 ( MCEnv* mce, IRAtom* at )
   2085 {
   2086    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ32x4, at));
   2087 }
   2088 
   2089 static IRAtom* mkPCast64x2 ( MCEnv* mce, IRAtom* at )
   2090 {
   2091    return assignNew('V', mce, Ity_V128, unop(Iop_CmpNEZ64x2, at));
   2092 }
   2093 
   2094 static IRAtom* mkPCast64x4 ( MCEnv* mce, IRAtom* at )
   2095 {
   2096    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ64x4, at));
   2097 }
   2098 
   2099 static IRAtom* mkPCast32x8 ( MCEnv* mce, IRAtom* at )
   2100 {
   2101    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ32x8, at));
   2102 }
   2103 
   2104 static IRAtom* mkPCast32x2 ( MCEnv* mce, IRAtom* at )
   2105 {
   2106    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ32x2, at));
   2107 }
   2108 
   2109 static IRAtom* mkPCast16x16 ( MCEnv* mce, IRAtom* at )
   2110 {
   2111    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ16x16, at));
   2112 }
   2113 
   2114 static IRAtom* mkPCast16x4 ( MCEnv* mce, IRAtom* at )
   2115 {
   2116    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ16x4, at));
   2117 }
   2118 
   2119 static IRAtom* mkPCast8x32 ( MCEnv* mce, IRAtom* at )
   2120 {
   2121    return assignNew('V', mce, Ity_V256, unop(Iop_CmpNEZ8x32, at));
   2122 }
   2123 
   2124 static IRAtom* mkPCast8x8 ( MCEnv* mce, IRAtom* at )
   2125 {
   2126    return assignNew('V', mce, Ity_I64, unop(Iop_CmpNEZ8x8, at));
   2127 }
   2128 
   2129 static IRAtom* mkPCast16x2 ( MCEnv* mce, IRAtom* at )
   2130 {
   2131    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ16x2, at));
   2132 }
   2133 
   2134 static IRAtom* mkPCast8x4 ( MCEnv* mce, IRAtom* at )
   2135 {
   2136    return assignNew('V', mce, Ity_I32, unop(Iop_CmpNEZ8x4, at));
   2137 }
   2138 
   2139 
   2140 /* Here's a simple scheme capable of handling ops derived from SSE1
   2141    code and while only generating ops that can be efficiently
   2142    implemented in SSE1. */
   2143 
   2144 /* All-lanes versions are straightforward:
   2145 
   2146    binary32Fx4(x,y)   ==> PCast32x4(UifUV128(x#,y#))
   2147 
   2148    unary32Fx4(x,y)    ==> PCast32x4(x#)
   2149 
   2150    Lowest-lane-only versions are more complex:
   2151 
   2152    binary32F0x4(x,y)  ==> SetV128lo32(
   2153                              x#,
   2154                              PCast32(V128to32(UifUV128(x#,y#)))
   2155                           )
   2156 
   2157    This is perhaps not so obvious.  In particular, it's faster to
   2158    do a V128-bit UifU and then take the bottom 32 bits than the more
   2159    obvious scheme of taking the bottom 32 bits of each operand
   2160    and doing a 32-bit UifU.  Basically since UifU is fast and
   2161    chopping lanes off vector values is slow.
   2162 
   2163    Finally:
   2164 
   2165    unary32F0x4(x)     ==> SetV128lo32(
   2166                              x#,
   2167                              PCast32(V128to32(x#))
   2168                           )
   2169 
   2170    Where:
   2171 
   2172    PCast32(v#)   = 1Sto32(CmpNE32(v#,0))
   2173    PCast32x4(v#) = CmpNEZ32x4(v#)
   2174 */
   2175 
   2176 static
   2177 IRAtom* binary32Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2178 {
   2179    IRAtom* at;
   2180    tl_assert(isShadowAtom(mce, vatomX));
   2181    tl_assert(isShadowAtom(mce, vatomY));
   2182    at = mkUifUV128(mce, vatomX, vatomY);
   2183    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, at));
   2184    return at;
   2185 }
   2186 
   2187 static
   2188 IRAtom* unary32Fx4 ( MCEnv* mce, IRAtom* vatomX )
   2189 {
   2190    IRAtom* at;
   2191    tl_assert(isShadowAtom(mce, vatomX));
   2192    at = assignNew('V', mce, Ity_V128, mkPCast32x4(mce, vatomX));
   2193    return at;
   2194 }
   2195 
   2196 static
   2197 IRAtom* binary32F0x4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2198 {
   2199    IRAtom* at;
   2200    tl_assert(isShadowAtom(mce, vatomX));
   2201    tl_assert(isShadowAtom(mce, vatomY));
   2202    at = mkUifUV128(mce, vatomX, vatomY);
   2203    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, at));
   2204    at = mkPCastTo(mce, Ity_I32, at);
   2205    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
   2206    return at;
   2207 }
   2208 
   2209 static
   2210 IRAtom* unary32F0x4 ( MCEnv* mce, IRAtom* vatomX )
   2211 {
   2212    IRAtom* at;
   2213    tl_assert(isShadowAtom(mce, vatomX));
   2214    at = assignNew('V', mce, Ity_I32, unop(Iop_V128to32, vatomX));
   2215    at = mkPCastTo(mce, Ity_I32, at);
   2216    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo32, vatomX, at));
   2217    return at;
   2218 }
   2219 
   2220 /* --- ... and ... 64Fx2 versions of the same ... --- */
   2221 
   2222 static
   2223 IRAtom* binary64Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2224 {
   2225    IRAtom* at;
   2226    tl_assert(isShadowAtom(mce, vatomX));
   2227    tl_assert(isShadowAtom(mce, vatomY));
   2228    at = mkUifUV128(mce, vatomX, vatomY);
   2229    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, at));
   2230    return at;
   2231 }
   2232 
   2233 static
   2234 IRAtom* unary64Fx2 ( MCEnv* mce, IRAtom* vatomX )
   2235 {
   2236    IRAtom* at;
   2237    tl_assert(isShadowAtom(mce, vatomX));
   2238    at = assignNew('V', mce, Ity_V128, mkPCast64x2(mce, vatomX));
   2239    return at;
   2240 }
   2241 
   2242 static
   2243 IRAtom* binary64F0x2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2244 {
   2245    IRAtom* at;
   2246    tl_assert(isShadowAtom(mce, vatomX));
   2247    tl_assert(isShadowAtom(mce, vatomY));
   2248    at = mkUifUV128(mce, vatomX, vatomY);
   2249    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, at));
   2250    at = mkPCastTo(mce, Ity_I64, at);
   2251    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
   2252    return at;
   2253 }
   2254 
   2255 static
   2256 IRAtom* unary64F0x2 ( MCEnv* mce, IRAtom* vatomX )
   2257 {
   2258    IRAtom* at;
   2259    tl_assert(isShadowAtom(mce, vatomX));
   2260    at = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vatomX));
   2261    at = mkPCastTo(mce, Ity_I64, at);
   2262    at = assignNew('V', mce, Ity_V128, binop(Iop_SetV128lo64, vatomX, at));
   2263    return at;
   2264 }
   2265 
   2266 /* --- --- ... and ... 32Fx2 versions of the same --- --- */
   2267 
   2268 static
   2269 IRAtom* binary32Fx2 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2270 {
   2271    IRAtom* at;
   2272    tl_assert(isShadowAtom(mce, vatomX));
   2273    tl_assert(isShadowAtom(mce, vatomY));
   2274    at = mkUifU64(mce, vatomX, vatomY);
   2275    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, at));
   2276    return at;
   2277 }
   2278 
   2279 static
   2280 IRAtom* unary32Fx2 ( MCEnv* mce, IRAtom* vatomX )
   2281 {
   2282    IRAtom* at;
   2283    tl_assert(isShadowAtom(mce, vatomX));
   2284    at = assignNew('V', mce, Ity_I64, mkPCast32x2(mce, vatomX));
   2285    return at;
   2286 }
   2287 
   2288 /* --- ... and ... 64Fx4 versions of the same ... --- */
   2289 
   2290 static
   2291 IRAtom* binary64Fx4 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2292 {
   2293    IRAtom* at;
   2294    tl_assert(isShadowAtom(mce, vatomX));
   2295    tl_assert(isShadowAtom(mce, vatomY));
   2296    at = mkUifUV256(mce, vatomX, vatomY);
   2297    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, at));
   2298    return at;
   2299 }
   2300 
   2301 static
   2302 IRAtom* unary64Fx4 ( MCEnv* mce, IRAtom* vatomX )
   2303 {
   2304    IRAtom* at;
   2305    tl_assert(isShadowAtom(mce, vatomX));
   2306    at = assignNew('V', mce, Ity_V256, mkPCast64x4(mce, vatomX));
   2307    return at;
   2308 }
   2309 
   2310 /* --- ... and ... 32Fx8 versions of the same ... --- */
   2311 
   2312 static
   2313 IRAtom* binary32Fx8 ( MCEnv* mce, IRAtom* vatomX, IRAtom* vatomY )
   2314 {
   2315    IRAtom* at;
   2316    tl_assert(isShadowAtom(mce, vatomX));
   2317    tl_assert(isShadowAtom(mce, vatomY));
   2318    at = mkUifUV256(mce, vatomX, vatomY);
   2319    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, at));
   2320    return at;
   2321 }
   2322 
   2323 static
   2324 IRAtom* unary32Fx8 ( MCEnv* mce, IRAtom* vatomX )
   2325 {
   2326    IRAtom* at;
   2327    tl_assert(isShadowAtom(mce, vatomX));
   2328    at = assignNew('V', mce, Ity_V256, mkPCast32x8(mce, vatomX));
   2329    return at;
   2330 }
   2331 
   2332 /* --- 64Fx2 binary FP ops, with rounding mode --- */
   2333 
   2334 static
   2335 IRAtom* binary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM,
   2336                                        IRAtom* vatomX, IRAtom* vatomY )
   2337 {
   2338    /* This is the same as binary64Fx2, except that we subsequently
   2339       pessimise vRM (definedness of the rounding mode), widen to 128
   2340       bits and UifU it into the result.  As with the scalar cases, if
   2341       the RM is a constant then it is defined and so this extra bit
   2342       will get constant-folded out later. */
   2343    // "do" the vector args
   2344    IRAtom* t1 = binary64Fx2(mce, vatomX, vatomY);
   2345    // PCast the RM, and widen it to 128 bits
   2346    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2347    // Roll it into the result
   2348    t1 = mkUifUV128(mce, t1, t2);
   2349    return t1;
   2350 }
   2351 
   2352 /* --- ... and ... 32Fx4 versions of the same --- */
   2353 
   2354 static
   2355 IRAtom* binary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
   2356                                        IRAtom* vatomX, IRAtom* vatomY )
   2357 {
   2358    IRAtom* t1 = binary32Fx4(mce, vatomX, vatomY);
   2359    // PCast the RM, and widen it to 128 bits
   2360    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2361    // Roll it into the result
   2362    t1 = mkUifUV128(mce, t1, t2);
   2363    return t1;
   2364 }
   2365 
   2366 /* --- ... and ... 64Fx4 versions of the same --- */
   2367 
   2368 static
   2369 IRAtom* binary64Fx4_w_rm ( MCEnv* mce, IRAtom* vRM,
   2370                                        IRAtom* vatomX, IRAtom* vatomY )
   2371 {
   2372    IRAtom* t1 = binary64Fx4(mce, vatomX, vatomY);
   2373    // PCast the RM, and widen it to 256 bits
   2374    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
   2375    // Roll it into the result
   2376    t1 = mkUifUV256(mce, t1, t2);
   2377    return t1;
   2378 }
   2379 
   2380 /* --- ... and ... 32Fx8 versions of the same --- */
   2381 
   2382 static
   2383 IRAtom* binary32Fx8_w_rm ( MCEnv* mce, IRAtom* vRM,
   2384                                        IRAtom* vatomX, IRAtom* vatomY )
   2385 {
   2386    IRAtom* t1 = binary32Fx8(mce, vatomX, vatomY);
   2387    // PCast the RM, and widen it to 256 bits
   2388    IRAtom* t2 = mkPCastTo(mce, Ity_V256, vRM);
   2389    // Roll it into the result
   2390    t1 = mkUifUV256(mce, t1, t2);
   2391    return t1;
   2392 }
   2393 
   2394 /* --- 64Fx2 unary FP ops, with rounding mode --- */
   2395 
   2396 static
   2397 IRAtom* unary64Fx2_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
   2398 {
   2399    /* Same scheme as binary64Fx2_w_rm. */
   2400    // "do" the vector arg
   2401    IRAtom* t1 = unary64Fx2(mce, vatomX);
   2402    // PCast the RM, and widen it to 128 bits
   2403    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2404    // Roll it into the result
   2405    t1 = mkUifUV128(mce, t1, t2);
   2406    return t1;
   2407 }
   2408 
   2409 /* --- ... and ... 32Fx4 versions of the same --- */
   2410 
   2411 static
   2412 IRAtom* unary32Fx4_w_rm ( MCEnv* mce, IRAtom* vRM, IRAtom* vatomX )
   2413 {
   2414    /* Same scheme as unary32Fx4_w_rm. */
   2415    IRAtom* t1 = unary32Fx4(mce, vatomX);
   2416    // PCast the RM, and widen it to 128 bits
   2417    IRAtom* t2 = mkPCastTo(mce, Ity_V128, vRM);
   2418    // Roll it into the result
   2419    t1 = mkUifUV128(mce, t1, t2);
   2420    return t1;
   2421 }
   2422 
   2423 
   2424 /* --- --- Vector saturated narrowing --- --- */
   2425 
   2426 /* We used to do something very clever here, but on closer inspection
   2427    (2011-Jun-15), and in particular bug #279698, it turns out to be
   2428    wrong.  Part of the problem came from the fact that for a long
   2429    time, the IR primops to do with saturated narrowing were
   2430    underspecified and managed to confuse multiple cases which needed
   2431    to be separate: the op names had a signedness qualifier, but in
   2432    fact the source and destination signednesses needed to be specified
   2433    independently, so the op names really need two independent
   2434    signedness specifiers.
   2435 
   2436    As of 2011-Jun-15 (ish) the underspecification was sorted out
   2437    properly.  The incorrect instrumentation remained, though.  That
   2438    has now (2011-Oct-22) been fixed.
   2439 
   2440    What we now do is simple:
   2441 
   2442    Let the original narrowing op be QNarrowBinXtoYxZ, where Z is a
   2443    number of lanes, X is the source lane width and signedness, and Y
   2444    is the destination lane width and signedness.  In all cases the
   2445    destination lane width is half the source lane width, so the names
   2446    have a bit of redundancy, but are at least easy to read.
   2447 
   2448    For example, Iop_QNarrowBin32Sto16Ux8 narrows 8 lanes of signed 32s
   2449    to unsigned 16s.
   2450 
   2451    Let Vanilla(OP) be a function that takes OP, one of these
   2452    saturating narrowing ops, and produces the same "shaped" narrowing
   2453    op which is not saturating, but merely dumps the most significant
   2454    bits.  "same shape" means that the lane numbers and widths are the
   2455    same as with OP.
   2456 
   2457    For example, Vanilla(Iop_QNarrowBin32Sto16Ux8)
   2458                   = Iop_NarrowBin32to16x8,
   2459    that is, narrow 8 lanes of 32 bits to 8 lanes of 16 bits, by
   2460    dumping the top half of each lane.
   2461 
   2462    So, with that in place, the scheme is simple, and it is simple to
   2463    pessimise each lane individually and then apply Vanilla(OP) so as
   2464    to get the result in the right "shape".  If the original OP is
   2465    QNarrowBinXtoYxZ then we produce
   2466 
   2467    Vanilla(OP)( PCast-X-to-X-x-Z(vatom1), PCast-X-to-X-x-Z(vatom2) )
   2468 
   2469    or for the case when OP is unary (Iop_QNarrowUn*)
   2470 
   2471    Vanilla(OP)( PCast-X-to-X-x-Z(vatom) )
   2472 */
   2473 static
   2474 IROp vanillaNarrowingOpOfShape ( IROp qnarrowOp )
   2475 {
   2476    switch (qnarrowOp) {
   2477       /* Binary: (128, 128) -> 128 */
   2478       case Iop_QNarrowBin16Sto8Ux16:
   2479       case Iop_QNarrowBin16Sto8Sx16:
   2480       case Iop_QNarrowBin16Uto8Ux16:
   2481       case Iop_QNarrowBin64Sto32Sx4:
   2482       case Iop_QNarrowBin64Uto32Ux4:
   2483          return Iop_NarrowBin16to8x16;
   2484       case Iop_QNarrowBin32Sto16Ux8:
   2485       case Iop_QNarrowBin32Sto16Sx8:
   2486       case Iop_QNarrowBin32Uto16Ux8:
   2487          return Iop_NarrowBin32to16x8;
   2488       /* Binary: (64, 64) -> 64 */
   2489       case Iop_QNarrowBin32Sto16Sx4:
   2490          return Iop_NarrowBin32to16x4;
   2491       case Iop_QNarrowBin16Sto8Ux8:
   2492       case Iop_QNarrowBin16Sto8Sx8:
   2493          return Iop_NarrowBin16to8x8;
   2494       /* Unary: 128 -> 64 */
   2495       case Iop_QNarrowUn64Uto32Ux2:
   2496       case Iop_QNarrowUn64Sto32Sx2:
   2497       case Iop_QNarrowUn64Sto32Ux2:
   2498          return Iop_NarrowUn64to32x2;
   2499       case Iop_QNarrowUn32Uto16Ux4:
   2500       case Iop_QNarrowUn32Sto16Sx4:
   2501       case Iop_QNarrowUn32Sto16Ux4:
   2502          return Iop_NarrowUn32to16x4;
   2503       case Iop_QNarrowUn16Uto8Ux8:
   2504       case Iop_QNarrowUn16Sto8Sx8:
   2505       case Iop_QNarrowUn16Sto8Ux8:
   2506          return Iop_NarrowUn16to8x8;
   2507       default:
   2508          ppIROp(qnarrowOp);
   2509          VG_(tool_panic)("vanillaNarrowOpOfShape");
   2510    }
   2511 }
   2512 
   2513 static
   2514 IRAtom* vectorNarrowBinV128 ( MCEnv* mce, IROp narrow_op,
   2515                               IRAtom* vatom1, IRAtom* vatom2)
   2516 {
   2517    IRAtom *at1, *at2, *at3;
   2518    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2519    switch (narrow_op) {
   2520       case Iop_QNarrowBin64Sto32Sx4: pcast = mkPCast32x4; break;
   2521       case Iop_QNarrowBin64Uto32Ux4: pcast = mkPCast32x4; break;
   2522       case Iop_QNarrowBin32Sto16Sx8: pcast = mkPCast32x4; break;
   2523       case Iop_QNarrowBin32Uto16Ux8: pcast = mkPCast32x4; break;
   2524       case Iop_QNarrowBin32Sto16Ux8: pcast = mkPCast32x4; break;
   2525       case Iop_QNarrowBin16Sto8Sx16: pcast = mkPCast16x8; break;
   2526       case Iop_QNarrowBin16Uto8Ux16: pcast = mkPCast16x8; break;
   2527       case Iop_QNarrowBin16Sto8Ux16: pcast = mkPCast16x8; break;
   2528       default: VG_(tool_panic)("vectorNarrowBinV128");
   2529    }
   2530    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2531    tl_assert(isShadowAtom(mce,vatom1));
   2532    tl_assert(isShadowAtom(mce,vatom2));
   2533    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
   2534    at2 = assignNew('V', mce, Ity_V128, pcast(mce, vatom2));
   2535    at3 = assignNew('V', mce, Ity_V128, binop(vanilla_narrow, at1, at2));
   2536    return at3;
   2537 }
   2538 
   2539 static
   2540 IRAtom* vectorNarrowBin64 ( MCEnv* mce, IROp narrow_op,
   2541                             IRAtom* vatom1, IRAtom* vatom2)
   2542 {
   2543    IRAtom *at1, *at2, *at3;
   2544    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2545    switch (narrow_op) {
   2546       case Iop_QNarrowBin32Sto16Sx4: pcast = mkPCast32x2; break;
   2547       case Iop_QNarrowBin16Sto8Sx8:  pcast = mkPCast16x4; break;
   2548       case Iop_QNarrowBin16Sto8Ux8:  pcast = mkPCast16x4; break;
   2549       default: VG_(tool_panic)("vectorNarrowBin64");
   2550    }
   2551    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2552    tl_assert(isShadowAtom(mce,vatom1));
   2553    tl_assert(isShadowAtom(mce,vatom2));
   2554    at1 = assignNew('V', mce, Ity_I64, pcast(mce, vatom1));
   2555    at2 = assignNew('V', mce, Ity_I64, pcast(mce, vatom2));
   2556    at3 = assignNew('V', mce, Ity_I64, binop(vanilla_narrow, at1, at2));
   2557    return at3;
   2558 }
   2559 
   2560 static
   2561 IRAtom* vectorNarrowUnV128 ( MCEnv* mce, IROp narrow_op,
   2562                              IRAtom* vatom1)
   2563 {
   2564    IRAtom *at1, *at2;
   2565    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2566    tl_assert(isShadowAtom(mce,vatom1));
   2567    /* For vanilla narrowing (non-saturating), we can just apply
   2568       the op directly to the V bits. */
   2569    switch (narrow_op) {
   2570       case Iop_NarrowUn16to8x8:
   2571       case Iop_NarrowUn32to16x4:
   2572       case Iop_NarrowUn64to32x2:
   2573          at1 = assignNew('V', mce, Ity_I64, unop(narrow_op, vatom1));
   2574          return at1;
   2575       default:
   2576          break; /* Do Plan B */
   2577    }
   2578    /* Plan B: for ops that involve a saturation operation on the args,
   2579       we must PCast before the vanilla narrow. */
   2580    switch (narrow_op) {
   2581       case Iop_QNarrowUn16Sto8Sx8:  pcast = mkPCast16x8; break;
   2582       case Iop_QNarrowUn16Sto8Ux8:  pcast = mkPCast16x8; break;
   2583       case Iop_QNarrowUn16Uto8Ux8:  pcast = mkPCast16x8; break;
   2584       case Iop_QNarrowUn32Sto16Sx4: pcast = mkPCast32x4; break;
   2585       case Iop_QNarrowUn32Sto16Ux4: pcast = mkPCast32x4; break;
   2586       case Iop_QNarrowUn32Uto16Ux4: pcast = mkPCast32x4; break;
   2587       case Iop_QNarrowUn64Sto32Sx2: pcast = mkPCast64x2; break;
   2588       case Iop_QNarrowUn64Sto32Ux2: pcast = mkPCast64x2; break;
   2589       case Iop_QNarrowUn64Uto32Ux2: pcast = mkPCast64x2; break;
   2590       default: VG_(tool_panic)("vectorNarrowUnV128");
   2591    }
   2592    IROp vanilla_narrow = vanillaNarrowingOpOfShape(narrow_op);
   2593    at1 = assignNew('V', mce, Ity_V128, pcast(mce, vatom1));
   2594    at2 = assignNew('V', mce, Ity_I64, unop(vanilla_narrow, at1));
   2595    return at2;
   2596 }
   2597 
   2598 static
   2599 IRAtom* vectorWidenI64 ( MCEnv* mce, IROp longen_op,
   2600                          IRAtom* vatom1)
   2601 {
   2602    IRAtom *at1, *at2;
   2603    IRAtom* (*pcast)( MCEnv*, IRAtom* );
   2604    switch (longen_op) {
   2605       case Iop_Widen8Uto16x8:  pcast = mkPCast16x8; break;
   2606       case Iop_Widen8Sto16x8:  pcast = mkPCast16x8; break;
   2607       case Iop_Widen16Uto32x4: pcast = mkPCast32x4; break;
   2608       case Iop_Widen16Sto32x4: pcast = mkPCast32x4; break;
   2609       case Iop_Widen32Uto64x2: pcast = mkPCast64x2; break;
   2610       case Iop_Widen32Sto64x2: pcast = mkPCast64x2; break;
   2611       default: VG_(tool_panic)("vectorWidenI64");
   2612    }
   2613    tl_assert(isShadowAtom(mce,vatom1));
   2614    at1 = assignNew('V', mce, Ity_V128, unop(longen_op, vatom1));
   2615    at2 = assignNew('V', mce, Ity_V128, pcast(mce, at1));
   2616    return at2;
   2617 }
   2618 
   2619 
   2620 /* --- --- Vector integer arithmetic --- --- */
   2621 
   2622 /* Simple ... UifU the args and per-lane pessimise the results. */
   2623 
   2624 /* --- V256-bit versions --- */
   2625 
   2626 static
   2627 IRAtom* binary8Ix32 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2628 {
   2629    IRAtom* at;
   2630    at = mkUifUV256(mce, vatom1, vatom2);
   2631    at = mkPCast8x32(mce, at);
   2632    return at;
   2633 }
   2634 
   2635 static
   2636 IRAtom* binary16Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2637 {
   2638    IRAtom* at;
   2639    at = mkUifUV256(mce, vatom1, vatom2);
   2640    at = mkPCast16x16(mce, at);
   2641    return at;
   2642 }
   2643 
   2644 static
   2645 IRAtom* binary32Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2646 {
   2647    IRAtom* at;
   2648    at = mkUifUV256(mce, vatom1, vatom2);
   2649    at = mkPCast32x8(mce, at);
   2650    return at;
   2651 }
   2652 
   2653 static
   2654 IRAtom* binary64Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2655 {
   2656    IRAtom* at;
   2657    at = mkUifUV256(mce, vatom1, vatom2);
   2658    at = mkPCast64x4(mce, at);
   2659    return at;
   2660 }
   2661 
   2662 /* --- V128-bit versions --- */
   2663 
   2664 static
   2665 IRAtom* binary8Ix16 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2666 {
   2667    IRAtom* at;
   2668    at = mkUifUV128(mce, vatom1, vatom2);
   2669    at = mkPCast8x16(mce, at);
   2670    return at;
   2671 }
   2672 
   2673 static
   2674 IRAtom* binary16Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2675 {
   2676    IRAtom* at;
   2677    at = mkUifUV128(mce, vatom1, vatom2);
   2678    at = mkPCast16x8(mce, at);
   2679    return at;
   2680 }
   2681 
   2682 static
   2683 IRAtom* binary32Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2684 {
   2685    IRAtom* at;
   2686    at = mkUifUV128(mce, vatom1, vatom2);
   2687    at = mkPCast32x4(mce, at);
   2688    return at;
   2689 }
   2690 
   2691 static
   2692 IRAtom* binary64Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2693 {
   2694    IRAtom* at;
   2695    at = mkUifUV128(mce, vatom1, vatom2);
   2696    at = mkPCast64x2(mce, at);
   2697    return at;
   2698 }
   2699 
   2700 /* --- 64-bit versions --- */
   2701 
   2702 static
   2703 IRAtom* binary8Ix8 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2704 {
   2705    IRAtom* at;
   2706    at = mkUifU64(mce, vatom1, vatom2);
   2707    at = mkPCast8x8(mce, at);
   2708    return at;
   2709 }
   2710 
   2711 static
   2712 IRAtom* binary16Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2713 {
   2714    IRAtom* at;
   2715    at = mkUifU64(mce, vatom1, vatom2);
   2716    at = mkPCast16x4(mce, at);
   2717    return at;
   2718 }
   2719 
   2720 static
   2721 IRAtom* binary32Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2722 {
   2723    IRAtom* at;
   2724    at = mkUifU64(mce, vatom1, vatom2);
   2725    at = mkPCast32x2(mce, at);
   2726    return at;
   2727 }
   2728 
   2729 static
   2730 IRAtom* binary64Ix1 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2731 {
   2732    IRAtom* at;
   2733    at = mkUifU64(mce, vatom1, vatom2);
   2734    at = mkPCastTo(mce, Ity_I64, at);
   2735    return at;
   2736 }
   2737 
   2738 /* --- 32-bit versions --- */
   2739 
   2740 static
   2741 IRAtom* binary8Ix4 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2742 {
   2743    IRAtom* at;
   2744    at = mkUifU32(mce, vatom1, vatom2);
   2745    at = mkPCast8x4(mce, at);
   2746    return at;
   2747 }
   2748 
   2749 static
   2750 IRAtom* binary16Ix2 ( MCEnv* mce, IRAtom* vatom1, IRAtom* vatom2 )
   2751 {
   2752    IRAtom* at;
   2753    at = mkUifU32(mce, vatom1, vatom2);
   2754    at = mkPCast16x2(mce, at);
   2755    return at;
   2756 }
   2757 
   2758 
   2759 /*------------------------------------------------------------*/
   2760 /*--- Generate shadow values from all kinds of IRExprs.    ---*/
   2761 /*------------------------------------------------------------*/
   2762 
   2763 static
   2764 IRAtom* expr2vbits_Qop ( MCEnv* mce,
   2765                          IROp op,
   2766                          IRAtom* atom1, IRAtom* atom2,
   2767                          IRAtom* atom3, IRAtom* atom4 )
   2768 {
   2769    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2770    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2771    IRAtom* vatom3 = expr2vbits( mce, atom3 );
   2772    IRAtom* vatom4 = expr2vbits( mce, atom4 );
   2773 
   2774    tl_assert(isOriginalAtom(mce,atom1));
   2775    tl_assert(isOriginalAtom(mce,atom2));
   2776    tl_assert(isOriginalAtom(mce,atom3));
   2777    tl_assert(isOriginalAtom(mce,atom4));
   2778    tl_assert(isShadowAtom(mce,vatom1));
   2779    tl_assert(isShadowAtom(mce,vatom2));
   2780    tl_assert(isShadowAtom(mce,vatom3));
   2781    tl_assert(isShadowAtom(mce,vatom4));
   2782    tl_assert(sameKindedAtoms(atom1,vatom1));
   2783    tl_assert(sameKindedAtoms(atom2,vatom2));
   2784    tl_assert(sameKindedAtoms(atom3,vatom3));
   2785    tl_assert(sameKindedAtoms(atom4,vatom4));
   2786    switch (op) {
   2787       case Iop_MAddF64:
   2788       case Iop_MAddF64r32:
   2789       case Iop_MSubF64:
   2790       case Iop_MSubF64r32:
   2791          /* I32(rm) x F64 x F64 x F64 -> F64 */
   2792          return mkLazy4(mce, Ity_I64, vatom1, vatom2, vatom3, vatom4);
   2793 
   2794       case Iop_MAddF32:
   2795       case Iop_MSubF32:
   2796          /* I32(rm) x F32 x F32 x F32 -> F32 */
   2797          return mkLazy4(mce, Ity_I32, vatom1, vatom2, vatom3, vatom4);
   2798 
   2799       /* V256-bit data-steering */
   2800       case Iop_64x4toV256:
   2801          return assignNew('V', mce, Ity_V256,
   2802                           IRExpr_Qop(op, vatom1, vatom2, vatom3, vatom4));
   2803 
   2804       default:
   2805          ppIROp(op);
   2806          VG_(tool_panic)("memcheck:expr2vbits_Qop");
   2807    }
   2808 }
   2809 
   2810 
   2811 static
   2812 IRAtom* expr2vbits_Triop ( MCEnv* mce,
   2813                            IROp op,
   2814                            IRAtom* atom1, IRAtom* atom2, IRAtom* atom3 )
   2815 {
   2816    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2817    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2818    IRAtom* vatom3 = expr2vbits( mce, atom3 );
   2819 
   2820    tl_assert(isOriginalAtom(mce,atom1));
   2821    tl_assert(isOriginalAtom(mce,atom2));
   2822    tl_assert(isOriginalAtom(mce,atom3));
   2823    tl_assert(isShadowAtom(mce,vatom1));
   2824    tl_assert(isShadowAtom(mce,vatom2));
   2825    tl_assert(isShadowAtom(mce,vatom3));
   2826    tl_assert(sameKindedAtoms(atom1,vatom1));
   2827    tl_assert(sameKindedAtoms(atom2,vatom2));
   2828    tl_assert(sameKindedAtoms(atom3,vatom3));
   2829    switch (op) {
   2830       case Iop_AddF128:
   2831       case Iop_AddD128:
   2832       case Iop_SubF128:
   2833       case Iop_SubD128:
   2834       case Iop_MulF128:
   2835       case Iop_MulD128:
   2836       case Iop_DivF128:
   2837       case Iop_DivD128:
   2838       case Iop_QuantizeD128:
   2839          /* I32(rm) x F128/D128 x F128/D128 -> F128/D128 */
   2840          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
   2841       case Iop_AddF64:
   2842       case Iop_AddD64:
   2843       case Iop_AddF64r32:
   2844       case Iop_SubF64:
   2845       case Iop_SubD64:
   2846       case Iop_SubF64r32:
   2847       case Iop_MulF64:
   2848       case Iop_MulD64:
   2849       case Iop_MulF64r32:
   2850       case Iop_DivF64:
   2851       case Iop_DivD64:
   2852       case Iop_DivF64r32:
   2853       case Iop_ScaleF64:
   2854       case Iop_Yl2xF64:
   2855       case Iop_Yl2xp1F64:
   2856       case Iop_AtanF64:
   2857       case Iop_PRemF64:
   2858       case Iop_PRem1F64:
   2859       case Iop_QuantizeD64:
   2860          /* I32(rm) x F64/D64 x F64/D64 -> F64/D64 */
   2861          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
   2862       case Iop_PRemC3210F64:
   2863       case Iop_PRem1C3210F64:
   2864          /* I32(rm) x F64 x F64 -> I32 */
   2865          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
   2866       case Iop_AddF32:
   2867       case Iop_SubF32:
   2868       case Iop_MulF32:
   2869       case Iop_DivF32:
   2870          /* I32(rm) x F32 x F32 -> I32 */
   2871          return mkLazy3(mce, Ity_I32, vatom1, vatom2, vatom3);
   2872       case Iop_SignificanceRoundD64:
   2873          /* IRRoundingMode(I32) x I8 x D64 -> D64 */
   2874          return mkLazy3(mce, Ity_I64, vatom1, vatom2, vatom3);
   2875       case Iop_SignificanceRoundD128:
   2876          /* IRRoundingMode(I32) x I8 x D128 -> D128 */
   2877          return mkLazy3(mce, Ity_I128, vatom1, vatom2, vatom3);
   2878       case Iop_SliceV128:
   2879          /* (V128, V128, I8) -> V128 */
   2880          complainIfUndefined(mce, atom3, NULL);
   2881          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
   2882       case Iop_Slice64:
   2883          /* (I64, I64, I8) -> I64 */
   2884          complainIfUndefined(mce, atom3, NULL);
   2885          return assignNew('V', mce, Ity_I64, triop(op, vatom1, vatom2, atom3));
   2886       case Iop_SetElem8x8:
   2887       case Iop_SetElem16x4:
   2888       case Iop_SetElem32x2:
   2889          complainIfUndefined(mce, atom2, NULL);
   2890          return assignNew('V', mce, Ity_I64, triop(op, vatom1, atom2, vatom3));
   2891       /* BCDIops */
   2892       case Iop_BCDAdd:
   2893       case Iop_BCDSub:
   2894          complainIfUndefined(mce, atom3, NULL);
   2895          return assignNew('V', mce, Ity_V128, triop(op, vatom1, vatom2, atom3));
   2896 
   2897       /* Vector FP with rounding mode as the first arg */
   2898       case Iop_Add64Fx2:
   2899       case Iop_Sub64Fx2:
   2900       case Iop_Mul64Fx2:
   2901       case Iop_Div64Fx2:
   2902          return binary64Fx2_w_rm(mce, vatom1, vatom2, vatom3);
   2903 
   2904       case Iop_Add32Fx4:
   2905       case Iop_Sub32Fx4:
   2906       case Iop_Mul32Fx4:
   2907       case Iop_Div32Fx4:
   2908         return binary32Fx4_w_rm(mce, vatom1, vatom2, vatom3);
   2909 
   2910       case Iop_Add64Fx4:
   2911       case Iop_Sub64Fx4:
   2912       case Iop_Mul64Fx4:
   2913       case Iop_Div64Fx4:
   2914          return binary64Fx4_w_rm(mce, vatom1, vatom2, vatom3);
   2915 
   2916       case Iop_Add32Fx8:
   2917       case Iop_Sub32Fx8:
   2918       case Iop_Mul32Fx8:
   2919       case Iop_Div32Fx8:
   2920          return binary32Fx8_w_rm(mce, vatom1, vatom2, vatom3);
   2921 
   2922       default:
   2923          ppIROp(op);
   2924          VG_(tool_panic)("memcheck:expr2vbits_Triop");
   2925    }
   2926 }
   2927 
   2928 
   2929 static
   2930 IRAtom* expr2vbits_Binop ( MCEnv* mce,
   2931                            IROp op,
   2932                            IRAtom* atom1, IRAtom* atom2 )
   2933 {
   2934    IRType  and_or_ty;
   2935    IRAtom* (*uifu)    (MCEnv*, IRAtom*, IRAtom*);
   2936    IRAtom* (*difd)    (MCEnv*, IRAtom*, IRAtom*);
   2937    IRAtom* (*improve) (MCEnv*, IRAtom*, IRAtom*);
   2938 
   2939    IRAtom* vatom1 = expr2vbits( mce, atom1 );
   2940    IRAtom* vatom2 = expr2vbits( mce, atom2 );
   2941 
   2942    tl_assert(isOriginalAtom(mce,atom1));
   2943    tl_assert(isOriginalAtom(mce,atom2));
   2944    tl_assert(isShadowAtom(mce,vatom1));
   2945    tl_assert(isShadowAtom(mce,vatom2));
   2946    tl_assert(sameKindedAtoms(atom1,vatom1));
   2947    tl_assert(sameKindedAtoms(atom2,vatom2));
   2948    switch (op) {
   2949 
   2950       /* 32-bit SIMD */
   2951 
   2952       case Iop_Add16x2:
   2953       case Iop_HAdd16Ux2:
   2954       case Iop_HAdd16Sx2:
   2955       case Iop_Sub16x2:
   2956       case Iop_HSub16Ux2:
   2957       case Iop_HSub16Sx2:
   2958       case Iop_QAdd16Sx2:
   2959       case Iop_QSub16Sx2:
   2960       case Iop_QSub16Ux2:
   2961       case Iop_QAdd16Ux2:
   2962          return binary16Ix2(mce, vatom1, vatom2);
   2963 
   2964       case Iop_Add8x4:
   2965       case Iop_HAdd8Ux4:
   2966       case Iop_HAdd8Sx4:
   2967       case Iop_Sub8x4:
   2968       case Iop_HSub8Ux4:
   2969       case Iop_HSub8Sx4:
   2970       case Iop_QSub8Ux4:
   2971       case Iop_QAdd8Ux4:
   2972       case Iop_QSub8Sx4:
   2973       case Iop_QAdd8Sx4:
   2974          return binary8Ix4(mce, vatom1, vatom2);
   2975 
   2976       /* 64-bit SIMD */
   2977 
   2978       case Iop_ShrN8x8:
   2979       case Iop_ShrN16x4:
   2980       case Iop_ShrN32x2:
   2981       case Iop_SarN8x8:
   2982       case Iop_SarN16x4:
   2983       case Iop_SarN32x2:
   2984       case Iop_ShlN16x4:
   2985       case Iop_ShlN32x2:
   2986       case Iop_ShlN8x8:
   2987          /* Same scheme as with all other shifts. */
   2988          complainIfUndefined(mce, atom2, NULL);
   2989          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
   2990 
   2991       case Iop_QNarrowBin32Sto16Sx4:
   2992       case Iop_QNarrowBin16Sto8Sx8:
   2993       case Iop_QNarrowBin16Sto8Ux8:
   2994          return vectorNarrowBin64(mce, op, vatom1, vatom2);
   2995 
   2996       case Iop_Min8Ux8:
   2997       case Iop_Min8Sx8:
   2998       case Iop_Max8Ux8:
   2999       case Iop_Max8Sx8:
   3000       case Iop_Avg8Ux8:
   3001       case Iop_QSub8Sx8:
   3002       case Iop_QSub8Ux8:
   3003       case Iop_Sub8x8:
   3004       case Iop_CmpGT8Sx8:
   3005       case Iop_CmpGT8Ux8:
   3006       case Iop_CmpEQ8x8:
   3007       case Iop_QAdd8Sx8:
   3008       case Iop_QAdd8Ux8:
   3009       case Iop_QSal8x8:
   3010       case Iop_QShl8x8:
   3011       case Iop_Add8x8:
   3012       case Iop_Mul8x8:
   3013       case Iop_PolynomialMul8x8:
   3014          return binary8Ix8(mce, vatom1, vatom2);
   3015 
   3016       case Iop_Min16Sx4:
   3017       case Iop_Min16Ux4:
   3018       case Iop_Max16Sx4:
   3019       case Iop_Max16Ux4:
   3020       case Iop_Avg16Ux4:
   3021       case Iop_QSub16Ux4:
   3022       case Iop_QSub16Sx4:
   3023       case Iop_Sub16x4:
   3024       case Iop_Mul16x4:
   3025       case Iop_MulHi16Sx4:
   3026       case Iop_MulHi16Ux4:
   3027       case Iop_CmpGT16Sx4:
   3028       case Iop_CmpGT16Ux4:
   3029       case Iop_CmpEQ16x4:
   3030       case Iop_QAdd16Sx4:
   3031       case Iop_QAdd16Ux4:
   3032       case Iop_QSal16x4:
   3033       case Iop_QShl16x4:
   3034       case Iop_Add16x4:
   3035       case Iop_QDMulHi16Sx4:
   3036       case Iop_QRDMulHi16Sx4:
   3037          return binary16Ix4(mce, vatom1, vatom2);
   3038 
   3039       case Iop_Sub32x2:
   3040       case Iop_Mul32x2:
   3041       case Iop_Max32Sx2:
   3042       case Iop_Max32Ux2:
   3043       case Iop_Min32Sx2:
   3044       case Iop_Min32Ux2:
   3045       case Iop_CmpGT32Sx2:
   3046       case Iop_CmpGT32Ux2:
   3047       case Iop_CmpEQ32x2:
   3048       case Iop_Add32x2:
   3049       case Iop_QAdd32Ux2:
   3050       case Iop_QAdd32Sx2:
   3051       case Iop_QSub32Ux2:
   3052       case Iop_QSub32Sx2:
   3053       case Iop_QSal32x2:
   3054       case Iop_QShl32x2:
   3055       case Iop_QDMulHi32Sx2:
   3056       case Iop_QRDMulHi32Sx2:
   3057          return binary32Ix2(mce, vatom1, vatom2);
   3058 
   3059       case Iop_QSub64Ux1:
   3060       case Iop_QSub64Sx1:
   3061       case Iop_QAdd64Ux1:
   3062       case Iop_QAdd64Sx1:
   3063       case Iop_QSal64x1:
   3064       case Iop_QShl64x1:
   3065       case Iop_Sal64x1:
   3066          return binary64Ix1(mce, vatom1, vatom2);
   3067 
   3068       case Iop_QShlNsatSU8x8:
   3069       case Iop_QShlNsatUU8x8:
   3070       case Iop_QShlNsatSS8x8:
   3071          complainIfUndefined(mce, atom2, NULL);
   3072          return mkPCast8x8(mce, vatom1);
   3073 
   3074       case Iop_QShlNsatSU16x4:
   3075       case Iop_QShlNsatUU16x4:
   3076       case Iop_QShlNsatSS16x4:
   3077          complainIfUndefined(mce, atom2, NULL);
   3078          return mkPCast16x4(mce, vatom1);
   3079 
   3080       case Iop_QShlNsatSU32x2:
   3081       case Iop_QShlNsatUU32x2:
   3082       case Iop_QShlNsatSS32x2:
   3083          complainIfUndefined(mce, atom2, NULL);
   3084          return mkPCast32x2(mce, vatom1);
   3085 
   3086       case Iop_QShlNsatSU64x1:
   3087       case Iop_QShlNsatUU64x1:
   3088       case Iop_QShlNsatSS64x1:
   3089          complainIfUndefined(mce, atom2, NULL);
   3090          return mkPCast32x2(mce, vatom1);
   3091 
   3092       case Iop_PwMax32Sx2:
   3093       case Iop_PwMax32Ux2:
   3094       case Iop_PwMin32Sx2:
   3095       case Iop_PwMin32Ux2:
   3096       case Iop_PwMax32Fx2:
   3097       case Iop_PwMin32Fx2:
   3098          return assignNew('V', mce, Ity_I64,
   3099                           binop(Iop_PwMax32Ux2,
   3100                                 mkPCast32x2(mce, vatom1),
   3101                                 mkPCast32x2(mce, vatom2)));
   3102 
   3103       case Iop_PwMax16Sx4:
   3104       case Iop_PwMax16Ux4:
   3105       case Iop_PwMin16Sx4:
   3106       case Iop_PwMin16Ux4:
   3107          return assignNew('V', mce, Ity_I64,
   3108                           binop(Iop_PwMax16Ux4,
   3109                                 mkPCast16x4(mce, vatom1),
   3110                                 mkPCast16x4(mce, vatom2)));
   3111 
   3112       case Iop_PwMax8Sx8:
   3113       case Iop_PwMax8Ux8:
   3114       case Iop_PwMin8Sx8:
   3115       case Iop_PwMin8Ux8:
   3116          return assignNew('V', mce, Ity_I64,
   3117                           binop(Iop_PwMax8Ux8,
   3118                                 mkPCast8x8(mce, vatom1),
   3119                                 mkPCast8x8(mce, vatom2)));
   3120 
   3121       case Iop_PwAdd32x2:
   3122       case Iop_PwAdd32Fx2:
   3123          return mkPCast32x2(mce,
   3124                assignNew('V', mce, Ity_I64,
   3125                          binop(Iop_PwAdd32x2,
   3126                                mkPCast32x2(mce, vatom1),
   3127                                mkPCast32x2(mce, vatom2))));
   3128 
   3129       case Iop_PwAdd16x4:
   3130          return mkPCast16x4(mce,
   3131                assignNew('V', mce, Ity_I64,
   3132                          binop(op, mkPCast16x4(mce, vatom1),
   3133                                    mkPCast16x4(mce, vatom2))));
   3134 
   3135       case Iop_PwAdd8x8:
   3136          return mkPCast8x8(mce,
   3137                assignNew('V', mce, Ity_I64,
   3138                          binop(op, mkPCast8x8(mce, vatom1),
   3139                                    mkPCast8x8(mce, vatom2))));
   3140 
   3141       case Iop_Shl8x8:
   3142       case Iop_Shr8x8:
   3143       case Iop_Sar8x8:
   3144       case Iop_Sal8x8:
   3145          return mkUifU64(mce,
   3146                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3147                    mkPCast8x8(mce,vatom2)
   3148                 );
   3149 
   3150       case Iop_Shl16x4:
   3151       case Iop_Shr16x4:
   3152       case Iop_Sar16x4:
   3153       case Iop_Sal16x4:
   3154          return mkUifU64(mce,
   3155                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3156                    mkPCast16x4(mce,vatom2)
   3157                 );
   3158 
   3159       case Iop_Shl32x2:
   3160       case Iop_Shr32x2:
   3161       case Iop_Sar32x2:
   3162       case Iop_Sal32x2:
   3163          return mkUifU64(mce,
   3164                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3165                    mkPCast32x2(mce,vatom2)
   3166                 );
   3167 
   3168       /* 64-bit data-steering */
   3169       case Iop_InterleaveLO32x2:
   3170       case Iop_InterleaveLO16x4:
   3171       case Iop_InterleaveLO8x8:
   3172       case Iop_InterleaveHI32x2:
   3173       case Iop_InterleaveHI16x4:
   3174       case Iop_InterleaveHI8x8:
   3175       case Iop_CatOddLanes8x8:
   3176       case Iop_CatEvenLanes8x8:
   3177       case Iop_CatOddLanes16x4:
   3178       case Iop_CatEvenLanes16x4:
   3179       case Iop_InterleaveOddLanes8x8:
   3180       case Iop_InterleaveEvenLanes8x8:
   3181       case Iop_InterleaveOddLanes16x4:
   3182       case Iop_InterleaveEvenLanes16x4:
   3183          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
   3184 
   3185       case Iop_GetElem8x8:
   3186          complainIfUndefined(mce, atom2, NULL);
   3187          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
   3188       case Iop_GetElem16x4:
   3189          complainIfUndefined(mce, atom2, NULL);
   3190          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
   3191       case Iop_GetElem32x2:
   3192          complainIfUndefined(mce, atom2, NULL);
   3193          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
   3194 
   3195       /* Perm8x8: rearrange values in left arg using steering values
   3196         from right arg.  So rearrange the vbits in the same way but
   3197         pessimise wrt steering values. */
   3198       case Iop_Perm8x8:
   3199          return mkUifU64(
   3200                    mce,
   3201                    assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2)),
   3202                    mkPCast8x8(mce, vatom2)
   3203                 );
   3204 
   3205       /* V128-bit SIMD */
   3206 
   3207       case Iop_Sqrt32Fx4:
   3208          return unary32Fx4_w_rm(mce, vatom1, vatom2);
   3209       case Iop_Sqrt64Fx2:
   3210          return unary64Fx2_w_rm(mce, vatom1, vatom2);
   3211 
   3212       case Iop_ShrN8x16:
   3213       case Iop_ShrN16x8:
   3214       case Iop_ShrN32x4:
   3215       case Iop_ShrN64x2:
   3216       case Iop_SarN8x16:
   3217       case Iop_SarN16x8:
   3218       case Iop_SarN32x4:
   3219       case Iop_SarN64x2:
   3220       case Iop_ShlN8x16:
   3221       case Iop_ShlN16x8:
   3222       case Iop_ShlN32x4:
   3223       case Iop_ShlN64x2:
   3224          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
   3225             this is wrong now, scalar shifts are done properly lazily.
   3226             Vector shifts should be fixed too. */
   3227          complainIfUndefined(mce, atom2, NULL);
   3228          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3229 
   3230       /* V x V shifts/rotates are done using the standard lazy scheme. */
   3231       /* For the non-rounding variants of bi-di vector x vector
   3232          shifts (the Iop_Sh.. ops, that is) we use the lazy scheme.
   3233          But note that this is overly pessimistic, because in fact only
   3234          the bottom 8 bits of each lane of the second argument are taken
   3235          into account when shifting.  So really we ought to ignore
   3236          undefinedness in bits 8 and above of each lane in the
   3237          second argument. */
   3238       case Iop_Shl8x16:
   3239       case Iop_Shr8x16:
   3240       case Iop_Sar8x16:
   3241       case Iop_Sal8x16:
   3242       case Iop_Rol8x16:
   3243       case Iop_Sh8Sx16:
   3244       case Iop_Sh8Ux16:
   3245          return mkUifUV128(mce,
   3246                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3247                    mkPCast8x16(mce,vatom2)
   3248                 );
   3249 
   3250       case Iop_Shl16x8:
   3251       case Iop_Shr16x8:
   3252       case Iop_Sar16x8:
   3253       case Iop_Sal16x8:
   3254       case Iop_Rol16x8:
   3255       case Iop_Sh16Sx8:
   3256       case Iop_Sh16Ux8:
   3257          return mkUifUV128(mce,
   3258                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3259                    mkPCast16x8(mce,vatom2)
   3260                 );
   3261 
   3262       case Iop_Shl32x4:
   3263       case Iop_Shr32x4:
   3264       case Iop_Sar32x4:
   3265       case Iop_Sal32x4:
   3266       case Iop_Rol32x4:
   3267       case Iop_Sh32Sx4:
   3268       case Iop_Sh32Ux4:
   3269          return mkUifUV128(mce,
   3270                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3271                    mkPCast32x4(mce,vatom2)
   3272                 );
   3273 
   3274       case Iop_Shl64x2:
   3275       case Iop_Shr64x2:
   3276       case Iop_Sar64x2:
   3277       case Iop_Sal64x2:
   3278       case Iop_Rol64x2:
   3279       case Iop_Sh64Sx2:
   3280       case Iop_Sh64Ux2:
   3281          return mkUifUV128(mce,
   3282                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3283                    mkPCast64x2(mce,vatom2)
   3284                 );
   3285 
   3286       /* For the rounding variants of bi-di vector x vector shifts, the
   3287          rounding adjustment can cause undefinedness to propagate through
   3288          the entire lane, in the worst case.  Too complex to handle
   3289          properly .. just UifU the arguments and then PCast them.
   3290          Suboptimal but safe. */
   3291       case Iop_Rsh8Sx16:
   3292       case Iop_Rsh8Ux16:
   3293          return binary8Ix16(mce, vatom1, vatom2);
   3294       case Iop_Rsh16Sx8:
   3295       case Iop_Rsh16Ux8:
   3296          return binary16Ix8(mce, vatom1, vatom2);
   3297       case Iop_Rsh32Sx4:
   3298       case Iop_Rsh32Ux4:
   3299          return binary32Ix4(mce, vatom1, vatom2);
   3300       case Iop_Rsh64Sx2:
   3301       case Iop_Rsh64Ux2:
   3302          return binary64Ix2(mce, vatom1, vatom2);
   3303 
   3304       case Iop_F32ToFixed32Ux4_RZ:
   3305       case Iop_F32ToFixed32Sx4_RZ:
   3306       case Iop_Fixed32UToF32x4_RN:
   3307       case Iop_Fixed32SToF32x4_RN:
   3308          complainIfUndefined(mce, atom2, NULL);
   3309          return mkPCast32x4(mce, vatom1);
   3310 
   3311       case Iop_F32ToFixed32Ux2_RZ:
   3312       case Iop_F32ToFixed32Sx2_RZ:
   3313       case Iop_Fixed32UToF32x2_RN:
   3314       case Iop_Fixed32SToF32x2_RN:
   3315          complainIfUndefined(mce, atom2, NULL);
   3316          return mkPCast32x2(mce, vatom1);
   3317 
   3318       case Iop_QSub8Ux16:
   3319       case Iop_QSub8Sx16:
   3320       case Iop_Sub8x16:
   3321       case Iop_Min8Ux16:
   3322       case Iop_Min8Sx16:
   3323       case Iop_Max8Ux16:
   3324       case Iop_Max8Sx16:
   3325       case Iop_CmpGT8Sx16:
   3326       case Iop_CmpGT8Ux16:
   3327       case Iop_CmpEQ8x16:
   3328       case Iop_Avg8Ux16:
   3329       case Iop_Avg8Sx16:
   3330       case Iop_QAdd8Ux16:
   3331       case Iop_QAdd8Sx16:
   3332       case Iop_QAddExtUSsatSS8x16:
   3333       case Iop_QAddExtSUsatUU8x16:
   3334       case Iop_QSal8x16:
   3335       case Iop_QShl8x16:
   3336       case Iop_Add8x16:
   3337       case Iop_Mul8x16:
   3338       case Iop_PolynomialMul8x16:
   3339       case Iop_PolynomialMulAdd8x16:
   3340          return binary8Ix16(mce, vatom1, vatom2);
   3341 
   3342       case Iop_QSub16Ux8:
   3343       case Iop_QSub16Sx8:
   3344       case Iop_Sub16x8:
   3345       case Iop_Mul16x8:
   3346       case Iop_MulHi16Sx8:
   3347       case Iop_MulHi16Ux8:
   3348       case Iop_Min16Sx8:
   3349       case Iop_Min16Ux8:
   3350       case Iop_Max16Sx8:
   3351       case Iop_Max16Ux8:
   3352       case Iop_CmpGT16Sx8:
   3353       case Iop_CmpGT16Ux8:
   3354       case Iop_CmpEQ16x8:
   3355       case Iop_Avg16Ux8:
   3356       case Iop_Avg16Sx8:
   3357       case Iop_QAdd16Ux8:
   3358       case Iop_QAdd16Sx8:
   3359       case Iop_QAddExtUSsatSS16x8:
   3360       case Iop_QAddExtSUsatUU16x8:
   3361       case Iop_QSal16x8:
   3362       case Iop_QShl16x8:
   3363       case Iop_Add16x8:
   3364       case Iop_QDMulHi16Sx8:
   3365       case Iop_QRDMulHi16Sx8:
   3366       case Iop_PolynomialMulAdd16x8:
   3367          return binary16Ix8(mce, vatom1, vatom2);
   3368 
   3369       case Iop_Sub32x4:
   3370       case Iop_CmpGT32Sx4:
   3371       case Iop_CmpGT32Ux4:
   3372       case Iop_CmpEQ32x4:
   3373       case Iop_QAdd32Sx4:
   3374       case Iop_QAdd32Ux4:
   3375       case Iop_QSub32Sx4:
   3376       case Iop_QSub32Ux4:
   3377       case Iop_QAddExtUSsatSS32x4:
   3378       case Iop_QAddExtSUsatUU32x4:
   3379       case Iop_QSal32x4:
   3380       case Iop_QShl32x4:
   3381       case Iop_Avg32Ux4:
   3382       case Iop_Avg32Sx4:
   3383       case Iop_Add32x4:
   3384       case Iop_Max32Ux4:
   3385       case Iop_Max32Sx4:
   3386       case Iop_Min32Ux4:
   3387       case Iop_Min32Sx4:
   3388       case Iop_Mul32x4:
   3389       case Iop_QDMulHi32Sx4:
   3390       case Iop_QRDMulHi32Sx4:
   3391       case Iop_PolynomialMulAdd32x4:
   3392          return binary32Ix4(mce, vatom1, vatom2);
   3393 
   3394       case Iop_Sub64x2:
   3395       case Iop_Add64x2:
   3396       case Iop_Max64Sx2:
   3397       case Iop_Max64Ux2:
   3398       case Iop_Min64Sx2:
   3399       case Iop_Min64Ux2:
   3400       case Iop_CmpEQ64x2:
   3401       case Iop_CmpGT64Sx2:
   3402       case Iop_CmpGT64Ux2:
   3403       case Iop_QSal64x2:
   3404       case Iop_QShl64x2:
   3405       case Iop_QAdd64Ux2:
   3406       case Iop_QAdd64Sx2:
   3407       case Iop_QSub64Ux2:
   3408       case Iop_QSub64Sx2:
   3409       case Iop_QAddExtUSsatSS64x2:
   3410       case Iop_QAddExtSUsatUU64x2:
   3411       case Iop_PolynomialMulAdd64x2:
   3412       case Iop_CipherV128:
   3413       case Iop_CipherLV128:
   3414       case Iop_NCipherV128:
   3415       case Iop_NCipherLV128:
   3416         return binary64Ix2(mce, vatom1, vatom2);
   3417 
   3418       case Iop_QNarrowBin64Sto32Sx4:
   3419       case Iop_QNarrowBin64Uto32Ux4:
   3420       case Iop_QNarrowBin32Sto16Sx8:
   3421       case Iop_QNarrowBin32Uto16Ux8:
   3422       case Iop_QNarrowBin32Sto16Ux8:
   3423       case Iop_QNarrowBin16Sto8Sx16:
   3424       case Iop_QNarrowBin16Uto8Ux16:
   3425       case Iop_QNarrowBin16Sto8Ux16:
   3426          return vectorNarrowBinV128(mce, op, vatom1, vatom2);
   3427 
   3428       case Iop_Min64Fx2:
   3429       case Iop_Max64Fx2:
   3430       case Iop_CmpLT64Fx2:
   3431       case Iop_CmpLE64Fx2:
   3432       case Iop_CmpEQ64Fx2:
   3433       case Iop_CmpUN64Fx2:
   3434       case Iop_RecipStep64Fx2:
   3435       case Iop_RSqrtStep64Fx2:
   3436          return binary64Fx2(mce, vatom1, vatom2);
   3437 
   3438       case Iop_Sub64F0x2:
   3439       case Iop_Mul64F0x2:
   3440       case Iop_Min64F0x2:
   3441       case Iop_Max64F0x2:
   3442       case Iop_Div64F0x2:
   3443       case Iop_CmpLT64F0x2:
   3444       case Iop_CmpLE64F0x2:
   3445       case Iop_CmpEQ64F0x2:
   3446       case Iop_CmpUN64F0x2:
   3447       case Iop_Add64F0x2:
   3448          return binary64F0x2(mce, vatom1, vatom2);
   3449 
   3450       case Iop_Min32Fx4:
   3451       case Iop_Max32Fx4:
   3452       case Iop_CmpLT32Fx4:
   3453       case Iop_CmpLE32Fx4:
   3454       case Iop_CmpEQ32Fx4:
   3455       case Iop_CmpUN32Fx4:
   3456       case Iop_CmpGT32Fx4:
   3457       case Iop_CmpGE32Fx4:
   3458       case Iop_RecipStep32Fx4:
   3459       case Iop_RSqrtStep32Fx4:
   3460          return binary32Fx4(mce, vatom1, vatom2);
   3461 
   3462       case Iop_Sub32Fx2:
   3463       case Iop_Mul32Fx2:
   3464       case Iop_Min32Fx2:
   3465       case Iop_Max32Fx2:
   3466       case Iop_CmpEQ32Fx2:
   3467       case Iop_CmpGT32Fx2:
   3468       case Iop_CmpGE32Fx2:
   3469       case Iop_Add32Fx2:
   3470       case Iop_RecipStep32Fx2:
   3471       case Iop_RSqrtStep32Fx2:
   3472          return binary32Fx2(mce, vatom1, vatom2);
   3473 
   3474       case Iop_Sub32F0x4:
   3475       case Iop_Mul32F0x4:
   3476       case Iop_Min32F0x4:
   3477       case Iop_Max32F0x4:
   3478       case Iop_Div32F0x4:
   3479       case Iop_CmpLT32F0x4:
   3480       case Iop_CmpLE32F0x4:
   3481       case Iop_CmpEQ32F0x4:
   3482       case Iop_CmpUN32F0x4:
   3483       case Iop_Add32F0x4:
   3484          return binary32F0x4(mce, vatom1, vatom2);
   3485 
   3486       case Iop_QShlNsatSU8x16:
   3487       case Iop_QShlNsatUU8x16:
   3488       case Iop_QShlNsatSS8x16:
   3489          complainIfUndefined(mce, atom2, NULL);
   3490          return mkPCast8x16(mce, vatom1);
   3491 
   3492       case Iop_QShlNsatSU16x8:
   3493       case Iop_QShlNsatUU16x8:
   3494       case Iop_QShlNsatSS16x8:
   3495          complainIfUndefined(mce, atom2, NULL);
   3496          return mkPCast16x8(mce, vatom1);
   3497 
   3498       case Iop_QShlNsatSU32x4:
   3499       case Iop_QShlNsatUU32x4:
   3500       case Iop_QShlNsatSS32x4:
   3501          complainIfUndefined(mce, atom2, NULL);
   3502          return mkPCast32x4(mce, vatom1);
   3503 
   3504       case Iop_QShlNsatSU64x2:
   3505       case Iop_QShlNsatUU64x2:
   3506       case Iop_QShlNsatSS64x2:
   3507          complainIfUndefined(mce, atom2, NULL);
   3508          return mkPCast32x4(mce, vatom1);
   3509 
   3510       /* Q-and-Qshift-by-imm-and-narrow of the form (V128, I8) -> V128.
   3511          To make this simpler, do the following:
   3512          * complain if the shift amount (the I8) is undefined
   3513          * pcast each lane at the wide width
   3514          * truncate each lane to half width
   3515          * pcast the resulting 64-bit value to a single bit and use
   3516            that as the least significant bit of the upper half of the
   3517            result. */
   3518       case Iop_QandQShrNnarrow64Uto32Ux2:
   3519       case Iop_QandQSarNnarrow64Sto32Sx2:
   3520       case Iop_QandQSarNnarrow64Sto32Ux2:
   3521       case Iop_QandQRShrNnarrow64Uto32Ux2:
   3522       case Iop_QandQRSarNnarrow64Sto32Sx2:
   3523       case Iop_QandQRSarNnarrow64Sto32Ux2:
   3524       case Iop_QandQShrNnarrow32Uto16Ux4:
   3525       case Iop_QandQSarNnarrow32Sto16Sx4:
   3526       case Iop_QandQSarNnarrow32Sto16Ux4:
   3527       case Iop_QandQRShrNnarrow32Uto16Ux4:
   3528       case Iop_QandQRSarNnarrow32Sto16Sx4:
   3529       case Iop_QandQRSarNnarrow32Sto16Ux4:
   3530       case Iop_QandQShrNnarrow16Uto8Ux8:
   3531       case Iop_QandQSarNnarrow16Sto8Sx8:
   3532       case Iop_QandQSarNnarrow16Sto8Ux8:
   3533       case Iop_QandQRShrNnarrow16Uto8Ux8:
   3534       case Iop_QandQRSarNnarrow16Sto8Sx8:
   3535       case Iop_QandQRSarNnarrow16Sto8Ux8:
   3536       {
   3537          IRAtom* (*fnPessim) (MCEnv*, IRAtom*) = NULL;
   3538          IROp opNarrow = Iop_INVALID;
   3539          switch (op) {
   3540             case Iop_QandQShrNnarrow64Uto32Ux2:
   3541             case Iop_QandQSarNnarrow64Sto32Sx2:
   3542             case Iop_QandQSarNnarrow64Sto32Ux2:
   3543             case Iop_QandQRShrNnarrow64Uto32Ux2:
   3544             case Iop_QandQRSarNnarrow64Sto32Sx2:
   3545             case Iop_QandQRSarNnarrow64Sto32Ux2:
   3546                fnPessim = mkPCast64x2;
   3547                opNarrow = Iop_NarrowUn64to32x2;
   3548                break;
   3549             case Iop_QandQShrNnarrow32Uto16Ux4:
   3550             case Iop_QandQSarNnarrow32Sto16Sx4:
   3551             case Iop_QandQSarNnarrow32Sto16Ux4:
   3552             case Iop_QandQRShrNnarrow32Uto16Ux4:
   3553             case Iop_QandQRSarNnarrow32Sto16Sx4:
   3554             case Iop_QandQRSarNnarrow32Sto16Ux4:
   3555                fnPessim = mkPCast32x4;
   3556                opNarrow = Iop_NarrowUn32to16x4;
   3557                break;
   3558             case Iop_QandQShrNnarrow16Uto8Ux8:
   3559             case Iop_QandQSarNnarrow16Sto8Sx8:
   3560             case Iop_QandQSarNnarrow16Sto8Ux8:
   3561             case Iop_QandQRShrNnarrow16Uto8Ux8:
   3562             case Iop_QandQRSarNnarrow16Sto8Sx8:
   3563             case Iop_QandQRSarNnarrow16Sto8Ux8:
   3564                fnPessim = mkPCast16x8;
   3565                opNarrow = Iop_NarrowUn16to8x8;
   3566                break;
   3567             default:
   3568                tl_assert(0);
   3569          }
   3570          complainIfUndefined(mce, atom2, NULL);
   3571          // Pessimised shift result
   3572          IRAtom* shV
   3573             = fnPessim(mce, vatom1);
   3574          // Narrowed, pessimised shift result
   3575          IRAtom* shVnarrowed
   3576             = assignNew('V', mce, Ity_I64, unop(opNarrow, shV));
   3577          // Generates: Def--(63)--Def PCast-to-I1(narrowed)
   3578          IRAtom* qV = mkPCastXXtoXXlsb(mce, shVnarrowed, Ity_I64);
   3579          // and assemble the result
   3580          return assignNew('V', mce, Ity_V128,
   3581                           binop(Iop_64HLtoV128, qV, shVnarrowed));
   3582       }
   3583 
   3584       case Iop_Mull32Sx2:
   3585       case Iop_Mull32Ux2:
   3586       case Iop_QDMull32Sx2:
   3587          return vectorWidenI64(mce, Iop_Widen32Sto64x2,
   3588                                     mkUifU64(mce, vatom1, vatom2));
   3589 
   3590       case Iop_Mull16Sx4:
   3591       case Iop_Mull16Ux4:
   3592       case Iop_QDMull16Sx4:
   3593          return vectorWidenI64(mce, Iop_Widen16Sto32x4,
   3594                                     mkUifU64(mce, vatom1, vatom2));
   3595 
   3596       case Iop_Mull8Sx8:
   3597       case Iop_Mull8Ux8:
   3598       case Iop_PolynomialMull8x8:
   3599          return vectorWidenI64(mce, Iop_Widen8Sto16x8,
   3600                                     mkUifU64(mce, vatom1, vatom2));
   3601 
   3602       case Iop_PwAdd32x4:
   3603          return mkPCast32x4(mce,
   3604                assignNew('V', mce, Ity_V128, binop(op, mkPCast32x4(mce, vatom1),
   3605                      mkPCast32x4(mce, vatom2))));
   3606 
   3607       case Iop_PwAdd16x8:
   3608          return mkPCast16x8(mce,
   3609                assignNew('V', mce, Ity_V128, binop(op, mkPCast16x8(mce, vatom1),
   3610                      mkPCast16x8(mce, vatom2))));
   3611 
   3612       case Iop_PwAdd8x16:
   3613          return mkPCast8x16(mce,
   3614                assignNew('V', mce, Ity_V128, binop(op, mkPCast8x16(mce, vatom1),
   3615                      mkPCast8x16(mce, vatom2))));
   3616 
   3617       /* V128-bit data-steering */
   3618       case Iop_SetV128lo32:
   3619       case Iop_SetV128lo64:
   3620       case Iop_64HLtoV128:
   3621       case Iop_InterleaveLO64x2:
   3622       case Iop_InterleaveLO32x4:
   3623       case Iop_InterleaveLO16x8:
   3624       case Iop_InterleaveLO8x16:
   3625       case Iop_InterleaveHI64x2:
   3626       case Iop_InterleaveHI32x4:
   3627       case Iop_InterleaveHI16x8:
   3628       case Iop_InterleaveHI8x16:
   3629       case Iop_CatOddLanes8x16:
   3630       case Iop_CatOddLanes16x8:
   3631       case Iop_CatOddLanes32x4:
   3632       case Iop_CatEvenLanes8x16:
   3633       case Iop_CatEvenLanes16x8:
   3634       case Iop_CatEvenLanes32x4:
   3635       case Iop_InterleaveOddLanes8x16:
   3636       case Iop_InterleaveOddLanes16x8:
   3637       case Iop_InterleaveOddLanes32x4:
   3638       case Iop_InterleaveEvenLanes8x16:
   3639       case Iop_InterleaveEvenLanes16x8:
   3640       case Iop_InterleaveEvenLanes32x4:
   3641          return assignNew('V', mce, Ity_V128, binop(op, vatom1, vatom2));
   3642 
   3643       case Iop_GetElem8x16:
   3644          complainIfUndefined(mce, atom2, NULL);
   3645          return assignNew('V', mce, Ity_I8, binop(op, vatom1, atom2));
   3646       case Iop_GetElem16x8:
   3647          complainIfUndefined(mce, atom2, NULL);
   3648          return assignNew('V', mce, Ity_I16, binop(op, vatom1, atom2));
   3649       case Iop_GetElem32x4:
   3650          complainIfUndefined(mce, atom2, NULL);
   3651          return assignNew('V', mce, Ity_I32, binop(op, vatom1, atom2));
   3652       case Iop_GetElem64x2:
   3653          complainIfUndefined(mce, atom2, NULL);
   3654          return assignNew('V', mce, Ity_I64, binop(op, vatom1, atom2));
   3655 
   3656      /* Perm8x16: rearrange values in left arg using steering values
   3657         from right arg.  So rearrange the vbits in the same way but
   3658         pessimise wrt steering values.  Perm32x4 ditto. */
   3659       case Iop_Perm8x16:
   3660          return mkUifUV128(
   3661                    mce,
   3662                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3663                    mkPCast8x16(mce, vatom2)
   3664                 );
   3665       case Iop_Perm32x4:
   3666          return mkUifUV128(
   3667                    mce,
   3668                    assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2)),
   3669                    mkPCast32x4(mce, vatom2)
   3670                 );
   3671 
   3672      /* These two take the lower half of each 16-bit lane, sign/zero
   3673         extend it to 32, and multiply together, producing a 32x4
   3674         result (and implicitly ignoring half the operand bits).  So
   3675         treat it as a bunch of independent 16x8 operations, but then
   3676         do 32-bit shifts left-right to copy the lower half results
   3677         (which are all 0s or all 1s due to PCasting in binary16Ix8)
   3678         into the upper half of each result lane. */
   3679       case Iop_MullEven16Ux8:
   3680       case Iop_MullEven16Sx8: {
   3681          IRAtom* at;
   3682          at = binary16Ix8(mce,vatom1,vatom2);
   3683          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN32x4, at, mkU8(16)));
   3684          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN32x4, at, mkU8(16)));
   3685 	 return at;
   3686       }
   3687 
   3688       /* Same deal as Iop_MullEven16{S,U}x8 */
   3689       case Iop_MullEven8Ux16:
   3690       case Iop_MullEven8Sx16: {
   3691          IRAtom* at;
   3692          at = binary8Ix16(mce,vatom1,vatom2);
   3693          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN16x8, at, mkU8(8)));
   3694          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN16x8, at, mkU8(8)));
   3695 	 return at;
   3696       }
   3697 
   3698       /* Same deal as Iop_MullEven16{S,U}x8 */
   3699       case Iop_MullEven32Ux4:
   3700       case Iop_MullEven32Sx4: {
   3701          IRAtom* at;
   3702          at = binary32Ix4(mce,vatom1,vatom2);
   3703          at = assignNew('V', mce, Ity_V128, binop(Iop_ShlN64x2, at, mkU8(32)));
   3704          at = assignNew('V', mce, Ity_V128, binop(Iop_SarN64x2, at, mkU8(32)));
   3705          return at;
   3706       }
   3707 
   3708       /* narrow 2xV128 into 1xV128, hi half from left arg, in a 2 x
   3709          32x4 -> 16x8 laneage, discarding the upper half of each lane.
   3710          Simply apply same op to the V bits, since this really no more
   3711          than a data steering operation. */
   3712       case Iop_NarrowBin32to16x8:
   3713       case Iop_NarrowBin16to8x16:
   3714       case Iop_NarrowBin64to32x4:
   3715          return assignNew('V', mce, Ity_V128,
   3716                                     binop(op, vatom1, vatom2));
   3717 
   3718       case Iop_ShrV128:
   3719       case Iop_ShlV128:
   3720          /* Same scheme as with all other shifts.  Note: 10 Nov 05:
   3721             this is wrong now, scalar shifts are done properly lazily.
   3722             Vector shifts should be fixed too. */
   3723          complainIfUndefined(mce, atom2, NULL);
   3724          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3725 
   3726       /* SHA Iops */
   3727       case Iop_SHA256:
   3728       case Iop_SHA512:
   3729          complainIfUndefined(mce, atom2, NULL);
   3730          return assignNew('V', mce, Ity_V128, binop(op, vatom1, atom2));
   3731 
   3732       /* I128-bit data-steering */
   3733       case Iop_64HLto128:
   3734          return assignNew('V', mce, Ity_I128, binop(op, vatom1, vatom2));
   3735 
   3736       /* V256-bit SIMD */
   3737 
   3738       case Iop_Max64Fx4:
   3739       case Iop_Min64Fx4:
   3740          return binary64Fx4(mce, vatom1, vatom2);
   3741 
   3742       case Iop_Max32Fx8:
   3743       case Iop_Min32Fx8:
   3744          return binary32Fx8(mce, vatom1, vatom2);
   3745 
   3746       /* V256-bit data-steering */
   3747       case Iop_V128HLtoV256:
   3748          return assignNew('V', mce, Ity_V256, binop(op, vatom1, vatom2));
   3749 
   3750       /* Scalar floating point */
   3751 
   3752       case Iop_F32toI64S:
   3753       case Iop_F32toI64U:
   3754          /* I32(rm) x F32 -> I64 */
   3755          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3756 
   3757       case Iop_I64StoF32:
   3758          /* I32(rm) x I64 -> F32 */
   3759          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3760 
   3761       case Iop_RoundF64toInt:
   3762       case Iop_RoundF64toF32:
   3763       case Iop_F64toI64S:
   3764       case Iop_F64toI64U:
   3765       case Iop_I64StoF64:
   3766       case Iop_I64UtoF64:
   3767       case Iop_SinF64:
   3768       case Iop_CosF64:
   3769       case Iop_TanF64:
   3770       case Iop_2xm1F64:
   3771       case Iop_SqrtF64:
   3772       case Iop_RecpExpF64:
   3773          /* I32(rm) x I64/F64 -> I64/F64 */
   3774          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3775 
   3776       case Iop_ShlD64:
   3777       case Iop_ShrD64:
   3778       case Iop_RoundD64toInt:
   3779          /* I32(rm) x D64 -> D64 */
   3780          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3781 
   3782       case Iop_ShlD128:
   3783       case Iop_ShrD128:
   3784       case Iop_RoundD128toInt:
   3785          /* I32(rm) x D128 -> D128 */
   3786          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3787 
   3788       case Iop_RoundF128toInt:
   3789          /* I32(rm) x F128 -> F128 */
   3790          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3791 
   3792       case Iop_D64toI64S:
   3793       case Iop_D64toI64U:
   3794       case Iop_I64StoD64:
   3795       case Iop_I64UtoD64:
   3796          /* I32(rm) x I64/D64 -> D64/I64 */
   3797          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3798 
   3799       case Iop_F32toD32:
   3800       case Iop_F64toD32:
   3801       case Iop_F128toD32:
   3802       case Iop_D32toF32:
   3803       case Iop_D64toF32:
   3804       case Iop_D128toF32:
   3805          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D32/F32 */
   3806          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3807 
   3808       case Iop_F32toD64:
   3809       case Iop_F64toD64:
   3810       case Iop_F128toD64:
   3811       case Iop_D32toF64:
   3812       case Iop_D64toF64:
   3813       case Iop_D128toF64:
   3814          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D64/F64 */
   3815          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3816 
   3817       case Iop_F32toD128:
   3818       case Iop_F64toD128:
   3819       case Iop_F128toD128:
   3820       case Iop_D32toF128:
   3821       case Iop_D64toF128:
   3822       case Iop_D128toF128:
   3823          /* I32(rm) x F32/F64/F128/D32/D64/D128 -> D128/F128 */
   3824          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3825 
   3826       case Iop_RoundF32toInt:
   3827       case Iop_SqrtF32:
   3828       case Iop_RecpExpF32:
   3829          /* I32(rm) x I32/F32 -> I32/F32 */
   3830          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3831 
   3832       case Iop_SqrtF128:
   3833          /* I32(rm) x F128 -> F128 */
   3834          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3835 
   3836       case Iop_I32StoF32:
   3837       case Iop_I32UtoF32:
   3838       case Iop_F32toI32S:
   3839       case Iop_F32toI32U:
   3840          /* First arg is I32 (rounding mode), second is F32/I32 (data). */
   3841          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3842 
   3843       case Iop_F64toF16:
   3844       case Iop_F32toF16:
   3845          /* First arg is I32 (rounding mode), second is F64/F32 (data). */
   3846          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
   3847 
   3848       case Iop_F128toI32S: /* IRRoundingMode(I32) x F128 -> signed I32  */
   3849       case Iop_F128toI32U: /* IRRoundingMode(I32) x F128 -> unsigned I32  */
   3850       case Iop_F128toF32:  /* IRRoundingMode(I32) x F128 -> F32         */
   3851       case Iop_D128toI32S: /* IRRoundingMode(I32) x D128 -> signed I32  */
   3852       case Iop_D128toI32U: /* IRRoundingMode(I32) x D128 -> unsigned I32  */
   3853          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3854 
   3855       case Iop_F128toI64S: /* IRRoundingMode(I32) x F128 -> signed I64  */
   3856       case Iop_F128toI64U: /* IRRoundingMode(I32) x F128 -> unsigned I64  */
   3857       case Iop_F128toF64:  /* IRRoundingMode(I32) x F128 -> F64         */
   3858       case Iop_D128toD64:  /* IRRoundingMode(I64) x D128 -> D64 */
   3859       case Iop_D128toI64S: /* IRRoundingMode(I64) x D128 -> signed I64  */
   3860       case Iop_D128toI64U: /* IRRoundingMode(I32) x D128 -> unsigned I64  */
   3861          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3862 
   3863       case Iop_F64HLtoF128:
   3864       case Iop_D64HLtoD128:
   3865          return assignNew('V', mce, Ity_I128,
   3866                           binop(Iop_64HLto128, vatom1, vatom2));
   3867 
   3868       case Iop_F64toI32U:
   3869       case Iop_F64toI32S:
   3870       case Iop_F64toF32:
   3871       case Iop_I64UtoF32:
   3872       case Iop_D64toI32U:
   3873       case Iop_D64toI32S:
   3874          /* First arg is I32 (rounding mode), second is F64/D64 (data). */
   3875          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3876 
   3877       case Iop_D64toD32:
   3878          /* First arg is I32 (rounding mode), second is D64 (data). */
   3879          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3880 
   3881       case Iop_F64toI16S:
   3882          /* First arg is I32 (rounding mode), second is F64 (data). */
   3883          return mkLazy2(mce, Ity_I16, vatom1, vatom2);
   3884 
   3885       case Iop_InsertExpD64:
   3886          /*  I64 x I64 -> D64 */
   3887          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3888 
   3889       case Iop_InsertExpD128:
   3890          /*  I64 x I128 -> D128 */
   3891          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3892 
   3893       case Iop_CmpF32:
   3894       case Iop_CmpF64:
   3895       case Iop_CmpF128:
   3896       case Iop_CmpD64:
   3897       case Iop_CmpD128:
   3898       case Iop_CmpExpD64:
   3899       case Iop_CmpExpD128:
   3900          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3901 
   3902       /* non-FP after here */
   3903 
   3904       case Iop_DivModU64to32:
   3905       case Iop_DivModS64to32:
   3906          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3907 
   3908       case Iop_DivModU128to64:
   3909       case Iop_DivModS128to64:
   3910          return mkLazy2(mce, Ity_I128, vatom1, vatom2);
   3911 
   3912       case Iop_8HLto16:
   3913          return assignNew('V', mce, Ity_I16, binop(op, vatom1, vatom2));
   3914       case Iop_16HLto32:
   3915          return assignNew('V', mce, Ity_I32, binop(op, vatom1, vatom2));
   3916       case Iop_32HLto64:
   3917          return assignNew('V', mce, Ity_I64, binop(op, vatom1, vatom2));
   3918 
   3919       case Iop_DivModS64to64:
   3920       case Iop_MullS64:
   3921       case Iop_MullU64: {
   3922          IRAtom* vLo64 = mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
   3923          IRAtom* vHi64 = mkPCastTo(mce, Ity_I64, vLo64);
   3924          return assignNew('V', mce, Ity_I128,
   3925                           binop(Iop_64HLto128, vHi64, vLo64));
   3926       }
   3927 
   3928       case Iop_MullS32:
   3929       case Iop_MullU32: {
   3930          IRAtom* vLo32 = mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
   3931          IRAtom* vHi32 = mkPCastTo(mce, Ity_I32, vLo32);
   3932          return assignNew('V', mce, Ity_I64,
   3933                           binop(Iop_32HLto64, vHi32, vLo32));
   3934       }
   3935 
   3936       case Iop_MullS16:
   3937       case Iop_MullU16: {
   3938          IRAtom* vLo16 = mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
   3939          IRAtom* vHi16 = mkPCastTo(mce, Ity_I16, vLo16);
   3940          return assignNew('V', mce, Ity_I32,
   3941                           binop(Iop_16HLto32, vHi16, vLo16));
   3942       }
   3943 
   3944       case Iop_MullS8:
   3945       case Iop_MullU8: {
   3946          IRAtom* vLo8 = mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
   3947          IRAtom* vHi8 = mkPCastTo(mce, Ity_I8, vLo8);
   3948          return assignNew('V', mce, Ity_I16, binop(Iop_8HLto16, vHi8, vLo8));
   3949       }
   3950 
   3951       case Iop_Sad8Ux4: /* maybe we could do better?  ftm, do mkLazy2. */
   3952       case Iop_DivS32:
   3953       case Iop_DivU32:
   3954       case Iop_DivU32E:
   3955       case Iop_DivS32E:
   3956       case Iop_QAdd32S: /* could probably do better */
   3957       case Iop_QSub32S: /* could probably do better */
   3958          return mkLazy2(mce, Ity_I32, vatom1, vatom2);
   3959 
   3960       case Iop_DivS64:
   3961       case Iop_DivU64:
   3962       case Iop_DivS64E:
   3963       case Iop_DivU64E:
   3964          return mkLazy2(mce, Ity_I64, vatom1, vatom2);
   3965 
   3966       case Iop_Add32:
   3967          if (mce->bogusLiterals || mce->useLLVMworkarounds)
   3968             return expensiveAddSub(mce,True,Ity_I32,
   3969                                    vatom1,vatom2, atom1,atom2);
   3970          else
   3971             goto cheap_AddSub32;
   3972       case Iop_Sub32:
   3973          if (mce->bogusLiterals)
   3974             return expensiveAddSub(mce,False,Ity_I32,
   3975                                    vatom1,vatom2, atom1,atom2);
   3976          else
   3977             goto cheap_AddSub32;
   3978 
   3979       cheap_AddSub32:
   3980       case Iop_Mul32:
   3981          return mkLeft32(mce, mkUifU32(mce, vatom1,vatom2));
   3982 
   3983       case Iop_CmpORD32S:
   3984       case Iop_CmpORD32U:
   3985       case Iop_CmpORD64S:
   3986       case Iop_CmpORD64U:
   3987          return doCmpORD(mce, op, vatom1,vatom2, atom1,atom2);
   3988 
   3989       case Iop_Add64:
   3990          if (mce->bogusLiterals || mce->useLLVMworkarounds)
   3991             return expensiveAddSub(mce,True,Ity_I64,
   3992                                    vatom1,vatom2, atom1,atom2);
   3993          else
   3994             goto cheap_AddSub64;
   3995       case Iop_Sub64:
   3996          if (mce->bogusLiterals)
   3997             return expensiveAddSub(mce,False,Ity_I64,
   3998                                    vatom1,vatom2, atom1,atom2);
   3999          else
   4000             goto cheap_AddSub64;
   4001 
   4002       cheap_AddSub64:
   4003       case Iop_Mul64:
   4004          return mkLeft64(mce, mkUifU64(mce, vatom1,vatom2));
   4005 
   4006       case Iop_Mul16:
   4007       case Iop_Add16:
   4008       case Iop_Sub16:
   4009          return mkLeft16(mce, mkUifU16(mce, vatom1,vatom2));
   4010 
   4011       case Iop_Mul8:
   4012       case Iop_Sub8:
   4013       case Iop_Add8:
   4014          return mkLeft8(mce, mkUifU8(mce, vatom1,vatom2));
   4015 
   4016       case Iop_CmpEQ64:
   4017       case Iop_CmpNE64:
   4018          if (mce->bogusLiterals)
   4019             goto expensive_cmp64;
   4020          else
   4021             goto cheap_cmp64;
   4022 
   4023       expensive_cmp64:
   4024       case Iop_ExpCmpNE64:
   4025          return expensiveCmpEQorNE(mce,Ity_I64, vatom1,vatom2, atom1,atom2 );
   4026 
   4027       cheap_cmp64:
   4028       case Iop_CmpLE64S: case Iop_CmpLE64U:
   4029       case Iop_CmpLT64U: case Iop_CmpLT64S:
   4030          return mkPCastTo(mce, Ity_I1, mkUifU64(mce, vatom1,vatom2));
   4031 
   4032       case Iop_CmpEQ32:
   4033       case Iop_CmpNE32:
   4034          if (mce->bogusLiterals)
   4035             goto expensive_cmp32;
   4036          else
   4037             goto cheap_cmp32;
   4038 
   4039       expensive_cmp32:
   4040       case Iop_ExpCmpNE32:
   4041          return expensiveCmpEQorNE(mce,Ity_I32, vatom1,vatom2, atom1,atom2 );
   4042 
   4043       cheap_cmp32:
   4044       case Iop_CmpLE32S: case Iop_CmpLE32U:
   4045       case Iop_CmpLT32U: case Iop_CmpLT32S:
   4046          return mkPCastTo(mce, Ity_I1, mkUifU32(mce, vatom1,vatom2));
   4047 
   4048       case Iop_CmpEQ16: case Iop_CmpNE16:
   4049          return mkPCastTo(mce, Ity_I1, mkUifU16(mce, vatom1,vatom2));
   4050 
   4051       case Iop_ExpCmpNE16:
   4052          return expensiveCmpEQorNE(mce,Ity_I16, vatom1,vatom2, atom1,atom2 );
   4053 
   4054       case Iop_CmpEQ8: case Iop_CmpNE8:
   4055          return mkPCastTo(mce, Ity_I1, mkUifU8(mce, vatom1,vatom2));
   4056 
   4057       case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
   4058       case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
   4059       case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
   4060       case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
   4061          /* Just say these all produce a defined result, regardless
   4062             of their arguments.  See COMMENT_ON_CasCmpEQ in this file. */
   4063          return assignNew('V', mce, Ity_I1, definedOfType(Ity_I1));
   4064 
   4065       case Iop_Shl64: case Iop_Shr64: case Iop_Sar64:
   4066          return scalarShift( mce, Ity_I64, op, vatom1,vatom2, atom1,atom2 );
   4067 
   4068       case Iop_Shl32: case Iop_Shr32: case Iop_Sar32:
   4069          return scalarShift( mce, Ity_I32, op, vatom1,vatom2, atom1,atom2 );
   4070 
   4071       case Iop_Shl16: case Iop_Shr16: case Iop_Sar16:
   4072          return scalarShift( mce, Ity_I16, op, vatom1,vatom2, atom1,atom2 );
   4073 
   4074       case Iop_Shl8: case Iop_Shr8: case Iop_Sar8:
   4075          return scalarShift( mce, Ity_I8, op, vatom1,vatom2, atom1,atom2 );
   4076 
   4077       case Iop_AndV256:
   4078          uifu = mkUifUV256; difd = mkDifDV256;
   4079          and_or_ty = Ity_V256; improve = mkImproveANDV256; goto do_And_Or;
   4080       case Iop_AndV128:
   4081          uifu = mkUifUV128; difd = mkDifDV128;
   4082          and_or_ty = Ity_V128; improve = mkImproveANDV128; goto do_And_Or;
   4083       case Iop_And64:
   4084          uifu = mkUifU64; difd = mkDifD64;
   4085          and_or_ty = Ity_I64; improve = mkImproveAND64; goto do_And_Or;
   4086       case Iop_And32:
   4087          uifu = mkUifU32; difd = mkDifD32;
   4088          and_or_ty = Ity_I32; improve = mkImproveAND32; goto do_And_Or;
   4089       case Iop_And16:
   4090          uifu = mkUifU16; difd = mkDifD16;
   4091          and_or_ty = Ity_I16; improve = mkImproveAND16; goto do_And_Or;
   4092       case Iop_And8:
   4093          uifu = mkUifU8; difd = mkDifD8;
   4094          and_or_ty = Ity_I8; improve = mkImproveAND8; goto do_And_Or;
   4095 
   4096       case Iop_OrV256:
   4097          uifu = mkUifUV256; difd = mkDifDV256;
   4098          and_or_ty = Ity_V256; improve = mkImproveORV256; goto do_And_Or;
   4099       case Iop_OrV128:
   4100          uifu = mkUifUV128; difd = mkDifDV128;
   4101          and_or_ty = Ity_V128; improve = mkImproveORV128; goto do_And_Or;
   4102       case Iop_Or64:
   4103          uifu = mkUifU64; difd = mkDifD64;
   4104          and_or_ty = Ity_I64; improve = mkImproveOR64; goto do_And_Or;
   4105       case Iop_Or32:
   4106          uifu = mkUifU32; difd = mkDifD32;
   4107          and_or_ty = Ity_I32; improve = mkImproveOR32; goto do_And_Or;
   4108       case Iop_Or16:
   4109          uifu = mkUifU16; difd = mkDifD16;
   4110          and_or_ty = Ity_I16; improve = mkImproveOR16; goto do_And_Or;
   4111       case Iop_Or8:
   4112          uifu = mkUifU8; difd = mkDifD8;
   4113          and_or_ty = Ity_I8; improve = mkImproveOR8; goto do_And_Or;
   4114 
   4115       do_And_Or:
   4116          return
   4117          assignNew(
   4118             'V', mce,
   4119             and_or_ty,
   4120             difd(mce, uifu(mce, vatom1, vatom2),
   4121                       difd(mce, improve(mce, atom1, vatom1),
   4122                                 improve(mce, atom2, vatom2) ) ) );
   4123 
   4124       case Iop_Xor8:
   4125          return mkUifU8(mce, vatom1, vatom2);
   4126       case Iop_Xor16:
   4127          return mkUifU16(mce, vatom1, vatom2);
   4128       case Iop_Xor32:
   4129          return mkUifU32(mce, vatom1, vatom2);
   4130       case Iop_Xor64:
   4131          return mkUifU64(mce, vatom1, vatom2);
   4132       case Iop_XorV128:
   4133          return mkUifUV128(mce, vatom1, vatom2);
   4134       case Iop_XorV256:
   4135          return mkUifUV256(mce, vatom1, vatom2);
   4136 
   4137       /* V256-bit SIMD */
   4138 
   4139       case Iop_ShrN16x16:
   4140       case Iop_ShrN32x8:
   4141       case Iop_ShrN64x4:
   4142       case Iop_SarN16x16:
   4143       case Iop_SarN32x8:
   4144       case Iop_ShlN16x16:
   4145       case Iop_ShlN32x8:
   4146       case Iop_ShlN64x4:
   4147          /* Same scheme as with all other shifts.  Note: 22 Oct 05:
   4148             this is wrong now, scalar shifts are done properly lazily.
   4149             Vector shifts should be fixed too. */
   4150          complainIfUndefined(mce, atom2, NULL);
   4151          return assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2));
   4152 
   4153       case Iop_QSub8Ux32:
   4154       case Iop_QSub8Sx32:
   4155       case Iop_Sub8x32:
   4156       case Iop_Min8Ux32:
   4157       case Iop_Min8Sx32:
   4158       case Iop_Max8Ux32:
   4159       case Iop_Max8Sx32:
   4160       case Iop_CmpGT8Sx32:
   4161       case Iop_CmpEQ8x32:
   4162       case Iop_Avg8Ux32:
   4163       case Iop_QAdd8Ux32:
   4164       case Iop_QAdd8Sx32:
   4165       case Iop_Add8x32:
   4166          return binary8Ix32(mce, vatom1, vatom2);
   4167 
   4168       case Iop_QSub16Ux16:
   4169       case Iop_QSub16Sx16:
   4170       case Iop_Sub16x16:
   4171       case Iop_Mul16x16:
   4172       case Iop_MulHi16Sx16:
   4173       case Iop_MulHi16Ux16:
   4174       case Iop_Min16Sx16:
   4175       case Iop_Min16Ux16:
   4176       case Iop_Max16Sx16:
   4177       case Iop_Max16Ux16:
   4178       case Iop_CmpGT16Sx16:
   4179       case Iop_CmpEQ16x16:
   4180       case Iop_Avg16Ux16:
   4181       case Iop_QAdd16Ux16:
   4182       case Iop_QAdd16Sx16:
   4183       case Iop_Add16x16:
   4184          return binary16Ix16(mce, vatom1, vatom2);
   4185 
   4186       case Iop_Sub32x8:
   4187       case Iop_CmpGT32Sx8:
   4188       case Iop_CmpEQ32x8:
   4189       case Iop_Add32x8:
   4190       case Iop_Max32Ux8:
   4191       case Iop_Max32Sx8:
   4192       case Iop_Min32Ux8:
   4193       case Iop_Min32Sx8:
   4194       case Iop_Mul32x8:
   4195          return binary32Ix8(mce, vatom1, vatom2);
   4196 
   4197       case Iop_Sub64x4:
   4198       case Iop_Add64x4:
   4199       case Iop_CmpEQ64x4:
   4200       case Iop_CmpGT64Sx4:
   4201          return binary64Ix4(mce, vatom1, vatom2);
   4202 
   4203      /* Perm32x8: rearrange values in left arg using steering values
   4204         from right arg.  So rearrange the vbits in the same way but
   4205         pessimise wrt steering values. */
   4206       case Iop_Perm32x8:
   4207          return mkUifUV256(
   4208                    mce,
   4209                    assignNew('V', mce, Ity_V256, binop(op, vatom1, atom2)),
   4210                    mkPCast32x8(mce, vatom2)
   4211                 );
   4212 
   4213       /* Q-and-Qshift-by-vector of the form (V128, V128) -> V256.
   4214          Handle the shifted results in the same way that other
   4215          binary Q ops are handled, eg QSub: UifU the two args,
   4216          then pessimise -- which is binaryNIxM.  But for the upper
   4217          V128, we require to generate just 1 bit which is the
   4218          pessimised shift result, with 127 defined zeroes above it.
   4219 
   4220          Note that this overly pessimistic in that in fact only the
   4221          bottom 8 bits of each lane of the second arg determine the shift
   4222          amount.  Really we ought to ignore any undefinedness in the
   4223          rest of the lanes of the second arg. */
   4224       case Iop_QandSQsh64x2:  case Iop_QandUQsh64x2:
   4225       case Iop_QandSQRsh64x2: case Iop_QandUQRsh64x2:
   4226       case Iop_QandSQsh32x4:  case Iop_QandUQsh32x4:
   4227       case Iop_QandSQRsh32x4: case Iop_QandUQRsh32x4:
   4228       case Iop_QandSQsh16x8:  case Iop_QandUQsh16x8:
   4229       case Iop_QandSQRsh16x8: case Iop_QandUQRsh16x8:
   4230       case Iop_QandSQsh8x16:  case Iop_QandUQsh8x16:
   4231       case Iop_QandSQRsh8x16: case Iop_QandUQRsh8x16:
   4232       {
   4233          // The function to generate the pessimised shift result
   4234          IRAtom* (*binaryNIxM)(MCEnv*,IRAtom*,IRAtom*) = NULL;
   4235          switch (op) {
   4236             case Iop_QandSQsh64x2:
   4237             case Iop_QandUQsh64x2:
   4238             case Iop_QandSQRsh64x2:
   4239             case Iop_QandUQRsh64x2:
   4240                binaryNIxM = binary64Ix2;
   4241                break;
   4242             case Iop_QandSQsh32x4:
   4243             case Iop_QandUQsh32x4:
   4244             case Iop_QandSQRsh32x4:
   4245             case Iop_QandUQRsh32x4:
   4246                binaryNIxM = binary32Ix4;
   4247                break;
   4248             case Iop_QandSQsh16x8:
   4249             case Iop_QandUQsh16x8:
   4250             case Iop_QandSQRsh16x8:
   4251             case Iop_QandUQRsh16x8:
   4252                binaryNIxM = binary16Ix8;
   4253                break;
   4254             case Iop_QandSQsh8x16:
   4255             case Iop_QandUQsh8x16:
   4256             case Iop_QandSQRsh8x16:
   4257             case Iop_QandUQRsh8x16:
   4258                binaryNIxM = binary8Ix16;
   4259                break;
   4260             default:
   4261                tl_assert(0);
   4262          }
   4263          tl_assert(binaryNIxM);
   4264          // Pessimised shift result, shV[127:0]
   4265          IRAtom* shV = binaryNIxM(mce, vatom1, vatom2);
   4266          // Generates: Def--(127)--Def PCast-to-I1(shV)
   4267          IRAtom* qV = mkPCastXXtoXXlsb(mce, shV, Ity_V128);
   4268          // and assemble the result
   4269          return assignNew('V', mce, Ity_V256,
   4270                           binop(Iop_V128HLtoV256, qV, shV));
   4271       }
   4272 
   4273       default:
   4274          ppIROp(op);
   4275          VG_(tool_panic)("memcheck:expr2vbits_Binop");
   4276    }
   4277 }
   4278 
   4279 
   4280 static
   4281 IRExpr* expr2vbits_Unop ( MCEnv* mce, IROp op, IRAtom* atom )
   4282 {
   4283    /* For the widening operations {8,16,32}{U,S}to{16,32,64}, the
   4284       selection of shadow operation implicitly duplicates the logic in
   4285       do_shadow_LoadG and should be kept in sync (in the very unlikely
   4286       event that the interpretation of such widening ops changes in
   4287       future).  See comment in do_shadow_LoadG. */
   4288    IRAtom* vatom = expr2vbits( mce, atom );
   4289    tl_assert(isOriginalAtom(mce,atom));
   4290    switch (op) {
   4291 
   4292       case Iop_Abs64Fx2:
   4293       case Iop_Neg64Fx2:
   4294       case Iop_RSqrtEst64Fx2:
   4295       case Iop_RecipEst64Fx2:
   4296          return unary64Fx2(mce, vatom);
   4297 
   4298       case Iop_Sqrt64F0x2:
   4299          return unary64F0x2(mce, vatom);
   4300 
   4301       case Iop_Sqrt32Fx8:
   4302       case Iop_RSqrtEst32Fx8:
   4303       case Iop_RecipEst32Fx8:
   4304          return unary32Fx8(mce, vatom);
   4305 
   4306       case Iop_Sqrt64Fx4:
   4307          return unary64Fx4(mce, vatom);
   4308 
   4309       case Iop_RecipEst32Fx4:
   4310       case Iop_I32UtoFx4:
   4311       case Iop_I32StoFx4:
   4312       case Iop_QFtoI32Ux4_RZ:
   4313       case Iop_QFtoI32Sx4_RZ:
   4314       case Iop_RoundF32x4_RM:
   4315       case Iop_RoundF32x4_RP:
   4316       case Iop_RoundF32x4_RN:
   4317       case Iop_RoundF32x4_RZ:
   4318       case Iop_RecipEst32Ux4:
   4319       case Iop_Abs32Fx4:
   4320       case Iop_Neg32Fx4:
   4321       case Iop_RSqrtEst32Fx4:
   4322          return unary32Fx4(mce, vatom);
   4323 
   4324       case Iop_I32UtoFx2:
   4325       case Iop_I32StoFx2:
   4326       case Iop_RecipEst32Fx2:
   4327       case Iop_RecipEst32Ux2:
   4328       case Iop_Abs32Fx2:
   4329       case Iop_Neg32Fx2:
   4330       case Iop_RSqrtEst32Fx2:
   4331          return unary32Fx2(mce, vatom);
   4332 
   4333       case Iop_Sqrt32F0x4:
   4334       case Iop_RSqrtEst32F0x4:
   4335       case Iop_RecipEst32F0x4:
   4336          return unary32F0x4(mce, vatom);
   4337 
   4338       case Iop_32UtoV128:
   4339       case Iop_64UtoV128:
   4340       case Iop_Dup8x16:
   4341       case Iop_Dup16x8:
   4342       case Iop_Dup32x4:
   4343       case Iop_Reverse1sIn8_x16:
   4344       case Iop_Reverse8sIn16_x8:
   4345       case Iop_Reverse8sIn32_x4:
   4346       case Iop_Reverse16sIn32_x4:
   4347       case Iop_Reverse8sIn64_x2:
   4348       case Iop_Reverse16sIn64_x2:
   4349       case Iop_Reverse32sIn64_x2:
   4350       case Iop_V256toV128_1: case Iop_V256toV128_0:
   4351       case Iop_ZeroHI64ofV128:
   4352       case Iop_ZeroHI96ofV128:
   4353       case Iop_ZeroHI112ofV128:
   4354       case Iop_ZeroHI120ofV128:
   4355          return assignNew('V', mce, Ity_V128, unop(op, vatom));
   4356 
   4357       case Iop_F128HItoF64:  /* F128 -> high half of F128 */
   4358       case Iop_D128HItoD64:  /* D128 -> high half of D128 */
   4359          return assignNew('V', mce, Ity_I64, unop(Iop_128HIto64, vatom));
   4360       case Iop_F128LOtoF64:  /* F128 -> low  half of F128 */
   4361       case Iop_D128LOtoD64:  /* D128 -> low  half of D128 */
   4362          return assignNew('V', mce, Ity_I64, unop(Iop_128to64, vatom));
   4363 
   4364       case Iop_NegF128:
   4365       case Iop_AbsF128:
   4366          return mkPCastTo(mce, Ity_I128, vatom);
   4367 
   4368       case Iop_I32StoF128: /* signed I32 -> F128 */
   4369       case Iop_I64StoF128: /* signed I64 -> F128 */
   4370       case Iop_I32UtoF128: /* unsigned I32 -> F128 */
   4371       case Iop_I64UtoF128: /* unsigned I64 -> F128 */
   4372       case Iop_F32toF128:  /* F32 -> F128 */
   4373       case Iop_F64toF128:  /* F64 -> F128 */
   4374       case Iop_I32StoD128: /* signed I64 -> D128 */
   4375       case Iop_I64StoD128: /* signed I64 -> D128 */
   4376       case Iop_I32UtoD128: /* unsigned I32 -> D128 */
   4377       case Iop_I64UtoD128: /* unsigned I64 -> D128 */
   4378          return mkPCastTo(mce, Ity_I128, vatom);
   4379 
   4380       case Iop_F16toF64:
   4381       case Iop_F32toF64:
   4382       case Iop_I32StoF64:
   4383       case Iop_I32UtoF64:
   4384       case Iop_NegF64:
   4385       case Iop_AbsF64:
   4386       case Iop_RSqrtEst5GoodF64:
   4387       case Iop_RoundF64toF64_NEAREST:
   4388       case Iop_RoundF64toF64_NegINF:
   4389       case Iop_RoundF64toF64_PosINF:
   4390       case Iop_RoundF64toF64_ZERO:
   4391       case Iop_Clz64:
   4392       case Iop_D32toD64:
   4393       case Iop_I32StoD64:
   4394       case Iop_I32UtoD64:
   4395       case Iop_ExtractExpD64:    /* D64  -> I64 */
   4396       case Iop_ExtractExpD128:   /* D128 -> I64 */
   4397       case Iop_ExtractSigD64:    /* D64  -> I64 */
   4398       case Iop_ExtractSigD128:   /* D128 -> I64 */
   4399       case Iop_DPBtoBCD:
   4400       case Iop_BCDtoDPB:
   4401          return mkPCastTo(mce, Ity_I64, vatom);
   4402 
   4403       case Iop_D64toD128:
   4404          return mkPCastTo(mce, Ity_I128, vatom);
   4405 
   4406       case Iop_Clz32:
   4407       case Iop_TruncF64asF32:
   4408       case Iop_NegF32:
   4409       case Iop_AbsF32:
   4410       case Iop_F16toF32:
   4411          return mkPCastTo(mce, Ity_I32, vatom);
   4412 
   4413       case Iop_Ctz32:
   4414       case Iop_Ctz64:
   4415          return expensiveCountTrailingZeroes(mce, op, atom, vatom);
   4416 
   4417       case Iop_1Uto64:
   4418       case Iop_1Sto64:
   4419       case Iop_8Uto64:
   4420       case Iop_8Sto64:
   4421       case Iop_16Uto64:
   4422       case Iop_16Sto64:
   4423       case Iop_32Sto64:
   4424       case Iop_32Uto64:
   4425       case Iop_V128to64:
   4426       case Iop_V128HIto64:
   4427       case Iop_128HIto64:
   4428       case Iop_128to64:
   4429       case Iop_Dup8x8:
   4430       case Iop_Dup16x4:
   4431       case Iop_Dup32x2:
   4432       case Iop_Reverse8sIn16_x4:
   4433       case Iop_Reverse8sIn32_x2:
   4434       case Iop_Reverse16sIn32_x2:
   4435       case Iop_Reverse8sIn64_x1:
   4436       case Iop_Reverse16sIn64_x1:
   4437       case Iop_Reverse32sIn64_x1:
   4438       case Iop_V256to64_0: case Iop_V256to64_1:
   4439       case Iop_V256to64_2: case Iop_V256to64_3:
   4440          return assignNew('V', mce, Ity_I64, unop(op, vatom));
   4441 
   4442       case Iop_64to32:
   4443       case Iop_64HIto32:
   4444       case Iop_1Uto32:
   4445       case Iop_1Sto32:
   4446       case Iop_8Uto32:
   4447       case Iop_16Uto32:
   4448       case Iop_16Sto32:
   4449       case Iop_8Sto32:
   4450       case Iop_V128to32:
   4451          return assignNew('V', mce, Ity_I32, unop(op, vatom));
   4452 
   4453       case Iop_8Sto16:
   4454       case Iop_8Uto16:
   4455       case Iop_32to16:
   4456       case Iop_32HIto16:
   4457       case Iop_64to16:
   4458       case Iop_GetMSBs8x16:
   4459          return assignNew('V', mce, Ity_I16, unop(op, vatom));
   4460 
   4461       case Iop_1Uto8:
   4462       case Iop_1Sto8:
   4463       case Iop_16to8:
   4464       case Iop_16HIto8:
   4465       case Iop_32to8:
   4466       case Iop_64to8:
   4467       case Iop_GetMSBs8x8:
   4468          return assignNew('V', mce, Ity_I8, unop(op, vatom));
   4469 
   4470       case Iop_32to1:
   4471          return assignNew('V', mce, Ity_I1, unop(Iop_32to1, vatom));
   4472 
   4473       case Iop_64to1:
   4474          return assignNew('V', mce, Ity_I1, unop(Iop_64to1, vatom));
   4475 
   4476       case Iop_ReinterpF64asI64:
   4477       case Iop_ReinterpI64asF64:
   4478       case Iop_ReinterpI32asF32:
   4479       case Iop_ReinterpF32asI32:
   4480       case Iop_ReinterpI64asD64:
   4481       case Iop_ReinterpD64asI64:
   4482       case Iop_NotV256:
   4483       case Iop_NotV128:
   4484       case Iop_Not64:
   4485       case Iop_Not32:
   4486       case Iop_Not16:
   4487       case Iop_Not8:
   4488       case Iop_Not1:
   4489          return vatom;
   4490 
   4491       case Iop_CmpNEZ8x8:
   4492       case Iop_Cnt8x8:
   4493       case Iop_Clz8x8:
   4494       case Iop_Cls8x8:
   4495       case Iop_Abs8x8:
   4496          return mkPCast8x8(mce, vatom);
   4497 
   4498       case Iop_CmpNEZ8x16:
   4499       case Iop_Cnt8x16:
   4500       case Iop_Clz8x16:
   4501       case Iop_Cls8x16:
   4502       case Iop_Abs8x16:
   4503          return mkPCast8x16(mce, vatom);
   4504 
   4505       case Iop_CmpNEZ16x4:
   4506       case Iop_Clz16x4:
   4507       case Iop_Cls16x4:
   4508       case Iop_Abs16x4:
   4509          return mkPCast16x4(mce, vatom);
   4510 
   4511       case Iop_CmpNEZ16x8:
   4512       case Iop_Clz16x8:
   4513       case Iop_Cls16x8:
   4514       case Iop_Abs16x8:
   4515          return mkPCast16x8(mce, vatom);
   4516 
   4517       case Iop_CmpNEZ32x2:
   4518       case Iop_Clz32x2:
   4519       case Iop_Cls32x2:
   4520       case Iop_FtoI32Ux2_RZ:
   4521       case Iop_FtoI32Sx2_RZ:
   4522       case Iop_Abs32x2:
   4523          return mkPCast32x2(mce, vatom);
   4524 
   4525       case Iop_CmpNEZ32x4:
   4526       case Iop_Clz32x4:
   4527       case Iop_Cls32x4:
   4528       case Iop_FtoI32Ux4_RZ:
   4529       case Iop_FtoI32Sx4_RZ:
   4530       case Iop_Abs32x4:
   4531       case Iop_RSqrtEst32Ux4:
   4532          return mkPCast32x4(mce, vatom);
   4533 
   4534       case Iop_CmpwNEZ32:
   4535          return mkPCastTo(mce, Ity_I32, vatom);
   4536 
   4537       case Iop_CmpwNEZ64:
   4538          return mkPCastTo(mce, Ity_I64, vatom);
   4539 
   4540       case Iop_CmpNEZ64x2:
   4541       case Iop_CipherSV128:
   4542       case Iop_Clz64x2:
   4543       case Iop_Abs64x2:
   4544          return mkPCast64x2(mce, vatom);
   4545 
   4546       case Iop_PwBitMtxXpose64x2:
   4547          return assignNew('V', mce, Ity_V128, unop(op, vatom));
   4548 
   4549       case Iop_NarrowUn16to8x8:
   4550       case Iop_NarrowUn32to16x4:
   4551       case Iop_NarrowUn64to32x2:
   4552       case Iop_QNarrowUn16Sto8Sx8:
   4553       case Iop_QNarrowUn16Sto8Ux8:
   4554       case Iop_QNarrowUn16Uto8Ux8:
   4555       case Iop_QNarrowUn32Sto16Sx4:
   4556       case Iop_QNarrowUn32Sto16Ux4:
   4557       case Iop_QNarrowUn32Uto16Ux4:
   4558       case Iop_QNarrowUn64Sto32Sx2:
   4559       case Iop_QNarrowUn64Sto32Ux2:
   4560       case Iop_QNarrowUn64Uto32Ux2:
   4561          return vectorNarrowUnV128(mce, op, vatom);
   4562 
   4563       case Iop_Widen8Sto16x8:
   4564       case Iop_Widen8Uto16x8:
   4565       case Iop_Widen16Sto32x4:
   4566       case Iop_Widen16Uto32x4:
   4567       case Iop_Widen32Sto64x2:
   4568       case Iop_Widen32Uto64x2:
   4569          return vectorWidenI64(mce, op, vatom);
   4570 
   4571       case Iop_PwAddL32Ux2:
   4572       case Iop_PwAddL32Sx2:
   4573          return mkPCastTo(mce, Ity_I64,
   4574                assignNew('V', mce, Ity_I64, unop(op, mkPCast32x2(mce, vatom))));
   4575 
   4576       case Iop_PwAddL16Ux4:
   4577       case Iop_PwAddL16Sx4:
   4578          return mkPCast32x2(mce,
   4579                assignNew('V', mce, Ity_I64, unop(op, mkPCast16x4(mce, vatom))));
   4580 
   4581       case Iop_PwAddL8Ux8:
   4582       case Iop_PwAddL8Sx8:
   4583          return mkPCast16x4(mce,
   4584                assignNew('V', mce, Ity_I64, unop(op, mkPCast8x8(mce, vatom))));
   4585 
   4586       case Iop_PwAddL32Ux4:
   4587       case Iop_PwAddL32Sx4:
   4588          return mkPCast64x2(mce,
   4589                assignNew('V', mce, Ity_V128, unop(op, mkPCast32x4(mce, vatom))));
   4590 
   4591       case Iop_PwAddL16Ux8:
   4592       case Iop_PwAddL16Sx8:
   4593          return mkPCast32x4(mce,
   4594                assignNew('V', mce, Ity_V128, unop(op, mkPCast16x8(mce, vatom))));
   4595 
   4596       case Iop_PwAddL8Ux16:
   4597       case Iop_PwAddL8Sx16:
   4598          return mkPCast16x8(mce,
   4599                assignNew('V', mce, Ity_V128, unop(op, mkPCast8x16(mce, vatom))));
   4600 
   4601       case Iop_I64UtoF32:
   4602       default:
   4603          ppIROp(op);
   4604          VG_(tool_panic)("memcheck:expr2vbits_Unop");
   4605    }
   4606 }
   4607 
   4608 
   4609 /* Worker function -- do not call directly.  See comments on
   4610    expr2vbits_Load for the meaning of |guard|.
   4611 
   4612    Generates IR to (1) perform a definedness test of |addr|, (2)
   4613    perform a validity test of |addr|, and (3) return the Vbits for the
   4614    location indicated by |addr|.  All of this only happens when
   4615    |guard| is NULL or |guard| evaluates to True at run time.
   4616 
   4617    If |guard| evaluates to False at run time, the returned value is
   4618    the IR-mandated 0x55..55 value, and no checks nor shadow loads are
   4619    performed.
   4620 
   4621    The definedness of |guard| itself is not checked.  That is assumed
   4622    to have been done before this point, by the caller. */
   4623 static
   4624 IRAtom* expr2vbits_Load_WRK ( MCEnv* mce,
   4625                               IREndness end, IRType ty,
   4626                               IRAtom* addr, UInt bias, IRAtom* guard )
   4627 {
   4628    tl_assert(isOriginalAtom(mce,addr));
   4629    tl_assert(end == Iend_LE || end == Iend_BE);
   4630 
   4631    /* First, emit a definedness test for the address.  This also sets
   4632       the address (shadow) to 'defined' following the test. */
   4633    complainIfUndefined( mce, addr, guard );
   4634 
   4635    /* Now cook up a call to the relevant helper function, to read the
   4636       data V bits from shadow memory. */
   4637    ty = shadowTypeV(ty);
   4638 
   4639    void*        helper           = NULL;
   4640    const HChar* hname            = NULL;
   4641    Bool         ret_via_outparam = False;
   4642 
   4643    if (end == Iend_LE) {
   4644       switch (ty) {
   4645          case Ity_V256: helper = &MC_(helperc_LOADV256le);
   4646                         hname = "MC_(helperc_LOADV256le)";
   4647                         ret_via_outparam = True;
   4648                         break;
   4649          case Ity_V128: helper = &MC_(helperc_LOADV128le);
   4650                         hname = "MC_(helperc_LOADV128le)";
   4651                         ret_via_outparam = True;
   4652                         break;
   4653          case Ity_I64:  helper = &MC_(helperc_LOADV64le);
   4654                         hname = "MC_(helperc_LOADV64le)";
   4655                         break;
   4656          case Ity_I32:  helper = &MC_(helperc_LOADV32le);
   4657                         hname = "MC_(helperc_LOADV32le)";
   4658                         break;
   4659          case Ity_I16:  helper = &MC_(helperc_LOADV16le);
   4660                         hname = "MC_(helperc_LOADV16le)";
   4661                         break;
   4662          case Ity_I8:   helper = &MC_(helperc_LOADV8);
   4663                         hname = "MC_(helperc_LOADV8)";
   4664                         break;
   4665          default:       ppIRType(ty);
   4666                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(LE)");
   4667       }
   4668    } else {
   4669       switch (ty) {
   4670          case Ity_V256: helper = &MC_(helperc_LOADV256be);
   4671                         hname = "MC_(helperc_LOADV256be)";
   4672                         ret_via_outparam = True;
   4673                         break;
   4674          case Ity_V128: helper = &MC_(helperc_LOADV128be);
   4675                         hname = "MC_(helperc_LOADV128be)";
   4676                         ret_via_outparam = True;
   4677                         break;
   4678          case Ity_I64:  helper = &MC_(helperc_LOADV64be);
   4679                         hname = "MC_(helperc_LOADV64be)";
   4680                         break;
   4681          case Ity_I32:  helper = &MC_(helperc_LOADV32be);
   4682                         hname = "MC_(helperc_LOADV32be)";
   4683                         break;
   4684          case Ity_I16:  helper = &MC_(helperc_LOADV16be);
   4685                         hname = "MC_(helperc_LOADV16be)";
   4686                         break;
   4687          case Ity_I8:   helper = &MC_(helperc_LOADV8);
   4688                         hname = "MC_(helperc_LOADV8)";
   4689                         break;
   4690          default:       ppIRType(ty);
   4691                         VG_(tool_panic)("memcheck:expr2vbits_Load_WRK(BE)");
   4692       }
   4693    }
   4694 
   4695    tl_assert(helper);
   4696    tl_assert(hname);
   4697 
   4698    /* Generate the actual address into addrAct. */
   4699    IRAtom* addrAct;
   4700    if (bias == 0) {
   4701       addrAct = addr;
   4702    } else {
   4703       IROp    mkAdd;
   4704       IRAtom* eBias;
   4705       IRType  tyAddr  = mce->hWordTy;
   4706       tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
   4707       mkAdd   = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
   4708       eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
   4709       addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias) );
   4710    }
   4711 
   4712    /* We need to have a place to park the V bits we're just about to
   4713       read. */
   4714    IRTemp datavbits = newTemp(mce, ty, VSh);
   4715 
   4716    /* Here's the call. */
   4717    IRDirty* di;
   4718    if (ret_via_outparam) {
   4719       di = unsafeIRDirty_1_N( datavbits,
   4720                               2/*regparms*/,
   4721                               hname, VG_(fnptr_to_fnentry)( helper ),
   4722                               mkIRExprVec_2( IRExpr_VECRET(), addrAct ) );
   4723    } else {
   4724       di = unsafeIRDirty_1_N( datavbits,
   4725                               1/*regparms*/,
   4726                               hname, VG_(fnptr_to_fnentry)( helper ),
   4727                               mkIRExprVec_1( addrAct ) );
   4728    }
   4729 
   4730    setHelperAnns( mce, di );
   4731    if (guard) {
   4732       di->guard = guard;
   4733       /* Ideally the didn't-happen return value here would be all-ones
   4734          (all-undefined), so it'd be obvious if it got used
   4735          inadvertently.  We can get by with the IR-mandated default
   4736          value (0b01 repeating, 0x55 etc) as that'll still look pretty
   4737          undefined if it ever leaks out. */
   4738    }
   4739    stmt( 'V', mce, IRStmt_Dirty(di) );
   4740 
   4741    return mkexpr(datavbits);
   4742 }
   4743 
   4744 
   4745 /* Generate IR to do a shadow load.  The helper is expected to check
   4746    the validity of the address and return the V bits for that address.
   4747    This can optionally be controlled by a guard, which is assumed to
   4748    be True if NULL.  In the case where the guard is False at runtime,
   4749    the helper will return the didn't-do-the-call value of 0x55..55.
   4750    Since that means "completely undefined result", the caller of
   4751    this function will need to fix up the result somehow in that
   4752    case.
   4753 
   4754    Caller of this function is also expected to have checked the
   4755    definedness of |guard| before this point.
   4756 */
   4757 static
   4758 IRAtom* expr2vbits_Load ( MCEnv* mce,
   4759                           IREndness end, IRType ty,
   4760                           IRAtom* addr, UInt bias,
   4761                           IRAtom* guard )
   4762 {
   4763    tl_assert(end == Iend_LE || end == Iend_BE);
   4764    switch (shadowTypeV(ty)) {
   4765       case Ity_I8:
   4766       case Ity_I16:
   4767       case Ity_I32:
   4768       case Ity_I64:
   4769       case Ity_V128:
   4770       case Ity_V256:
   4771          return expr2vbits_Load_WRK(mce, end, ty, addr, bias, guard);
   4772       default:
   4773          VG_(tool_panic)("expr2vbits_Load");
   4774    }
   4775 }
   4776 
   4777 
   4778 /* The most general handler for guarded loads.  Assumes the
   4779    definedness of GUARD has already been checked by the caller.  A
   4780    GUARD of NULL is assumed to mean "always True".  Generates code to
   4781    check the definedness and validity of ADDR.
   4782 
   4783    Generate IR to do a shadow load from ADDR and return the V bits.
   4784    The loaded type is TY.  The loaded data is then (shadow) widened by
   4785    using VWIDEN, which can be Iop_INVALID to denote a no-op.  If GUARD
   4786    evaluates to False at run time then the returned Vbits are simply
   4787    VALT instead.  Note therefore that the argument type of VWIDEN must
   4788    be TY and the result type of VWIDEN must equal the type of VALT.
   4789 */
   4790 static
   4791 IRAtom* expr2vbits_Load_guarded_General ( MCEnv* mce,
   4792                                           IREndness end, IRType ty,
   4793                                           IRAtom* addr, UInt bias,
   4794                                           IRAtom* guard,
   4795                                           IROp vwiden, IRAtom* valt )
   4796 {
   4797    /* Sanity check the conversion operation, and also set TYWIDE. */
   4798    IRType tyWide = Ity_INVALID;
   4799    switch (vwiden) {
   4800       case Iop_INVALID:
   4801          tyWide = ty;
   4802          break;
   4803       case Iop_16Uto32: case Iop_16Sto32: case Iop_8Uto32: case Iop_8Sto32:
   4804          tyWide = Ity_I32;
   4805          break;
   4806       default:
   4807          VG_(tool_panic)("memcheck:expr2vbits_Load_guarded_General");
   4808    }
   4809 
   4810    /* If the guard evaluates to True, this will hold the loaded V bits
   4811       at TY.  If the guard evaluates to False, this will be all
   4812       ones, meaning "all undefined", in which case we will have to
   4813       replace it using an ITE below. */
   4814    IRAtom* iftrue1
   4815       = assignNew('V', mce, ty,
   4816                   expr2vbits_Load(mce, end, ty, addr, bias, guard));
   4817    /* Now (shadow-) widen the loaded V bits to the desired width.  In
   4818       the guard-is-False case, the allowable widening operators will
   4819       in the worst case (unsigned widening) at least leave the
   4820       pre-widened part as being marked all-undefined, and in the best
   4821       case (signed widening) mark the whole widened result as
   4822       undefined.  Anyway, it doesn't matter really, since in this case
   4823       we will replace said value with the default value |valt| using an
   4824       ITE. */
   4825    IRAtom* iftrue2
   4826       = vwiden == Iop_INVALID
   4827            ? iftrue1
   4828            : assignNew('V', mce, tyWide, unop(vwiden, iftrue1));
   4829    /* These are the V bits we will return if the load doesn't take
   4830       place. */
   4831    IRAtom* iffalse
   4832       = valt;
   4833    /* Prepare the cond for the ITE.  Convert a NULL cond into
   4834       something that iropt knows how to fold out later. */
   4835    IRAtom* cond
   4836       = guard == NULL  ? mkU1(1)  : guard;
   4837    /* And assemble the final result. */
   4838    return assignNew('V', mce, tyWide, IRExpr_ITE(cond, iftrue2, iffalse));
   4839 }
   4840 
   4841 
   4842 /* A simpler handler for guarded loads, in which there is no
   4843    conversion operation, and the default V bit return (when the guard
   4844    evaluates to False at runtime) is "all defined".  If there is no
   4845    guard expression or the guard is always TRUE this function behaves
   4846    like expr2vbits_Load.  It is assumed that definedness of GUARD has
   4847    already been checked at the call site. */
   4848 static
   4849 IRAtom* expr2vbits_Load_guarded_Simple ( MCEnv* mce,
   4850                                          IREndness end, IRType ty,
   4851                                          IRAtom* addr, UInt bias,
   4852                                          IRAtom *guard )
   4853 {
   4854    return expr2vbits_Load_guarded_General(
   4855              mce, end, ty, addr, bias, guard, Iop_INVALID, definedOfType(ty)
   4856           );
   4857 }
   4858 
   4859 
   4860 static
   4861 IRAtom* expr2vbits_ITE ( MCEnv* mce,
   4862                          IRAtom* cond, IRAtom* iftrue, IRAtom* iffalse )
   4863 {
   4864    IRAtom *vbitsC, *vbits0, *vbits1;
   4865    IRType ty;
   4866    /* Given ITE(cond, iftrue,  iffalse),  generate
   4867             ITE(cond, iftrue#, iffalse#) `UifU` PCast(cond#)
   4868       That is, steer the V bits like the originals, but trash the
   4869       result if the steering value is undefined.  This gives
   4870       lazy propagation. */
   4871    tl_assert(isOriginalAtom(mce, cond));
   4872    tl_assert(isOriginalAtom(mce, iftrue));
   4873    tl_assert(isOriginalAtom(mce, iffalse));
   4874 
   4875    vbitsC = expr2vbits(mce, cond);
   4876    vbits1 = expr2vbits(mce, iftrue);
   4877    vbits0 = expr2vbits(mce, iffalse);
   4878    ty = typeOfIRExpr(mce->sb->tyenv, vbits0);
   4879 
   4880    return
   4881       mkUifU(mce, ty, assignNew('V', mce, ty,
   4882                                      IRExpr_ITE(cond, vbits1, vbits0)),
   4883                       mkPCastTo(mce, ty, vbitsC) );
   4884 }
   4885 
   4886 /* --------- This is the main expression-handling function. --------- */
   4887 
   4888 static
   4889 IRExpr* expr2vbits ( MCEnv* mce, IRExpr* e )
   4890 {
   4891    switch (e->tag) {
   4892 
   4893       case Iex_Get:
   4894          return shadow_GET( mce, e->Iex.Get.offset, e->Iex.Get.ty );
   4895 
   4896       case Iex_GetI:
   4897          return shadow_GETI( mce, e->Iex.GetI.descr,
   4898                                   e->Iex.GetI.ix, e->Iex.GetI.bias );
   4899 
   4900       case Iex_RdTmp:
   4901          return IRExpr_RdTmp( findShadowTmpV(mce, e->Iex.RdTmp.tmp) );
   4902 
   4903       case Iex_Const:
   4904          return definedOfType(shadowTypeV(typeOfIRExpr(mce->sb->tyenv, e)));
   4905 
   4906       case Iex_Qop:
   4907          return expr2vbits_Qop(
   4908                    mce,
   4909                    e->Iex.Qop.details->op,
   4910                    e->Iex.Qop.details->arg1, e->Iex.Qop.details->arg2,
   4911                    e->Iex.Qop.details->arg3, e->Iex.Qop.details->arg4
   4912                 );
   4913 
   4914       case Iex_Triop:
   4915          return expr2vbits_Triop(
   4916                    mce,
   4917                    e->Iex.Triop.details->op,
   4918                    e->Iex.Triop.details->arg1, e->Iex.Triop.details->arg2,
   4919                    e->Iex.Triop.details->arg3
   4920                 );
   4921 
   4922       case Iex_Binop:
   4923          return expr2vbits_Binop(
   4924                    mce,
   4925                    e->Iex.Binop.op,
   4926                    e->Iex.Binop.arg1, e->Iex.Binop.arg2
   4927                 );
   4928 
   4929       case Iex_Unop:
   4930          return expr2vbits_Unop( mce, e->Iex.Unop.op, e->Iex.Unop.arg );
   4931 
   4932       case Iex_Load:
   4933          return expr2vbits_Load( mce, e->Iex.Load.end,
   4934                                       e->Iex.Load.ty,
   4935                                       e->Iex.Load.addr, 0/*addr bias*/,
   4936                                       NULL/* guard == "always True"*/ );
   4937 
   4938       case Iex_CCall:
   4939          return mkLazyN( mce, e->Iex.CCall.args,
   4940                               e->Iex.CCall.retty,
   4941                               e->Iex.CCall.cee );
   4942 
   4943       case Iex_ITE:
   4944          return expr2vbits_ITE( mce, e->Iex.ITE.cond, e->Iex.ITE.iftrue,
   4945                                      e->Iex.ITE.iffalse);
   4946 
   4947       default:
   4948          VG_(printf)("\n");
   4949          ppIRExpr(e);
   4950          VG_(printf)("\n");
   4951          VG_(tool_panic)("memcheck: expr2vbits");
   4952    }
   4953 }
   4954 
   4955 /*------------------------------------------------------------*/
   4956 /*--- Generate shadow stmts from all kinds of IRStmts.     ---*/
   4957 /*------------------------------------------------------------*/
   4958 
   4959 /* Widen a value to the host word size. */
   4960 
   4961 static
   4962 IRExpr* zwidenToHostWord ( MCEnv* mce, IRAtom* vatom )
   4963 {
   4964    IRType ty, tyH;
   4965 
   4966    /* vatom is vbits-value and as such can only have a shadow type. */
   4967    tl_assert(isShadowAtom(mce,vatom));
   4968 
   4969    ty  = typeOfIRExpr(mce->sb->tyenv, vatom);
   4970    tyH = mce->hWordTy;
   4971 
   4972    if (tyH == Ity_I32) {
   4973       switch (ty) {
   4974          case Ity_I32:
   4975             return vatom;
   4976          case Ity_I16:
   4977             return assignNew('V', mce, tyH, unop(Iop_16Uto32, vatom));
   4978          case Ity_I8:
   4979             return assignNew('V', mce, tyH, unop(Iop_8Uto32, vatom));
   4980          default:
   4981             goto unhandled;
   4982       }
   4983    } else
   4984    if (tyH == Ity_I64) {
   4985       switch (ty) {
   4986          case Ity_I32:
   4987             return assignNew('V', mce, tyH, unop(Iop_32Uto64, vatom));
   4988          case Ity_I16:
   4989             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
   4990                    assignNew('V', mce, Ity_I32, unop(Iop_16Uto32, vatom))));
   4991          case Ity_I8:
   4992             return assignNew('V', mce, tyH, unop(Iop_32Uto64,
   4993                    assignNew('V', mce, Ity_I32, unop(Iop_8Uto32, vatom))));
   4994          default:
   4995             goto unhandled;
   4996       }
   4997    } else {
   4998       goto unhandled;
   4999    }
   5000   unhandled:
   5001    VG_(printf)("\nty = "); ppIRType(ty); VG_(printf)("\n");
   5002    VG_(tool_panic)("zwidenToHostWord");
   5003 }
   5004 
   5005 
   5006 /* Generate a shadow store.  |addr| is always the original address
   5007    atom.  You can pass in either originals or V-bits for the data
   5008    atom, but obviously not both.  This function generates a check for
   5009    the definedness and (indirectly) the validity of |addr|, but only
   5010    when |guard| evaluates to True at run time (or is NULL).
   5011 
   5012    |guard| :: Ity_I1 controls whether the store really happens; NULL
   5013    means it unconditionally does.  Note that |guard| itself is not
   5014    checked for definedness; the caller of this function must do that
   5015    if necessary.
   5016 */
   5017 static
   5018 void do_shadow_Store ( MCEnv* mce,
   5019                        IREndness end,
   5020                        IRAtom* addr, UInt bias,
   5021                        IRAtom* data, IRAtom* vdata,
   5022                        IRAtom* guard )
   5023 {
   5024    IROp     mkAdd;
   5025    IRType   ty, tyAddr;
   5026    void*    helper = NULL;
   5027    const HChar* hname = NULL;
   5028    IRConst* c;
   5029 
   5030    tyAddr = mce->hWordTy;
   5031    mkAdd  = tyAddr==Ity_I32 ? Iop_Add32 : Iop_Add64;
   5032    tl_assert( tyAddr == Ity_I32 || tyAddr == Ity_I64 );
   5033    tl_assert( end == Iend_LE || end == Iend_BE );
   5034 
   5035    if (data) {
   5036       tl_assert(!vdata);
   5037       tl_assert(isOriginalAtom(mce, data));
   5038       tl_assert(bias == 0);
   5039       vdata = expr2vbits( mce, data );
   5040    } else {
   5041       tl_assert(vdata);
   5042    }
   5043 
   5044    tl_assert(isOriginalAtom(mce,addr));
   5045    tl_assert(isShadowAtom(mce,vdata));
   5046 
   5047    if (guard) {
   5048       tl_assert(isOriginalAtom(mce, guard));
   5049       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
   5050    }
   5051 
   5052    ty = typeOfIRExpr(mce->sb->tyenv, vdata);
   5053 
   5054    // If we're not doing undefined value checking, pretend that this value
   5055    // is "all valid".  That lets Vex's optimiser remove some of the V bit
   5056    // shadow computation ops that precede it.
   5057    if (MC_(clo_mc_level) == 1) {
   5058       switch (ty) {
   5059          case Ity_V256: // V256 weirdness -- used four times
   5060                         c = IRConst_V256(V_BITS32_DEFINED); break;
   5061          case Ity_V128: // V128 weirdness -- used twice
   5062                         c = IRConst_V128(V_BITS16_DEFINED); break;
   5063          case Ity_I64:  c = IRConst_U64 (V_BITS64_DEFINED); break;
   5064          case Ity_I32:  c = IRConst_U32 (V_BITS32_DEFINED); break;
   5065          case Ity_I16:  c = IRConst_U16 (V_BITS16_DEFINED); break;
   5066          case Ity_I8:   c = IRConst_U8  (V_BITS8_DEFINED);  break;
   5067          default:       VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
   5068       }
   5069       vdata = IRExpr_Const( c );
   5070    }
   5071 
   5072    /* First, emit a definedness test for the address.  This also sets
   5073       the address (shadow) to 'defined' following the test.  Both of
   5074       those actions are gated on |guard|. */
   5075    complainIfUndefined( mce, addr, guard );
   5076 
   5077    /* Now decide which helper function to call to write the data V
   5078       bits into shadow memory. */
   5079    if (end == Iend_LE) {
   5080       switch (ty) {
   5081          case Ity_V256: /* we'll use the helper four times */
   5082          case Ity_V128: /* we'll use the helper twice */
   5083          case Ity_I64: helper = &MC_(helperc_STOREV64le);
   5084                        hname = "MC_(helperc_STOREV64le)";
   5085                        break;
   5086          case Ity_I32: helper = &MC_(helperc_STOREV32le);
   5087                        hname = "MC_(helperc_STOREV32le)";
   5088                        break;
   5089          case Ity_I16: helper = &MC_(helperc_STOREV16le);
   5090                        hname = "MC_(helperc_STOREV16le)";
   5091                        break;
   5092          case Ity_I8:  helper = &MC_(helperc_STOREV8);
   5093                        hname = "MC_(helperc_STOREV8)";
   5094                        break;
   5095          default:      VG_(tool_panic)("memcheck:do_shadow_Store(LE)");
   5096       }
   5097    } else {
   5098       switch (ty) {
   5099          case Ity_V128: /* we'll use the helper twice */
   5100          case Ity_I64: helper = &MC_(helperc_STOREV64be);
   5101                        hname = "MC_(helperc_STOREV64be)";
   5102                        break;
   5103          case Ity_I32: helper = &MC_(helperc_STOREV32be);
   5104                        hname = "MC_(helperc_STOREV32be)";
   5105                        break;
   5106          case Ity_I16: helper = &MC_(helperc_STOREV16be);
   5107                        hname = "MC_(helperc_STOREV16be)";
   5108                        break;
   5109          case Ity_I8:  helper = &MC_(helperc_STOREV8);
   5110                        hname = "MC_(helperc_STOREV8)";
   5111                        break;
   5112          /* Note, no V256 case here, because no big-endian target that
   5113             we support, has 256 vectors. */
   5114          default:      VG_(tool_panic)("memcheck:do_shadow_Store(BE)");
   5115       }
   5116    }
   5117 
   5118    if (UNLIKELY(ty == Ity_V256)) {
   5119 
   5120       /* V256-bit case -- phrased in terms of 64 bit units (Qs), with
   5121          Q3 being the most significant lane. */
   5122       /* These are the offsets of the Qs in memory. */
   5123       Int     offQ0, offQ1, offQ2, offQ3;
   5124 
   5125       /* Various bits for constructing the 4 lane helper calls */
   5126       IRDirty *diQ0,    *diQ1,    *diQ2,    *diQ3;
   5127       IRAtom  *addrQ0,  *addrQ1,  *addrQ2,  *addrQ3;
   5128       IRAtom  *vdataQ0, *vdataQ1, *vdataQ2, *vdataQ3;
   5129       IRAtom  *eBiasQ0, *eBiasQ1, *eBiasQ2, *eBiasQ3;
   5130 
   5131       if (end == Iend_LE) {
   5132          offQ0 = 0; offQ1 = 8; offQ2 = 16; offQ3 = 24;
   5133       } else {
   5134          offQ3 = 0; offQ2 = 8; offQ1 = 16; offQ0 = 24;
   5135       }
   5136 
   5137       eBiasQ0 = tyAddr==Ity_I32 ? mkU32(bias+offQ0) : mkU64(bias+offQ0);
   5138       addrQ0  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ0) );
   5139       vdataQ0 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_0, vdata));
   5140       diQ0    = unsafeIRDirty_0_N(
   5141                    1/*regparms*/,
   5142                    hname, VG_(fnptr_to_fnentry)( helper ),
   5143                    mkIRExprVec_2( addrQ0, vdataQ0 )
   5144                 );
   5145 
   5146       eBiasQ1 = tyAddr==Ity_I32 ? mkU32(bias+offQ1) : mkU64(bias+offQ1);
   5147       addrQ1  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ1) );
   5148       vdataQ1 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_1, vdata));
   5149       diQ1    = unsafeIRDirty_0_N(
   5150                    1/*regparms*/,
   5151                    hname, VG_(fnptr_to_fnentry)( helper ),
   5152                    mkIRExprVec_2( addrQ1, vdataQ1 )
   5153                 );
   5154 
   5155       eBiasQ2 = tyAddr==Ity_I32 ? mkU32(bias+offQ2) : mkU64(bias+offQ2);
   5156       addrQ2  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ2) );
   5157       vdataQ2 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_2, vdata));
   5158       diQ2    = unsafeIRDirty_0_N(
   5159                    1/*regparms*/,
   5160                    hname, VG_(fnptr_to_fnentry)( helper ),
   5161                    mkIRExprVec_2( addrQ2, vdataQ2 )
   5162                 );
   5163 
   5164       eBiasQ3 = tyAddr==Ity_I32 ? mkU32(bias+offQ3) : mkU64(bias+offQ3);
   5165       addrQ3  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasQ3) );
   5166       vdataQ3 = assignNew('V', mce, Ity_I64, unop(Iop_V256to64_3, vdata));
   5167       diQ3    = unsafeIRDirty_0_N(
   5168                    1/*regparms*/,
   5169                    hname, VG_(fnptr_to_fnentry)( helper ),
   5170                    mkIRExprVec_2( addrQ3, vdataQ3 )
   5171                 );
   5172 
   5173       if (guard)
   5174          diQ0->guard = diQ1->guard = diQ2->guard = diQ3->guard = guard;
   5175 
   5176       setHelperAnns( mce, diQ0 );
   5177       setHelperAnns( mce, diQ1 );
   5178       setHelperAnns( mce, diQ2 );
   5179       setHelperAnns( mce, diQ3 );
   5180       stmt( 'V', mce, IRStmt_Dirty(diQ0) );
   5181       stmt( 'V', mce, IRStmt_Dirty(diQ1) );
   5182       stmt( 'V', mce, IRStmt_Dirty(diQ2) );
   5183       stmt( 'V', mce, IRStmt_Dirty(diQ3) );
   5184 
   5185    }
   5186    else if (UNLIKELY(ty == Ity_V128)) {
   5187 
   5188       /* V128-bit case */
   5189       /* See comment in next clause re 64-bit regparms */
   5190       /* also, need to be careful about endianness */
   5191 
   5192       Int     offLo64, offHi64;
   5193       IRDirty *diLo64, *diHi64;
   5194       IRAtom  *addrLo64, *addrHi64;
   5195       IRAtom  *vdataLo64, *vdataHi64;
   5196       IRAtom  *eBiasLo64, *eBiasHi64;
   5197 
   5198       if (end == Iend_LE) {
   5199          offLo64 = 0;
   5200          offHi64 = 8;
   5201       } else {
   5202          offLo64 = 8;
   5203          offHi64 = 0;
   5204       }
   5205 
   5206       eBiasLo64 = tyAddr==Ity_I32 ? mkU32(bias+offLo64) : mkU64(bias+offLo64);
   5207       addrLo64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasLo64) );
   5208       vdataLo64 = assignNew('V', mce, Ity_I64, unop(Iop_V128to64, vdata));
   5209       diLo64    = unsafeIRDirty_0_N(
   5210                      1/*regparms*/,
   5211                      hname, VG_(fnptr_to_fnentry)( helper ),
   5212                      mkIRExprVec_2( addrLo64, vdataLo64 )
   5213                   );
   5214       eBiasHi64 = tyAddr==Ity_I32 ? mkU32(bias+offHi64) : mkU64(bias+offHi64);
   5215       addrHi64  = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBiasHi64) );
   5216       vdataHi64 = assignNew('V', mce, Ity_I64, unop(Iop_V128HIto64, vdata));
   5217       diHi64    = unsafeIRDirty_0_N(
   5218                      1/*regparms*/,
   5219                      hname, VG_(fnptr_to_fnentry)( helper ),
   5220                      mkIRExprVec_2( addrHi64, vdataHi64 )
   5221                   );
   5222       if (guard) diLo64->guard = guard;
   5223       if (guard) diHi64->guard = guard;
   5224       setHelperAnns( mce, diLo64 );
   5225       setHelperAnns( mce, diHi64 );
   5226       stmt( 'V', mce, IRStmt_Dirty(diLo64) );
   5227       stmt( 'V', mce, IRStmt_Dirty(diHi64) );
   5228 
   5229    } else {
   5230 
   5231       IRDirty *di;
   5232       IRAtom  *addrAct;
   5233 
   5234       /* 8/16/32/64-bit cases */
   5235       /* Generate the actual address into addrAct. */
   5236       if (bias == 0) {
   5237          addrAct = addr;
   5238       } else {
   5239          IRAtom* eBias   = tyAddr==Ity_I32 ? mkU32(bias) : mkU64(bias);
   5240          addrAct = assignNew('V', mce, tyAddr, binop(mkAdd, addr, eBias));
   5241       }
   5242 
   5243       if (ty == Ity_I64) {
   5244          /* We can't do this with regparm 2 on 32-bit platforms, since
   5245             the back ends aren't clever enough to handle 64-bit
   5246             regparm args.  Therefore be different. */
   5247          di = unsafeIRDirty_0_N(
   5248                  1/*regparms*/,
   5249                  hname, VG_(fnptr_to_fnentry)( helper ),
   5250                  mkIRExprVec_2( addrAct, vdata )
   5251               );
   5252       } else {
   5253          di = unsafeIRDirty_0_N(
   5254                  2/*regparms*/,
   5255                  hname, VG_(fnptr_to_fnentry)( helper ),
   5256                  mkIRExprVec_2( addrAct,
   5257                                 zwidenToHostWord( mce, vdata ))
   5258               );
   5259       }
   5260       if (guard) di->guard = guard;
   5261       setHelperAnns( mce, di );
   5262       stmt( 'V', mce, IRStmt_Dirty(di) );
   5263    }
   5264 
   5265 }
   5266 
   5267 
   5268 /* Do lazy pessimistic propagation through a dirty helper call, by
   5269    looking at the annotations on it.  This is the most complex part of
   5270    Memcheck. */
   5271 
   5272 static IRType szToITy ( Int n )
   5273 {
   5274    switch (n) {
   5275       case 1: return Ity_I8;
   5276       case 2: return Ity_I16;
   5277       case 4: return Ity_I32;
   5278       case 8: return Ity_I64;
   5279       default: VG_(tool_panic)("szToITy(memcheck)");
   5280    }
   5281 }
   5282 
   5283 static
   5284 void do_shadow_Dirty ( MCEnv* mce, IRDirty* d )
   5285 {
   5286    Int       i, k, n, toDo, gSz, gOff;
   5287    IRAtom    *src, *here, *curr;
   5288    IRType    tySrc, tyDst;
   5289    IRTemp    dst;
   5290    IREndness end;
   5291 
   5292    /* What's the native endianness?  We need to know this. */
   5293 #  if defined(VG_BIGENDIAN)
   5294    end = Iend_BE;
   5295 #  elif defined(VG_LITTLEENDIAN)
   5296    end = Iend_LE;
   5297 #  else
   5298 #    error "Unknown endianness"
   5299 #  endif
   5300 
   5301    /* First check the guard. */
   5302    complainIfUndefined(mce, d->guard, NULL);
   5303 
   5304    /* Now round up all inputs and PCast over them. */
   5305    curr = definedOfType(Ity_I32);
   5306 
   5307    /* Inputs: unmasked args
   5308       Note: arguments are evaluated REGARDLESS of the guard expression */
   5309    for (i = 0; d->args[i]; i++) {
   5310       IRAtom* arg = d->args[i];
   5311       if ( (d->cee->mcx_mask & (1<<i))
   5312            || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
   5313          /* ignore this arg */
   5314       } else {
   5315          here = mkPCastTo( mce, Ity_I32, expr2vbits(mce, arg) );
   5316          curr = mkUifU32(mce, here, curr);
   5317       }
   5318    }
   5319 
   5320    /* Inputs: guest state that we read. */
   5321    for (i = 0; i < d->nFxState; i++) {
   5322       tl_assert(d->fxState[i].fx != Ifx_None);
   5323       if (d->fxState[i].fx == Ifx_Write)
   5324          continue;
   5325 
   5326       /* Enumerate the described state segments */
   5327       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   5328          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   5329          gSz  = d->fxState[i].size;
   5330 
   5331          /* Ignore any sections marked as 'always defined'. */
   5332          if (isAlwaysDefd(mce, gOff, gSz)) {
   5333             if (0)
   5334             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
   5335                         gOff, gSz);
   5336             continue;
   5337          }
   5338 
   5339          /* This state element is read or modified.  So we need to
   5340             consider it.  If larger than 8 bytes, deal with it in
   5341             8-byte chunks. */
   5342          while (True) {
   5343             tl_assert(gSz >= 0);
   5344             if (gSz == 0) break;
   5345             n = gSz <= 8 ? gSz : 8;
   5346             /* update 'curr' with UifU of the state slice
   5347                gOff .. gOff+n-1 */
   5348             tySrc = szToITy( n );
   5349 
   5350             /* Observe the guard expression. If it is false use an
   5351                all-bits-defined bit pattern */
   5352             IRAtom *cond, *iffalse, *iftrue;
   5353 
   5354             cond    = assignNew('V', mce, Ity_I1, d->guard);
   5355             iftrue  = assignNew('V', mce, tySrc, shadow_GET(mce, gOff, tySrc));
   5356             iffalse = assignNew('V', mce, tySrc, definedOfType(tySrc));
   5357             src     = assignNew('V', mce, tySrc,
   5358                                 IRExpr_ITE(cond, iftrue, iffalse));
   5359 
   5360             here = mkPCastTo( mce, Ity_I32, src );
   5361             curr = mkUifU32(mce, here, curr);
   5362             gSz -= n;
   5363             gOff += n;
   5364          }
   5365       }
   5366    }
   5367 
   5368    /* Inputs: memory.  First set up some info needed regardless of
   5369       whether we're doing reads or writes. */
   5370 
   5371    if (d->mFx != Ifx_None) {
   5372       /* Because we may do multiple shadow loads/stores from the same
   5373          base address, it's best to do a single test of its
   5374          definedness right now.  Post-instrumentation optimisation
   5375          should remove all but this test. */
   5376       IRType tyAddr;
   5377       tl_assert(d->mAddr);
   5378       complainIfUndefined(mce, d->mAddr, d->guard);
   5379 
   5380       tyAddr = typeOfIRExpr(mce->sb->tyenv, d->mAddr);
   5381       tl_assert(tyAddr == Ity_I32 || tyAddr == Ity_I64);
   5382       tl_assert(tyAddr == mce->hWordTy); /* not really right */
   5383    }
   5384 
   5385    /* Deal with memory inputs (reads or modifies) */
   5386    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
   5387       toDo   = d->mSize;
   5388       /* chew off 32-bit chunks.  We don't care about the endianness
   5389          since it's all going to be condensed down to a single bit,
   5390          but nevertheless choose an endianness which is hopefully
   5391          native to the platform. */
   5392       while (toDo >= 4) {
   5393          here = mkPCastTo(
   5394                    mce, Ity_I32,
   5395                    expr2vbits_Load_guarded_Simple(
   5396                       mce, end, Ity_I32, d->mAddr, d->mSize - toDo, d->guard )
   5397                 );
   5398          curr = mkUifU32(mce, here, curr);
   5399          toDo -= 4;
   5400       }
   5401       /* chew off 16-bit chunks */
   5402       while (toDo >= 2) {
   5403          here = mkPCastTo(
   5404                    mce, Ity_I32,
   5405                    expr2vbits_Load_guarded_Simple(
   5406                       mce, end, Ity_I16, d->mAddr, d->mSize - toDo, d->guard )
   5407                 );
   5408          curr = mkUifU32(mce, here, curr);
   5409          toDo -= 2;
   5410       }
   5411       /* chew off the remaining 8-bit chunk, if any */
   5412       if (toDo == 1) {
   5413          here = mkPCastTo(
   5414                    mce, Ity_I32,
   5415                    expr2vbits_Load_guarded_Simple(
   5416                       mce, end, Ity_I8, d->mAddr, d->mSize - toDo, d->guard )
   5417                 );
   5418          curr = mkUifU32(mce, here, curr);
   5419          toDo -= 1;
   5420       }
   5421       tl_assert(toDo == 0);
   5422    }
   5423 
   5424    /* Whew!  So curr is a 32-bit V-value summarising pessimistically
   5425       all the inputs to the helper.  Now we need to re-distribute the
   5426       results to all destinations. */
   5427 
   5428    /* Outputs: the destination temporary, if there is one. */
   5429    if (d->tmp != IRTemp_INVALID) {
   5430       dst   = findShadowTmpV(mce, d->tmp);
   5431       tyDst = typeOfIRTemp(mce->sb->tyenv, d->tmp);
   5432       assign( 'V', mce, dst, mkPCastTo( mce, tyDst, curr) );
   5433    }
   5434 
   5435    /* Outputs: guest state that we write or modify. */
   5436    for (i = 0; i < d->nFxState; i++) {
   5437       tl_assert(d->fxState[i].fx != Ifx_None);
   5438       if (d->fxState[i].fx == Ifx_Read)
   5439          continue;
   5440 
   5441       /* Enumerate the described state segments */
   5442       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   5443          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   5444          gSz  = d->fxState[i].size;
   5445 
   5446          /* Ignore any sections marked as 'always defined'. */
   5447          if (isAlwaysDefd(mce, gOff, gSz))
   5448             continue;
   5449 
   5450          /* This state element is written or modified.  So we need to
   5451             consider it.  If larger than 8 bytes, deal with it in
   5452             8-byte chunks. */
   5453          while (True) {
   5454             tl_assert(gSz >= 0);
   5455             if (gSz == 0) break;
   5456             n = gSz <= 8 ? gSz : 8;
   5457             /* Write suitably-casted 'curr' to the state slice
   5458                gOff .. gOff+n-1 */
   5459             tyDst = szToITy( n );
   5460             do_shadow_PUT( mce, gOff,
   5461                                 NULL, /* original atom */
   5462                                 mkPCastTo( mce, tyDst, curr ), d->guard );
   5463             gSz -= n;
   5464             gOff += n;
   5465          }
   5466       }
   5467    }
   5468 
   5469    /* Outputs: memory that we write or modify.  Same comments about
   5470       endianness as above apply. */
   5471    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
   5472       toDo   = d->mSize;
   5473       /* chew off 32-bit chunks */
   5474       while (toDo >= 4) {
   5475          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5476                           NULL, /* original data */
   5477                           mkPCastTo( mce, Ity_I32, curr ),
   5478                           d->guard );
   5479          toDo -= 4;
   5480       }
   5481       /* chew off 16-bit chunks */
   5482       while (toDo >= 2) {
   5483          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5484                           NULL, /* original data */
   5485                           mkPCastTo( mce, Ity_I16, curr ),
   5486                           d->guard );
   5487          toDo -= 2;
   5488       }
   5489       /* chew off the remaining 8-bit chunk, if any */
   5490       if (toDo == 1) {
   5491          do_shadow_Store( mce, end, d->mAddr, d->mSize - toDo,
   5492                           NULL, /* original data */
   5493                           mkPCastTo( mce, Ity_I8, curr ),
   5494                           d->guard );
   5495          toDo -= 1;
   5496       }
   5497       tl_assert(toDo == 0);
   5498    }
   5499 
   5500 }
   5501 
   5502 
   5503 /* We have an ABI hint telling us that [base .. base+len-1] is to
   5504    become undefined ("writable").  Generate code to call a helper to
   5505    notify the A/V bit machinery of this fact.
   5506 
   5507    We call
   5508    void MC_(helperc_MAKE_STACK_UNINIT) ( Addr base, UWord len,
   5509                                                     Addr nia );
   5510 */
   5511 static
   5512 void do_AbiHint ( MCEnv* mce, IRExpr* base, Int len, IRExpr* nia )
   5513 {
   5514    IRDirty* di;
   5515    /* Minor optimisation: if not doing origin tracking, ignore the
   5516       supplied nia and pass zero instead.  This is on the basis that
   5517       MC_(helperc_MAKE_STACK_UNINIT) will ignore it anyway, and we can
   5518       almost always generate a shorter instruction to put zero into a
   5519       register than any other value. */
   5520    if (MC_(clo_mc_level) < 3)
   5521       nia = mkIRExpr_HWord(0);
   5522 
   5523    di = unsafeIRDirty_0_N(
   5524            0/*regparms*/,
   5525            "MC_(helperc_MAKE_STACK_UNINIT)",
   5526            VG_(fnptr_to_fnentry)( &MC_(helperc_MAKE_STACK_UNINIT) ),
   5527            mkIRExprVec_3( base, mkIRExpr_HWord( (UInt)len), nia )
   5528         );
   5529    stmt( 'V', mce, IRStmt_Dirty(di) );
   5530 }
   5531 
   5532 
   5533 /* ------ Dealing with IRCAS (big and complex) ------ */
   5534 
   5535 /* FWDS */
   5536 static IRAtom* gen_load_b  ( MCEnv* mce, Int szB,
   5537                              IRAtom* baseaddr, Int offset );
   5538 static IRAtom* gen_maxU32  ( MCEnv* mce, IRAtom* b1, IRAtom* b2 );
   5539 static void    gen_store_b ( MCEnv* mce, Int szB,
   5540                              IRAtom* baseaddr, Int offset, IRAtom* dataB,
   5541                              IRAtom* guard );
   5542 
   5543 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas );
   5544 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas );
   5545 
   5546 
   5547 /* Either ORIG and SHADOW are both IRExpr.RdTmps, or they are both
   5548    IRExpr.Consts, else this asserts.  If they are both Consts, it
   5549    doesn't do anything.  So that just leaves the RdTmp case.
   5550 
   5551    In which case: this assigns the shadow value SHADOW to the IR
   5552    shadow temporary associated with ORIG.  That is, ORIG, being an
   5553    original temporary, will have a shadow temporary associated with
   5554    it.  However, in the case envisaged here, there will so far have
   5555    been no IR emitted to actually write a shadow value into that
   5556    temporary.  What this routine does is to (emit IR to) copy the
   5557    value in SHADOW into said temporary, so that after this call,
   5558    IRExpr.RdTmps of ORIG's shadow temp will correctly pick up the
   5559    value in SHADOW.
   5560 
   5561    Point is to allow callers to compute "by hand" a shadow value for
   5562    ORIG, and force it to be associated with ORIG.
   5563 
   5564    How do we know that that shadow associated with ORIG has not so far
   5565    been assigned to?  Well, we don't per se know that, but supposing
   5566    it had.  Then this routine would create a second assignment to it,
   5567    and later the IR sanity checker would barf.  But that never
   5568    happens.  QED.
   5569 */
   5570 static void bind_shadow_tmp_to_orig ( UChar how,
   5571                                       MCEnv* mce,
   5572                                       IRAtom* orig, IRAtom* shadow )
   5573 {
   5574    tl_assert(isOriginalAtom(mce, orig));
   5575    tl_assert(isShadowAtom(mce, shadow));
   5576    switch (orig->tag) {
   5577       case Iex_Const:
   5578          tl_assert(shadow->tag == Iex_Const);
   5579          break;
   5580       case Iex_RdTmp:
   5581          tl_assert(shadow->tag == Iex_RdTmp);
   5582          if (how == 'V') {
   5583             assign('V', mce, findShadowTmpV(mce,orig->Iex.RdTmp.tmp),
   5584                    shadow);
   5585          } else {
   5586             tl_assert(how == 'B');
   5587             assign('B', mce, findShadowTmpB(mce,orig->Iex.RdTmp.tmp),
   5588                    shadow);
   5589          }
   5590          break;
   5591       default:
   5592          tl_assert(0);
   5593    }
   5594 }
   5595 
   5596 
   5597 static
   5598 void do_shadow_CAS ( MCEnv* mce, IRCAS* cas )
   5599 {
   5600    /* Scheme is (both single- and double- cases):
   5601 
   5602       1. fetch data#,dataB (the proposed new value)
   5603 
   5604       2. fetch expd#,expdB (what we expect to see at the address)
   5605 
   5606       3. check definedness of address
   5607 
   5608       4. load old#,oldB from shadow memory; this also checks
   5609          addressibility of the address
   5610 
   5611       5. the CAS itself
   5612 
   5613       6. compute "expected == old".  See COMMENT_ON_CasCmpEQ below.
   5614 
   5615       7. if "expected == old" (as computed by (6))
   5616             store data#,dataB to shadow memory
   5617 
   5618       Note that 5 reads 'old' but 4 reads 'old#'.  Similarly, 5 stores
   5619       'data' but 7 stores 'data#'.  Hence it is possible for the
   5620       shadow data to be incorrectly checked and/or updated:
   5621 
   5622       * 7 is at least gated correctly, since the 'expected == old'
   5623         condition is derived from outputs of 5.  However, the shadow
   5624         write could happen too late: imagine after 5 we are
   5625         descheduled, a different thread runs, writes a different
   5626         (shadow) value at the address, and then we resume, hence
   5627         overwriting the shadow value written by the other thread.
   5628 
   5629       Because the original memory access is atomic, there's no way to
   5630       make both the original and shadow accesses into a single atomic
   5631       thing, hence this is unavoidable.
   5632 
   5633       At least as Valgrind stands, I don't think it's a problem, since
   5634       we're single threaded *and* we guarantee that there are no
   5635       context switches during the execution of any specific superblock
   5636       -- context switches can only happen at superblock boundaries.
   5637 
   5638       If Valgrind ever becomes MT in the future, then it might be more
   5639       of a problem.  A possible kludge would be to artificially
   5640       associate with the location, a lock, which we must acquire and
   5641       release around the transaction as a whole.  Hmm, that probably
   5642       would't work properly since it only guards us against other
   5643       threads doing CASs on the same location, not against other
   5644       threads doing normal reads and writes.
   5645 
   5646       ------------------------------------------------------------
   5647 
   5648       COMMENT_ON_CasCmpEQ:
   5649 
   5650       Note two things.  Firstly, in the sequence above, we compute
   5651       "expected == old", but we don't check definedness of it.  Why
   5652       not?  Also, the x86 and amd64 front ends use
   5653       Iop_CasCmp{EQ,NE}{8,16,32,64} comparisons to make the equivalent
   5654       determination (expected == old ?) for themselves, and we also
   5655       don't check definedness for those primops; we just say that the
   5656       result is defined.  Why?  Details follow.
   5657 
   5658       x86/amd64 contains various forms of locked insns:
   5659       * lock prefix before all basic arithmetic insn;
   5660         eg lock xorl %reg1,(%reg2)
   5661       * atomic exchange reg-mem
   5662       * compare-and-swaps
   5663 
   5664       Rather than attempt to represent them all, which would be a
   5665       royal PITA, I used a result from Maurice Herlihy
   5666       (http://en.wikipedia.org/wiki/Maurice_Herlihy), in which he
   5667       demonstrates that compare-and-swap is a primitive more general
   5668       than the other two, and so can be used to represent all of them.
   5669       So the translation scheme for (eg) lock incl (%reg) is as
   5670       follows:
   5671 
   5672         again:
   5673          old = * %reg
   5674          new = old + 1
   5675          atomically { if (* %reg == old) { * %reg = new } else { goto again } }
   5676 
   5677       The "atomically" is the CAS bit.  The scheme is always the same:
   5678       get old value from memory, compute new value, atomically stuff
   5679       new value back in memory iff the old value has not changed (iow,
   5680       no other thread modified it in the meantime).  If it has changed
   5681       then we've been out-raced and we have to start over.
   5682 
   5683       Now that's all very neat, but it has the bad side effect of
   5684       introducing an explicit equality test into the translation.
   5685       Consider the behaviour of said code on a memory location which
   5686       is uninitialised.  We will wind up doing a comparison on
   5687       uninitialised data, and mc duly complains.
   5688 
   5689       What's difficult about this is, the common case is that the
   5690       location is uncontended, and so we're usually comparing the same
   5691       value (* %reg) with itself.  So we shouldn't complain even if it
   5692       is undefined.  But mc doesn't know that.
   5693 
   5694       My solution is to mark the == in the IR specially, so as to tell
   5695       mc that it almost certainly compares a value with itself, and we
   5696       should just regard the result as always defined.  Rather than
   5697       add a bit to all IROps, I just cloned Iop_CmpEQ{8,16,32,64} into
   5698       Iop_CasCmpEQ{8,16,32,64} so as not to disturb anything else.
   5699 
   5700       So there's always the question of, can this give a false
   5701       negative?  eg, imagine that initially, * %reg is defined; and we
   5702       read that; but then in the gap between the read and the CAS, a
   5703       different thread writes an undefined (and different) value at
   5704       the location.  Then the CAS in this thread will fail and we will
   5705       go back to "again:", but without knowing that the trip back
   5706       there was based on an undefined comparison.  No matter; at least
   5707       the other thread won the race and the location is correctly
   5708       marked as undefined.  What if it wrote an uninitialised version
   5709       of the same value that was there originally, though?
   5710 
   5711       etc etc.  Seems like there's a small corner case in which we
   5712       might lose the fact that something's defined -- we're out-raced
   5713       in between the "old = * reg" and the "atomically {", _and_ the
   5714       other thread is writing in an undefined version of what's
   5715       already there.  Well, that seems pretty unlikely.
   5716 
   5717       ---
   5718 
   5719       If we ever need to reinstate it .. code which generates a
   5720       definedness test for "expected == old" was removed at r10432 of
   5721       this file.
   5722    */
   5723    if (cas->oldHi == IRTemp_INVALID) {
   5724       do_shadow_CAS_single( mce, cas );
   5725    } else {
   5726       do_shadow_CAS_double( mce, cas );
   5727    }
   5728 }
   5729 
   5730 
   5731 static void do_shadow_CAS_single ( MCEnv* mce, IRCAS* cas )
   5732 {
   5733    IRAtom *vdataLo = NULL, *bdataLo = NULL;
   5734    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
   5735    IRAtom *voldLo  = NULL, *boldLo  = NULL;
   5736    IRAtom *expd_eq_old = NULL;
   5737    IROp   opCasCmpEQ;
   5738    Int    elemSzB;
   5739    IRType elemTy;
   5740    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
   5741 
   5742    /* single CAS */
   5743    tl_assert(cas->oldHi == IRTemp_INVALID);
   5744    tl_assert(cas->expdHi == NULL);
   5745    tl_assert(cas->dataHi == NULL);
   5746 
   5747    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
   5748    switch (elemTy) {
   5749       case Ity_I8:  elemSzB = 1; opCasCmpEQ = Iop_CasCmpEQ8;  break;
   5750       case Ity_I16: elemSzB = 2; opCasCmpEQ = Iop_CasCmpEQ16; break;
   5751       case Ity_I32: elemSzB = 4; opCasCmpEQ = Iop_CasCmpEQ32; break;
   5752       case Ity_I64: elemSzB = 8; opCasCmpEQ = Iop_CasCmpEQ64; break;
   5753       default: tl_assert(0); /* IR defn disallows any other types */
   5754    }
   5755 
   5756    /* 1. fetch data# (the proposed new value) */
   5757    tl_assert(isOriginalAtom(mce, cas->dataLo));
   5758    vdataLo
   5759       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
   5760    tl_assert(isShadowAtom(mce, vdataLo));
   5761    if (otrak) {
   5762       bdataLo
   5763          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
   5764       tl_assert(isShadowAtom(mce, bdataLo));
   5765    }
   5766 
   5767    /* 2. fetch expected# (what we expect to see at the address) */
   5768    tl_assert(isOriginalAtom(mce, cas->expdLo));
   5769    vexpdLo
   5770       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
   5771    tl_assert(isShadowAtom(mce, vexpdLo));
   5772    if (otrak) {
   5773       bexpdLo
   5774          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
   5775       tl_assert(isShadowAtom(mce, bexpdLo));
   5776    }
   5777 
   5778    /* 3. check definedness of address */
   5779    /* 4. fetch old# from shadow memory; this also checks
   5780          addressibility of the address */
   5781    voldLo
   5782       = assignNew(
   5783            'V', mce, elemTy,
   5784            expr2vbits_Load(
   5785               mce,
   5786               cas->end, elemTy, cas->addr, 0/*Addr bias*/,
   5787               NULL/*always happens*/
   5788         ));
   5789    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
   5790    if (otrak) {
   5791       boldLo
   5792          = assignNew('B', mce, Ity_I32,
   5793                      gen_load_b(mce, elemSzB, cas->addr, 0/*addr bias*/));
   5794       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
   5795    }
   5796 
   5797    /* 5. the CAS itself */
   5798    stmt( 'C', mce, IRStmt_CAS(cas) );
   5799 
   5800    /* 6. compute "expected == old" */
   5801    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
   5802    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
   5803       tree, but it's not copied from the input block. */
   5804    expd_eq_old
   5805       = assignNew('C', mce, Ity_I1,
   5806                   binop(opCasCmpEQ, cas->expdLo, mkexpr(cas->oldLo)));
   5807 
   5808    /* 7. if "expected == old"
   5809             store data# to shadow memory */
   5810    do_shadow_Store( mce, cas->end, cas->addr, 0/*bias*/,
   5811                     NULL/*data*/, vdataLo/*vdata*/,
   5812                     expd_eq_old/*guard for store*/ );
   5813    if (otrak) {
   5814       gen_store_b( mce, elemSzB, cas->addr, 0/*offset*/,
   5815                    bdataLo/*bdata*/,
   5816                    expd_eq_old/*guard for store*/ );
   5817    }
   5818 }
   5819 
   5820 
   5821 static void do_shadow_CAS_double ( MCEnv* mce, IRCAS* cas )
   5822 {
   5823    IRAtom *vdataHi = NULL, *bdataHi = NULL;
   5824    IRAtom *vdataLo = NULL, *bdataLo = NULL;
   5825    IRAtom *vexpdHi = NULL, *bexpdHi = NULL;
   5826    IRAtom *vexpdLo = NULL, *bexpdLo = NULL;
   5827    IRAtom *voldHi  = NULL, *boldHi  = NULL;
   5828    IRAtom *voldLo  = NULL, *boldLo  = NULL;
   5829    IRAtom *xHi = NULL, *xLo = NULL, *xHL = NULL;
   5830    IRAtom *expd_eq_old = NULL, *zero = NULL;
   5831    IROp   opCasCmpEQ, opOr, opXor;
   5832    Int    elemSzB, memOffsLo, memOffsHi;
   5833    IRType elemTy;
   5834    Bool   otrak = MC_(clo_mc_level) >= 3; /* a shorthand */
   5835 
   5836    /* double CAS */
   5837    tl_assert(cas->oldHi != IRTemp_INVALID);
   5838    tl_assert(cas->expdHi != NULL);
   5839    tl_assert(cas->dataHi != NULL);
   5840 
   5841    elemTy = typeOfIRExpr(mce->sb->tyenv, cas->expdLo);
   5842    switch (elemTy) {
   5843       case Ity_I8:
   5844          opCasCmpEQ = Iop_CasCmpEQ8; opOr = Iop_Or8; opXor = Iop_Xor8;
   5845          elemSzB = 1; zero = mkU8(0);
   5846          break;
   5847       case Ity_I16:
   5848          opCasCmpEQ = Iop_CasCmpEQ16; opOr = Iop_Or16; opXor = Iop_Xor16;
   5849          elemSzB = 2; zero = mkU16(0);
   5850          break;
   5851       case Ity_I32:
   5852          opCasCmpEQ = Iop_CasCmpEQ32; opOr = Iop_Or32; opXor = Iop_Xor32;
   5853          elemSzB = 4; zero = mkU32(0);
   5854          break;
   5855       case Ity_I64:
   5856          opCasCmpEQ = Iop_CasCmpEQ64; opOr = Iop_Or64; opXor = Iop_Xor64;
   5857          elemSzB = 8; zero = mkU64(0);
   5858          break;
   5859       default:
   5860          tl_assert(0); /* IR defn disallows any other types */
   5861    }
   5862 
   5863    /* 1. fetch data# (the proposed new value) */
   5864    tl_assert(isOriginalAtom(mce, cas->dataHi));
   5865    tl_assert(isOriginalAtom(mce, cas->dataLo));
   5866    vdataHi
   5867       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataHi));
   5868    vdataLo
   5869       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->dataLo));
   5870    tl_assert(isShadowAtom(mce, vdataHi));
   5871    tl_assert(isShadowAtom(mce, vdataLo));
   5872    if (otrak) {
   5873       bdataHi
   5874          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataHi));
   5875       bdataLo
   5876          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->dataLo));
   5877       tl_assert(isShadowAtom(mce, bdataHi));
   5878       tl_assert(isShadowAtom(mce, bdataLo));
   5879    }
   5880 
   5881    /* 2. fetch expected# (what we expect to see at the address) */
   5882    tl_assert(isOriginalAtom(mce, cas->expdHi));
   5883    tl_assert(isOriginalAtom(mce, cas->expdLo));
   5884    vexpdHi
   5885       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdHi));
   5886    vexpdLo
   5887       = assignNew('V', mce, elemTy, expr2vbits(mce, cas->expdLo));
   5888    tl_assert(isShadowAtom(mce, vexpdHi));
   5889    tl_assert(isShadowAtom(mce, vexpdLo));
   5890    if (otrak) {
   5891       bexpdHi
   5892          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdHi));
   5893       bexpdLo
   5894          = assignNew('B', mce, Ity_I32, schemeE(mce, cas->expdLo));
   5895       tl_assert(isShadowAtom(mce, bexpdHi));
   5896       tl_assert(isShadowAtom(mce, bexpdLo));
   5897    }
   5898 
   5899    /* 3. check definedness of address */
   5900    /* 4. fetch old# from shadow memory; this also checks
   5901          addressibility of the address */
   5902    if (cas->end == Iend_LE) {
   5903       memOffsLo = 0;
   5904       memOffsHi = elemSzB;
   5905    } else {
   5906       tl_assert(cas->end == Iend_BE);
   5907       memOffsLo = elemSzB;
   5908       memOffsHi = 0;
   5909    }
   5910    voldHi
   5911       = assignNew(
   5912            'V', mce, elemTy,
   5913            expr2vbits_Load(
   5914               mce,
   5915               cas->end, elemTy, cas->addr, memOffsHi/*Addr bias*/,
   5916               NULL/*always happens*/
   5917         ));
   5918    voldLo
   5919       = assignNew(
   5920            'V', mce, elemTy,
   5921            expr2vbits_Load(
   5922               mce,
   5923               cas->end, elemTy, cas->addr, memOffsLo/*Addr bias*/,
   5924               NULL/*always happens*/
   5925         ));
   5926    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldHi), voldHi);
   5927    bind_shadow_tmp_to_orig('V', mce, mkexpr(cas->oldLo), voldLo);
   5928    if (otrak) {
   5929       boldHi
   5930          = assignNew('B', mce, Ity_I32,
   5931                      gen_load_b(mce, elemSzB, cas->addr,
   5932                                 memOffsHi/*addr bias*/));
   5933       boldLo
   5934          = assignNew('B', mce, Ity_I32,
   5935                      gen_load_b(mce, elemSzB, cas->addr,
   5936                                 memOffsLo/*addr bias*/));
   5937       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldHi), boldHi);
   5938       bind_shadow_tmp_to_orig('B', mce, mkexpr(cas->oldLo), boldLo);
   5939    }
   5940 
   5941    /* 5. the CAS itself */
   5942    stmt( 'C', mce, IRStmt_CAS(cas) );
   5943 
   5944    /* 6. compute "expected == old" */
   5945    /* See COMMENT_ON_CasCmpEQ in this file background/rationale. */
   5946    /* Note that 'C' is kinda faking it; it is indeed a non-shadow
   5947       tree, but it's not copied from the input block. */
   5948    /*
   5949       xHi = oldHi ^ expdHi;
   5950       xLo = oldLo ^ expdLo;
   5951       xHL = xHi | xLo;
   5952       expd_eq_old = xHL == 0;
   5953    */
   5954    xHi = assignNew('C', mce, elemTy,
   5955                    binop(opXor, cas->expdHi, mkexpr(cas->oldHi)));
   5956    xLo = assignNew('C', mce, elemTy,
   5957                    binop(opXor, cas->expdLo, mkexpr(cas->oldLo)));
   5958    xHL = assignNew('C', mce, elemTy,
   5959                    binop(opOr, xHi, xLo));
   5960    expd_eq_old
   5961       = assignNew('C', mce, Ity_I1,
   5962                   binop(opCasCmpEQ, xHL, zero));
   5963 
   5964    /* 7. if "expected == old"
   5965             store data# to shadow memory */
   5966    do_shadow_Store( mce, cas->end, cas->addr, memOffsHi/*bias*/,
   5967                     NULL/*data*/, vdataHi/*vdata*/,
   5968                     expd_eq_old/*guard for store*/ );
   5969    do_shadow_Store( mce, cas->end, cas->addr, memOffsLo/*bias*/,
   5970                     NULL/*data*/, vdataLo/*vdata*/,
   5971                     expd_eq_old/*guard for store*/ );
   5972    if (otrak) {
   5973       gen_store_b( mce, elemSzB, cas->addr, memOffsHi/*offset*/,
   5974                    bdataHi/*bdata*/,
   5975                    expd_eq_old/*guard for store*/ );
   5976       gen_store_b( mce, elemSzB, cas->addr, memOffsLo/*offset*/,
   5977                    bdataLo/*bdata*/,
   5978                    expd_eq_old/*guard for store*/ );
   5979    }
   5980 }
   5981 
   5982 
   5983 /* ------ Dealing with LL/SC (not difficult) ------ */
   5984 
   5985 static void do_shadow_LLSC ( MCEnv*    mce,
   5986                              IREndness stEnd,
   5987                              IRTemp    stResult,
   5988                              IRExpr*   stAddr,
   5989                              IRExpr*   stStoredata )
   5990 {
   5991    /* In short: treat a load-linked like a normal load followed by an
   5992       assignment of the loaded (shadow) data to the result temporary.
   5993       Treat a store-conditional like a normal store, and mark the
   5994       result temporary as defined. */
   5995    IRType resTy  = typeOfIRTemp(mce->sb->tyenv, stResult);
   5996    IRTemp resTmp = findShadowTmpV(mce, stResult);
   5997 
   5998    tl_assert(isIRAtom(stAddr));
   5999    if (stStoredata)
   6000       tl_assert(isIRAtom(stStoredata));
   6001 
   6002    if (stStoredata == NULL) {
   6003       /* Load Linked */
   6004       /* Just treat this as a normal load, followed by an assignment of
   6005          the value to .result. */
   6006       /* Stay sane */
   6007       tl_assert(resTy == Ity_I64 || resTy == Ity_I32
   6008                 || resTy == Ity_I16 || resTy == Ity_I8);
   6009       assign( 'V', mce, resTmp,
   6010                    expr2vbits_Load(
   6011                       mce, stEnd, resTy, stAddr, 0/*addr bias*/,
   6012                       NULL/*always happens*/) );
   6013    } else {
   6014       /* Store Conditional */
   6015       /* Stay sane */
   6016       IRType dataTy = typeOfIRExpr(mce->sb->tyenv,
   6017                                    stStoredata);
   6018       tl_assert(dataTy == Ity_I64 || dataTy == Ity_I32
   6019                 || dataTy == Ity_I16 || dataTy == Ity_I8);
   6020       do_shadow_Store( mce, stEnd,
   6021                             stAddr, 0/* addr bias */,
   6022                             stStoredata,
   6023                             NULL /* shadow data */,
   6024                             NULL/*guard*/ );
   6025       /* This is a store conditional, so it writes to .result a value
   6026          indicating whether or not the store succeeded.  Just claim
   6027          this value is always defined.  In the PowerPC interpretation
   6028          of store-conditional, definedness of the success indication
   6029          depends on whether the address of the store matches the
   6030          reservation address.  But we can't tell that here (and
   6031          anyway, we're not being PowerPC-specific).  At least we are
   6032          guaranteed that the definedness of the store address, and its
   6033          addressibility, will be checked as per normal.  So it seems
   6034          pretty safe to just say that the success indication is always
   6035          defined.
   6036 
   6037          In schemeS, for origin tracking, we must correspondingly set
   6038          a no-origin value for the origin shadow of .result.
   6039       */
   6040       tl_assert(resTy == Ity_I1);
   6041       assign( 'V', mce, resTmp, definedOfType(resTy) );
   6042    }
   6043 }
   6044 
   6045 
   6046 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
   6047 
   6048 static void do_shadow_StoreG ( MCEnv* mce, IRStoreG* sg )
   6049 {
   6050    complainIfUndefined(mce, sg->guard, NULL);
   6051    /* do_shadow_Store will generate code to check the definedness and
   6052       validity of sg->addr, in the case where sg->guard evaluates to
   6053       True at run-time. */
   6054    do_shadow_Store( mce, sg->end,
   6055                     sg->addr, 0/* addr bias */,
   6056                     sg->data,
   6057                     NULL /* shadow data */,
   6058                     sg->guard );
   6059 }
   6060 
   6061 static void do_shadow_LoadG ( MCEnv* mce, IRLoadG* lg )
   6062 {
   6063    complainIfUndefined(mce, lg->guard, NULL);
   6064    /* expr2vbits_Load_guarded_General will generate code to check the
   6065       definedness and validity of lg->addr, in the case where
   6066       lg->guard evaluates to True at run-time. */
   6067 
   6068    /* Look at the LoadG's built-in conversion operation, to determine
   6069       the source (actual loaded data) type, and the equivalent IROp.
   6070       NOTE that implicitly we are taking a widening operation to be
   6071       applied to original atoms and producing one that applies to V
   6072       bits.  Since signed and unsigned widening are self-shadowing,
   6073       this is a straight copy of the op (modulo swapping from the
   6074       IRLoadGOp form to the IROp form).  Note also therefore that this
   6075       implicitly duplicates the logic to do with said widening ops in
   6076       expr2vbits_Unop.  See comment at the start of expr2vbits_Unop. */
   6077    IROp   vwiden   = Iop_INVALID;
   6078    IRType loadedTy = Ity_INVALID;
   6079    switch (lg->cvt) {
   6080       case ILGop_IdentV128: loadedTy = Ity_V128; vwiden = Iop_INVALID; break;
   6081       case ILGop_Ident64:   loadedTy = Ity_I64;  vwiden = Iop_INVALID; break;
   6082       case ILGop_Ident32:   loadedTy = Ity_I32;  vwiden = Iop_INVALID; break;
   6083       case ILGop_16Uto32:   loadedTy = Ity_I16;  vwiden = Iop_16Uto32; break;
   6084       case ILGop_16Sto32:   loadedTy = Ity_I16;  vwiden = Iop_16Sto32; break;
   6085       case ILGop_8Uto32:    loadedTy = Ity_I8;   vwiden = Iop_8Uto32;  break;
   6086       case ILGop_8Sto32:    loadedTy = Ity_I8;   vwiden = Iop_8Sto32;  break;
   6087       default: VG_(tool_panic)("do_shadow_LoadG");
   6088    }
   6089 
   6090    IRAtom* vbits_alt
   6091       = expr2vbits( mce, lg->alt );
   6092    IRAtom* vbits_final
   6093       = expr2vbits_Load_guarded_General(mce, lg->end, loadedTy,
   6094                                         lg->addr, 0/*addr bias*/,
   6095                                         lg->guard, vwiden, vbits_alt );
   6096    /* And finally, bind the V bits to the destination temporary. */
   6097    assign( 'V', mce, findShadowTmpV(mce, lg->dst), vbits_final );
   6098 }
   6099 
   6100 
   6101 /*------------------------------------------------------------*/
   6102 /*--- Memcheck main                                        ---*/
   6103 /*------------------------------------------------------------*/
   6104 
   6105 static void schemeS ( MCEnv* mce, IRStmt* st );
   6106 
   6107 static Bool isBogusAtom ( IRAtom* at )
   6108 {
   6109    ULong n = 0;
   6110    IRConst* con;
   6111    tl_assert(isIRAtom(at));
   6112    if (at->tag == Iex_RdTmp)
   6113       return False;
   6114    tl_assert(at->tag == Iex_Const);
   6115    con = at->Iex.Const.con;
   6116    switch (con->tag) {
   6117       case Ico_U1:   return False;
   6118       case Ico_U8:   n = (ULong)con->Ico.U8; break;
   6119       case Ico_U16:  n = (ULong)con->Ico.U16; break;
   6120       case Ico_U32:  n = (ULong)con->Ico.U32; break;
   6121       case Ico_U64:  n = (ULong)con->Ico.U64; break;
   6122       case Ico_F32:  return False;
   6123       case Ico_F64:  return False;
   6124       case Ico_F32i: return False;
   6125       case Ico_F64i: return False;
   6126       case Ico_V128: return False;
   6127       case Ico_V256: return False;
   6128       default: ppIRExpr(at); tl_assert(0);
   6129    }
   6130    /* VG_(printf)("%llx\n", n); */
   6131    return (/*32*/    n == 0xFEFEFEFFULL
   6132            /*32*/ || n == 0x80808080ULL
   6133            /*32*/ || n == 0x7F7F7F7FULL
   6134            /*32*/ || n == 0x7EFEFEFFULL
   6135            /*32*/ || n == 0x81010100ULL
   6136            /*64*/ || n == 0xFFFFFFFFFEFEFEFFULL
   6137            /*64*/ || n == 0xFEFEFEFEFEFEFEFFULL
   6138            /*64*/ || n == 0x0000000000008080ULL
   6139            /*64*/ || n == 0x8080808080808080ULL
   6140            /*64*/ || n == 0x0101010101010101ULL
   6141           );
   6142 }
   6143 
   6144 static Bool checkForBogusLiterals ( /*FLAT*/ IRStmt* st )
   6145 {
   6146    Int      i;
   6147    IRExpr*  e;
   6148    IRDirty* d;
   6149    IRCAS*   cas;
   6150    switch (st->tag) {
   6151       case Ist_WrTmp:
   6152          e = st->Ist.WrTmp.data;
   6153          switch (e->tag) {
   6154             case Iex_Get:
   6155             case Iex_RdTmp:
   6156                return False;
   6157             case Iex_Const:
   6158                return isBogusAtom(e);
   6159             case Iex_Unop:
   6160                return isBogusAtom(e->Iex.Unop.arg)
   6161                       || e->Iex.Unop.op == Iop_GetMSBs8x16;
   6162             case Iex_GetI:
   6163                return isBogusAtom(e->Iex.GetI.ix);
   6164             case Iex_Binop:
   6165                return isBogusAtom(e->Iex.Binop.arg1)
   6166                       || isBogusAtom(e->Iex.Binop.arg2);
   6167             case Iex_Triop:
   6168                return isBogusAtom(e->Iex.Triop.details->arg1)
   6169                       || isBogusAtom(e->Iex.Triop.details->arg2)
   6170                       || isBogusAtom(e->Iex.Triop.details->arg3);
   6171             case Iex_Qop:
   6172                return isBogusAtom(e->Iex.Qop.details->arg1)
   6173                       || isBogusAtom(e->Iex.Qop.details->arg2)
   6174                       || isBogusAtom(e->Iex.Qop.details->arg3)
   6175                       || isBogusAtom(e->Iex.Qop.details->arg4);
   6176             case Iex_ITE:
   6177                return isBogusAtom(e->Iex.ITE.cond)
   6178                       || isBogusAtom(e->Iex.ITE.iftrue)
   6179                       || isBogusAtom(e->Iex.ITE.iffalse);
   6180             case Iex_Load:
   6181                return isBogusAtom(e->Iex.Load.addr);
   6182             case Iex_CCall:
   6183                for (i = 0; e->Iex.CCall.args[i]; i++)
   6184                   if (isBogusAtom(e->Iex.CCall.args[i]))
   6185                      return True;
   6186                return False;
   6187             default:
   6188                goto unhandled;
   6189          }
   6190       case Ist_Dirty:
   6191          d = st->Ist.Dirty.details;
   6192          for (i = 0; d->args[i]; i++) {
   6193             IRAtom* atom = d->args[i];
   6194             if (LIKELY(!is_IRExpr_VECRET_or_BBPTR(atom))) {
   6195                if (isBogusAtom(atom))
   6196                   return True;
   6197             }
   6198          }
   6199          if (isBogusAtom(d->guard))
   6200             return True;
   6201          if (d->mAddr && isBogusAtom(d->mAddr))
   6202             return True;
   6203          return False;
   6204       case Ist_Put:
   6205          return isBogusAtom(st->Ist.Put.data);
   6206       case Ist_PutI:
   6207          return isBogusAtom(st->Ist.PutI.details->ix)
   6208                 || isBogusAtom(st->Ist.PutI.details->data);
   6209       case Ist_Store:
   6210          return isBogusAtom(st->Ist.Store.addr)
   6211                 || isBogusAtom(st->Ist.Store.data);
   6212       case Ist_StoreG: {
   6213          IRStoreG* sg = st->Ist.StoreG.details;
   6214          return isBogusAtom(sg->addr) || isBogusAtom(sg->data)
   6215                 || isBogusAtom(sg->guard);
   6216       }
   6217       case Ist_LoadG: {
   6218          IRLoadG* lg = st->Ist.LoadG.details;
   6219          return isBogusAtom(lg->addr) || isBogusAtom(lg->alt)
   6220                 || isBogusAtom(lg->guard);
   6221       }
   6222       case Ist_Exit:
   6223          return isBogusAtom(st->Ist.Exit.guard);
   6224       case Ist_AbiHint:
   6225          return isBogusAtom(st->Ist.AbiHint.base)
   6226                 || isBogusAtom(st->Ist.AbiHint.nia);
   6227       case Ist_NoOp:
   6228       case Ist_IMark:
   6229       case Ist_MBE:
   6230          return False;
   6231       case Ist_CAS:
   6232          cas = st->Ist.CAS.details;
   6233          return isBogusAtom(cas->addr)
   6234                 || (cas->expdHi ? isBogusAtom(cas->expdHi) : False)
   6235                 || isBogusAtom(cas->expdLo)
   6236                 || (cas->dataHi ? isBogusAtom(cas->dataHi) : False)
   6237                 || isBogusAtom(cas->dataLo);
   6238       case Ist_LLSC:
   6239          return isBogusAtom(st->Ist.LLSC.addr)
   6240                 || (st->Ist.LLSC.storedata
   6241                        ? isBogusAtom(st->Ist.LLSC.storedata)
   6242                        : False);
   6243       default:
   6244       unhandled:
   6245          ppIRStmt(st);
   6246          VG_(tool_panic)("hasBogusLiterals");
   6247    }
   6248 }
   6249 
   6250 
   6251 IRSB* MC_(instrument) ( VgCallbackClosure* closure,
   6252                         IRSB* sb_in,
   6253                         const VexGuestLayout* layout,
   6254                         const VexGuestExtents* vge,
   6255                         const VexArchInfo* archinfo_host,
   6256                         IRType gWordTy, IRType hWordTy )
   6257 {
   6258    Bool    verboze = 0||False;
   6259    Int     i, j, first_stmt;
   6260    IRStmt* st;
   6261    MCEnv   mce;
   6262    IRSB*   sb_out;
   6263 
   6264    if (gWordTy != hWordTy) {
   6265       /* We don't currently support this case. */
   6266       VG_(tool_panic)("host/guest word size mismatch");
   6267    }
   6268 
   6269    /* Check we're not completely nuts */
   6270    tl_assert(sizeof(UWord)  == sizeof(void*));
   6271    tl_assert(sizeof(Word)   == sizeof(void*));
   6272    tl_assert(sizeof(Addr)   == sizeof(void*));
   6273    tl_assert(sizeof(ULong)  == 8);
   6274    tl_assert(sizeof(Long)   == 8);
   6275    tl_assert(sizeof(UInt)   == 4);
   6276    tl_assert(sizeof(Int)    == 4);
   6277 
   6278    tl_assert(MC_(clo_mc_level) >= 1 && MC_(clo_mc_level) <= 3);
   6279 
   6280    /* Set up SB */
   6281    sb_out = deepCopyIRSBExceptStmts(sb_in);
   6282 
   6283    /* Set up the running environment.  Both .sb and .tmpMap are
   6284       modified as we go along.  Note that tmps are added to both
   6285       .sb->tyenv and .tmpMap together, so the valid index-set for
   6286       those two arrays should always be identical. */
   6287    VG_(memset)(&mce, 0, sizeof(mce));
   6288    mce.sb             = sb_out;
   6289    mce.trace          = verboze;
   6290    mce.layout         = layout;
   6291    mce.hWordTy        = hWordTy;
   6292    mce.bogusLiterals  = False;
   6293 
   6294    /* Do expensive interpretation for Iop_Add32 and Iop_Add64 on
   6295       Darwin.  10.7 is mostly built with LLVM, which uses these for
   6296       bitfield inserts, and we get a lot of false errors if the cheap
   6297       interpretation is used, alas.  Could solve this much better if
   6298       we knew which of such adds came from x86/amd64 LEA instructions,
   6299       since these are the only ones really needing the expensive
   6300       interpretation, but that would require some way to tag them in
   6301       the _toIR.c front ends, which is a lot of faffing around.  So
   6302       for now just use the slow and blunt-instrument solution. */
   6303    mce.useLLVMworkarounds = False;
   6304 #  if defined(VGO_darwin)
   6305    mce.useLLVMworkarounds = True;
   6306 #  endif
   6307 
   6308    mce.tmpMap = VG_(newXA)( VG_(malloc), "mc.MC_(instrument).1", VG_(free),
   6309                             sizeof(TempMapEnt));
   6310    VG_(hintSizeXA) (mce.tmpMap, sb_in->tyenv->types_used);
   6311    for (i = 0; i < sb_in->tyenv->types_used; i++) {
   6312       TempMapEnt ent;
   6313       ent.kind    = Orig;
   6314       ent.shadowV = IRTemp_INVALID;
   6315       ent.shadowB = IRTemp_INVALID;
   6316       VG_(addToXA)( mce.tmpMap, &ent );
   6317    }
   6318    tl_assert( VG_(sizeXA)( mce.tmpMap ) == sb_in->tyenv->types_used );
   6319 
   6320    if (MC_(clo_expensive_definedness_checks)) {
   6321       /* For expensive definedness checking skip looking for bogus
   6322          literals. */
   6323       mce.bogusLiterals = True;
   6324    } else {
   6325       /* Make a preliminary inspection of the statements, to see if there
   6326          are any dodgy-looking literals.  If there are, we generate
   6327          extra-detailed (hence extra-expensive) instrumentation in
   6328          places.  Scan the whole bb even if dodgyness is found earlier,
   6329          so that the flatness assertion is applied to all stmts. */
   6330       Bool bogus = False;
   6331 
   6332       for (i = 0; i < sb_in->stmts_used; i++) {
   6333          st = sb_in->stmts[i];
   6334          tl_assert(st);
   6335          tl_assert(isFlatIRStmt(st));
   6336 
   6337          if (!bogus) {
   6338             bogus = checkForBogusLiterals(st);
   6339             if (0 && bogus) {
   6340                VG_(printf)("bogus: ");
   6341                ppIRStmt(st);
   6342                VG_(printf)("\n");
   6343             }
   6344             if (bogus) break;
   6345          }
   6346       }
   6347       mce.bogusLiterals = bogus;
   6348    }
   6349 
   6350    /* Copy verbatim any IR preamble preceding the first IMark */
   6351 
   6352    tl_assert(mce.sb == sb_out);
   6353    tl_assert(mce.sb != sb_in);
   6354 
   6355    i = 0;
   6356    while (i < sb_in->stmts_used && sb_in->stmts[i]->tag != Ist_IMark) {
   6357 
   6358       st = sb_in->stmts[i];
   6359       tl_assert(st);
   6360       tl_assert(isFlatIRStmt(st));
   6361 
   6362       stmt( 'C', &mce, sb_in->stmts[i] );
   6363       i++;
   6364    }
   6365 
   6366    /* Nasty problem.  IR optimisation of the pre-instrumented IR may
   6367       cause the IR following the preamble to contain references to IR
   6368       temporaries defined in the preamble.  Because the preamble isn't
   6369       instrumented, these temporaries don't have any shadows.
   6370       Nevertheless uses of them following the preamble will cause
   6371       memcheck to generate references to their shadows.  End effect is
   6372       to cause IR sanity check failures, due to references to
   6373       non-existent shadows.  This is only evident for the complex
   6374       preambles used for function wrapping on TOC-afflicted platforms
   6375       (ppc64-linux).
   6376 
   6377       The following loop therefore scans the preamble looking for
   6378       assignments to temporaries.  For each one found it creates an
   6379       assignment to the corresponding (V) shadow temp, marking it as
   6380       'defined'.  This is the same resulting IR as if the main
   6381       instrumentation loop before had been applied to the statement
   6382       'tmp = CONSTANT'.
   6383 
   6384       Similarly, if origin tracking is enabled, we must generate an
   6385       assignment for the corresponding origin (B) shadow, claiming
   6386       no-origin, as appropriate for a defined value.
   6387    */
   6388    for (j = 0; j < i; j++) {
   6389       if (sb_in->stmts[j]->tag == Ist_WrTmp) {
   6390          /* findShadowTmpV checks its arg is an original tmp;
   6391             no need to assert that here. */
   6392          IRTemp tmp_o = sb_in->stmts[j]->Ist.WrTmp.tmp;
   6393          IRTemp tmp_v = findShadowTmpV(&mce, tmp_o);
   6394          IRType ty_v  = typeOfIRTemp(sb_out->tyenv, tmp_v);
   6395          assign( 'V', &mce, tmp_v, definedOfType( ty_v ) );
   6396          if (MC_(clo_mc_level) == 3) {
   6397             IRTemp tmp_b = findShadowTmpB(&mce, tmp_o);
   6398             tl_assert(typeOfIRTemp(sb_out->tyenv, tmp_b) == Ity_I32);
   6399             assign( 'B', &mce, tmp_b, mkU32(0)/* UNKNOWN ORIGIN */);
   6400          }
   6401          if (0) {
   6402             VG_(printf)("create shadow tmp(s) for preamble tmp [%d] ty ", j);
   6403             ppIRType( ty_v );
   6404             VG_(printf)("\n");
   6405          }
   6406       }
   6407    }
   6408 
   6409    /* Iterate over the remaining stmts to generate instrumentation. */
   6410 
   6411    tl_assert(sb_in->stmts_used > 0);
   6412    tl_assert(i >= 0);
   6413    tl_assert(i < sb_in->stmts_used);
   6414    tl_assert(sb_in->stmts[i]->tag == Ist_IMark);
   6415 
   6416    for (/* use current i*/; i < sb_in->stmts_used; i++) {
   6417 
   6418       st = sb_in->stmts[i];
   6419       first_stmt = sb_out->stmts_used;
   6420 
   6421       if (verboze) {
   6422          VG_(printf)("\n");
   6423          ppIRStmt(st);
   6424          VG_(printf)("\n");
   6425       }
   6426 
   6427       if (MC_(clo_mc_level) == 3) {
   6428          /* See comments on case Ist_CAS below. */
   6429          if (st->tag != Ist_CAS)
   6430             schemeS( &mce, st );
   6431       }
   6432 
   6433       /* Generate instrumentation code for each stmt ... */
   6434 
   6435       switch (st->tag) {
   6436 
   6437          case Ist_WrTmp:
   6438             assign( 'V', &mce, findShadowTmpV(&mce, st->Ist.WrTmp.tmp),
   6439                                expr2vbits( &mce, st->Ist.WrTmp.data) );
   6440             break;
   6441 
   6442          case Ist_Put:
   6443             do_shadow_PUT( &mce,
   6444                            st->Ist.Put.offset,
   6445                            st->Ist.Put.data,
   6446                            NULL /* shadow atom */, NULL /* guard */ );
   6447             break;
   6448 
   6449          case Ist_PutI:
   6450             do_shadow_PUTI( &mce, st->Ist.PutI.details);
   6451             break;
   6452 
   6453          case Ist_Store:
   6454             do_shadow_Store( &mce, st->Ist.Store.end,
   6455                                    st->Ist.Store.addr, 0/* addr bias */,
   6456                                    st->Ist.Store.data,
   6457                                    NULL /* shadow data */,
   6458                                    NULL/*guard*/ );
   6459             break;
   6460 
   6461          case Ist_StoreG:
   6462             do_shadow_StoreG( &mce, st->Ist.StoreG.details );
   6463             break;
   6464 
   6465          case Ist_LoadG:
   6466             do_shadow_LoadG( &mce, st->Ist.LoadG.details );
   6467             break;
   6468 
   6469          case Ist_Exit:
   6470             complainIfUndefined( &mce, st->Ist.Exit.guard, NULL );
   6471             break;
   6472 
   6473          case Ist_IMark:
   6474             break;
   6475 
   6476          case Ist_NoOp:
   6477          case Ist_MBE:
   6478             break;
   6479 
   6480          case Ist_Dirty:
   6481             do_shadow_Dirty( &mce, st->Ist.Dirty.details );
   6482             break;
   6483 
   6484          case Ist_AbiHint:
   6485             do_AbiHint( &mce, st->Ist.AbiHint.base,
   6486                               st->Ist.AbiHint.len,
   6487                               st->Ist.AbiHint.nia );
   6488             break;
   6489 
   6490          case Ist_CAS:
   6491             do_shadow_CAS( &mce, st->Ist.CAS.details );
   6492             /* Note, do_shadow_CAS copies the CAS itself to the output
   6493                block, because it needs to add instrumentation both
   6494                before and after it.  Hence skip the copy below.  Also
   6495                skip the origin-tracking stuff (call to schemeS) above,
   6496                since that's all tangled up with it too; do_shadow_CAS
   6497                does it all. */
   6498             break;
   6499 
   6500          case Ist_LLSC:
   6501             do_shadow_LLSC( &mce,
   6502                             st->Ist.LLSC.end,
   6503                             st->Ist.LLSC.result,
   6504                             st->Ist.LLSC.addr,
   6505                             st->Ist.LLSC.storedata );
   6506             break;
   6507 
   6508          default:
   6509             VG_(printf)("\n");
   6510             ppIRStmt(st);
   6511             VG_(printf)("\n");
   6512             VG_(tool_panic)("memcheck: unhandled IRStmt");
   6513 
   6514       } /* switch (st->tag) */
   6515 
   6516       if (0 && verboze) {
   6517          for (j = first_stmt; j < sb_out->stmts_used; j++) {
   6518             VG_(printf)("   ");
   6519             ppIRStmt(sb_out->stmts[j]);
   6520             VG_(printf)("\n");
   6521          }
   6522          VG_(printf)("\n");
   6523       }
   6524 
   6525       /* ... and finally copy the stmt itself to the output.  Except,
   6526          skip the copy of IRCASs; see comments on case Ist_CAS
   6527          above. */
   6528       if (st->tag != Ist_CAS)
   6529          stmt('C', &mce, st);
   6530    }
   6531 
   6532    /* Now we need to complain if the jump target is undefined. */
   6533    first_stmt = sb_out->stmts_used;
   6534 
   6535    if (verboze) {
   6536       VG_(printf)("sb_in->next = ");
   6537       ppIRExpr(sb_in->next);
   6538       VG_(printf)("\n\n");
   6539    }
   6540 
   6541    complainIfUndefined( &mce, sb_in->next, NULL );
   6542 
   6543    if (0 && verboze) {
   6544       for (j = first_stmt; j < sb_out->stmts_used; j++) {
   6545          VG_(printf)("   ");
   6546          ppIRStmt(sb_out->stmts[j]);
   6547          VG_(printf)("\n");
   6548       }
   6549       VG_(printf)("\n");
   6550    }
   6551 
   6552    /* If this fails, there's been some serious snafu with tmp management,
   6553       that should be investigated. */
   6554    tl_assert( VG_(sizeXA)( mce.tmpMap ) == mce.sb->tyenv->types_used );
   6555    VG_(deleteXA)( mce.tmpMap );
   6556 
   6557    tl_assert(mce.sb == sb_out);
   6558    return sb_out;
   6559 }
   6560 
   6561 /*------------------------------------------------------------*/
   6562 /*--- Post-tree-build final tidying                        ---*/
   6563 /*------------------------------------------------------------*/
   6564 
   6565 /* This exploits the observation that Memcheck often produces
   6566    repeated conditional calls of the form
   6567 
   6568    Dirty G MC_(helperc_value_check0/1/4/8_fail)(UInt otag)
   6569 
   6570    with the same guard expression G guarding the same helper call.
   6571    The second and subsequent calls are redundant.  This usually
   6572    results from instrumentation of guest code containing multiple
   6573    memory references at different constant offsets from the same base
   6574    register.  After optimisation of the instrumentation, you get a
   6575    test for the definedness of the base register for each memory
   6576    reference, which is kinda pointless.  MC_(final_tidy) therefore
   6577    looks for such repeated calls and removes all but the first. */
   6578 
   6579 /* A struct for recording which (helper, guard) pairs we have already
   6580    seen. */
   6581 typedef
   6582    struct { void* entry; IRExpr* guard; }
   6583    Pair;
   6584 
   6585 /* Return True if e1 and e2 definitely denote the same value (used to
   6586    compare guards).  Return False if unknown; False is the safe
   6587    answer.  Since guest registers and guest memory do not have the
   6588    SSA property we must return False if any Gets or Loads appear in
   6589    the expression. */
   6590 
   6591 static Bool sameIRValue ( IRExpr* e1, IRExpr* e2 )
   6592 {
   6593    if (e1->tag != e2->tag)
   6594       return False;
   6595    switch (e1->tag) {
   6596       case Iex_Const:
   6597          return eqIRConst( e1->Iex.Const.con, e2->Iex.Const.con );
   6598       case Iex_Binop:
   6599          return e1->Iex.Binop.op == e2->Iex.Binop.op
   6600                 && sameIRValue(e1->Iex.Binop.arg1, e2->Iex.Binop.arg1)
   6601                 && sameIRValue(e1->Iex.Binop.arg2, e2->Iex.Binop.arg2);
   6602       case Iex_Unop:
   6603          return e1->Iex.Unop.op == e2->Iex.Unop.op
   6604                 && sameIRValue(e1->Iex.Unop.arg, e2->Iex.Unop.arg);
   6605       case Iex_RdTmp:
   6606          return e1->Iex.RdTmp.tmp == e2->Iex.RdTmp.tmp;
   6607       case Iex_ITE:
   6608          return sameIRValue( e1->Iex.ITE.cond, e2->Iex.ITE.cond )
   6609                 && sameIRValue( e1->Iex.ITE.iftrue,  e2->Iex.ITE.iftrue )
   6610                 && sameIRValue( e1->Iex.ITE.iffalse, e2->Iex.ITE.iffalse );
   6611       case Iex_Qop:
   6612       case Iex_Triop:
   6613       case Iex_CCall:
   6614          /* be lazy.  Could define equality for these, but they never
   6615             appear to be used. */
   6616          return False;
   6617       case Iex_Get:
   6618       case Iex_GetI:
   6619       case Iex_Load:
   6620          /* be conservative - these may not give the same value each
   6621             time */
   6622          return False;
   6623       case Iex_Binder:
   6624          /* should never see this */
   6625          /* fallthrough */
   6626       default:
   6627          VG_(printf)("mc_translate.c: sameIRValue: unhandled: ");
   6628          ppIRExpr(e1);
   6629          VG_(tool_panic)("memcheck:sameIRValue");
   6630          return False;
   6631    }
   6632 }
   6633 
   6634 /* See if 'pairs' already has an entry for (entry, guard).  Return
   6635    True if so.  If not, add an entry. */
   6636 
   6637 static
   6638 Bool check_or_add ( XArray* /*of Pair*/ pairs, IRExpr* guard, void* entry )
   6639 {
   6640    Pair  p;
   6641    Pair* pp;
   6642    Int   i, n = VG_(sizeXA)( pairs );
   6643    for (i = 0; i < n; i++) {
   6644       pp = VG_(indexXA)( pairs, i );
   6645       if (pp->entry == entry && sameIRValue(pp->guard, guard))
   6646          return True;
   6647    }
   6648    p.guard = guard;
   6649    p.entry = entry;
   6650    VG_(addToXA)( pairs, &p );
   6651    return False;
   6652 }
   6653 
   6654 static Bool is_helperc_value_checkN_fail ( const HChar* name )
   6655 {
   6656    return
   6657       0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_no_o)")
   6658       || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_no_o)")
   6659       || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_no_o)")
   6660       || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_no_o)")
   6661       || 0==VG_(strcmp)(name, "MC_(helperc_value_check0_fail_w_o)")
   6662       || 0==VG_(strcmp)(name, "MC_(helperc_value_check1_fail_w_o)")
   6663       || 0==VG_(strcmp)(name, "MC_(helperc_value_check4_fail_w_o)")
   6664       || 0==VG_(strcmp)(name, "MC_(helperc_value_check8_fail_w_o)");
   6665 }
   6666 
   6667 IRSB* MC_(final_tidy) ( IRSB* sb_in )
   6668 {
   6669    Int i;
   6670    IRStmt*   st;
   6671    IRDirty*  di;
   6672    IRExpr*   guard;
   6673    IRCallee* cee;
   6674    Bool      alreadyPresent;
   6675    XArray*   pairs = VG_(newXA)( VG_(malloc), "mc.ft.1",
   6676                                  VG_(free), sizeof(Pair) );
   6677    /* Scan forwards through the statements.  Each time a call to one
   6678       of the relevant helpers is seen, check if we have made a
   6679       previous call to the same helper using the same guard
   6680       expression, and if so, delete the call. */
   6681    for (i = 0; i < sb_in->stmts_used; i++) {
   6682       st = sb_in->stmts[i];
   6683       tl_assert(st);
   6684       if (st->tag != Ist_Dirty)
   6685          continue;
   6686       di = st->Ist.Dirty.details;
   6687       guard = di->guard;
   6688       tl_assert(guard);
   6689       if (0) { ppIRExpr(guard); VG_(printf)("\n"); }
   6690       cee = di->cee;
   6691       if (!is_helperc_value_checkN_fail( cee->name ))
   6692          continue;
   6693        /* Ok, we have a call to helperc_value_check0/1/4/8_fail with
   6694           guard 'guard'.  Check if we have already seen a call to this
   6695           function with the same guard.  If so, delete it.  If not,
   6696           add it to the set of calls we do know about. */
   6697       alreadyPresent = check_or_add( pairs, guard, cee->addr );
   6698       if (alreadyPresent) {
   6699          sb_in->stmts[i] = IRStmt_NoOp();
   6700          if (0) VG_(printf)("XX\n");
   6701       }
   6702    }
   6703    VG_(deleteXA)( pairs );
   6704    return sb_in;
   6705 }
   6706 
   6707 
   6708 /*------------------------------------------------------------*/
   6709 /*--- Origin tracking stuff                                ---*/
   6710 /*------------------------------------------------------------*/
   6711 
   6712 /* Almost identical to findShadowTmpV. */
   6713 static IRTemp findShadowTmpB ( MCEnv* mce, IRTemp orig )
   6714 {
   6715    TempMapEnt* ent;
   6716    /* VG_(indexXA) range-checks 'orig', hence no need to check
   6717       here. */
   6718    ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
   6719    tl_assert(ent->kind == Orig);
   6720    if (ent->shadowB == IRTemp_INVALID) {
   6721       IRTemp tmpB
   6722         = newTemp( mce, Ity_I32, BSh );
   6723       /* newTemp may cause mce->tmpMap to resize, hence previous results
   6724          from VG_(indexXA) are invalid. */
   6725       ent = (TempMapEnt*)VG_(indexXA)( mce->tmpMap, (Word)orig );
   6726       tl_assert(ent->kind == Orig);
   6727       tl_assert(ent->shadowB == IRTemp_INVALID);
   6728       ent->shadowB = tmpB;
   6729    }
   6730    return ent->shadowB;
   6731 }
   6732 
   6733 static IRAtom* gen_maxU32 ( MCEnv* mce, IRAtom* b1, IRAtom* b2 )
   6734 {
   6735    return assignNew( 'B', mce, Ity_I32, binop(Iop_Max32U, b1, b2) );
   6736 }
   6737 
   6738 
   6739 /* Make a guarded origin load, with no special handling in the
   6740    didn't-happen case.  A GUARD of NULL is assumed to mean "always
   6741    True".
   6742 
   6743    Generate IR to do a shadow origins load from BASEADDR+OFFSET and
   6744    return the otag.  The loaded size is SZB.  If GUARD evaluates to
   6745    False at run time then the returned otag is zero.
   6746 */
   6747 static IRAtom* gen_guarded_load_b ( MCEnv* mce, Int szB,
   6748                                     IRAtom* baseaddr,
   6749                                     Int offset, IRExpr* guard )
   6750 {
   6751    void*    hFun;
   6752    const HChar* hName;
   6753    IRTemp   bTmp;
   6754    IRDirty* di;
   6755    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
   6756    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
   6757    IRAtom*  ea    = baseaddr;
   6758    if (offset != 0) {
   6759       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
   6760                                    : mkU64( (Long)(Int)offset );
   6761       ea = assignNew( 'B', mce, aTy, binop(opAdd, ea, off));
   6762    }
   6763    bTmp = newTemp(mce, mce->hWordTy, BSh);
   6764 
   6765    switch (szB) {
   6766       case 1: hFun  = (void*)&MC_(helperc_b_load1);
   6767               hName = "MC_(helperc_b_load1)";
   6768               break;
   6769       case 2: hFun  = (void*)&MC_(helperc_b_load2);
   6770               hName = "MC_(helperc_b_load2)";
   6771               break;
   6772       case 4: hFun  = (void*)&MC_(helperc_b_load4);
   6773               hName = "MC_(helperc_b_load4)";
   6774               break;
   6775       case 8: hFun  = (void*)&MC_(helperc_b_load8);
   6776               hName = "MC_(helperc_b_load8)";
   6777               break;
   6778       case 16: hFun  = (void*)&MC_(helperc_b_load16);
   6779                hName = "MC_(helperc_b_load16)";
   6780                break;
   6781       case 32: hFun  = (void*)&MC_(helperc_b_load32);
   6782                hName = "MC_(helperc_b_load32)";
   6783                break;
   6784       default:
   6785          VG_(printf)("mc_translate.c: gen_load_b: unhandled szB == %d\n", szB);
   6786          tl_assert(0);
   6787    }
   6788    di = unsafeIRDirty_1_N(
   6789            bTmp, 1/*regparms*/, hName, VG_(fnptr_to_fnentry)( hFun ),
   6790            mkIRExprVec_1( ea )
   6791         );
   6792    if (guard) {
   6793       di->guard = guard;
   6794       /* Ideally the didn't-happen return value here would be
   6795          all-zeroes (unknown-origin), so it'd be harmless if it got
   6796          used inadvertently.  We slum it out with the IR-mandated
   6797          default value (0b01 repeating, 0x55 etc) as that'll probably
   6798          trump all legitimate otags via Max32, and it's pretty
   6799          obviously bogus. */
   6800    }
   6801    /* no need to mess with any annotations.  This call accesses
   6802       neither guest state nor guest memory. */
   6803    stmt( 'B', mce, IRStmt_Dirty(di) );
   6804    if (mce->hWordTy == Ity_I64) {
   6805       /* 64-bit host */
   6806       IRTemp bTmp32 = newTemp(mce, Ity_I32, BSh);
   6807       assign( 'B', mce, bTmp32, unop(Iop_64to32, mkexpr(bTmp)) );
   6808       return mkexpr(bTmp32);
   6809    } else {
   6810       /* 32-bit host */
   6811       return mkexpr(bTmp);
   6812    }
   6813 }
   6814 
   6815 
   6816 /* Generate IR to do a shadow origins load from BASEADDR+OFFSET.  The
   6817    loaded size is SZB.  The load is regarded as unconditional (always
   6818    happens).
   6819 */
   6820 static IRAtom* gen_load_b ( MCEnv* mce, Int szB, IRAtom* baseaddr,
   6821                             Int offset )
   6822 {
   6823    return gen_guarded_load_b(mce, szB, baseaddr, offset, NULL/*guard*/);
   6824 }
   6825 
   6826 
   6827 /* The most general handler for guarded origin loads.  A GUARD of NULL
   6828    is assumed to mean "always True".
   6829 
   6830    Generate IR to do a shadow origin load from ADDR+BIAS and return
   6831    the B bits.  The loaded type is TY.  If GUARD evaluates to False at
   6832    run time then the returned B bits are simply BALT instead.
   6833 */
   6834 static
   6835 IRAtom* expr2ori_Load_guarded_General ( MCEnv* mce,
   6836                                         IRType ty,
   6837                                         IRAtom* addr, UInt bias,
   6838                                         IRAtom* guard, IRAtom* balt )
   6839 {
   6840    /* If the guard evaluates to True, this will hold the loaded
   6841       origin.  If the guard evaluates to False, this will be zero,
   6842       meaning "unknown origin", in which case we will have to replace
   6843       it using an ITE below. */
   6844    IRAtom* iftrue
   6845       = assignNew('B', mce, Ity_I32,
   6846                   gen_guarded_load_b(mce, sizeofIRType(ty),
   6847                                      addr, bias, guard));
   6848    /* These are the bits we will return if the load doesn't take
   6849       place. */
   6850    IRAtom* iffalse
   6851       = balt;
   6852    /* Prepare the cond for the ITE.  Convert a NULL cond into
   6853       something that iropt knows how to fold out later. */
   6854    IRAtom* cond
   6855       = guard == NULL  ? mkU1(1)  : guard;
   6856    /* And assemble the final result. */
   6857    return assignNew('B', mce, Ity_I32, IRExpr_ITE(cond, iftrue, iffalse));
   6858 }
   6859 
   6860 
   6861 /* Generate a shadow origins store.  guard :: Ity_I1 controls whether
   6862    the store really happens; NULL means it unconditionally does. */
   6863 static void gen_store_b ( MCEnv* mce, Int szB,
   6864                           IRAtom* baseaddr, Int offset, IRAtom* dataB,
   6865                           IRAtom* guard )
   6866 {
   6867    void*    hFun;
   6868    const HChar* hName;
   6869    IRDirty* di;
   6870    IRType   aTy   = typeOfIRExpr( mce->sb->tyenv, baseaddr );
   6871    IROp     opAdd = aTy == Ity_I32 ? Iop_Add32 : Iop_Add64;
   6872    IRAtom*  ea    = baseaddr;
   6873    if (guard) {
   6874       tl_assert(isOriginalAtom(mce, guard));
   6875       tl_assert(typeOfIRExpr(mce->sb->tyenv, guard) == Ity_I1);
   6876    }
   6877    if (offset != 0) {
   6878       IRAtom* off = aTy == Ity_I32 ? mkU32( offset )
   6879                                    : mkU64( (Long)(Int)offset );
   6880       ea = assignNew(  'B', mce, aTy, binop(opAdd, ea, off));
   6881    }
   6882    if (mce->hWordTy == Ity_I64)
   6883       dataB = assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, dataB));
   6884 
   6885    switch (szB) {
   6886       case 1: hFun  = (void*)&MC_(helperc_b_store1);
   6887               hName = "MC_(helperc_b_store1)";
   6888               break;
   6889       case 2: hFun  = (void*)&MC_(helperc_b_store2);
   6890               hName = "MC_(helperc_b_store2)";
   6891               break;
   6892       case 4: hFun  = (void*)&MC_(helperc_b_store4);
   6893               hName = "MC_(helperc_b_store4)";
   6894               break;
   6895       case 8: hFun  = (void*)&MC_(helperc_b_store8);
   6896               hName = "MC_(helperc_b_store8)";
   6897               break;
   6898       case 16: hFun  = (void*)&MC_(helperc_b_store16);
   6899                hName = "MC_(helperc_b_store16)";
   6900                break;
   6901       case 32: hFun  = (void*)&MC_(helperc_b_store32);
   6902                hName = "MC_(helperc_b_store32)";
   6903                break;
   6904       default:
   6905          tl_assert(0);
   6906    }
   6907    di = unsafeIRDirty_0_N( 2/*regparms*/,
   6908            hName, VG_(fnptr_to_fnentry)( hFun ),
   6909            mkIRExprVec_2( ea, dataB )
   6910         );
   6911    /* no need to mess with any annotations.  This call accesses
   6912       neither guest state nor guest memory. */
   6913    if (guard) di->guard = guard;
   6914    stmt( 'B', mce, IRStmt_Dirty(di) );
   6915 }
   6916 
   6917 static IRAtom* narrowTo32 ( MCEnv* mce, IRAtom* e ) {
   6918    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
   6919    if (eTy == Ity_I64)
   6920       return assignNew( 'B', mce, Ity_I32, unop(Iop_64to32, e) );
   6921    if (eTy == Ity_I32)
   6922       return e;
   6923    tl_assert(0);
   6924 }
   6925 
   6926 static IRAtom* zWidenFrom32 ( MCEnv* mce, IRType dstTy, IRAtom* e ) {
   6927    IRType eTy = typeOfIRExpr(mce->sb->tyenv, e);
   6928    tl_assert(eTy == Ity_I32);
   6929    if (dstTy == Ity_I64)
   6930       return assignNew( 'B', mce, Ity_I64, unop(Iop_32Uto64, e) );
   6931    tl_assert(0);
   6932 }
   6933 
   6934 
   6935 static IRAtom* schemeE ( MCEnv* mce, IRExpr* e )
   6936 {
   6937    tl_assert(MC_(clo_mc_level) == 3);
   6938 
   6939    switch (e->tag) {
   6940 
   6941       case Iex_GetI: {
   6942          IRRegArray* descr_b;
   6943          IRAtom      *t1, *t2, *t3, *t4;
   6944          IRRegArray* descr      = e->Iex.GetI.descr;
   6945          IRType equivIntTy
   6946             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
   6947          /* If this array is unshadowable for whatever reason, use the
   6948             usual approximation. */
   6949          if (equivIntTy == Ity_INVALID)
   6950             return mkU32(0);
   6951          tl_assert(sizeofIRType(equivIntTy) >= 4);
   6952          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
   6953          descr_b = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
   6954                                  equivIntTy, descr->nElems );
   6955          /* Do a shadow indexed get of the same size, giving t1.  Take
   6956             the bottom 32 bits of it, giving t2.  Compute into t3 the
   6957             origin for the index (almost certainly zero, but there's
   6958             no harm in being completely general here, since iropt will
   6959             remove any useless code), and fold it in, giving a final
   6960             value t4. */
   6961          t1 = assignNew( 'B', mce, equivIntTy,
   6962                           IRExpr_GetI( descr_b, e->Iex.GetI.ix,
   6963                                                 e->Iex.GetI.bias ));
   6964          t2 = narrowTo32( mce, t1 );
   6965          t3 = schemeE( mce, e->Iex.GetI.ix );
   6966          t4 = gen_maxU32( mce, t2, t3 );
   6967          return t4;
   6968       }
   6969       case Iex_CCall: {
   6970          Int i;
   6971          IRAtom*  here;
   6972          IRExpr** args = e->Iex.CCall.args;
   6973          IRAtom*  curr = mkU32(0);
   6974          for (i = 0; args[i]; i++) {
   6975             tl_assert(i < 32);
   6976             tl_assert(isOriginalAtom(mce, args[i]));
   6977             /* Only take notice of this arg if the callee's
   6978                mc-exclusion mask does not say it is to be excluded. */
   6979             if (e->Iex.CCall.cee->mcx_mask & (1<<i)) {
   6980                /* the arg is to be excluded from definedness checking.
   6981                   Do nothing. */
   6982                if (0) VG_(printf)("excluding %s(%d)\n",
   6983                                   e->Iex.CCall.cee->name, i);
   6984             } else {
   6985                /* calculate the arg's definedness, and pessimistically
   6986                   merge it in. */
   6987                here = schemeE( mce, args[i] );
   6988                curr = gen_maxU32( mce, curr, here );
   6989             }
   6990          }
   6991          return curr;
   6992       }
   6993       case Iex_Load: {
   6994          Int dszB;
   6995          dszB = sizeofIRType(e->Iex.Load.ty);
   6996          /* assert that the B value for the address is already
   6997             available (somewhere) */
   6998          tl_assert(isIRAtom(e->Iex.Load.addr));
   6999          tl_assert(mce->hWordTy == Ity_I32 || mce->hWordTy == Ity_I64);
   7000          return gen_load_b( mce, dszB, e->Iex.Load.addr, 0 );
   7001       }
   7002       case Iex_ITE: {
   7003          IRAtom* b1 = schemeE( mce, e->Iex.ITE.cond );
   7004          IRAtom* b3 = schemeE( mce, e->Iex.ITE.iftrue );
   7005          IRAtom* b2 = schemeE( mce, e->Iex.ITE.iffalse );
   7006          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ));
   7007       }
   7008       case Iex_Qop: {
   7009          IRAtom* b1 = schemeE( mce, e->Iex.Qop.details->arg1 );
   7010          IRAtom* b2 = schemeE( mce, e->Iex.Qop.details->arg2 );
   7011          IRAtom* b3 = schemeE( mce, e->Iex.Qop.details->arg3 );
   7012          IRAtom* b4 = schemeE( mce, e->Iex.Qop.details->arg4 );
   7013          return gen_maxU32( mce, gen_maxU32( mce, b1, b2 ),
   7014                                  gen_maxU32( mce, b3, b4 ) );
   7015       }
   7016       case Iex_Triop: {
   7017          IRAtom* b1 = schemeE( mce, e->Iex.Triop.details->arg1 );
   7018          IRAtom* b2 = schemeE( mce, e->Iex.Triop.details->arg2 );
   7019          IRAtom* b3 = schemeE( mce, e->Iex.Triop.details->arg3 );
   7020          return gen_maxU32( mce, b1, gen_maxU32( mce, b2, b3 ) );
   7021       }
   7022       case Iex_Binop: {
   7023          switch (e->Iex.Binop.op) {
   7024             case Iop_CasCmpEQ8:  case Iop_CasCmpNE8:
   7025             case Iop_CasCmpEQ16: case Iop_CasCmpNE16:
   7026             case Iop_CasCmpEQ32: case Iop_CasCmpNE32:
   7027             case Iop_CasCmpEQ64: case Iop_CasCmpNE64:
   7028                /* Just say these all produce a defined result,
   7029                   regardless of their arguments.  See
   7030                   COMMENT_ON_CasCmpEQ in this file. */
   7031                return mkU32(0);
   7032             default: {
   7033                IRAtom* b1 = schemeE( mce, e->Iex.Binop.arg1 );
   7034                IRAtom* b2 = schemeE( mce, e->Iex.Binop.arg2 );
   7035                return gen_maxU32( mce, b1, b2 );
   7036             }
   7037          }
   7038          tl_assert(0);
   7039          /*NOTREACHED*/
   7040       }
   7041       case Iex_Unop: {
   7042          IRAtom* b1 = schemeE( mce, e->Iex.Unop.arg );
   7043          return b1;
   7044       }
   7045       case Iex_Const:
   7046          return mkU32(0);
   7047       case Iex_RdTmp:
   7048          return mkexpr( findShadowTmpB( mce, e->Iex.RdTmp.tmp ));
   7049       case Iex_Get: {
   7050          Int b_offset = MC_(get_otrack_shadow_offset)(
   7051                            e->Iex.Get.offset,
   7052                            sizeofIRType(e->Iex.Get.ty)
   7053                         );
   7054          tl_assert(b_offset >= -1
   7055                    && b_offset <= mce->layout->total_sizeB -4);
   7056          if (b_offset >= 0) {
   7057             /* FIXME: this isn't an atom! */
   7058             return IRExpr_Get( b_offset + 2*mce->layout->total_sizeB,
   7059                                Ity_I32 );
   7060          }
   7061          return mkU32(0);
   7062       }
   7063       default:
   7064          VG_(printf)("mc_translate.c: schemeE: unhandled: ");
   7065          ppIRExpr(e);
   7066          VG_(tool_panic)("memcheck:schemeE");
   7067    }
   7068 }
   7069 
   7070 
   7071 static void do_origins_Dirty ( MCEnv* mce, IRDirty* d )
   7072 {
   7073    // This is a hacked version of do_shadow_Dirty
   7074    Int       i, k, n, toDo, gSz, gOff;
   7075    IRAtom    *here, *curr;
   7076    IRTemp    dst;
   7077 
   7078    /* First check the guard. */
   7079    curr = schemeE( mce, d->guard );
   7080 
   7081    /* Now round up all inputs and maxU32 over them. */
   7082 
   7083    /* Inputs: unmasked args
   7084       Note: arguments are evaluated REGARDLESS of the guard expression */
   7085    for (i = 0; d->args[i]; i++) {
   7086       IRAtom* arg = d->args[i];
   7087       if ( (d->cee->mcx_mask & (1<<i))
   7088            || UNLIKELY(is_IRExpr_VECRET_or_BBPTR(arg)) ) {
   7089          /* ignore this arg */
   7090       } else {
   7091          here = schemeE( mce, arg );
   7092          curr = gen_maxU32( mce, curr, here );
   7093       }
   7094    }
   7095 
   7096    /* Inputs: guest state that we read. */
   7097    for (i = 0; i < d->nFxState; i++) {
   7098       tl_assert(d->fxState[i].fx != Ifx_None);
   7099       if (d->fxState[i].fx == Ifx_Write)
   7100          continue;
   7101 
   7102       /* Enumerate the described state segments */
   7103       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   7104          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   7105          gSz  = d->fxState[i].size;
   7106 
   7107          /* Ignore any sections marked as 'always defined'. */
   7108          if (isAlwaysDefd(mce, gOff, gSz)) {
   7109             if (0)
   7110             VG_(printf)("memcheck: Dirty gst: ignored off %d, sz %d\n",
   7111                         gOff, gSz);
   7112             continue;
   7113          }
   7114 
   7115          /* This state element is read or modified.  So we need to
   7116             consider it.  If larger than 4 bytes, deal with it in
   7117             4-byte chunks. */
   7118          while (True) {
   7119             Int b_offset;
   7120             tl_assert(gSz >= 0);
   7121             if (gSz == 0) break;
   7122             n = gSz <= 4 ? gSz : 4;
   7123             /* update 'curr' with maxU32 of the state slice
   7124                gOff .. gOff+n-1 */
   7125             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
   7126             if (b_offset != -1) {
   7127                /* Observe the guard expression. If it is false use 0, i.e.
   7128                   nothing is known about the origin */
   7129                IRAtom *cond, *iffalse, *iftrue;
   7130 
   7131                cond = assignNew( 'B', mce, Ity_I1, d->guard);
   7132                iffalse = mkU32(0);
   7133                iftrue  = assignNew( 'B', mce, Ity_I32,
   7134                                     IRExpr_Get(b_offset
   7135                                                  + 2*mce->layout->total_sizeB,
   7136                                                Ity_I32));
   7137                here = assignNew( 'B', mce, Ity_I32,
   7138                                  IRExpr_ITE(cond, iftrue, iffalse));
   7139                curr = gen_maxU32( mce, curr, here );
   7140             }
   7141             gSz -= n;
   7142             gOff += n;
   7143          }
   7144       }
   7145    }
   7146 
   7147    /* Inputs: memory */
   7148 
   7149    if (d->mFx != Ifx_None) {
   7150       /* Because we may do multiple shadow loads/stores from the same
   7151          base address, it's best to do a single test of its
   7152          definedness right now.  Post-instrumentation optimisation
   7153          should remove all but this test. */
   7154       tl_assert(d->mAddr);
   7155       here = schemeE( mce, d->mAddr );
   7156       curr = gen_maxU32( mce, curr, here );
   7157    }
   7158 
   7159    /* Deal with memory inputs (reads or modifies) */
   7160    if (d->mFx == Ifx_Read || d->mFx == Ifx_Modify) {
   7161       toDo   = d->mSize;
   7162       /* chew off 32-bit chunks.  We don't care about the endianness
   7163          since it's all going to be condensed down to a single bit,
   7164          but nevertheless choose an endianness which is hopefully
   7165          native to the platform. */
   7166       while (toDo >= 4) {
   7167          here = gen_guarded_load_b( mce, 4, d->mAddr, d->mSize - toDo,
   7168                                     d->guard );
   7169          curr = gen_maxU32( mce, curr, here );
   7170          toDo -= 4;
   7171       }
   7172       /* handle possible 16-bit excess */
   7173       while (toDo >= 2) {
   7174          here = gen_guarded_load_b( mce, 2, d->mAddr, d->mSize - toDo,
   7175                                     d->guard );
   7176          curr = gen_maxU32( mce, curr, here );
   7177          toDo -= 2;
   7178       }
   7179       /* chew off the remaining 8-bit chunk, if any */
   7180       if (toDo == 1) {
   7181          here = gen_guarded_load_b( mce, 1, d->mAddr, d->mSize - toDo,
   7182                                     d->guard );
   7183          curr = gen_maxU32( mce, curr, here );
   7184          toDo -= 1;
   7185       }
   7186       tl_assert(toDo == 0);
   7187    }
   7188 
   7189    /* Whew!  So curr is a 32-bit B-value which should give an origin
   7190       of some use if any of the inputs to the helper are undefined.
   7191       Now we need to re-distribute the results to all destinations. */
   7192 
   7193    /* Outputs: the destination temporary, if there is one. */
   7194    if (d->tmp != IRTemp_INVALID) {
   7195       dst   = findShadowTmpB(mce, d->tmp);
   7196       assign( 'V', mce, dst, curr );
   7197    }
   7198 
   7199    /* Outputs: guest state that we write or modify. */
   7200    for (i = 0; i < d->nFxState; i++) {
   7201       tl_assert(d->fxState[i].fx != Ifx_None);
   7202       if (d->fxState[i].fx == Ifx_Read)
   7203          continue;
   7204 
   7205       /* Enumerate the described state segments */
   7206       for (k = 0; k < 1 + d->fxState[i].nRepeats; k++) {
   7207          gOff = d->fxState[i].offset + k * d->fxState[i].repeatLen;
   7208          gSz  = d->fxState[i].size;
   7209 
   7210          /* Ignore any sections marked as 'always defined'. */
   7211          if (isAlwaysDefd(mce, gOff, gSz))
   7212             continue;
   7213 
   7214          /* This state element is written or modified.  So we need to
   7215             consider it.  If larger than 4 bytes, deal with it in
   7216             4-byte chunks. */
   7217          while (True) {
   7218             Int b_offset;
   7219             tl_assert(gSz >= 0);
   7220             if (gSz == 0) break;
   7221             n = gSz <= 4 ? gSz : 4;
   7222             /* Write 'curr' to the state slice gOff .. gOff+n-1 */
   7223             b_offset = MC_(get_otrack_shadow_offset)(gOff, 4);
   7224             if (b_offset != -1) {
   7225 
   7226                /* If the guard expression evaluates to false we simply Put
   7227                   the value that is already stored in the guest state slot */
   7228                IRAtom *cond, *iffalse;
   7229 
   7230                cond    = assignNew('B', mce, Ity_I1,
   7231                                    d->guard);
   7232                iffalse = assignNew('B', mce, Ity_I32,
   7233                                    IRExpr_Get(b_offset +
   7234                                               2*mce->layout->total_sizeB,
   7235                                               Ity_I32));
   7236                curr = assignNew('V', mce, Ity_I32,
   7237                                 IRExpr_ITE(cond, curr, iffalse));
   7238 
   7239                stmt( 'B', mce, IRStmt_Put(b_offset
   7240                                           + 2*mce->layout->total_sizeB,
   7241                                           curr ));
   7242             }
   7243             gSz -= n;
   7244             gOff += n;
   7245          }
   7246       }
   7247    }
   7248 
   7249    /* Outputs: memory that we write or modify.  Same comments about
   7250       endianness as above apply. */
   7251    if (d->mFx == Ifx_Write || d->mFx == Ifx_Modify) {
   7252       toDo   = d->mSize;
   7253       /* chew off 32-bit chunks */
   7254       while (toDo >= 4) {
   7255          gen_store_b( mce, 4, d->mAddr, d->mSize - toDo, curr,
   7256                       d->guard );
   7257          toDo -= 4;
   7258       }
   7259       /* handle possible 16-bit excess */
   7260       while (toDo >= 2) {
   7261          gen_store_b( mce, 2, d->mAddr, d->mSize - toDo, curr,
   7262                       d->guard );
   7263          toDo -= 2;
   7264       }
   7265       /* chew off the remaining 8-bit chunk, if any */
   7266       if (toDo == 1) {
   7267          gen_store_b( mce, 1, d->mAddr, d->mSize - toDo, curr,
   7268                       d->guard );
   7269          toDo -= 1;
   7270       }
   7271       tl_assert(toDo == 0);
   7272    }
   7273 }
   7274 
   7275 
   7276 /* Generate IR for origin shadowing for a general guarded store. */
   7277 static void do_origins_Store_guarded ( MCEnv* mce,
   7278                                        IREndness stEnd,
   7279                                        IRExpr* stAddr,
   7280                                        IRExpr* stData,
   7281                                        IRExpr* guard )
   7282 {
   7283    Int     dszB;
   7284    IRAtom* dataB;
   7285    /* assert that the B value for the address is already available
   7286       (somewhere), since the call to schemeE will want to see it.
   7287       XXXX how does this actually ensure that?? */
   7288    tl_assert(isIRAtom(stAddr));
   7289    tl_assert(isIRAtom(stData));
   7290    dszB  = sizeofIRType( typeOfIRExpr(mce->sb->tyenv, stData ) );
   7291    dataB = schemeE( mce, stData );
   7292    gen_store_b( mce, dszB, stAddr, 0/*offset*/, dataB, guard );
   7293 }
   7294 
   7295 
   7296 /* Generate IR for origin shadowing for a plain store. */
   7297 static void do_origins_Store_plain ( MCEnv* mce,
   7298                                      IREndness stEnd,
   7299                                      IRExpr* stAddr,
   7300                                      IRExpr* stData )
   7301 {
   7302    do_origins_Store_guarded ( mce, stEnd, stAddr, stData,
   7303                               NULL/*guard*/ );
   7304 }
   7305 
   7306 
   7307 /* ---- Dealing with LoadG/StoreG (not entirely simple) ---- */
   7308 
   7309 static void do_origins_StoreG ( MCEnv* mce, IRStoreG* sg )
   7310 {
   7311    do_origins_Store_guarded( mce, sg->end, sg->addr,
   7312                              sg->data, sg->guard );
   7313 }
   7314 
   7315 static void do_origins_LoadG ( MCEnv* mce, IRLoadG* lg )
   7316 {
   7317    IRType loadedTy = Ity_INVALID;
   7318    switch (lg->cvt) {
   7319       case ILGop_IdentV128: loadedTy = Ity_V128; break;
   7320       case ILGop_Ident64:   loadedTy = Ity_I64;  break;
   7321       case ILGop_Ident32:   loadedTy = Ity_I32;  break;
   7322       case ILGop_16Uto32:   loadedTy = Ity_I16;  break;
   7323       case ILGop_16Sto32:   loadedTy = Ity_I16;  break;
   7324       case ILGop_8Uto32:    loadedTy = Ity_I8;   break;
   7325       case ILGop_8Sto32:    loadedTy = Ity_I8;   break;
   7326       default: VG_(tool_panic)("schemeS.IRLoadG");
   7327    }
   7328    IRAtom* ori_alt
   7329       = schemeE( mce,lg->alt );
   7330    IRAtom* ori_final
   7331       = expr2ori_Load_guarded_General(mce, loadedTy,
   7332                                       lg->addr, 0/*addr bias*/,
   7333                                       lg->guard, ori_alt );
   7334    /* And finally, bind the origin to the destination temporary. */
   7335    assign( 'B', mce, findShadowTmpB(mce, lg->dst), ori_final );
   7336 }
   7337 
   7338 
   7339 static void schemeS ( MCEnv* mce, IRStmt* st )
   7340 {
   7341    tl_assert(MC_(clo_mc_level) == 3);
   7342 
   7343    switch (st->tag) {
   7344 
   7345       case Ist_AbiHint:
   7346          /* The value-check instrumenter handles this - by arranging
   7347             to pass the address of the next instruction to
   7348             MC_(helperc_MAKE_STACK_UNINIT).  This is all that needs to
   7349             happen for origin tracking w.r.t. AbiHints.  So there is
   7350             nothing to do here. */
   7351          break;
   7352 
   7353       case Ist_PutI: {
   7354          IRPutI *puti = st->Ist.PutI.details;
   7355          IRRegArray* descr_b;
   7356          IRAtom      *t1, *t2, *t3, *t4;
   7357          IRRegArray* descr = puti->descr;
   7358          IRType equivIntTy
   7359             = MC_(get_otrack_reg_array_equiv_int_type)(descr);
   7360          /* If this array is unshadowable for whatever reason,
   7361             generate no code. */
   7362          if (equivIntTy == Ity_INVALID)
   7363             break;
   7364          tl_assert(sizeofIRType(equivIntTy) >= 4);
   7365          tl_assert(sizeofIRType(equivIntTy) == sizeofIRType(descr->elemTy));
   7366          descr_b
   7367             = mkIRRegArray( descr->base + 2*mce->layout->total_sizeB,
   7368                             equivIntTy, descr->nElems );
   7369          /* Compute a value to Put - the conjoinment of the origin for
   7370             the data to be Put-ted (obviously) and of the index value
   7371             (not so obviously). */
   7372          t1 = schemeE( mce, puti->data );
   7373          t2 = schemeE( mce, puti->ix );
   7374          t3 = gen_maxU32( mce, t1, t2 );
   7375          t4 = zWidenFrom32( mce, equivIntTy, t3 );
   7376          stmt( 'B', mce, IRStmt_PutI( mkIRPutI(descr_b, puti->ix,
   7377                                                puti->bias, t4) ));
   7378          break;
   7379       }
   7380 
   7381       case Ist_Dirty:
   7382          do_origins_Dirty( mce, st->Ist.Dirty.details );
   7383          break;
   7384 
   7385       case Ist_Store:
   7386          do_origins_Store_plain( mce, st->Ist.Store.end,
   7387                                       st->Ist.Store.addr,
   7388                                       st->Ist.Store.data );
   7389          break;
   7390 
   7391       case Ist_StoreG:
   7392          do_origins_StoreG( mce, st->Ist.StoreG.details );
   7393          break;
   7394 
   7395       case Ist_LoadG:
   7396          do_origins_LoadG( mce, st->Ist.LoadG.details );
   7397          break;
   7398 
   7399       case Ist_LLSC: {
   7400          /* In short: treat a load-linked like a normal load followed
   7401             by an assignment of the loaded (shadow) data the result
   7402             temporary.  Treat a store-conditional like a normal store,
   7403             and mark the result temporary as defined. */
   7404          if (st->Ist.LLSC.storedata == NULL) {
   7405             /* Load Linked */
   7406             IRType resTy
   7407                = typeOfIRTemp(mce->sb->tyenv, st->Ist.LLSC.result);
   7408             IRExpr* vanillaLoad
   7409                = IRExpr_Load(st->Ist.LLSC.end, resTy, st->Ist.LLSC.addr);
   7410             tl_assert(resTy == Ity_I64 || resTy == Ity_I32
   7411                       || resTy == Ity_I16 || resTy == Ity_I8);
   7412             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
   7413                               schemeE(mce, vanillaLoad));
   7414          } else {
   7415             /* Store conditional */
   7416             do_origins_Store_plain( mce, st->Ist.LLSC.end,
   7417                                     st->Ist.LLSC.addr,
   7418                                     st->Ist.LLSC.storedata );
   7419             /* For the rationale behind this, see comments at the
   7420                place where the V-shadow for .result is constructed, in
   7421                do_shadow_LLSC.  In short, we regard .result as
   7422                always-defined. */
   7423             assign( 'B', mce, findShadowTmpB(mce, st->Ist.LLSC.result),
   7424                               mkU32(0) );
   7425          }
   7426          break;
   7427       }
   7428 
   7429       case Ist_Put: {
   7430          Int b_offset
   7431             = MC_(get_otrack_shadow_offset)(
   7432                  st->Ist.Put.offset,
   7433                  sizeofIRType(typeOfIRExpr(mce->sb->tyenv, st->Ist.Put.data))
   7434               );
   7435          if (b_offset >= 0) {
   7436             /* FIXME: this isn't an atom! */
   7437             stmt( 'B', mce, IRStmt_Put(b_offset + 2*mce->layout->total_sizeB,
   7438                                        schemeE( mce, st->Ist.Put.data )) );
   7439          }
   7440          break;
   7441       }
   7442 
   7443       case Ist_WrTmp:
   7444          assign( 'B', mce, findShadowTmpB(mce, st->Ist.WrTmp.tmp),
   7445                            schemeE(mce, st->Ist.WrTmp.data) );
   7446          break;
   7447 
   7448       case Ist_MBE:
   7449       case Ist_NoOp:
   7450       case Ist_Exit:
   7451       case Ist_IMark:
   7452          break;
   7453 
   7454       default:
   7455          VG_(printf)("mc_translate.c: schemeS: unhandled: ");
   7456          ppIRStmt(st);
   7457          VG_(tool_panic)("memcheck:schemeS");
   7458    }
   7459 }
   7460 
   7461 
   7462 /*--------------------------------------------------------------------*/
   7463 /*--- end                                           mc_translate.c ---*/
   7464 /*--------------------------------------------------------------------*/
   7465